Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 2 - Sentiment Analysis with Naive Bayes/utils.py
14373 views
import re1import string23from nltk.corpus import stopwords4from nltk.stem import PorterStemmer5from nltk.tokenize import TweetTokenizer678def process_tweet(tweet):9'''10Input:11tweet: a string containing a tweet12Output:13tweets_clean: a list of words containing the processed tweet1415'''16stemmer = PorterStemmer()17stopwords_english = stopwords.words('english')18# remove stock market tickers like $GE19tweet = re.sub(r'\$\w*', '', tweet)20# remove old style retweet text "RT"21tweet = re.sub(r'^RT[\s]+', '', tweet)22# remove hyperlinks23tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)24# remove hashtags25# only removing the hash # sign from the word26tweet = re.sub(r'#', '', tweet)27# tokenize tweets28tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,29reduce_len=True)30tweet_tokens = tokenizer.tokenize(tweet)3132tweets_clean = []33for word in tweet_tokens:34if (word not in stopwords_english and # remove stopwords35word not in string.punctuation): # remove punctuation36# tweets_clean.append(word)37stem_word = stemmer.stem(word) # stemming word38tweets_clean.append(stem_word)3940return tweets_clean414243def test_lookup(func):44freqs = {('sad', 0): 4,45('happy', 1): 12,46('oppressed', 0): 7}47word = 'happy'48label = 149if func(freqs, word, label) == 12:50return 'SUCCESS!!'51return 'Failed Sanity Check!'525354def lookup(freqs, word, label):55'''56Input:57freqs: a dictionary with the frequency of each pair (or tuple)58word: the word to look up59label: the label corresponding to the word60Output:61n: the number of times the word with its corresponding label appears.62'''63n = 0 # freqs.get((word, label), 0)6465pair = (word, label)66if (pair in freqs):67n = freqs[pair]6869return n707172