Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 1 - Sentiment Analysis with Logistic Regression/utils.py
14375 views
import re1import string2import numpy as np34from nltk.corpus import stopwords5from nltk.stem import PorterStemmer6from nltk.tokenize import TweetTokenizer789def process_tweet(tweet):10"""Process tweet function.11Input:12tweet: a string containing a tweet13Output:14tweets_clean: a list of words containing the processed tweet1516"""17stemmer = PorterStemmer()18stopwords_english = stopwords.words('english')19# remove stock market tickers like $GE20tweet = re.sub(r'\$\w*', '', tweet)21# remove old style retweet text "RT"22tweet = re.sub(r'^RT[\s]+', '', tweet)23# remove hyperlinks24tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)25# remove hashtags26# only removing the hash # sign from the word27tweet = re.sub(r'#', '', tweet)28# tokenize tweets29tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,30reduce_len=True)31tweet_tokens = tokenizer.tokenize(tweet)3233tweets_clean = []34for word in tweet_tokens:35if (word not in stopwords_english and # remove stopwords36word not in string.punctuation): # remove punctuation37# tweets_clean.append(word)38stem_word = stemmer.stem(word) # stemming word39tweets_clean.append(stem_word)4041return tweets_clean424344def build_freqs(tweets, ys):45"""Build frequencies.46Input:47tweets: a list of tweets48ys: an m x 1 array with the sentiment label of each tweet49(either 0 or 1)50Output:51freqs: a dictionary mapping each (word, sentiment) pair to its52frequency53"""54# Convert np array to list since zip needs an iterable.55# The squeeze is necessary or the list ends up with one element.56# Also note that this is just a NOP if ys is already a list.57yslist = np.squeeze(ys).tolist()5859# Start with an empty dictionary and populate it by looping over all tweets60# and over all processed words in each tweet.61freqs = {}62for y, tweet in zip(yslist, tweets):63for word in process_tweet(tweet):64pair = (word, y)65if pair in freqs:66freqs[pair] += 167else:68freqs[pair] = 16970return freqs717273