Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T
GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 1 - Sentiment Analysis with Logistic Regression/utils.py
14375 views
1
import re
2
import string
3
import numpy as np
4
5
from nltk.corpus import stopwords
6
from nltk.stem import PorterStemmer
7
from nltk.tokenize import TweetTokenizer
8
9
10
def process_tweet(tweet):
11
"""Process tweet function.
12
Input:
13
tweet: a string containing a tweet
14
Output:
15
tweets_clean: a list of words containing the processed tweet
16
17
"""
18
stemmer = PorterStemmer()
19
stopwords_english = stopwords.words('english')
20
# remove stock market tickers like $GE
21
tweet = re.sub(r'\$\w*', '', tweet)
22
# remove old style retweet text "RT"
23
tweet = re.sub(r'^RT[\s]+', '', tweet)
24
# remove hyperlinks
25
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
26
# remove hashtags
27
# only removing the hash # sign from the word
28
tweet = re.sub(r'#', '', tweet)
29
# tokenize tweets
30
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
31
reduce_len=True)
32
tweet_tokens = tokenizer.tokenize(tweet)
33
34
tweets_clean = []
35
for word in tweet_tokens:
36
if (word not in stopwords_english and # remove stopwords
37
word not in string.punctuation): # remove punctuation
38
# tweets_clean.append(word)
39
stem_word = stemmer.stem(word) # stemming word
40
tweets_clean.append(stem_word)
41
42
return tweets_clean
43
44
45
def build_freqs(tweets, ys):
46
"""Build frequencies.
47
Input:
48
tweets: a list of tweets
49
ys: an m x 1 array with the sentiment label of each tweet
50
(either 0 or 1)
51
Output:
52
freqs: a dictionary mapping each (word, sentiment) pair to its
53
frequency
54
"""
55
# Convert np array to list since zip needs an iterable.
56
# The squeeze is necessary or the list ends up with one element.
57
# Also note that this is just a NOP if ys is already a list.
58
yslist = np.squeeze(ys).tolist()
59
60
# Start with an empty dictionary and populate it by looping over all tweets
61
# and over all processed words in each tweet.
62
freqs = {}
63
for y, tweet in zip(yslist, tweets):
64
for word in process_tweet(tweet):
65
pair = (word, y)
66
if pair in freqs:
67
freqs[pair] += 1
68
else:
69
freqs[pair] = 1
70
71
return freqs
72
73