Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T
GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 2 - Sentiment Analysis with Naive Bayes/utils.py
14373 views
1
import re
2
import string
3
4
from nltk.corpus import stopwords
5
from nltk.stem import PorterStemmer
6
from nltk.tokenize import TweetTokenizer
7
8
9
def process_tweet(tweet):
10
'''
11
Input:
12
tweet: a string containing a tweet
13
Output:
14
tweets_clean: a list of words containing the processed tweet
15
16
'''
17
stemmer = PorterStemmer()
18
stopwords_english = stopwords.words('english')
19
# remove stock market tickers like $GE
20
tweet = re.sub(r'\$\w*', '', tweet)
21
# remove old style retweet text "RT"
22
tweet = re.sub(r'^RT[\s]+', '', tweet)
23
# remove hyperlinks
24
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
25
# remove hashtags
26
# only removing the hash # sign from the word
27
tweet = re.sub(r'#', '', tweet)
28
# tokenize tweets
29
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
30
reduce_len=True)
31
tweet_tokens = tokenizer.tokenize(tweet)
32
33
tweets_clean = []
34
for word in tweet_tokens:
35
if (word not in stopwords_english and # remove stopwords
36
word not in string.punctuation): # remove punctuation
37
# tweets_clean.append(word)
38
stem_word = stemmer.stem(word) # stemming word
39
tweets_clean.append(stem_word)
40
41
return tweets_clean
42
43
44
def test_lookup(func):
45
freqs = {('sad', 0): 4,
46
('happy', 1): 12,
47
('oppressed', 0): 7}
48
word = 'happy'
49
label = 1
50
if func(freqs, word, label) == 12:
51
return 'SUCCESS!!'
52
return 'Failed Sanity Check!'
53
54
55
def lookup(freqs, word, label):
56
'''
57
Input:
58
freqs: a dictionary with the frequency of each pair (or tuple)
59
word: the word to look up
60
label: the label corresponding to the word
61
Output:
62
n: the number of times the word with its corresponding label appears.
63
'''
64
n = 0 # freqs.get((word, label), 0)
65
66
pair = (word, label)
67
if (pair in freqs):
68
n = freqs[pair]
69
70
return n
71
72