CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Sequence Models/Week 1 - Neural Netowrks for Sentiment Analysis/utils.py
Views: 13373
1
import string
2
import re
3
import os
4
import nltk
5
nltk.download('twitter_samples')
6
nltk.download('stopwords')
7
from nltk.tokenize import TweetTokenizer
8
from nltk.corpus import stopwords, twitter_samples
9
10
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
11
12
# Stop words are messy and not that compelling;
13
# "very" and "not" are considered stop words, but they are obviously expressing sentiment
14
15
# The porter stemmer lemmatizes "was" to "wa". Seriously???
16
17
# I'm not sure we want to get into stop words
18
stopwords_english = stopwords.words('english')
19
20
# Also have my doubts about stemming...
21
from nltk.stem import PorterStemmer
22
stemmer = PorterStemmer()
23
24
def process_tweet(tweet):
25
'''
26
Input:
27
tweet: a string containing a tweet
28
Output:
29
tweets_clean: a list of words containing the processed tweet
30
31
'''
32
# remove stock market tickers like $GE
33
tweet = re.sub(r'\$\w*', '', tweet)
34
# remove old style retweet text "RT"
35
tweet = re.sub(r'^RT[\s]+', '', tweet)
36
# remove hyperlinks
37
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
38
# remove hashtags
39
# only removing the hash # sign from the word
40
tweet = re.sub(r'#', '', tweet)
41
# tokenize tweets
42
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
43
tweet_tokens = tokenizer.tokenize(tweet)
44
### START CODE HERE ###
45
tweets_clean = []
46
for word in tweet_tokens:
47
if (word not in stopwords_english and # remove stopwords
48
word not in string.punctuation): # remove punctuation
49
#tweets_clean.append(word)
50
stem_word = stemmer.stem(word) # stemming word
51
tweets_clean.append(stem_word)
52
### END CODE HERE ###
53
return tweets_clean
54
55
56
# let's not reuse variables
57
#all_positive_tweets = twitter_samples.strings('positive_tweets.json')
58
#all_negative_tweets = twitter_samples.strings('negative_tweets.json')
59
60
def load_tweets():
61
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
62
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
63
return all_positive_tweets, all_negative_tweets
64
65
# Layers have weights and a foward function.
66
# They create weights when layer.initialize is called and use them.
67
# remove this or make it optional
68
69
class Layer(object):
70
"""Base class for layers."""
71
def __init__(self):
72
self.weights = None
73
74
def forward(self, x):
75
raise NotImplementedError
76
77
def init_weights_and_state(self, input_signature, random_key):
78
pass
79
80
def init(self, input_signature, random_key):
81
self.init_weights_and_state(input_signature, random_key)
82
return self.weights
83
84
def __call__(self, x):
85
return self.forward(x)
86