CoCalc -- utils.py

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Sequence Models/Week 1 - Neural Netowrks for Sentiment Analysis/utils.py
¹⁴⁴⁸¹ views
1
import string
2
import re
3
import os
4
import nltk
5
nltk.download('twitter_samples')
6
nltk.download('stopwords')
7
from nltk.tokenize import TweetTokenizer
8
from nltk.corpus import stopwords, twitter_samples 
9

10
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
11

12
# Stop words are messy and not that compelling; 
13
# "very" and "not" are considered stop words, but they are obviously expressing sentiment
14

15
# The porter stemmer lemmatizes "was" to "wa".  Seriously???
16

17
# I'm not sure we want to get into stop words
18
stopwords_english = stopwords.words('english')
19

20
# Also have my doubts about stemming...
21
from nltk.stem import PorterStemmer
22
stemmer = PorterStemmer()
23

24
def process_tweet(tweet):
25
    '''
26
    Input: 
27
        tweet: a string containing a tweet
28
    Output:
29
        tweets_clean: a list of words containing the processed tweet
30
    
31
    '''
32
    # remove stock market tickers like $GE
33
    tweet = re.sub(r'\$\w*', '', tweet)
34
    # remove old style retweet text "RT"
35
    tweet = re.sub(r'^RT[\s]+', '', tweet)
36
    # remove hyperlinks
37
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
38
    # remove hashtags
39
    # only removing the hash # sign from the word
40
    tweet = re.sub(r'#', '', tweet)
41
    # tokenize tweets
42
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
43
    tweet_tokens = tokenizer.tokenize(tweet)
44
    ### START CODE HERE ###
45
    tweets_clean = []
46
    for word in tweet_tokens:
47
        if (word not in stopwords_english and # remove stopwords
48
            word not in string.punctuation): # remove punctuation
49
            #tweets_clean.append(word)
50
            stem_word = stemmer.stem(word) # stemming word
51
            tweets_clean.append(stem_word)
52
    ### END CODE HERE ###
53
    return tweets_clean
54

55

56
# let's not reuse variables
57
#all_positive_tweets = twitter_samples.strings('positive_tweets.json')
58
#all_negative_tweets = twitter_samples.strings('negative_tweets.json')
59

60
def load_tweets():
61
    all_positive_tweets = twitter_samples.strings('positive_tweets.json')
62
    all_negative_tweets = twitter_samples.strings('negative_tweets.json')  
63
    return all_positive_tweets, all_negative_tweets
64
    
65
# Layers have weights and a foward function.
66
# They create weights when layer.initialize is called and use them.
67
# remove this or make it optional 
68

69
class Layer(object):
70
    """Base class for layers."""
71
    def __init__(self):
72
        self.weights = None
73

74
    def forward(self, x):
75
        raise NotImplementedError
76
  
77
    def init_weights_and_state(self, input_signature, random_key):
78
        pass
79

80
    def init(self, input_signature, random_key):
81
        self.init_weights_and_state(input_signature, random_key)
82
        return self.weights
83
    
84
    def __call__(self, x):
85
        return self.forward(x)
86
Product

Resources

Company