CoCalc -- utils.py

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 1 - Sentiment Analysis with Logistic Regression/utils.py
¹⁴³⁷⁵ views
1
import re
2
import string
3
import numpy as np
4

5
from nltk.corpus import stopwords
6
from nltk.stem import PorterStemmer
7
from nltk.tokenize import TweetTokenizer
8

9

10
def process_tweet(tweet):
11
    """Process tweet function.
12
    Input:
13
        tweet: a string containing a tweet
14
    Output:
15
        tweets_clean: a list of words containing the processed tweet
16

17
    """
18
    stemmer = PorterStemmer()
19
    stopwords_english = stopwords.words('english')
20
    # remove stock market tickers like $GE
21
    tweet = re.sub(r'\$\w*', '', tweet)
22
    # remove old style retweet text "RT"
23
    tweet = re.sub(r'^RT[\s]+', '', tweet)
24
    # remove hyperlinks
25
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
26
    # remove hashtags
27
    # only removing the hash # sign from the word
28
    tweet = re.sub(r'#', '', tweet)
29
    # tokenize tweets
30
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
31
                               reduce_len=True)
32
    tweet_tokens = tokenizer.tokenize(tweet)
33

34
    tweets_clean = []
35
    for word in tweet_tokens:
36
        if (word not in stopwords_english and  # remove stopwords
37
                word not in string.punctuation):  # remove punctuation
38
            # tweets_clean.append(word)
39
            stem_word = stemmer.stem(word)  # stemming word
40
            tweets_clean.append(stem_word)
41

42
    return tweets_clean
43

44

45
def build_freqs(tweets, ys):
46
    """Build frequencies.
47
    Input:
48
        tweets: a list of tweets
49
        ys: an m x 1 array with the sentiment label of each tweet
50
            (either 0 or 1)
51
    Output:
52
        freqs: a dictionary mapping each (word, sentiment) pair to its
53
        frequency
54
    """
55
    # Convert np array to list since zip needs an iterable.
56
    # The squeeze is necessary or the list ends up with one element.
57
    # Also note that this is just a NOP if ys is already a list.
58
    yslist = np.squeeze(ys).tolist()
59

60
    # Start with an empty dictionary and populate it by looping over all tweets
61
    # and over all processed words in each tweet.
62
    freqs = {}
63
    for y, tweet in zip(yslist, tweets):
64
        for word in process_tweet(tweet):
65
            pair = (word, y)
66
            if pair in freqs:
67
                freqs[pair] += 1
68
            else:
69
                freqs[pair] = 1
70

71
    return freqs
72

73
Product

Resources

Company