CoCalc -- utils.py

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 2 - Sentiment Analysis with Naive Bayes/utils.py
¹⁴³⁷³ views

1
import re
2
import string
3

4
from nltk.corpus import stopwords
5
from nltk.stem import PorterStemmer
6
from nltk.tokenize import TweetTokenizer
7

8

9
def process_tweet(tweet):
10
    '''
11
    Input:
12
        tweet: a string containing a tweet
13
    Output:
14
        tweets_clean: a list of words containing the processed tweet
15

16
    '''
17
    stemmer = PorterStemmer()
18
    stopwords_english = stopwords.words('english')
19
    # remove stock market tickers like $GE
20
    tweet = re.sub(r'\$\w*', '', tweet)
21
    # remove old style retweet text "RT"
22
    tweet = re.sub(r'^RT[\s]+', '', tweet)
23
    # remove hyperlinks
24
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
25
    # remove hashtags
26
    # only removing the hash # sign from the word
27
    tweet = re.sub(r'#', '', tweet)
28
    # tokenize tweets
29
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
30
                               reduce_len=True)
31
    tweet_tokens = tokenizer.tokenize(tweet)
32

33
    tweets_clean = []
34
    for word in tweet_tokens:
35
        if (word not in stopwords_english and  # remove stopwords
36
            word not in string.punctuation):  # remove punctuation
37
            # tweets_clean.append(word)
38
            stem_word = stemmer.stem(word)  # stemming word
39
            tweets_clean.append(stem_word)
40

41
    return tweets_clean
42

43

44
def test_lookup(func):
45
    freqs = {('sad', 0): 4,
46
             ('happy', 1): 12,
47
             ('oppressed', 0): 7}
48
    word = 'happy'
49
    label = 1
50
    if func(freqs, word, label) == 12:
51
        return 'SUCCESS!!'
52
    return 'Failed Sanity Check!'
53

54

55
def lookup(freqs, word, label):
56
    '''
57
    Input:
58
        freqs: a dictionary with the frequency of each pair (or tuple)
59
        word: the word to look up
60
        label: the label corresponding to the word
61
    Output:
62
        n: the number of times the word with its corresponding label appears.
63
    '''
64
    n = 0  # freqs.get((word, label), 0)
65

66
    pair = (word, label)
67
    if (pair in freqs):
68
        n = freqs[pair]
69

70
    return n
71

72

Product

Resources

Company