CoCalc -- utils.py

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 4 - Machine Translation and Document Search/utils.py
¹⁴³⁷³ views
1
import re
2
import string
3

4
import numpy as np
5
import pandas as pd
6
from nltk.corpus import stopwords
7
from nltk.stem import PorterStemmer
8
from nltk.tokenize import TweetTokenizer
9

10

11
def process_tweet(tweet):
12
    '''
13
    Input:
14
        tweet: a string containing a tweet
15
    Output:
16
        tweets_clean: a list of words containing the processed tweet
17

18
    '''
19
    stemmer = PorterStemmer()
20
    stopwords_english = stopwords.words('english')
21
    # remove stock market tickers like $GE
22
    tweet = re.sub(r'\$\w*', '', tweet)
23
    # remove old style retweet text "RT"
24
    tweet = re.sub(r'^RT[\s]+', '', tweet)
25
    # remove hyperlinks
26
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
27
    # remove hashtags
28
    # only removing the hash # sign from the word
29
    tweet = re.sub(r'#', '', tweet)
30
    # tokenize tweets
31
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
32
                               reduce_len=True)
33
    tweet_tokens = tokenizer.tokenize(tweet)
34

35
    tweets_clean = []
36
    for word in tweet_tokens:
37
        if (word not in stopwords_english and  # remove stopwords
38
            word not in string.punctuation):  # remove punctuation
39
            # tweets_clean.append(word)
40
            stem_word = stemmer.stem(word)  # stemming word
41
            tweets_clean.append(stem_word)
42

43
    return tweets_clean
44

45

46
def get_dict(file_name):
47
    """
48
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
49
    Check out the files this function takes in your workspace.
50
    """
51
    my_file = pd.read_csv(file_name, delimiter=' ')
52
    etof = {}  # the english to french dictionary to be returned
53
    for i in range(len(my_file)):
54
        # indexing into the rows.
55
        en = my_file.loc[i][0]
56
        fr = my_file.loc[i][1]
57
        etof[en] = fr
58

59
    return etof
60

61

62
def cosine_similarity(A, B):
63
    '''
64
    Input:
65
        A: a numpy array which corresponds to a word vector
66
        B: A numpy array which corresponds to a word vector
67
    Output:
68
        cos: numerical number representing the cosine similarity between A and B.
69
    '''
70
    # you have to set this variable to the true label.
71
    cos = -10
72
    dot = np.dot(A, B)
73
    norma = np.linalg.norm(A)
74
    normb = np.linalg.norm(B)
75
    cos = dot / (norma * normb)
76

77
    return cos
78

79
Product

Resources

Company