CoCalc -- utils

GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 4 - Machine Translation and Document Search/utils_nb.py
¹⁴³⁷³ views
1
import re
2
import string
3

4
import numpy as np
5
import pandas as pd
6
from nltk.corpus import stopwords
7
from nltk.stem import PorterStemmer
8
from nltk.tokenize import TweetTokenizer
9
import matplotlib.pyplot as plt 
10

11

12
def process_tweet(tweet):
13
    '''
14
    Input:
15
        tweet: a string containing a tweet
16
    Output:
17
        tweets_clean: a list of words containing the processed tweet
18

19
    '''
20
    stemmer = PorterStemmer()
21
    stopwords_english = stopwords.words('english')
22
    # remove stock market tickers like $GE
23
    tweet = re.sub(r'\$\w*', '', tweet)
24
    # remove old style retweet text "RT"
25
    tweet = re.sub(r'^RT[\s]+', '', tweet)
26
    # remove hyperlinks
27
    tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
28
    # remove hashtags
29
    # only removing the hash # sign from the word
30
    tweet = re.sub(r'#', '', tweet)
31
    # tokenize tweets
32
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
33
                               reduce_len=True)
34
    tweet_tokens = tokenizer.tokenize(tweet)
35

36
    tweets_clean = []
37
    for word in tweet_tokens:
38
        if (word not in stopwords_english and  # remove stopwords
39
            word not in string.punctuation):  # remove punctuation
40
            # tweets_clean.append(word)
41
            stem_word = stemmer.stem(word)  # stemming word
42
            tweets_clean.append(stem_word)
43

44
    return tweets_clean
45

46

47
def get_dict(file_name):
48
    """
49
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
50
    Check out the files this function takes in your workspace.
51
    """
52
    my_file = pd.read_csv(file_name, delimiter=' ')
53
    etof = {}  # the english to french dictionary to be returned
54
    for i in range(len(my_file)):
55
        # indexing into the rows.
56
        en = my_file.loc[i][0]
57
        fr = my_file.loc[i][1]
58
        etof[en] = fr
59

60
    return etof
61

62

63
def cosine_similarity(A, B):
64
    '''
65
    Input:
66
        A: a numpy array which corresponds to a word vector
67
        B: A numpy array which corresponds to a word vector
68
    Output:
69
        cos: numerical number representing the cosine similarity between A and B.
70
    '''
71
    # you have to set this variable to the true label.
72
    cos = -10
73
    dot = np.dot(A, B)
74
    norma = np.linalg.norm(A)
75
    normb = np.linalg.norm(B)
76
    cos = dot / (norma * normb)
77

78
    return cos
79

80
# Procedure to plot and arrows that represents vectors with pyplot
81
def plot_vectors(vectors, colors=['k', 'b', 'r', 'm', 'c'], axes=None, fname='image.svg', ax=None):
82
    scale = 1
83
    scale_units = 'x'
84
    x_dir = []
85
    y_dir = []
86
    
87
    for i, vec in enumerate(vectors):
88
        x_dir.append(vec[0][0])
89
        y_dir.append(vec[0][1])
90
    
91
    if ax == None:
92
        fig, ax2 = plt.subplots()
93
    else:
94
        ax2 = ax
95
      
96
    if axes == None:
97
        x_axis = 2 + np.max(np.abs(x_dir))
98
        y_axis = 2 + np.max(np.abs(y_dir))
99
    else:
100
        x_axis = axes[0]
101
        y_axis = axes[1]
102
        
103
    ax2.axis([-x_axis, x_axis, -y_axis, y_axis])
104
        
105
    for i, vec in enumerate(vectors):
106
        ax2.arrow(0, 0, vec[0][0], vec[0][1], head_width=0.05 * x_axis, head_length=0.05 * y_axis, fc=colors[i], ec=colors[i])
107
    
108
    if ax == None:
109
        plt.show()
110
        fig.savefig(fname)
111

112
Product

Resources

Company