Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T
GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 4 - Machine Translation and Document Search/utils_nb.py
14373 views
1
import re
2
import string
3
4
import numpy as np
5
import pandas as pd
6
from nltk.corpus import stopwords
7
from nltk.stem import PorterStemmer
8
from nltk.tokenize import TweetTokenizer
9
import matplotlib.pyplot as plt
10
11
12
def process_tweet(tweet):
13
'''
14
Input:
15
tweet: a string containing a tweet
16
Output:
17
tweets_clean: a list of words containing the processed tweet
18
19
'''
20
stemmer = PorterStemmer()
21
stopwords_english = stopwords.words('english')
22
# remove stock market tickers like $GE
23
tweet = re.sub(r'\$\w*', '', tweet)
24
# remove old style retweet text "RT"
25
tweet = re.sub(r'^RT[\s]+', '', tweet)
26
# remove hyperlinks
27
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
28
# remove hashtags
29
# only removing the hash # sign from the word
30
tweet = re.sub(r'#', '', tweet)
31
# tokenize tweets
32
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
33
reduce_len=True)
34
tweet_tokens = tokenizer.tokenize(tweet)
35
36
tweets_clean = []
37
for word in tweet_tokens:
38
if (word not in stopwords_english and # remove stopwords
39
word not in string.punctuation): # remove punctuation
40
# tweets_clean.append(word)
41
stem_word = stemmer.stem(word) # stemming word
42
tweets_clean.append(stem_word)
43
44
return tweets_clean
45
46
47
def get_dict(file_name):
48
"""
49
This function returns the english to french dictionary given a file where the each column corresponds to a word.
50
Check out the files this function takes in your workspace.
51
"""
52
my_file = pd.read_csv(file_name, delimiter=' ')
53
etof = {} # the english to french dictionary to be returned
54
for i in range(len(my_file)):
55
# indexing into the rows.
56
en = my_file.loc[i][0]
57
fr = my_file.loc[i][1]
58
etof[en] = fr
59
60
return etof
61
62
63
def cosine_similarity(A, B):
64
'''
65
Input:
66
A: a numpy array which corresponds to a word vector
67
B: A numpy array which corresponds to a word vector
68
Output:
69
cos: numerical number representing the cosine similarity between A and B.
70
'''
71
# you have to set this variable to the true label.
72
cos = -10
73
dot = np.dot(A, B)
74
norma = np.linalg.norm(A)
75
normb = np.linalg.norm(B)
76
cos = dot / (norma * normb)
77
78
return cos
79
80
# Procedure to plot and arrows that represents vectors with pyplot
81
def plot_vectors(vectors, colors=['k', 'b', 'r', 'm', 'c'], axes=None, fname='image.svg', ax=None):
82
scale = 1
83
scale_units = 'x'
84
x_dir = []
85
y_dir = []
86
87
for i, vec in enumerate(vectors):
88
x_dir.append(vec[0][0])
89
y_dir.append(vec[0][1])
90
91
if ax == None:
92
fig, ax2 = plt.subplots()
93
else:
94
ax2 = ax
95
96
if axes == None:
97
x_axis = 2 + np.max(np.abs(x_dir))
98
y_axis = 2 + np.max(np.abs(y_dir))
99
else:
100
x_axis = axes[0]
101
y_axis = axes[1]
102
103
ax2.axis([-x_axis, x_axis, -y_axis, y_axis])
104
105
for i, vec in enumerate(vectors):
106
ax2.arrow(0, 0, vec[0][0], vec[0][1], head_width=0.05 * x_axis, head_length=0.05 * y_axis, fc=colors[i], ec=colors[i])
107
108
if ax == None:
109
plt.show()
110
fig.savefig(fname)
111
112