Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T
GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 4 - Machine Translation and Document Search/utils.py
14373 views
1
import re
2
import string
3
4
import numpy as np
5
import pandas as pd
6
from nltk.corpus import stopwords
7
from nltk.stem import PorterStemmer
8
from nltk.tokenize import TweetTokenizer
9
10
11
def process_tweet(tweet):
12
'''
13
Input:
14
tweet: a string containing a tweet
15
Output:
16
tweets_clean: a list of words containing the processed tweet
17
18
'''
19
stemmer = PorterStemmer()
20
stopwords_english = stopwords.words('english')
21
# remove stock market tickers like $GE
22
tweet = re.sub(r'\$\w*', '', tweet)
23
# remove old style retweet text "RT"
24
tweet = re.sub(r'^RT[\s]+', '', tweet)
25
# remove hyperlinks
26
tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)
27
# remove hashtags
28
# only removing the hash # sign from the word
29
tweet = re.sub(r'#', '', tweet)
30
# tokenize tweets
31
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
32
reduce_len=True)
33
tweet_tokens = tokenizer.tokenize(tweet)
34
35
tweets_clean = []
36
for word in tweet_tokens:
37
if (word not in stopwords_english and # remove stopwords
38
word not in string.punctuation): # remove punctuation
39
# tweets_clean.append(word)
40
stem_word = stemmer.stem(word) # stemming word
41
tweets_clean.append(stem_word)
42
43
return tweets_clean
44
45
46
def get_dict(file_name):
47
"""
48
This function returns the english to french dictionary given a file where the each column corresponds to a word.
49
Check out the files this function takes in your workspace.
50
"""
51
my_file = pd.read_csv(file_name, delimiter=' ')
52
etof = {} # the english to french dictionary to be returned
53
for i in range(len(my_file)):
54
# indexing into the rows.
55
en = my_file.loc[i][0]
56
fr = my_file.loc[i][1]
57
etof[en] = fr
58
59
return etof
60
61
62
def cosine_similarity(A, B):
63
'''
64
Input:
65
A: a numpy array which corresponds to a word vector
66
B: A numpy array which corresponds to a word vector
67
Output:
68
cos: numerical number representing the cosine similarity between A and B.
69
'''
70
# you have to set this variable to the true label.
71
cos = -10
72
dot = np.dot(A, B)
73
norma = np.linalg.norm(A)
74
normb = np.linalg.norm(B)
75
cos = dot / (norma * normb)
76
77
return cos
78
79