Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 4 - Machine Translation and Document Search/utils.py
14373 views
import re1import string23import numpy as np4import pandas as pd5from nltk.corpus import stopwords6from nltk.stem import PorterStemmer7from nltk.tokenize import TweetTokenizer8910def process_tweet(tweet):11'''12Input:13tweet: a string containing a tweet14Output:15tweets_clean: a list of words containing the processed tweet1617'''18stemmer = PorterStemmer()19stopwords_english = stopwords.words('english')20# remove stock market tickers like $GE21tweet = re.sub(r'\$\w*', '', tweet)22# remove old style retweet text "RT"23tweet = re.sub(r'^RT[\s]+', '', tweet)24# remove hyperlinks25tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)26# remove hashtags27# only removing the hash # sign from the word28tweet = re.sub(r'#', '', tweet)29# tokenize tweets30tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,31reduce_len=True)32tweet_tokens = tokenizer.tokenize(tweet)3334tweets_clean = []35for word in tweet_tokens:36if (word not in stopwords_english and # remove stopwords37word not in string.punctuation): # remove punctuation38# tweets_clean.append(word)39stem_word = stemmer.stem(word) # stemming word40tweets_clean.append(stem_word)4142return tweets_clean434445def get_dict(file_name):46"""47This function returns the english to french dictionary given a file where the each column corresponds to a word.48Check out the files this function takes in your workspace.49"""50my_file = pd.read_csv(file_name, delimiter=' ')51etof = {} # the english to french dictionary to be returned52for i in range(len(my_file)):53# indexing into the rows.54en = my_file.loc[i][0]55fr = my_file.loc[i][1]56etof[en] = fr5758return etof596061def cosine_similarity(A, B):62'''63Input:64A: a numpy array which corresponds to a word vector65B: A numpy array which corresponds to a word vector66Output:67cos: numerical number representing the cosine similarity between A and B.68'''69# you have to set this variable to the true label.70cos = -1071dot = np.dot(A, B)72norma = np.linalg.norm(A)73normb = np.linalg.norm(B)74cos = dot / (norma * normb)7576return cos777879