Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 4 - Machine Translation and Document Search/utils_nb.py
14373 views
import re1import string23import numpy as np4import pandas as pd5from nltk.corpus import stopwords6from nltk.stem import PorterStemmer7from nltk.tokenize import TweetTokenizer8import matplotlib.pyplot as plt91011def process_tweet(tweet):12'''13Input:14tweet: a string containing a tweet15Output:16tweets_clean: a list of words containing the processed tweet1718'''19stemmer = PorterStemmer()20stopwords_english = stopwords.words('english')21# remove stock market tickers like $GE22tweet = re.sub(r'\$\w*', '', tweet)23# remove old style retweet text "RT"24tweet = re.sub(r'^RT[\s]+', '', tweet)25# remove hyperlinks26tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)27# remove hashtags28# only removing the hash # sign from the word29tweet = re.sub(r'#', '', tweet)30# tokenize tweets31tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,32reduce_len=True)33tweet_tokens = tokenizer.tokenize(tweet)3435tweets_clean = []36for word in tweet_tokens:37if (word not in stopwords_english and # remove stopwords38word not in string.punctuation): # remove punctuation39# tweets_clean.append(word)40stem_word = stemmer.stem(word) # stemming word41tweets_clean.append(stem_word)4243return tweets_clean444546def get_dict(file_name):47"""48This function returns the english to french dictionary given a file where the each column corresponds to a word.49Check out the files this function takes in your workspace.50"""51my_file = pd.read_csv(file_name, delimiter=' ')52etof = {} # the english to french dictionary to be returned53for i in range(len(my_file)):54# indexing into the rows.55en = my_file.loc[i][0]56fr = my_file.loc[i][1]57etof[en] = fr5859return etof606162def cosine_similarity(A, B):63'''64Input:65A: a numpy array which corresponds to a word vector66B: A numpy array which corresponds to a word vector67Output:68cos: numerical number representing the cosine similarity between A and B.69'''70# you have to set this variable to the true label.71cos = -1072dot = np.dot(A, B)73norma = np.linalg.norm(A)74normb = np.linalg.norm(B)75cos = dot / (norma * normb)7677return cos7879# Procedure to plot and arrows that represents vectors with pyplot80def plot_vectors(vectors, colors=['k', 'b', 'r', 'm', 'c'], axes=None, fname='image.svg', ax=None):81scale = 182scale_units = 'x'83x_dir = []84y_dir = []8586for i, vec in enumerate(vectors):87x_dir.append(vec[0][0])88y_dir.append(vec[0][1])8990if ax == None:91fig, ax2 = plt.subplots()92else:93ax2 = ax9495if axes == None:96x_axis = 2 + np.max(np.abs(x_dir))97y_axis = 2 + np.max(np.abs(y_dir))98else:99x_axis = axes[0]100y_axis = axes[1]101102ax2.axis([-x_axis, x_axis, -y_axis, y_axis])103104for i, vec in enumerate(vectors):105ax2.arrow(0, 0, vec[0][0], vec[0][1], head_width=0.05 * x_axis, head_length=0.05 * y_axis, fc=colors[i], ec=colors[i])106107if ax == None:108plt.show()109fig.savefig(fname)110111112