Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/master/Natural Language Processing with Sequence Models/Week 1 - Neural Netowrks for Sentiment Analysis/utils.py
Views: 13373
import string1import re2import os3import nltk4nltk.download('twitter_samples')5nltk.download('stopwords')6from nltk.tokenize import TweetTokenizer7from nltk.corpus import stopwords, twitter_samples89tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)1011# Stop words are messy and not that compelling;12# "very" and "not" are considered stop words, but they are obviously expressing sentiment1314# The porter stemmer lemmatizes "was" to "wa". Seriously???1516# I'm not sure we want to get into stop words17stopwords_english = stopwords.words('english')1819# Also have my doubts about stemming...20from nltk.stem import PorterStemmer21stemmer = PorterStemmer()2223def process_tweet(tweet):24'''25Input:26tweet: a string containing a tweet27Output:28tweets_clean: a list of words containing the processed tweet2930'''31# remove stock market tickers like $GE32tweet = re.sub(r'\$\w*', '', tweet)33# remove old style retweet text "RT"34tweet = re.sub(r'^RT[\s]+', '', tweet)35# remove hyperlinks36tweet = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet)37# remove hashtags38# only removing the hash # sign from the word39tweet = re.sub(r'#', '', tweet)40# tokenize tweets41tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)42tweet_tokens = tokenizer.tokenize(tweet)43### START CODE HERE ###44tweets_clean = []45for word in tweet_tokens:46if (word not in stopwords_english and # remove stopwords47word not in string.punctuation): # remove punctuation48#tweets_clean.append(word)49stem_word = stemmer.stem(word) # stemming word50tweets_clean.append(stem_word)51### END CODE HERE ###52return tweets_clean535455# let's not reuse variables56#all_positive_tweets = twitter_samples.strings('positive_tweets.json')57#all_negative_tweets = twitter_samples.strings('negative_tweets.json')5859def load_tweets():60all_positive_tweets = twitter_samples.strings('positive_tweets.json')61all_negative_tweets = twitter_samples.strings('negative_tweets.json')62return all_positive_tweets, all_negative_tweets6364# Layers have weights and a foward function.65# They create weights when layer.initialize is called and use them.66# remove this or make it optional6768class Layer(object):69"""Base class for layers."""70def __init__(self):71self.weights = None7273def forward(self, x):74raise NotImplementedError7576def init_weights_and_state(self, input_signature, random_key):77pass7879def init(self, input_signature, random_key):80self.init_weights_and_state(input_signature, random_key)81return self.weights8283def __call__(self, x):84return self.forward(x)8586