📚 The CoCalc Library - books, templates and other resources
License: OTHER
import numpy as np1import re2import itertools3from collections import Counter4"""5Original taken from https://github.com/dennybritz/cnn-text-classification-tf6"""78def clean_str(string):9"""10Tokenization/string cleaning for all datasets except for SST.11Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py12"""13string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)14string = re.sub(r"\'s", " \'s", string)15string = re.sub(r"\'ve", " \'ve", string)16string = re.sub(r"n\'t", " n\'t", string)17string = re.sub(r"\'re", " \'re", string)18string = re.sub(r"\'d", " \'d", string)19string = re.sub(r"\'ll", " \'ll", string)20string = re.sub(r",", " , ", string)21string = re.sub(r"!", " ! ", string)22string = re.sub(r"\(", " \( ", string)23string = re.sub(r"\)", " \) ", string)24string = re.sub(r"\?", " \? ", string)25string = re.sub(r"\s{2,}", " ", string)26return string.strip().lower()272829def load_data_and_labels():30"""31Loads MR polarity data from files, splits the data into words and generates labels.32Returns split sentences and labels.33"""34# Load data from files35positive_examples = list(open("./data/rt-polarity.pos", encoding='ISO-8859-1').readlines())36positive_examples = [s.strip() for s in positive_examples]37negative_examples = list(open("./data/rt-polarity.neg", encoding='ISO-8859-1').readlines())38negative_examples = [s.strip() for s in negative_examples]39# Split by words40x_text = positive_examples + negative_examples41x_text = [clean_str(sent) for sent in x_text]42x_text = [s.split(" ") for s in x_text]43# Generate labels44positive_labels = [[0, 1] for _ in positive_examples]45negative_labels = [[1, 0] for _ in negative_examples]46y = np.concatenate([positive_labels, negative_labels], 0)47return [x_text, y]484950def pad_sentences(sentences, padding_word="<PAD/>"):51"""52Pads all sentences to the same length. The length is defined by the longest sentence.53Returns padded sentences.54"""55sequence_length = max(len(x) for x in sentences)56padded_sentences = []57for i in range(len(sentences)):58sentence = sentences[i]59num_padding = sequence_length - len(sentence)60new_sentence = sentence + [padding_word] * num_padding61padded_sentences.append(new_sentence)62return padded_sentences636465def build_vocab(sentences):66"""67Builds a vocabulary mapping from word to index based on the sentences.68Returns vocabulary mapping and inverse vocabulary mapping.69"""70# Build vocabulary71word_counts = Counter(itertools.chain(*sentences))72# Mapping from index to word73vocabulary_inv = [x[0] for x in word_counts.most_common()]74# Mapping from word to index75vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}76return [vocabulary, vocabulary_inv]777879def build_input_data(sentences, labels, vocabulary):80"""81Maps sentencs and labels to vectors based on a vocabulary.82"""83x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])84y = np.array(labels)85return [x, y]868788def load_data():89"""90Loads and preprocessed data for the MR dataset.91Returns input vectors, labels, vocabulary, and inverse vocabulary.92"""93# Load and preprocess data94sentences, labels = load_data_and_labels()95sentences_padded = pad_sentences(sentences)96vocabulary, vocabulary_inv = build_vocab(sentences_padded)97x, y = build_input_data(sentences_padded, labels, vocabulary)98return [x, y, vocabulary, vocabulary_inv]99100101def batch_iter(data, batch_size, num_epochs):102"""103Generates a batch iterator for a dataset.104"""105data = np.array(data)106data_size = len(data)107num_batches_per_epoch = int(len(data)/batch_size) + 1108for epoch in range(num_epochs):109# Shuffle the data at each epoch110shuffle_indices = np.random.permutation(np.arange(data_size))111shuffled_data = data[shuffle_indices]112for batch_num in range(num_batches_per_epoch):113start_index = batch_num * batch_size114end_index = min((batch_num + 1) * batch_size, data_size)115yield shuffled_data[start_index:end_index]116117