Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download

📚 The CoCalc Library - books, templates and other resources

132926 views
License: OTHER
1
import numpy as np
2
import re
3
import itertools
4
from collections import Counter
5
"""
6
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
7
"""
8
9
def clean_str(string):
10
"""
11
Tokenization/string cleaning for all datasets except for SST.
12
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
13
"""
14
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
15
string = re.sub(r"\'s", " \'s", string)
16
string = re.sub(r"\'ve", " \'ve", string)
17
string = re.sub(r"n\'t", " n\'t", string)
18
string = re.sub(r"\'re", " \'re", string)
19
string = re.sub(r"\'d", " \'d", string)
20
string = re.sub(r"\'ll", " \'ll", string)
21
string = re.sub(r",", " , ", string)
22
string = re.sub(r"!", " ! ", string)
23
string = re.sub(r"\(", " \( ", string)
24
string = re.sub(r"\)", " \) ", string)
25
string = re.sub(r"\?", " \? ", string)
26
string = re.sub(r"\s{2,}", " ", string)
27
return string.strip().lower()
28
29
30
def load_data_and_labels():
31
"""
32
Loads MR polarity data from files, splits the data into words and generates labels.
33
Returns split sentences and labels.
34
"""
35
# Load data from files
36
positive_examples = list(open("./data/rt-polarity.pos", encoding='ISO-8859-1').readlines())
37
positive_examples = [s.strip() for s in positive_examples]
38
negative_examples = list(open("./data/rt-polarity.neg", encoding='ISO-8859-1').readlines())
39
negative_examples = [s.strip() for s in negative_examples]
40
# Split by words
41
x_text = positive_examples + negative_examples
42
x_text = [clean_str(sent) for sent in x_text]
43
x_text = [s.split(" ") for s in x_text]
44
# Generate labels
45
positive_labels = [[0, 1] for _ in positive_examples]
46
negative_labels = [[1, 0] for _ in negative_examples]
47
y = np.concatenate([positive_labels, negative_labels], 0)
48
return [x_text, y]
49
50
51
def pad_sentences(sentences, padding_word="<PAD/>"):
52
"""
53
Pads all sentences to the same length. The length is defined by the longest sentence.
54
Returns padded sentences.
55
"""
56
sequence_length = max(len(x) for x in sentences)
57
padded_sentences = []
58
for i in range(len(sentences)):
59
sentence = sentences[i]
60
num_padding = sequence_length - len(sentence)
61
new_sentence = sentence + [padding_word] * num_padding
62
padded_sentences.append(new_sentence)
63
return padded_sentences
64
65
66
def build_vocab(sentences):
67
"""
68
Builds a vocabulary mapping from word to index based on the sentences.
69
Returns vocabulary mapping and inverse vocabulary mapping.
70
"""
71
# Build vocabulary
72
word_counts = Counter(itertools.chain(*sentences))
73
# Mapping from index to word
74
vocabulary_inv = [x[0] for x in word_counts.most_common()]
75
# Mapping from word to index
76
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
77
return [vocabulary, vocabulary_inv]
78
79
80
def build_input_data(sentences, labels, vocabulary):
81
"""
82
Maps sentencs and labels to vectors based on a vocabulary.
83
"""
84
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
85
y = np.array(labels)
86
return [x, y]
87
88
89
def load_data():
90
"""
91
Loads and preprocessed data for the MR dataset.
92
Returns input vectors, labels, vocabulary, and inverse vocabulary.
93
"""
94
# Load and preprocess data
95
sentences, labels = load_data_and_labels()
96
sentences_padded = pad_sentences(sentences)
97
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
98
x, y = build_input_data(sentences_padded, labels, vocabulary)
99
return [x, y, vocabulary, vocabulary_inv]
100
101
102
def batch_iter(data, batch_size, num_epochs):
103
"""
104
Generates a batch iterator for a dataset.
105
"""
106
data = np.array(data)
107
data_size = len(data)
108
num_batches_per_epoch = int(len(data)/batch_size) + 1
109
for epoch in range(num_epochs):
110
# Shuffle the data at each epoch
111
shuffle_indices = np.random.permutation(np.arange(data_size))
112
shuffled_data = data[shuffle_indices]
113
for batch_num in range(num_batches_per_epoch):
114
start_index = batch_num * batch_size
115
end_index = min((batch_num + 1) * batch_size, data_size)
116
yield shuffled_data[start_index:end_index]
117