CoCalc -- data_helpers.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / deep-learning / keras-tutorial / data_helpers.py
¹³²⁹²⁶ views
License: OTHER
1
import numpy as np
2
import re
3
import itertools
4
from collections import Counter
5
"""
6
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
7
"""
8

9
def clean_str(string):
10
    """
11
    Tokenization/string cleaning for all datasets except for SST.
12
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
13
    """
14
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
15
    string = re.sub(r"\'s", " \'s", string)
16
    string = re.sub(r"\'ve", " \'ve", string)
17
    string = re.sub(r"n\'t", " n\'t", string)
18
    string = re.sub(r"\'re", " \'re", string)
19
    string = re.sub(r"\'d", " \'d", string)
20
    string = re.sub(r"\'ll", " \'ll", string)
21
    string = re.sub(r",", " , ", string)
22
    string = re.sub(r"!", " ! ", string)
23
    string = re.sub(r"\(", " \( ", string)
24
    string = re.sub(r"\)", " \) ", string)
25
    string = re.sub(r"\?", " \? ", string)
26
    string = re.sub(r"\s{2,}", " ", string)
27
    return string.strip().lower()
28

29

30
def load_data_and_labels():
31
    """
32
    Loads MR polarity data from files, splits the data into words and generates labels.
33
    Returns split sentences and labels.
34
    """
35
    # Load data from files
36
    positive_examples = list(open("./data/rt-polarity.pos", encoding='ISO-8859-1').readlines())
37
    positive_examples = [s.strip() for s in positive_examples]
38
    negative_examples = list(open("./data/rt-polarity.neg", encoding='ISO-8859-1').readlines())
39
    negative_examples = [s.strip() for s in negative_examples]
40
    # Split by words
41
    x_text = positive_examples + negative_examples
42
    x_text = [clean_str(sent) for sent in x_text]
43
    x_text = [s.split(" ") for s in x_text]
44
    # Generate labels
45
    positive_labels = [[0, 1] for _ in positive_examples]
46
    negative_labels = [[1, 0] for _ in negative_examples]
47
    y = np.concatenate([positive_labels, negative_labels], 0)
48
    return [x_text, y]
49

50

51
def pad_sentences(sentences, padding_word="<PAD/>"):
52
    """
53
    Pads all sentences to the same length. The length is defined by the longest sentence.
54
    Returns padded sentences.
55
    """
56
    sequence_length = max(len(x) for x in sentences)
57
    padded_sentences = []
58
    for i in range(len(sentences)):
59
        sentence = sentences[i]
60
        num_padding = sequence_length - len(sentence)
61
        new_sentence = sentence + [padding_word] * num_padding
62
        padded_sentences.append(new_sentence)
63
    return padded_sentences
64

65

66
def build_vocab(sentences):
67
    """
68
    Builds a vocabulary mapping from word to index based on the sentences.
69
    Returns vocabulary mapping and inverse vocabulary mapping.
70
    """
71
    # Build vocabulary
72
    word_counts = Counter(itertools.chain(*sentences))
73
    # Mapping from index to word
74
    vocabulary_inv = [x[0] for x in word_counts.most_common()]
75
    # Mapping from word to index
76
    vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
77
    return [vocabulary, vocabulary_inv]
78

79

80
def build_input_data(sentences, labels, vocabulary):
81
    """
82
    Maps sentencs and labels to vectors based on a vocabulary.
83
    """
84
    x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences])
85
    y = np.array(labels)
86
    return [x, y]
87

88

89
def load_data():
90
    """
91
    Loads and preprocessed data for the MR dataset.
92
    Returns input vectors, labels, vocabulary, and inverse vocabulary.
93
    """
94
    # Load and preprocess data
95
    sentences, labels = load_data_and_labels()
96
    sentences_padded = pad_sentences(sentences)
97
    vocabulary, vocabulary_inv = build_vocab(sentences_padded)
98
    x, y = build_input_data(sentences_padded, labels, vocabulary)
99
    return [x, y, vocabulary, vocabulary_inv]
100

101

102
def batch_iter(data, batch_size, num_epochs):
103
    """
104
    Generates a batch iterator for a dataset.
105
    """
106
    data = np.array(data)
107
    data_size = len(data)
108
    num_batches_per_epoch = int(len(data)/batch_size) + 1
109
    for epoch in range(num_epochs):
110
        # Shuffle the data at each epoch
111
        shuffle_indices = np.random.permutation(np.arange(data_size))
112
        shuffled_data = data[shuffle_indices]
113
        for batch_num in range(num_batches_per_epoch):
114
            start_index = batch_num * batch_size
115
            end_index = min((batch_num + 1) * batch_size, data_size)
116
            yield shuffled_data[start_index:end_index]
117
Product

Resources

Company