CoCalc -- w2v.py

📚 The CoCalc Library - books, templates and other resources
cocalc-examples / data-science-ipython-notebooks / deep-learning / keras-tutorial / w2v.py
¹³²⁹²⁸ views
License: OTHER
1
from gensim.models import word2vec
2
from os.path import join, exists, split
3
import os
4
import numpy as np
5

6
def train_word2vec(sentence_matrix, vocabulary_inv,
7
                   num_features=300, min_word_count=1, context=10):
8
    """
9
    Trains, saves, loads Word2Vec model
10
    Returns initial weights for embedding layer.
11
   
12
    inputs:
13
    sentence_matrix # int matrix: num_sentences x max_sentence_len
14
    vocabulary_inv  # dict {str:int}
15
    num_features    # Word vector dimensionality                      
16
    min_word_count  # Minimum word count                        
17
    context         # Context window size 
18
    """
19
    model_dir = 'word2vec_models'
20
    model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
21
    model_name = join(model_dir, model_name)
22
    if exists(model_name):
23
        embedding_model = word2vec.Word2Vec.load(model_name)
24
        print('Loading existing Word2Vec model \'%s\'' % split(model_name)[-1])
25
    else:
26
        # Set values for various parameters
27
        num_workers = 2       # Number of threads to run in parallel
28
        downsampling = 1e-3   # Downsample setting for frequent words
29
        
30
        # Initialize and train the model
31
        print("Training Word2Vec model...")
32
        sentences = [[vocabulary_inv[w] for w in s] for s in sentence_matrix]
33
        embedding_model = word2vec.Word2Vec(sentences, workers=num_workers, \
34
                            size=num_features, min_count = min_word_count, \
35
                            window = context, sample = downsampling)
36
        
37
        # If we don't plan to train the model any further, calling 
38
        # init_sims will make the model much more memory-efficient.
39
        embedding_model.init_sims(replace=True)
40
        
41
        # Saving the model for later use. You can load it later using Word2Vec.load()
42
        if not exists(model_dir):
43
            os.mkdir(model_dir)
44
        print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
45
        embedding_model.save(model_name)
46
    
47
    #  add unknown words
48
    embedding_weights = [np.array([embedding_model[w] if w in embedding_model\
49
                                                        else np.random.uniform(-0.25,0.25,embedding_model.vector_size)\
50
                                                        for w in vocabulary_inv])]
51
    return embedding_weights
52

53
if __name__=='__main__':
54
    import data_helpers
55
    print("Loading data...")
56
    x, _, _, vocabulary_inv = data_helpers.load_data()
57
    w = train_word2vec(x, vocabulary_inv)
58

59
Product

Resources

Company