Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
y33-j3T
GitHub Repository: y33-j3T/Coursera-Deep-Learning
Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 3 - Vector Space Models/utils_vecs.py
14375 views
1
# This is used to tranlate english to french
2
3
import pandas as pd
4
from gensim.models import KeyedVectors
5
import nltk
6
import unicodedata
7
import string
8
9
# Loading in the French embeddings.
10
11
fr_embeddings = KeyedVectors.load_word2vec_format('wiki.multi.fr.vec')
12
f = open('capitals.txt', 'r').read()
13
set_words = set(nltk.word_tokenize(f))
14
15
def load_translations():
16
'''
17
TBD
18
19
'''
20
dict_fr = pd.read_csv('en-fr.txt', delimiter = ' ')
21
22
en_to_fr = {}
23
fr_to_vec = {}
24
for i in range(len(dict_fr)):
25
en = dict_fr.loc[i][0]
26
fr = dict_fr.loc[i][1]
27
if type(en) != float:
28
en = en.capitalize()
29
if en in set_words and en not in set(en_to_fr.keys()):
30
en_to_fr[en] = fr
31
fr_to_vec[fr] = fr_embeddings[fr]
32
# Add comments later
33
del fr_to_vec['syrienne']
34
del fr_to_vec['iranienne']
35
del fr_to_vec['malien']
36
del fr_to_vec['arménienne']
37
del fr_to_vec['chilien']
38
del fr_to_vec['équateur']
39
en_to_fr['Chile'] = 'chili'
40
fr_to_vec['chili'] = fr_embeddings['chili']
41
en_to_fr['Iran'] = 'iran'
42
fr_to_vec['iran'] = fr_embeddings['iran']
43
en_to_fr['Turkey'] = 'turquie'
44
fr_to_vec['turquie'] = fr_embeddings['turquie']
45
en_to_fr['Syria'] = 'syrie'
46
fr_to_vec['syrie'] = fr_embeddings['syrie']
47
en_to_fr['Nigeria'] = 'nigeria'
48
fr_to_vec['nigeria'] = fr_embeddings['nigeria']
49
en_to_fr['Mali'] = 'mali'
50
fr_to_vec['mali'] = fr_embeddings['mali']
51
fr_to_vec['grece'] = fr_embeddings['grèce']
52
en_to_fr['Armenia'] = 'arménie'
53
fr_to_vec['arménie'] = fr_embeddings['arménie']
54
en_to_fr['Ecuador'] = 'ecuador'
55
fr_to_vec['ecuador'] = fr_embeddings['ecuador']
56
en_to_fr['Niger'] = 'niger'
57
fr_to_vec['niger'] = fr_embeddings['niger']
58
return en_to_fr, fr_to_vec
59
60
def remove_accents(data):
61
return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters).lower()
62
63