Path: blob/master/Natural Language Processing with Classification and Vector Spaces/Week 3 - Vector Space Models/utils_vecs.py
14375 views
# This is used to tranlate english to french12import pandas as pd3from gensim.models import KeyedVectors4import nltk5import unicodedata6import string78# Loading in the French embeddings.910fr_embeddings = KeyedVectors.load_word2vec_format('wiki.multi.fr.vec')11f = open('capitals.txt', 'r').read()12set_words = set(nltk.word_tokenize(f))1314def load_translations():15'''16TBD1718'''19dict_fr = pd.read_csv('en-fr.txt', delimiter = ' ')2021en_to_fr = {}22fr_to_vec = {}23for i in range(len(dict_fr)):24en = dict_fr.loc[i][0]25fr = dict_fr.loc[i][1]26if type(en) != float:27en = en.capitalize()28if en in set_words and en not in set(en_to_fr.keys()):29en_to_fr[en] = fr30fr_to_vec[fr] = fr_embeddings[fr]31# Add comments later32del fr_to_vec['syrienne']33del fr_to_vec['iranienne']34del fr_to_vec['malien']35del fr_to_vec['arménienne']36del fr_to_vec['chilien']37del fr_to_vec['équateur']38en_to_fr['Chile'] = 'chili'39fr_to_vec['chili'] = fr_embeddings['chili']40en_to_fr['Iran'] = 'iran'41fr_to_vec['iran'] = fr_embeddings['iran']42en_to_fr['Turkey'] = 'turquie'43fr_to_vec['turquie'] = fr_embeddings['turquie']44en_to_fr['Syria'] = 'syrie'45fr_to_vec['syrie'] = fr_embeddings['syrie']46en_to_fr['Nigeria'] = 'nigeria'47fr_to_vec['nigeria'] = fr_embeddings['nigeria']48en_to_fr['Mali'] = 'mali'49fr_to_vec['mali'] = fr_embeddings['mali']50fr_to_vec['grece'] = fr_embeddings['grèce']51en_to_fr['Armenia'] = 'arménie'52fr_to_vec['arménie'] = fr_embeddings['arménie']53en_to_fr['Ecuador'] = 'ecuador'54fr_to_vec['ecuador'] = fr_embeddings['ecuador']55en_to_fr['Niger'] = 'niger'56fr_to_vec['niger'] = fr_embeddings['niger']57return en_to_fr, fr_to_vec5859def remove_accents(data):60return ''.join(x for x in unicodedata.normalize('NFKD', data) if x in string.ascii_letters).lower()616263