Skip to content

Instantly share code, notes, and snippets.

@RoaldSchuring
Created July 9, 2019 02:08
Show Gist options
  • Save RoaldSchuring/7ddbdf3359ff1d0ed9b3fa2c6aafa746 to your computer and use it in GitHub Desktop.
Save RoaldSchuring/7ddbdf3359ff1d0ed9b3fa2c6aafa746 to your computer and use it in GitHub Desktop.
extracting_info_from_vectorstxt
from sklearn.preprocessing import normalize
# open the vectors.txt file containing all the trained word embeddings, extracting the descriptors & embeddings
num_points = len(open('vectors.txt','r').read().split('\n'))
first_line = True
index_to_word = []
with open("vectors.txt","r") as f:
for line_num, line in enumerate(f):
if first_line:
dim = int(line.strip().split()[1])
word_vecs = np.zeros((num_points, dim), dtype=float)
first_line = False
continue
line = line.strip()
word = line.split()[0]
vec = word_vecs[line_num-1]
for index, vec_val in enumerate(line.split()[1:]):
vec[index] = float(vec_val)
index_to_word.append(word)
if line_num >= num_points:
break
word_vecs = normalize(word_vecs, copy=False, return_norm=False)
names_vecs = list(zip(index_to_word, word_vecs))
# eliminate any words that are not in our overview of accepted wine descriptors (contained in the dataframe descriptor_mapping)
descriptor_mapping = pd.read_csv('s3://{}/descriptor_mapping.csv'.format(bucket)).set_index('raw descriptor')
names_vecs_filtered = [n for n in names_vecs if n[0] in list(descriptor_mapping['level_3'])]
# save the descriptor names and the corresponding word vectors in a csv file in our S3 bucket
names_vecs_df = pd.DataFrame(names_vecs_filtered, columns=['word', 'vector'])
names_vecs_df.sort_values(by=['word'], inplace=True)
names_vecs_df.to_csv('word_vectors.csv')
boto3.Session().resource('s3').Bucket(bucket).Object('word_vectors.csv').upload_file('word_vectors.csv')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment