RoaldSchuring/sagemaker_functions.py

## sagemaker_functions.py
from sklearn.preprocessing import normalize

# open the vectors.txt file containing all the trained word embeddings, extracting the descriptors & embeddings
num_points = len(open('vectors.txt','r').read().split('\n'))

first_line = True
index_to_word = []
with open("vectors.txt","r") as f:
    for line_num, line in enumerate(f):
        if first_line:
            dim = int(line.strip().split()[1])
            word_vecs = np.zeros((num_points, dim), dtype=float)
            first_line = False
            continue
        line = line.strip()
        word = line.split()[0]
        vec = word_vecs[line_num-1]
        for index, vec_val in enumerate(line.split()[1:]):
            vec[index] = float(vec_val)
        index_to_word.append(word)
        if line_num >= num_points:
            break
word_vecs = normalize(word_vecs, copy=False, return_norm=False)

names_vecs = list(zip(index_to_word, word_vecs))

# eliminate any words that are not in our overview of accepted wine descriptors (contained in the dataframe descriptor_mapping)
descriptor_mapping = pd.read_csv('s3://{}/descriptor_mapping.csv'.format(bucket)).set_index('raw descriptor')
names_vecs_filtered = [n for n in names_vecs if n[0] in list(descriptor_mapping['level_3'])]

# save the descriptor names and the corresponding word vectors in a csv file in our S3 bucket
names_vecs_df = pd.DataFrame(names_vecs_filtered, columns=['word', 'vector'])
names_vecs_df.sort_values(by=['word'], inplace=True)
names_vecs_df.to_csv('word_vectors.csv')
boto3.Session().resource('s3').Bucket(bucket).Object('word_vectors.csv').upload_file('word_vectors.csv')
	from sklearn.preprocessing import normalize

	# open the vectors.txt file containing all the trained word embeddings, extracting the descriptors & embeddings
	num_points = len(open('vectors.txt','r').read().split('\n'))

	first_line = True
	index_to_word = []
	with open("vectors.txt","r") as f:
	for line_num, line in enumerate(f):
	if first_line:
	dim = int(line.strip().split()[1])
	word_vecs = np.zeros((num_points, dim), dtype=float)
	first_line = False
	continue
	line = line.strip()
	word = line.split()[0]
	vec = word_vecs[line_num-1]
	for index, vec_val in enumerate(line.split()[1:]):
	vec[index] = float(vec_val)
	index_to_word.append(word)
	if line_num >= num_points:
	break
	word_vecs = normalize(word_vecs, copy=False, return_norm=False)

	names_vecs = list(zip(index_to_word, word_vecs))

	# eliminate any words that are not in our overview of accepted wine descriptors (contained in the dataframe descriptor_mapping)
	descriptor_mapping = pd.read_csv('s3://{}/descriptor_mapping.csv'.format(bucket)).set_index('raw descriptor')
	names_vecs_filtered = [n for n in names_vecs if n[0] in list(descriptor_mapping['level_3'])]

	# save the descriptor names and the corresponding word vectors in a csv file in our S3 bucket
	names_vecs_df = pd.DataFrame(names_vecs_filtered, columns=['word', 'vector'])
	names_vecs_df.sort_values(by=['word'], inplace=True)
	names_vecs_df.to_csv('word_vectors.csv')
	boto3.Session().resource('s3').Bucket(bucket).Object('word_vectors.csv').upload_file('word_vectors.csv')