Skip to content

Instantly share code, notes, and snippets.

@l1m2p3
Last active July 31, 2018 10:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save l1m2p3/fe2e355d5af19e17e5a21bcf356b3d45 to your computer and use it in GitHub Desktop.
Save l1m2p3/fe2e355d5af19e17e5a21bcf356b3d45 to your computer and use it in GitHub Desktop.
functions for updating/accessing word vecs on DynamoDB (*updated to use spacy to find token. See https://spacy.io/usage/ for how to install spacy)
import boto3
import numpy
import pickle
import spacy
table_name = 'wordvec' # table name on DynamoDB
# batch size specified by DynamoDB. See DynamoDB's doc for more details
write_batch_size = 25
read_batch_size = 100
# DynamoDB client
client = boto3.client('dynamodb')
# helps turn words into tokens, from which we can find word vector
tokenizer = spacy.load('en')
# helper function to divide list into sublists
def sublist(l, batch_size):
return [l[i:i+batch_size] for i in range(0,len(l), batch_size)]
# helper function to convert a word to a put request
def word_to_put_req(word):
vector = tokenizer(unicode(word, encoding='utf-8'))[0].vector
return {
'PutRequest': {
'Item': {
'word': {
'S': word
},
'vector': {
'L': [{'N': str(n)} for n in vector]
}
}
}
}
#
# upload lookup for word in `words`
#
def put_words(words):
# request cannot contain duplicate keys. remove duplicates
words = list(set(words))
batches = sublist(words, write_batch_size)
for batch in batches:
request = [word_to_put_req(word) for word in batch]
response = client.batch_write_item(
RequestItems = {
table_name: request
}
)
#
# returns a lookup for word in `words`
# if a word is absent on DynamoDB, it won't be in the returned lookup's keys
#
def get_words(words):
# request cannot contain duplicate keys. remove duplicates
words_no_dup = list(set(words))
batches = sublist(words_no_dup, read_batch_size)
wordvec_a = []
for batch in batches:
request = [{'word':{'S':word}} for word in batch]
response = client.batch_get_item(
RequestItems = {
table_name: {
'Keys': request
}
}
)
wordvec_a = wordvec_a + [(d['word']['S'], d['vector']['L']) for d in response['Responses'][table_name]]
return dict(wordvec_a)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment