Skip to content

Instantly share code, notes, and snippets.

@prayagupa
Last active November 19, 2017 04:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save prayagupa/2cbda001d0ca132b9ad63ccb540aaa27 to your computer and use it in GitHub Desktop.
Save prayagupa/2cbda001d0ca132b9ad63ccb540aaa27 to your computer and use it in GitHub Desktop.
python
# http://boto3.readthedocs.io/en/latest/reference/services/kinesis.html
import boto3
import json
client = boto3.client('kinesis')
create_stream = client.create_stream(StreamName='GregorSamsa', ShardCount=150)
streams = client.list_streams(Limit=100)
class SensorEvent(object):
def __init__(self, sensorId, data):
self.sensorId = sensorId
self.data = data
for sensorEvent in [SensorEvent("1", "something happened"), SensorEvent("2", "something again happened")]:
client.put_record(StreamName='GregorSamsa', Data=json.dumps(sensorEvent.__dict__), PartitionKey=sensorEvent.sensorId)
toInt(MD5 of 1) = 261578874264819908609102035485573088411
#{'SequenceNumber': '49574391176947112863656757427907696591655425940803028786', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '110', 'x-amzn-requestid': 'fd00a7f5-39e1-20f6-acc5-a59bd6ca5be1', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:24:30 GMT', 'x-amz-id-2': 'JYuozovloYQiCyXDxAhmTTY29AHhGIlqiC2zGsj/svtCck2nDe/HS785SY0vdr4OuvGvL4dTwxCvRfMLuv9Wa0YfdJ2n4NUZ'}, 'RetryAttempts': 0, 'RequestId': 'fd00a7f5-39e1-20f6-acc5-a59bd6ca5be1', 'HTTPStatusCode': 200}, 'ShardId': 'shardId-000000000115'}
#{'SequenceNumber': '49574391176991714354053818674191976954020337292989695826', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '110', 'x-amzn-requestid': 'e376b52e-772f-e5d3-b2b3-b74098049ec4', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:24:30 GMT', 'x-amz-id-2': 'C75YgF1cKYIgQc1lSYZvmsdDvPhc+71qDAacRjFGbAtMpTLdZaENtK9lGew0pc09ZiSkfRG5raOHXLNTItyfSBPomKCc/Vnp'}, 'RetryAttempts': 0, 'RequestId': 'e376b52e-772f-e5d3-b2b3-b74098049ec4', 'HTTPStatusCode': 200}, 'ShardId': 'shardId-000000000117'}
cursor = client.get_shard_iterator(StreamName='GregorSamsa', , ShardId='shardId-00000', ShardIteratorType='TRIM_HORIZON')
client.get_records( ShardIterator=cursor["ShardIterator"], Limit=10000)
#
# {'Records': [{'PartitionKey': '1', 'ApproximateArrivalTimestamp': datetime.datetime(2017, 6, 21, 23, 24, 30, 661000, tzinfo=tzlocal()), 'SequenceNumber': '49574391176947112863656757427907696591655425940803028786', 'Data': b'{"data": "something happened", "sensorId": "1"}'}], 'MillisBehindLatest': 0, 'NextShardIterator': 'AAAAAAAAAAHq8axv9lHphjtFpGis3xnexFn73/whruqLDnFR55xjUXleyM4Ci/i0hSld6ISzMb51xUoYkV2EHsjQCBtfNn74Gc25yrIVLZ5kH4VLxf64Q4gCgK3ABJxBTuXThv+UVA+iklVjObroqLE3tUMBke2bpciQp+fnzn4SLs9dARPsZHKqDtR0CZxddAT4HoMQhhUx1clOR81jbaWabyzBY6bG', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '501', 'x-amzn-requestid': 'c87d168f-8704-d00c-99b8-10d42eb35d3e', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:41:15 GMT', 'x-amz-id-2': 'Gmy8eZ4nXfu6tb0WkVvY20AlXnev7L8OIPRvTZDF07EFGlYYpHzWvxvOXRDnIUkX1vx9+QYaUXi6SdoUx7QEUsr7N90RVk7G'}, 'RetryAttempts': 0, 'RequestId': 'c87d168f-8704-d00c-99b8-10d42eb35d3e', 'HTTPStatusCode': 200}}
#

http://www.nltk.org/

pip3 install -U nltk
pip3 install -U numpy

tokenize query

$ python3
Python 3.6.2 (default, Jul 17 2017, 16:44:45) 
[GCC 4.2.1 Compatible Apple LLVM 8.1.0 (clang-802.0.42)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> nltk.download('punkt')
[nltk_data] Downloading package punkt to /Users/a1353612/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
True

>>> query_tokens = nltk.word_tokenize(query)
>>> query_tokens
['where', 'is', 'my', 'order', 'i', 'bought', 'last', 'saturday']

part of speech(pos) https://www.kaggle.com/nltkdata/averaged-perceptron-tagger

>>> nltk.download('averaged_perceptron_tagger')
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/a1353612/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
True

>>> perc_tagged = nltk.pos_tag(query_tokens)
>>> perc_tagged[0:6]
[('where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('order', 'NN'), ('i', 'NN'), ('bought', 'VBD')]
nltk.download('maxent_ne_chunker')
nltk.download('words')

>>> entities = nltk.chunk.ne_chunk(perc_tagged)
>>> entities
Tree('S', [('where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('order', 'NN'), ('i', 'NN'), ('bought', 'VBD'), ('last', 'JJ'), ('saturday', 'NN')])
nltk.download('treebank')

>>> from nltk.corpus import treebank
>>> t = treebank.parsed_sents('wsj_0001.mrg')[0]
>>> t.draw
<bound method Tree.draw of Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])>
@prayagupa
Copy link
Author

261578874264819908609102035485573088411 / 150

2 to the power 128

@prayagupa
Copy link
Author

prayagupa commented Jun 22, 2017

// python 3.6

STEP 1 - create s3 bucket in us-west-2 region
STEP 2 - create lambda processor on above bucket

import json
import urllib.parse
import boto3

print('[INFO] Loading function')

s3 = boto3.client('s3')


def lambda_handler(event, context):
    print("[INFO] Received event: " + json.dumps(event, indent=2))

    # Get the object from the event and show its content type
    bucket = event['Records'][0]['s3']['bucket']['name']
    key = urllib.parse.unquote_plus(event['Records'][0]['s3']['object']['key'], encoding='utf-8')
    try:
        response = s3.get_object(Bucket=bucket, Key=key)
        print("[INFO] CONTENT TYPE: " + response['ContentType'])
        return response['ContentType']
    except Exception as e:
        print(e)
        print('[ERROR] Error getting object {} from bucket {}. Make sure they exist and your bucket is in the same region as this function.'.format(key, bucket))
        raise e

STEP 3 - emit some documents to s3 bucket which will trigger the event processor
STEP 4 - see cloudwatch logs

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment