prayagupa/kinesis.py

## kinesis.py
# http://boto3.readthedocs.io/en/latest/reference/services/kinesis.html
import boto3
import json

client = boto3.client('kinesis')

create_stream = client.create_stream(StreamName='GregorSamsa', ShardCount=150)

streams = client.list_streams(Limit=100)

class SensorEvent(object):
     def __init__(self, sensorId, data):
        self.sensorId = sensorId
        self.data = data

for sensorEvent in [SensorEvent("1", "something happened"), SensorEvent("2", "something again happened")]:
     client.put_record(StreamName='GregorSamsa', Data=json.dumps(sensorEvent.__dict__), PartitionKey=sensorEvent.sensorId)

toInt(MD5 of 1) = 261578874264819908609102035485573088411

#{'SequenceNumber': '49574391176947112863656757427907696591655425940803028786', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '110', 'x-amzn-requestid': 'fd00a7f5-39e1-20f6-acc5-a59bd6ca5be1', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:24:30 GMT', 'x-amz-id-2': 'JYuozovloYQiCyXDxAhmTTY29AHhGIlqiC2zGsj/svtCck2nDe/HS785SY0vdr4OuvGvL4dTwxCvRfMLuv9Wa0YfdJ2n4NUZ'}, 'RetryAttempts': 0, 'RequestId': 'fd00a7f5-39e1-20f6-acc5-a59bd6ca5be1', 'HTTPStatusCode': 200}, 'ShardId': 'shardId-000000000115'}
#{'SequenceNumber': '49574391176991714354053818674191976954020337292989695826', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '110', 'x-amzn-requestid': 'e376b52e-772f-e5d3-b2b3-b74098049ec4', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:24:30 GMT', 'x-amz-id-2': 'C75YgF1cKYIgQc1lSYZvmsdDvPhc+71qDAacRjFGbAtMpTLdZaENtK9lGew0pc09ZiSkfRG5raOHXLNTItyfSBPomKCc/Vnp'}, 'RetryAttempts': 0, 'RequestId': 'e376b52e-772f-e5d3-b2b3-b74098049ec4', 'HTTPStatusCode': 200}, 'ShardId': 'shardId-000000000117'}

cursor = client.get_shard_iterator(StreamName='GregorSamsa', , ShardId='shardId-00000', ShardIteratorType='TRIM_HORIZON')

client.get_records( ShardIterator=cursor["ShardIterator"], Limit=10000)

#
# {'Records': [{'PartitionKey': '1', 'ApproximateArrivalTimestamp': datetime.datetime(2017, 6, 21, 23, 24, 30, 661000, tzinfo=tzlocal()), 'SequenceNumber': '49574391176947112863656757427907696591655425940803028786', 'Data': b'{"data": "something happened", "sensorId": "1"}'}], 'MillisBehindLatest': 0, 'NextShardIterator': 'AAAAAAAAAAHq8axv9lHphjtFpGis3xnexFn73/whruqLDnFR55xjUXleyM4Ci/i0hSld6ISzMb51xUoYkV2EHsjQCBtfNn74Gc25yrIVLZ5kH4VLxf64Q4gCgK3ABJxBTuXThv+UVA+iklVjObroqLE3tUMBke2bpciQp+fnzn4SLs9dARPsZHKqDtR0CZxddAT4HoMQhhUx1clOR81jbaWabyzBY6bG', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '501', 'x-amzn-requestid': 'c87d168f-8704-d00c-99b8-10d42eb35d3e', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:41:15 GMT', 'x-amz-id-2': 'Gmy8eZ4nXfu6tb0WkVvY20AlXnev7L8OIPRvTZDF07EFGlYYpHzWvxvOXRDnIUkX1vx9+QYaUXi6SdoUx7QEUsr7N90RVk7G'}, 'RetryAttempts': 0, 'RequestId': 'c87d168f-8704-d00c-99b8-10d42eb35d3e', 'HTTPStatusCode': 200}}
#

## nlp.md

      
    Raw
  

              nlp.md
            
          
    http://www.nltk.org/
pip3 install -U nltk
pip3 install -U numpy
tokenize query
$ python3
Python 3.6.2 (default, Jul 17 2017, 16:44:45) 
[GCC 4.2.1 Compatible Apple LLVM 8.1.0 (clang-802.0.42)] on darwin
Type "help", "copyright", "credits" or "license" for more information.
>>> import nltk
>>> nltk.download('punkt')
[nltk_data] Downloading package punkt to /Users/a1353612/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
True

>>> query_tokens = nltk.word_tokenize(query)
>>> query_tokens
['where', 'is', 'my', 'order', 'i', 'bought', 'last', 'saturday']


part of speech(pos)
https://www.kaggle.com/nltkdata/averaged-perceptron-tagger
>>> nltk.download('averaged_perceptron_tagger')
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/a1353612/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
True

>>> perc_tagged = nltk.pos_tag(query_tokens)
>>> perc_tagged[0:6]
[('where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('order', 'NN'), ('i', 'NN'), ('bought', 'VBD')]

nltk.download('maxent_ne_chunker')
nltk.download('words')

>>> entities = nltk.chunk.ne_chunk(perc_tagged)
>>> entities
Tree('S', [('where', 'WRB'), ('is', 'VBZ'), ('my', 'PRP$'), ('order', 'NN'), ('i', 'NN'), ('bought', 'VBD'), ('last', 'JJ'), ('saturday', 'NN')])

nltk.download('treebank')

>>> from nltk.corpus import treebank
>>> t = treebank.parsed_sents('wsj_0001.mrg')[0]
>>> t.draw
<bound method Tree.draw of Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])>
	# http://boto3.readthedocs.io/en/latest/reference/services/kinesis.html
	import boto3
	import json

	client = boto3.client('kinesis')

	create_stream = client.create_stream(StreamName='GregorSamsa', ShardCount=150)

	streams = client.list_streams(Limit=100)

	class SensorEvent(object):
	def __init__(self, sensorId, data):
	self.sensorId = sensorId
	self.data = data

	for sensorEvent in [SensorEvent("1", "something happened"), SensorEvent("2", "something again happened")]:
	client.put_record(StreamName='GregorSamsa', Data=json.dumps(sensorEvent.__dict__), PartitionKey=sensorEvent.sensorId)

	toInt(MD5 of 1) = 261578874264819908609102035485573088411

	#{'SequenceNumber': '49574391176947112863656757427907696591655425940803028786', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '110', 'x-amzn-requestid': 'fd00a7f5-39e1-20f6-acc5-a59bd6ca5be1', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:24:30 GMT', 'x-amz-id-2': 'JYuozovloYQiCyXDxAhmTTY29AHhGIlqiC2zGsj/svtCck2nDe/HS785SY0vdr4OuvGvL4dTwxCvRfMLuv9Wa0YfdJ2n4NUZ'}, 'RetryAttempts': 0, 'RequestId': 'fd00a7f5-39e1-20f6-acc5-a59bd6ca5be1', 'HTTPStatusCode': 200}, 'ShardId': 'shardId-000000000115'}
	#{'SequenceNumber': '49574391176991714354053818674191976954020337292989695826', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '110', 'x-amzn-requestid': 'e376b52e-772f-e5d3-b2b3-b74098049ec4', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:24:30 GMT', 'x-amz-id-2': 'C75YgF1cKYIgQc1lSYZvmsdDvPhc+71qDAacRjFGbAtMpTLdZaENtK9lGew0pc09ZiSkfRG5raOHXLNTItyfSBPomKCc/Vnp'}, 'RetryAttempts': 0, 'RequestId': 'e376b52e-772f-e5d3-b2b3-b74098049ec4', 'HTTPStatusCode': 200}, 'ShardId': 'shardId-000000000117'}

	cursor = client.get_shard_iterator(StreamName='GregorSamsa', , ShardId='shardId-00000', ShardIteratorType='TRIM_HORIZON')

	client.get_records( ShardIterator=cursor["ShardIterator"], Limit=10000)

	#
	# {'Records': [{'PartitionKey': '1', 'ApproximateArrivalTimestamp': datetime.datetime(2017, 6, 21, 23, 24, 30, 661000, tzinfo=tzlocal()), 'SequenceNumber': '49574391176947112863656757427907696591655425940803028786', 'Data': b'{"data": "something happened", "sensorId": "1"}'}], 'MillisBehindLatest': 0, 'NextShardIterator': 'AAAAAAAAAAHq8axv9lHphjtFpGis3xnexFn73/whruqLDnFR55xjUXleyM4Ci/i0hSld6ISzMb51xUoYkV2EHsjQCBtfNn74Gc25yrIVLZ5kH4VLxf64Q4gCgK3ABJxBTuXThv+UVA+iklVjObroqLE3tUMBke2bpciQp+fnzn4SLs9dARPsZHKqDtR0CZxddAT4HoMQhhUx1clOR81jbaWabyzBY6bG', 'ResponseMetadata': {'HTTPHeaders': {'content-length': '501', 'x-amzn-requestid': 'c87d168f-8704-d00c-99b8-10d42eb35d3e', 'content-type': 'application/x-amz-json-1.1', 'server': 'Apache-Coyote/1.1', 'date': 'Wed, 21 Jun 2017 23:41:15 GMT', 'x-amz-id-2': 'Gmy8eZ4nXfu6tb0WkVvY20AlXnev7L8OIPRvTZDF07EFGlYYpHzWvxvOXRDnIUkX1vx9+QYaUXi6SdoUx7QEUsr7N90RVk7G'}, 'RetryAttempts': 0, 'RequestId': 'c87d168f-8704-d00c-99b8-10d42eb35d3e', 'HTTPStatusCode': 200}}
	#