Skip to content

Instantly share code, notes, and snippets.

@nanvel
Created October 31, 2014 14:37
Show Gist options
  • Save nanvel/4f7696174ac3a9b3554c to your computer and use it in GitHub Desktop.
Save nanvel/4f7696174ac3a9b3554c to your computer and use it in GitHub Desktop.
Spike project with Amazon CloudSearch
"""
Search bebop series.
"""
import arrow
import json
from tornado import options
from tornado.httpclient import HTTPError, HTTPClient, HTTPRequest
from tornado_botocore import Botocore
from tvs import TVS
DOMAIN_NAME = 'test-bebop-domain'
API_VERSION = '2013-01-01'
if __name__ == '__main__':
options.parse_command_line()
# create domain
cs_create_domain = Botocore(
service='cloudsearch', operation='CreateDomain',
region_name='us-west-2')
session = cs_create_domain.session
"""
try:
# create domain, domain will be reused if already exists
print cs_create_domain.call(domain_name=DOMAIN_NAME)
# {
# "DomainStatus":{
# "DomainId":"240020657974/test-bebop-domain",
# "Created":true,
# "SearchService":{},
# "SearchInstanceCount":0,
# "DomainName":"test-bebop-domain",
# "DocService":{},
# "Deleted":false,
# "Processing":false,
# "RequiresIndexDocuments":false,
# "ARN":"arn:aws:cloudsearch:us-west-2:240020657974:domain/test-bebop-domain",
# "SearchPartitionCount":0
# },
# "ResponseMetadata":{
# "RequestId":"38b0cba7-60f2-11e4-980e-6d6976ea3108"
# }
# }
except HTTPError as e:
print e.response.body
# configure fields
cs_define_index_field = Botocore(
service='cloudsearch', operation='DefineIndexField',
region_name='us-west-2', session=session)
# Fields:
# - title - text + show in result
# - airdate - uint
# - genre - literal + facet enabled (or literal-array?)
# - content - text
FIELDS = [{
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'title',
'IndexFieldType': 'text',
'TextOptions': {
'HighlightEnabled': False,
'DefaultValue': 'untitled',
'ReturnEnabled': True,
}
}
}, {
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'content',
'IndexFieldType': 'text',
'TextOptions': {
'HighlightEnabled': False,
'DefaultValue': '',
'ReturnEnabled': False,
}
}
}, {
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'airdate',
'IndexFieldType': 'int',
'IntOptions': {
'DefaultValue': 946684800,
}
}
}, {
'DomainName': DOMAIN_NAME,
'IndexField': {
'IndexFieldName': 'genre',
'IndexFieldType': 'literal-array',
'LiteralArrayOptions': {
'DefaultValue': '',
'FacetEnabled': True,
'ReturnEnabled': False,
'SearchEnabled': True,
}
}
}]
try:
for params in FIELDS:
print cs_define_index_field.call(**params)
except HTTPError as e:
print e.response.body
# add data
"""
batch = []
for tv in TVS:
batch.append({
'type': 'add', 'id': tv['number'],
'fields': {
'title': tv['title'],
'content': tv['content'],
'airdate': arrow.get(tv['airdate'], ['YYYY-MM-DD', 'MMMM D, YYYY']).timestamp,
'genre': tv['genre'],
}
})
# get document and search endpoints
cs_describe_domains = Botocore(
service='cloudsearch', operation='DescribeDomains',
region_name='us-west-2', session=session)
response = cs_describe_domains.call(domain_names=[DOMAIN_NAME])
# {
# "DomainStatusList":[
# {
# "DomainId":"240020657974/test-bebop-domain",
# "Created":true,
# "SearchService":{
# "Endpoint":"search-test-bebop-domain-kmvxd5zzot4opij6zvb6okvrma.us-west-2.cloudsearch.amazonaws.com"
# },
# "SearchInstanceCount":1,
# "DomainName":"test-bebop-domain",
# "DocService":{
# "Endpoint":"doc-test-bebop-domain-kmvxd5zzot4opij6zvb6okvrma.us-west-2.cloudsearch.amazonaws.com"
# },
# "SearchInstanceType":"search.m1.small",
# "Deleted":false,
# "Processing":false,
# "RequiresIndexDocuments":true,
# "ARN":"arn:aws:cloudsearch:us-west-2:240020657974:domain/test-bebop-domain",
# "SearchPartitionCount":1
# }
# ],
# "ResponseMetadata":{
# "RequestId":"7993ac9b-6101-11e4-8510-8ffcccb94f21"
# }
# }
search_endpoint = response['DomainStatusList'][0]['SearchService']['Endpoint']
document_endpoint = response['DomainStatusList'][0]['DocService']['Endpoint']
httpclient = HTTPClient()
# reindex
"""
cs_index_documents = Botocore(
service='cloudsearch', operation='IndexDocuments',
region_name='us-west-2', session=session)
print cs_index_documents.call(domain_name=DOMAIN_NAME)
# add documents
url = 'http://{document_endpoint}/{api_version}/documents/batch'.format(
document_endpoint=document_endpoint,
api_version=API_VERSION)
try:
request = HTTPRequest(
url=url, body=json.dumps(batch),
headers={'Content-Type': 'application/json'}, method='POST')
request.params = None
cs_describe_domains.endpoint.auth.add_auth(request=request)
response = httpclient.fetch(request=request)
print response.body
except HTTPError as e:
print e.response.body
"""
# search
url = 'http://{search_endpoint}/{api_version}/search?q=bebop'.format(
search_endpoint=search_endpoint, api_version=API_VERSION)
request = HTTPRequest(
url=url, headers={'Content-Type': 'application/json'},
method='GET')
request.params = None
cs_describe_domains.endpoint.auth.add_auth(request=request)
response = httpclient.fetch(request=request)
print response.body
# {
# "status":{
# "rid":"st/UtJYpAAoghec=",
# "time-ms":82
# },
# "hits":{
# "found":12,
# "start":0,
# "hit":[
# {
# "id":"3",
# "fields":{
# "airdate":"910396800",
# "title":"Honky Tonk Women"
# }
# },
# {
# "id":"18",
# "fields":{
# "airdate":"920073600",
# "title":"Speak Like a Child"
# }
# },
# ...
# ]
# }
# }
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment