Skip to content

Instantly share code, notes, and snippets.

@andheiberg
Created July 16, 2015 18:59
Show Gist options
  • Save andheiberg/ab5958c077895131acfd to your computer and use it in GitHub Desktop.
Save andheiberg/ab5958c077895131acfd to your computer and use it in GitHub Desktop.
Elasticsearch Tutorial
import requests
from elasticsearch import Elasticsearch
es = Elasticsearch()
# Return a response of the top 100 IAMA Reddit posts of all time
response = requests.get("http://api.reddit.com/r/iama/top/?t=all&limit=100",
headers={"User-Agent":"TrackMaven"})
fields = ['title', 'selftext', 'author', 'score',
'ups', 'downs', 'num_comments', 'url', 'created']
# Loop through results and add each data dictionary to the ES "reddit" index
for i, iama in enumerate(response.json()['data']['children']):
content = iama['data']
doc = {}
for field in fields:
doc[field] = content[field]
es.index(index="reddit", doc_type='iama', id=i, body=doc)
import csv
from elasticsearch import Elasticsearch
# Map the fields of a new "trip" doc_type
mapping = {
"trip": {
"properties": {
"duration": {"type": "integer"},
"start_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"},
"start_station": {"type": "string", "index": "not_analyzed"},
"end_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"},
"end_station": {"type": "string", "index": "not_analyzed"},
"bike_id": {"type": "string"},
"subscriber_type": {"type": "string"}
}
}
}
# Create a new "bikeshare" index that includes "trips" with the above mapping
es = Elasticsearch()
es.indices.create("bikeshare")
es.indices.put_mapping(index="bikeshare", doc_type="trip", body=mapping)
# Import a CSV file of trip data - this will take quite a while!
with open('trips.csv', 'rb') as csvfile:
reader = csv.reader(csvfile)
reader.next() # Skip header row
for id, row in enumerate(reader):
h, m, s = row[0].split()
trip_seconds = int(h[:-1])*60*60 + int(m[:-4])*60 + int(s[:-4])
content = {
"duration": trip_seconds,
"start_date": row[1],
"end_date": row[2],
"start_station": row[3],
"end_station": row[4],
"bike_id": row[5],
"subscriber_type": row[6]
}
es.index(index="bikeshare", doc_type='trip', id=id, body=content)
from elasticsearch import Elasticsearch
es = Elasticsearch()
# Fetch a specific result
res = es.get(index='reddit', doc_type='iama', id=1)
print res['_source']
# Update the index to be able to query against it
es.indices.refresh(index="reddit")
# Query for results: nothing will match this author
res = es.search(index="reddit",
body={"query": {"match": {"author": "no results here!"}}})
print res
# Query for all results (no matching criteria)
res = es.search(index="reddit", body={"query": {"match_all": {}}})
print res['hits']['total']
print res['hits']['hits'][1]['_source']['title']
# Query based on text appearing in the title
# (by default matches across capitalization, pluralization, etc)
res = es.search(index="reddit", body={"query": {"match": {"title": "obama"}}})
print res['hits']['total']
print res['hits']['hits'][0]['_source']['title']
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment