andheiberg/import.py

## import.py
import requests
from elasticsearch import Elasticsearch

es = Elasticsearch()

# Return a response of the top 100 IAMA Reddit posts of all time
response = requests.get("http://api.reddit.com/r/iama/top/?t=all&limit=100",
                        headers={"User-Agent":"TrackMaven"})

fields = ['title', 'selftext', 'author', 'score',
        'ups', 'downs', 'num_comments', 'url', 'created']

# Loop through results and add each data dictionary to the ES "reddit" index
for i, iama in enumerate(response.json()['data']['children']):
    content = iama['data']
    doc = {}
    for field in fields:
        doc[field] = content[field]
    es.index(index="reddit", doc_type='iama', id=i, body=doc)

## import_trips.py
import csv
from elasticsearch import Elasticsearch

# Map the fields of a new "trip" doc_type
mapping = {
    "trip": {
        "properties": {
            "duration": {"type": "integer"},
            "start_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"},
            "start_station": {"type": "string", "index": "not_analyzed"},
            "end_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"},
            "end_station": {"type": "string", "index": "not_analyzed"},
            "bike_id": {"type": "string"},
            "subscriber_type": {"type": "string"}
        }
    }
}

# Create a new "bikeshare" index that includes "trips" with the above mapping
es = Elasticsearch()
es.indices.create("bikeshare")
es.indices.put_mapping(index="bikeshare", doc_type="trip", body=mapping)

# Import a CSV file of trip data - this will take quite a while!
with open('trips.csv', 'rb') as csvfile:
    reader = csv.reader(csvfile)
    reader.next() # Skip header row
    for id, row in enumerate(reader):
        h, m, s = row[0].split()
        trip_seconds = int(h[:-1])*60*60 + int(m[:-4])*60 + int(s[:-4])
        content = {
            "duration": trip_seconds,
            "start_date": row[1],
            "end_date": row[2],
            "start_station": row[3],
            "end_station": row[4],
            "bike_id": row[5],
            "subscriber_type": row[6]
        }
        es.index(index="bikeshare", doc_type='trip', id=id, body=content)

## queries.py
from elasticsearch import Elasticsearch

es = Elasticsearch()

# Fetch a specific result
res = es.get(index='reddit', doc_type='iama', id=1)
print res['_source']

# Update the index to be able to query against it
es.indices.refresh(index="reddit")

# Query for results: nothing will match this author
res = es.search(index="reddit",
                body={"query": {"match": {"author": "no results here!"}}})
print res

# Query for all results (no matching criteria)
res = es.search(index="reddit", body={"query": {"match_all": {}}})
print res['hits']['total']
print res['hits']['hits'][1]['_source']['title']

# Query based on text appearing in the title
# (by default matches across capitalization, pluralization, etc)
res = es.search(index="reddit", body={"query": {"match": {"title": "obama"}}})
print res['hits']['total']
print res['hits']['hits'][0]['_source']['title']
	import requests
	from elasticsearch import Elasticsearch

	es = Elasticsearch()

	# Return a response of the top 100 IAMA Reddit posts of all time
	response = requests.get("http://api.reddit.com/r/iama/top/?t=all&limit=100",
	headers={"User-Agent":"TrackMaven"})

	fields = ['title', 'selftext', 'author', 'score',
	'ups', 'downs', 'num_comments', 'url', 'created']

	# Loop through results and add each data dictionary to the ES "reddit" index
	for i, iama in enumerate(response.json()['data']['children']):
	content = iama['data']
	doc = {}
	for field in fields:
	doc[field] = content[field]
	es.index(index="reddit", doc_type='iama', id=i, body=doc)
	import csv
	from elasticsearch import Elasticsearch

	# Map the fields of a new "trip" doc_type
	mapping = {
	"trip": {
	"properties": {
	"duration": {"type": "integer"},
	"start_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"},
	"start_station": {"type": "string", "index": "not_analyzed"},
	"end_date": {"type" : "date", "format" : "MM/dd/yyyy HH:mm"},
	"end_station": {"type": "string", "index": "not_analyzed"},
	"bike_id": {"type": "string"},
	"subscriber_type": {"type": "string"}
	}
	}
	}

	# Create a new "bikeshare" index that includes "trips" with the above mapping
	es = Elasticsearch()
	es.indices.create("bikeshare")
	es.indices.put_mapping(index="bikeshare", doc_type="trip", body=mapping)

	# Import a CSV file of trip data - this will take quite a while!
	with open('trips.csv', 'rb') as csvfile:
	reader = csv.reader(csvfile)
	reader.next() # Skip header row
	for id, row in enumerate(reader):
	h, m, s = row[0].split()
	trip_seconds = int(h[:-1])6060 + int(m[:-4])*60 + int(s[:-4])
	content = {
	"duration": trip_seconds,
	"start_date": row[1],
	"end_date": row[2],
	"start_station": row[3],
	"end_station": row[4],
	"bike_id": row[5],
	"subscriber_type": row[6]
	}
	es.index(index="bikeshare", doc_type='trip', id=id, body=content)
	from elasticsearch import Elasticsearch

	es = Elasticsearch()

	# Fetch a specific result
	res = es.get(index='reddit', doc_type='iama', id=1)
	print res['_source']

	# Update the index to be able to query against it
	es.indices.refresh(index="reddit")

	# Query for results: nothing will match this author
	res = es.search(index="reddit",
	body={"query": {"match": {"author": "no results here!"}}})
	print res

	# Query for all results (no matching criteria)
	res = es.search(index="reddit", body={"query": {"match_all": {}}})
	print res['hits']['total']
	print res['hits']['hits'][1]['_source']['title']

	# Query based on text appearing in the title
	# (by default matches across capitalization, pluralization, etc)
	res = es.search(index="reddit", body={"query": {"match": {"title": "obama"}}})
	print res['hits']['total']
	print res['hits']['hits'][0]['_source']['title']