Skip to content

Instantly share code, notes, and snippets.

@mingrammer
Last active January 23, 2018 02:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mingrammer/555c070c7cf670d7c32b6850e6088865 to your computer and use it in GitHub Desktop.
Save mingrammer/555c070c7cf670d7c32b6850e6088865 to your computer and use it in GitHub Desktop.
Get the maximum and minimum numbers of people in/out for seoul metro using elasticsearch
import math
from pprint import pprint
import elasticsearch as es
import numpy as np
# Constants
INDEX_NAME = 'seoul-metro-2014'
THRESHOLD = 10
CHUNK_SIZE = 5000
# Global holders
stations = []
in_out_of_stations = []
gap_rank_list = []
client = es.Elasticsearch(['http://doit-dev.lkaybob.pe.kr'], port=9200)
results = client.search(
index=INDEX_NAME,
body={
'aggs': {
'unique_station_names': {
'terms': {
'field': 'station_name',
'size': 0,
},
},
},
},
)
for entry in results['aggregations']['unique_station_names']['buckets']:
stations.append({
'name': entry['key'],
'count': entry['doc_count'],
})
for station in stations:
loop = math.ceil(station['count'] / CHUNK_SIZE)
for i in range(loop):
result = client.search(
index=INDEX_NAME,
doc_type='seoul-metro',
body={
'query': {
'constant_score': {
'filter': {
'term': {
'station_name': station['name']
}
}
}
},
'aggs': {
'amount_per_day': {
'date_histogram': {
'field': 'time_slot',
'interval': 'day'
},
'aggs': {
'total_in': {
'sum': {
'field': 'people_in'
}
},
'total_out': {
'sum': {
'field': 'people_out'
}
}
}
}
}
},
size=CHUNK_SIZE
)
people_ins = []
people_outs = []
for bucket in result['aggregations']['amount_per_day']['buckets']:
people_ins.append({
'date': bucket['key_as_string'],
'total': bucket['total_in']['value']
})
people_outs.append({
'date': bucket['key_as_string'],
'total': bucket['total_out']['value']
})
people_ins.sort(reverse=True, key=lambda d: d['total'])
people_outs.sort(reverse=True, key=lambda d: d['total'])
in_out_of_a_station = {
'station': station['name'],
'highest_people_ins': people_ins[:THRESHOLD],
'highest_people_outs': people_outs[:THRESHOLD],
'lowest_people_ins': people_ins[-1:-THRESHOLD:-1],
'lowest_people_outs': people_outs[-1:-THRESHOLD:-1]
}
highest_gap = {
'station': in_out_of_a_station['station'],
'date_for_ins': in_out_of_a_station['highest_people_ins'][0]['date'],
'date_for_outs': in_out_of_a_station['highest_people_outs'][0]['date'],
'highest_people_ins_gap': in_out_of_a_station['highest_people_ins'][0]['total'] - np.mean(list(map(lambda d: d['total'], people_ins))),
'highest_people_outs_gap': in_out_of_a_station['highest_people_outs'][0]['total'] - np.mean(list(map(lambda d: d['total'], people_outs)))
}
pprint(in_out_of_a_station)
pprint(highest_gap)
in_out_of_stations.append(in_out_of_a_station)
gap_rank_list.append(highest_gap)
gap_rank_list.sort(reverse=True, key=lambda d: d['highest_people_ins_gap'])
pprint(gap_rank_list[:20])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment