Skip to content

Instantly share code, notes, and snippets.

@rendoaw
Created February 9, 2018 02:24
Show Gist options
  • Save rendoaw/d212d8ecf0b226e7328434cdad6a8c5c to your computer and use it in GitHub Desktop.
Save rendoaw/d212d8ecf0b226e7328434cdad6a8c5c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import os
import json
import datetime
from elasticsearch import Elasticsearch,helpers
import sys
import time
def read_file(filename):
cmds = ""
if filename is not '':
finput = open(filename)
lines = [x.replace('\n', '') for x in finput]
finput.close()
return lines
def jsonpretty(text):
return json.dumps(text, indent=4, sort_keys=True)
def write_json_file(d, outfilename):
fo = open(outfilename, "w")
json.dump(d, fo, indent=4)
fo.close()
return
if __name__ == "__main__":
actions = []
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
lines = read_file("json/EL99_Sinergi_Bangun_Negeri.jsonl")
for line in lines:
doc = json.loads(line)
doc['ts'] = time.strftime('%Y-%m-%dT%H:%M:%S.000Z', time.gmtime(doc['date']))
doc['ts_hour'] = time.strftime('%H', time.gmtime(doc['date']))
doc['ts_day'] = time.strftime('%w', time.gmtime(doc['date']))
if not 'media' in doc:
if 'text' in doc:
doc['media'] = {}
doc['media']['type'] = 'text'
if 'text' in doc:
doc['text_len'] = len(doc['text'])
if 'print_name' not in doc['from']:
doc['from']['print_name'] = doc['from']['first_name']+'_'+doc['from']['last_name']
if 'phone' not in doc['from']:
doc['from']['phone'] = "0000000000"
action = {
"_index": "el99",
"_type": "telegram",
"_id": str(doc['date'])+"_"+doc['from']['id'],
"_source": doc
}
actions.append(action)
#print jsonpretty(action)
write_json_file(actions, "dump.json")
es.indices.delete(index='el99', ignore=[400, 404])
es.indices.create(index='el99')
res = helpers.bulk(es, actions)
print res
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment