Skip to content

Instantly share code, notes, and snippets.

@mooreniemi
Created October 23, 2020 02:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mooreniemi/0b31f3406e6635557ab41cbb492fb6f0 to your computer and use it in GitHub Desktop.
Save mooreniemi/0b31f3406e6635557ab41cbb492fb6f0 to your computer and use it in GitHub Desktop.
# a script to load seshat data into Elasticsearch
# seshat data is clearly graphdb data, but ES is nifty, so...
# assumes you have set up elasticsearch and kibana
import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
# assumes local ES
es = Elasticsearch()
index_name = "seshat"
print(f"setting up index {index_name}")
es.indices.delete(index=index_name, ignore=400)
es.indices.create(index=index_name, ignore=400)
def gendata(records):
for record in records:
# would be more efficient to do in pandas but w/e
index_tag = {"_index": index_name}
record.update(index_tag)
yield record
df = pd.read_csv("download.csv").fillna("")
records = df.to_dict("records")
for success, info in parallel_bulk(
es, gendata(records), chunk_size=1000, thread_count=5, queue_size=16
):
if not success:
print("failed: " + info)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment