mooreniemi/load_es.py

## load_es.py
# a script to load seshat data into Elasticsearch
# seshat data is clearly graphdb data, but ES is nifty, so...
# assumes you have set up elasticsearch and kibana

import pandas as pd
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk

# assumes local ES
es = Elasticsearch()

index_name = "seshat"
print(f"setting up index {index_name}")
es.indices.delete(index=index_name, ignore=400)
es.indices.create(index=index_name, ignore=400)


def gendata(records):
    for record in records:
        # would be more efficient to do in pandas but w/e
        index_tag = {"_index": index_name}
        record.update(index_tag)
        yield record


df = pd.read_csv("download.csv").fillna("")
records = df.to_dict("records")

for success, info in parallel_bulk(
    es, gendata(records), chunk_size=1000, thread_count=5, queue_size=16
):
    if not success:
        print("failed: " + info)
	# a script to load seshat data into Elasticsearch
	# seshat data is clearly graphdb data, but ES is nifty, so...
	# assumes you have set up elasticsearch and kibana

	import pandas as pd
	from elasticsearch import Elasticsearch
	from elasticsearch.helpers import parallel_bulk

	# assumes local ES
	es = Elasticsearch()

	index_name = "seshat"
	print(f"setting up index {index_name}")
	es.indices.delete(index=index_name, ignore=400)
	es.indices.create(index=index_name, ignore=400)


	def gendata(records):
	for record in records:
	# would be more efficient to do in pandas but w/e
	index_tag = {"_index": index_name}
	record.update(index_tag)
	yield record


	df = pd.read_csv("download.csv").fillna("")
	records = df.to_dict("records")

	for success, info in parallel_bulk(
	es, gendata(records), chunk_size=1000, thread_count=5, queue_size=16
	):
	if not success:
	print("failed: " + info)