iapilgrim/csv_to_elastic_search_bulk_insert.py

## csv_to_elastic_search_bulk_insert.py
from pyelasticsearch import ElasticSearch
import pandas as pd
from time import time

root_path="/home/clemsos/Dev/mitras/"
raw_data_path=root_path+"data/"
csv_filename="week10.csv"

t0=time()

# size of the bulk
chunksize=5000

# open csv file
f = open(raw_data_path+csv_filename) # read csv

# parse csv with pandas
csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize)

# init ElasticSearch
es = ElasticSearch('http://localhost:9200/')

# init index
try :
    es.delete_index("weiboscope")
except :
    pass

es.create_index("weiboscope")

# start bulk indexing
print "now indexing %s..."%(csv_filename)

for i,df in enumerate(csvfile):
    print i
    records=df.where(pd.notnull(df), None).T.to_dict()
    list_records=[records[it] for it in records]
    try :
        es.bulk_index("weiboscope","tweet",list_records)
    except :
        print "error!, skiping some tweets sorry"
        pass

print "done in %.3fs"%(time()-t0)
	from pyelasticsearch import ElasticSearch
	import pandas as pd
	from time import time

	root_path="/home/clemsos/Dev/mitras/"
	raw_data_path=root_path+"data/"
	csv_filename="week10.csv"

	t0=time()

	# size of the bulk
	chunksize=5000

	# open csv file
	f = open(raw_data_path+csv_filename) # read csv

	# parse csv with pandas
	csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize)

	# init ElasticSearch
	es = ElasticSearch('http://localhost:9200/')

	# init index
	try :
	es.delete_index("weiboscope")
	except :
	pass

	es.create_index("weiboscope")

	# start bulk indexing
	print "now indexing %s..."%(csv_filename)

	for i,df in enumerate(csvfile):
	print i
	records=df.where(pd.notnull(df), None).T.to_dict()
	list_records=[records[it] for it in records]
	try :
	es.bulk_index("weiboscope","tweet",list_records)
	except :
	print "error!, skiping some tweets sorry"
	pass

	print "done in %.3fs"%(time()-t0)