Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save iapilgrim/9068644 to your computer and use it in GitHub Desktop.
Save iapilgrim/9068644 to your computer and use it in GitHub Desktop.
from pyelasticsearch import ElasticSearch
import pandas as pd
from time import time
root_path="/home/clemsos/Dev/mitras/"
raw_data_path=root_path+"data/"
csv_filename="week10.csv"
t0=time()
# size of the bulk
chunksize=5000
# open csv file
f = open(raw_data_path+csv_filename) # read csv
# parse csv with pandas
csvfile=pd.read_csv(f, iterator=True, chunksize=chunksize)
# init ElasticSearch
es = ElasticSearch('http://localhost:9200/')
# init index
try :
es.delete_index("weiboscope")
except :
pass
es.create_index("weiboscope")
# start bulk indexing
print "now indexing %s..."%(csv_filename)
for i,df in enumerate(csvfile):
print i
records=df.where(pd.notnull(df), None).T.to_dict()
list_records=[records[it] for it in records]
try :
es.bulk_index("weiboscope","tweet",list_records)
except :
print "error!, skiping some tweets sorry"
pass
print "done in %.3fs"%(time()-t0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment