Skip to content

Instantly share code, notes, and snippets.

@stevehanson
Last active November 22, 2020 21:22
Show Gist options
  • Star 19 You must be signed in to star a gist
  • Fork 11 You must be signed in to fork a gist
  • Save stevehanson/7462063 to your computer and use it in GitHub Desktop.
Save stevehanson/7462063 to your computer and use it in GitHub Desktop.
Interactive Python script to recursively index files in directory tree to elasticSearch using the elasticsearch-mapper-attachments (https://github.com/elasticsearch/elasticsearch-mapper-attachments) plugin to index files (pdf, docx, html, etc).
import os
import sys
# constants, configure to match your environment
HOST = 'http://localhost:9200'
INDEX = 'test'
TYPE = 'attachment'
TMP_FILE_NAME = 'tmp.json'
# for supported formats, see apache tika - http://tika.apache.org/1.4/formats.html
INDEX_FILE_TYPES = ['html','pdf', 'doc', 'docx', 'xls', 'xlsx', 'xml']
def main():
indexDirectory = raw_input('Index entire directory [Y/n]: ')
if not indexDirectory:
indexDirectory = 'y'
if indexDirectory.lower() == 'y':
dir = raw_input('Directory to index (relative to script): ')
indexDir(dir)
else:
fname = raw_input('File to index (relative to script): ')
createIndexIfDoesntExist()
indexFile(fname)
def indexFile(fname):
print '\nIndexing ' + fname
createEncodedTempFile(fname)
postFileToTheIndex()
os.remove(TMP_FILE_NAME)
print '\n-----------'
def indexDir(dir):
print 'Indexing dir ' + dir
createIndexIfDoesntExist()
for path, dirs, files in os.walk(dir):
for file in files:
fname = os.path.join(path,file)
base,extension = file.rsplit('.',1)
if extension.lower() in INDEX_FILE_TYPES:
indexFile(fname)
else:
'Skipping {}, not approved file type: {}'.format(fname, extension)
def postFileToTheIndex():
cmd = 'curl -X POST "{}/{}/{}" -d @'.format(HOST,INDEX,TYPE) + TMP_FILE_NAME
print cmd
os.system(cmd)
def createEncodedTempFile(fname):
import json
file64 = open(fname, "rb").read().encode("base64")
print 'writing JSON with base64 encoded file to temp file {}'.format(TMP_FILE_NAME)
f = open(TMP_FILE_NAME, 'w')
data = { 'file': file64, 'title': fname }
json.dump(data, f) # dump json to tmp file
f.close()
def createIndexIfDoesntExist():
import urllib2
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
# check if type exists by sending HEAD request to index
try:
urllib2.urlopen(HeadRequest(HOST + '/' + INDEX + '/' + TYPE))
except urllib2.HTTPError, e:
if e.code == 404:
print 'Index doesnt exist, creating...'
os.system('curl -X PUT "{}/{}/{}/_mapping" -d'.format(HOST,INDEX,TYPE) + ''' '{
"attachment" : {
"properties" : {
"file" : {
"type" : "attachment",
"fields" : {
"title" : { "store" : "yes" },
"file" : { "term_vector":"with_positions_offsets", "store":"yes" }
}
}
}
}
}' ''')
else:
print 'Failed to retrieve index with error code - %s.' % e.code
# kick off the main function when script loads
main()
@agdemore
Copy link

agdemore commented Jul 6, 2016

Can this script index large files? Like 100mb?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment