Interactive Python script to recursively index files in directory tree to elasticSearch using the elasticsearch-mapper-attachments ( plugin to index files (pdf, docx, html, etc).
import os
import sys
# constants, configure to match your environment
HOST = 'http://localhost:9200'
INDEX = 'test'
TYPE = 'attachment'
TMP_FILE_NAME = 'tmp.json'
# for supported formats, see apache tika -
INDEX_FILE_TYPES = ['html','pdf', 'doc', 'docx', 'xls', 'xlsx', 'xml']
def main():
indexDirectory = raw_input('Index entire directory [Y/n]: ')
if not indexDirectory:
indexDirectory = 'y'
if indexDirectory.lower() == 'y':
dir = raw_input('Directory to index (relative to script): ')
fname = raw_input('File to index (relative to script): ')
def indexFile(fname):
print '\nIndexing ' + fname
print '\n-----------'
def indexDir(dir):
print 'Indexing dir ' + dir
for path, dirs, files in os.walk(dir):
for file in files:
fname = os.path.join(path,file)
base,extension = file.rsplit('.',1)
if extension.lower() in INDEX_FILE_TYPES:
'Skipping {}, not approved file type: {}'.format(fname, extension)
def postFileToTheIndex():
cmd = 'curl -X POST "{}/{}/{}" -d @'.format(HOST,INDEX,TYPE) + TMP_FILE_NAME
print cmd
def createEncodedTempFile(fname):
import json
file64 = open(fname, "rb").read().encode("base64")
print 'writing JSON with base64 encoded file to temp file {}'.format(TMP_FILE_NAME)
f = open(TMP_FILE_NAME, 'w')
data = { 'file': file64, 'title': fname }
json.dump(data, f) # dump json to tmp file
def createIndexIfDoesntExist():
import urllib2
class HeadRequest(urllib2.Request):
def get_method(self):
return "HEAD"
# check if type exists by sending HEAD request to index
urllib2.urlopen(HeadRequest(HOST + '/' + INDEX + '/' + TYPE))
except urllib2.HTTPError, e:
if e.code == 404:
print 'Index doesnt exist, creating...'
os.system('curl -X PUT "{}/{}/{}/_mapping" -d'.format(HOST,INDEX,TYPE) + ''' '{
"attachment" : {
"properties" : {
"file" : {
"type" : "attachment",
"fields" : {
"title" : { "store" : "yes" },
"file" : { "term_vector":"with_positions_offsets", "store":"yes" }
}' ''')
print 'Failed to retrieve index with error code - %s.' % e.code
# kick off the main function when script loads
jasonheine commented Jun 16, 2015


I'm attempting to index a 50 MB directory (430 files total). After using this script with the following parameters:

        os.system('curl -X PUT "{}/{}/{}/_mapping" -d'.format(HOST, INDEX, TYPE) + ''' '{
              "attachment" : {
                "properties" : {
                  "file" : {
                    "type" : "attachment",
                    "fields" : {
                        "content" : {"store" : "yes"},
                        "title" : {"store" : "yes"},
                        "content_type" : {"store" : "yes"},
                        "content_length" : {"store" : "yes"},
            }' ''')

I've noticed that the index size is about 230 MB. I've been using DTSearch for the longest time, and the index size is about 8 MB.

Am I doing something wrong?

Is there a way to compress the contents of the document being indexes?

Also, does it store the physical file in the index as base64? Is there a way to prevent this if it does?

Also, I can't seem to search the physical contents of the document, I only seem to search on the title field or any other meta data.


agdemore commented Jul 6, 2016

Can this script index large files? Like 100mb?

