Last active
February 13, 2017 11:39
-
-
Save stevehanson/7461706 to your computer and use it in GitHub Desktop.
Example using the elasticsearch-mapper-attachments (https://github.com/elasticsearch/elasticsearch-mapper-attachments) plugin to index files (pdf, docx, html, etc). Usage with "python es-attach.py my-filename. Credit to Lucas Vlcek's similar Gist using Perl - https://gist.github.com/lukas-vlcek/1075067.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
# constants, configure to match your environment | |
HOST = 'http://localhost:9200' | |
INDEX = 'test' | |
TYPE = 'attachment' | |
TMP_FILE_NAME = 'tmp.json' | |
def main(): | |
if len(sys.argv) < 2: | |
print 'No filename provided.\nUsage: "python es-attach.py filename".\nExiting...' | |
exit() | |
fname = sys.argv[1] | |
createEncodedTempFile(fname) | |
createIndexIfDoesntExist() | |
postFileToTheIndex() | |
os.remove(TMP_FILE_NAME) | |
def postFileToTheIndex(): | |
cmd = 'curl -X POST "{}/{}/{}" -d @'.format(HOST,INDEX,TYPE) + TMP_FILE_NAME | |
print cmd | |
os.system(cmd) | |
def createEncodedTempFile(fname): | |
import json | |
file64 = open(fname, "rb").read().encode("base64") | |
print 'writing JSON with base64 encoded file to temp file {}'.format(TMP_FILE_NAME) | |
f = open(TMP_FILE_NAME, 'w') | |
data = { 'file': file64, 'title': fname } | |
json.dump(data, f) # dump json to tmp file | |
f.close() | |
def createIndexIfDoesntExist(): | |
import urllib2 | |
class HeadRequest(urllib2.Request): | |
def get_method(self): | |
return "HEAD" | |
# check if type exists by sending HEAD request to index | |
try: | |
urllib2.urlopen(HeadRequest(HOST + '/' + INDEX + '/' + TYPE)) | |
except urllib2.HTTPError, e: | |
if e.code == 404: | |
print 'Index doesnt exist, creating...' | |
os.system('curl -X PUT "{}/{}/{}/_mapping" -d'.format(HOST,INDEX,TYPE) + ''' '{ | |
"attachment" : { | |
"properties" : { | |
"file" : { | |
"type" : "attachment", | |
"fields" : { | |
"title" : { "store" : "yes" }, | |
"file" : { "term_vector":"with_positions_offsets", "store":"yes" } | |
} | |
} | |
} | |
} | |
}' ''') | |
else: | |
print 'Failed to retrieve index with error code - %s.' % e.code | |
# kick off the main function when script loads | |
main() |
@rboyd, @Analect: Did you installed https://github.com/elastic/elasticsearch-mapper-attachments ?
@Analect, did you figure this out? It appears that elasicsearch-mapper-attachments only looks at the first 100,000 chars for indexing. It is possible you are missing parts of the PDF if it is larger than that.
@stevehanson can you please explain the that you have used..I am a bit confused.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
@Analect you ever figure this out? I have similar results.