Created
June 3, 2016 13:42
-
-
Save ael-code/98eed9d336ddafc504768bba03348d5f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import string | |
import sys | |
import re, os | |
import logging | |
from archivant import Archivant | |
logging.basicConfig(level=logging.DEBUG) | |
logging.getLogger("elasticsearch").setLevel(logging.WARNING) | |
logging.getLogger("urllib3").setLevel(logging.WARNING) | |
log = logging.getLogger("importer") | |
defaults = { | |
'FSDB_PATH': "/mnt/data/libreantDB/fsdb/", | |
'ES_HOSTS': None, | |
'ES_INDEXNAME': "propirata_v1" | |
} | |
test = { | |
'FSDB_PATH': "/tmp/fsdb/", | |
'ES_HOSTS': None, | |
'ES_INDEXNAME': "test" | |
} | |
ar = Archivant(defaults) | |
try: | |
dir_path=sys.argv[1] | |
except: | |
dir_path=os.getcwd() | |
def read_from_path(dir_path=dir_path): | |
if dir_path.endswith("/"): | |
dir_path = dir_path[:-1] | |
for path, dirname, files in os.walk(dir_path): | |
for _file in files: | |
if _file.startswith('.'): | |
continue | |
splitted = string.rsplit(_file,'.', 1) | |
if len(splitted) != 2: | |
log.debug("skipping: '{}'".format(os.path.join(path, _file))) | |
continue | |
filename, ext = splitted | |
if ext in ["pdf","epub","doc"]: | |
tags=path[len(dir_path):].split("/") | |
metadata={} | |
for i, tag in enumerate(tags): | |
if tag!='': | |
metadata["tag"+str(i)]=tag | |
metadata['title'] = filename | |
metadata['original_path'] = path[len(dir_path):]+"/"+_file | |
metadata['_language'] = "it" | |
attachments = dict() | |
attachments['file'] = path+"/"+_file | |
yield metadata, [attachments] | |
def already_exists(metadata): | |
res = ar._db.user_search('original_path:"{}"'.format(metadata['original_path'])) | |
from pprint import pprint | |
return (res['hits']['total'] > 0) | |
for metadata, attachments in read_from_path(): | |
log.debug("Adding file: {}'".format(metadata['original_path'])) | |
if already_exists(metadata): | |
log.debug("Skipping already existing file: '{}'".format(metadata['original_path'])) | |
continue | |
ar.insert_volume(metadata,attachments) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment