Created
June 1, 2017 01:39
-
-
Save lukehollis/7f1db9a0e6567792440d856ff9004435 to your computer and use it in GitHub Desktop.
Ingest cltk formatted json files into the cltk archive mongo database
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import pdb | |
import datetime | |
import pymongo | |
import six | |
import re | |
import unicodedata | |
import concurrent.futures | |
from concurrent.futures import ProcessPoolExecutor, wait, as_completed | |
from django.utils.text import slugify | |
import logging | |
import time | |
import hashlib | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
LANGUAGES = [ | |
'sanskrit', | |
'punjabi', | |
'pali', | |
'old_english', | |
'latin', | |
'javanese', | |
'chinese', | |
'hindi', | |
'hebrew', | |
'bengali', | |
'greek', | |
'malayalam', | |
'prakrit', | |
'old_church_slavonic', | |
'middle_english', | |
'telugu', | |
'coptic' | |
] | |
client = pymongo.MongoClient('localhost', 27017, maxPoolSize=None) | |
db = client['cltk_frontend'] | |
def mongo(db, host="localhost", port=27017): | |
client = pymongo.MongoClient(host, port, maxPoolSize=None) | |
# client = pymongo.MongoClient('mongodb://cltkAdmin:quidfaciatlaetassegetes@52.91.62.113') | |
return client[db] | |
def md5(fname): | |
hash_md5 = hashlib.md5() | |
with open(fname, "rb") as f: | |
for chunk in iter(lambda: f.read(4096), b""): | |
hash_md5.update(chunk) | |
return hash_md5.hexdigest() | |
def make_corpus_from_dirname(dirname): | |
corpus = dirname.replace("texts", "").replace("text", "") | |
for language in LANGUAGES: | |
corpus = re.sub('^' + language, '', corpus) | |
corpus = corpus.replace("_", " ") | |
corpus = corpus.strip() | |
corpus = corpus.title() | |
return corpus | |
def make_corpus_link_from_dirname(corpus): | |
corpusLink = '' | |
if corpus.lower() == "Open Greek and Latin".lower(): | |
corpusLink = "https://github.com/OpenGreekAndLatin" | |
elif corpus.lower() == "The First 1k Years Of Greek".lower(): | |
corpusLink = "http://opengreekandlatin.github.io/First1KGreek/" | |
elif corpus.lower() == "Perseus Digital Library".lower(): | |
corpusLink = "http://www.perseus.tufts.edu/hopper/" | |
else: | |
logging.info("%s didn't have a link" % corpus) | |
return corpusLink | |
def dictToText(textNodeMeta, textNode, n1=None, n2=None, n3=None, n4=None, n5=None): | |
try: | |
for key, node in textNode.items(): | |
if isinstance(node, dict): | |
if n4: | |
dictToText(textNodeMeta, node, n1, n2, n3, n4, (int(key) + 1)) | |
elif n3: | |
dictToText(textNodeMeta, node, n1, n2, n3, (int(key) + 1)) | |
elif n2: | |
dictToText(textNodeMeta, node, n1, n2, (int(key) + 1)) | |
elif n1: | |
dictToText(textNodeMeta, node, n1, (int(key) + 1)) | |
else: | |
dictToText(textNodeMeta, node, (int(key)+ 1)) | |
else: | |
existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])}) | |
existingCorpus = db.corpora.find_one({'slug': slugify(textNodeMeta['corpus'])}) | |
existingLanguage = db.languages.find_one({'slug': slugify(textNodeMeta['textLanguage'])}) | |
if existingAuthor: | |
existingWork = db.works.find_one({ | |
'slug': slugify(textNodeMeta['workTitle']), | |
'authors': existingAuthor['_id'] | |
}) | |
else: | |
existingWork = None | |
db.authors.insert({ | |
'english_name': textNodeMeta['author'].title(), | |
'original_name': textNodeMeta['author'].title(), | |
'slug': slugify(textNodeMeta['author']), | |
'authorLanguages': [slugify(textNodeMeta['textLanguage'])], | |
'createdAt': datetime.datetime.now(), | |
'updatedAt': datetime.datetime.now(), | |
}) | |
existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])}) | |
if textNodeMeta['textLanguage'] not in existingAuthor['authorLanguages']: | |
db.authors.update({ | |
'_id': existingAuthor['_id'] | |
}, { | |
'$addToSet': { | |
'authorLanguages': slugify(textNodeMeta['textLanguage']) | |
} | |
}) | |
if not existingCorpus: | |
db.corpora.insert({ | |
'title': textNodeMeta['corpus'].title(), | |
'slug': slugify(textNodeMeta['corpus']), | |
'link': textNodeMeta['corpusLink'], | |
'corpusLanguages': [slugify(textNodeMeta['textLanguage'])], | |
'createdAt': datetime.datetime.now(), | |
'updatedAt': datetime.datetime.now(), | |
}) | |
else: | |
if textNodeMeta['textLanguage'] not in existingCorpus['corpusLanguages']: | |
db.corpora.update({ | |
'_id': existingCorpus['_id'] | |
}, { | |
'$addToSet': { | |
'corpusLanguages': slugify(textNodeMeta['textLanguage']) | |
} | |
}) | |
if not existingWork: | |
db.works.insert({ | |
'english_title': textNodeMeta['workTitle'].title(), | |
'original_title': textNodeMeta['originalTitle'].title(), | |
'slug': slugify(textNodeMeta['workTitle']), | |
'workLanguage': slugify(textNodeMeta['textLanguage']), | |
'corpus': slugify(textNodeMeta['corpus']), | |
'authors': [existingAuthor['_id']], | |
'edition': textNodeMeta['edition'], | |
'structure': textNodeMeta['structure'], | |
'form': textNodeMeta['form'], | |
'filename': textNodeMeta['filename'], | |
'hash': textNodeMeta['hash'], | |
'createdAt': datetime.datetime.now(), | |
'updatedAt': datetime.datetime.now(), | |
}) | |
existingWork = db.works.find_one({ | |
'slug': slugify(textNodeMeta['workTitle']), | |
'authors': existingAuthor['_id'] | |
}) | |
if not existingLanguage: | |
db.languages.insert({ | |
'title': textNodeMeta['textLanguage'].title(), | |
'slug': slugify(textNodeMeta['textLanguage']), | |
'createdAt': datetime.datetime.now(), | |
'updatedAt': datetime.datetime.now(), | |
}) | |
ingestText = { | |
'work': existingWork['_id'], | |
'author': slugify(textNodeMeta['author']), | |
'corpus': slugify(textNodeMeta['corpus']), | |
'textLanguage': slugify(textNodeMeta['textLanguage']), | |
'text': node, | |
'html': node, | |
'comments': [], | |
'entities': [], | |
'createdAt': datetime.datetime.now(), | |
'updatedAt': datetime.datetime.now(), | |
} | |
if n4: | |
ingestText['n_5'] = int(key) + 1 | |
ingestText['n_4'] = int(n4) | |
ingestText['n_3'] = int(n3) | |
ingestText['n_2'] = int(n2) | |
ingestText['n_1'] = int(n1) | |
elif n3: | |
ingestText['n_4'] = int(key) + 1 | |
ingestText['n_3'] = int(n3) | |
ingestText['n_2'] = int(n2) | |
ingestText['n_1'] = int(n1) | |
elif n2: | |
ingestText['n_3'] = int(key) + 1 | |
ingestText['n_2'] = int(n2) | |
ingestText['n_1'] = int(n1) | |
elif n1: | |
ingestText['n_2'] = int(key) + 1 | |
ingestText['n_1'] = int(n1) | |
else: | |
ingestText['n_1'] = int(key) + 1 | |
db.texts.insert(ingestText) | |
except Exception as e: | |
logging.info(' --------- broken %s' % textNodeMeta) | |
logging.info(' --------- exception %s' % e) | |
def main(): | |
start_time = time.time() | |
logging.info(' -- resetting db') | |
# logging.info(' -- -- authors') | |
# db.authors.remove({}) | |
# logging.info(' -- -- language') | |
# db.languages.remove({}) | |
logging.info(' -- -- corpora') | |
db.corpora.remove({}) | |
logging.info(' -- -- works') | |
db.works.remove({}) | |
logging.info(' -- -- texts') | |
db.texts.remove({}) | |
# multiprocess ingest | |
pool = ProcessPoolExecutor(6) | |
futures = [] | |
for root, dirs, files in os.walk("."): | |
path = root.split('/') | |
if len(path) == 2: | |
logging.info((((len(path) - 1) * '---') + os.path.basename(root))) | |
if len(path) > 2 and path[2] == 'cltk_json': | |
logging.info((((len(path) - 1) * '---') + os.path.basename(root))) | |
for fname in files: | |
ext = os.path.splitext(fname) | |
if ext[1] == ".json": | |
logging.info(((len(path) * '---') + fname)) | |
with open(os.path.join(root, fname), 'r') as f: | |
data = json.load(f) | |
if 'work' in data: | |
work = data['work'] | |
elif 'english_title' in data: | |
work = data['english_title'] | |
elif 'englishTitle' in data: | |
work = data['englishTitle'] | |
elif 'title' in data: | |
work = data['title'] | |
else: | |
logging.info(' ############## broken text: %s' % fname) | |
continue | |
if 'originalTitle' in data: | |
originalTitle = data['originalTitle'] | |
else: | |
originalTitle = work | |
if 'edition' in data: | |
edition = data['edition'] | |
else: | |
edition = '' | |
author = data['author'] | |
if 'meta' in data: | |
structure = data['meta'].lower() | |
else: | |
structure = '' | |
if 'meta' in data: | |
structure = data['meta'].lower() | |
else: | |
structure = '' | |
if 'source' in data: | |
corpus = data['source'] | |
else: | |
corpus = make_corpus_from_dirname(path[1]) | |
if 'sourceLink' in data: | |
corpusLink = data['sourceLink'] | |
else: | |
corpusLink = make_corpus_link_from_dirname(corpus) | |
language = '' | |
if 'language' in data: | |
language = data['language'] | |
form = 'prose' | |
if 'line' in structure: | |
form = 'poetry' | |
filehash = md5(os.path.join(root, fname)) | |
textNodeMeta = { | |
'workTitle': work, | |
'filename': fname, | |
'originalTitle': originalTitle, | |
'edition': edition, | |
'author': author, | |
'corpus': corpus, | |
'corpusLink': corpusLink, | |
'textLanguage': language, | |
'structure': structure, | |
'form': form, | |
'hash': filehash, | |
} | |
# dictToText(textNodeMeta, data['text']) | |
futures.append(pool.submit(dictToText, textNodeMeta, data['text'])) | |
nWorks = len(futures) | |
for i, x in enumerate(as_completed(futures)): | |
logging.info("--- %s minutes : %s work of %s ---" % ((time.time() - start_time) / 60, i, nWorks)) | |
wait(futures) | |
logging.info("--- %s minutes ---" % ((time.time() - start_time) / 60)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment