Skip to content

Instantly share code, notes, and snippets.

@lukehollis
Created June 1, 2017 01:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save lukehollis/7f1db9a0e6567792440d856ff9004435 to your computer and use it in GitHub Desktop.
Save lukehollis/7f1db9a0e6567792440d856ff9004435 to your computer and use it in GitHub Desktop.
Ingest cltk formatted json files into the cltk archive mongo database
import os
import json
import pdb
import datetime
import pymongo
import six
import re
import unicodedata
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, wait, as_completed
from django.utils.text import slugify
import logging
import time
import hashlib
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
LANGUAGES = [
'sanskrit',
'punjabi',
'pali',
'old_english',
'latin',
'javanese',
'chinese',
'hindi',
'hebrew',
'bengali',
'greek',
'malayalam',
'prakrit',
'old_church_slavonic',
'middle_english',
'telugu',
'coptic'
]
client = pymongo.MongoClient('localhost', 27017, maxPoolSize=None)
db = client['cltk_frontend']
def mongo(db, host="localhost", port=27017):
client = pymongo.MongoClient(host, port, maxPoolSize=None)
# client = pymongo.MongoClient('mongodb://cltkAdmin:quidfaciatlaetassegetes@52.91.62.113')
return client[db]
def md5(fname):
hash_md5 = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()
def make_corpus_from_dirname(dirname):
corpus = dirname.replace("texts", "").replace("text", "")
for language in LANGUAGES:
corpus = re.sub('^' + language, '', corpus)
corpus = corpus.replace("_", " ")
corpus = corpus.strip()
corpus = corpus.title()
return corpus
def make_corpus_link_from_dirname(corpus):
corpusLink = ''
if corpus.lower() == "Open Greek and Latin".lower():
corpusLink = "https://github.com/OpenGreekAndLatin"
elif corpus.lower() == "The First 1k Years Of Greek".lower():
corpusLink = "http://opengreekandlatin.github.io/First1KGreek/"
elif corpus.lower() == "Perseus Digital Library".lower():
corpusLink = "http://www.perseus.tufts.edu/hopper/"
else:
logging.info("%s didn't have a link" % corpus)
return corpusLink
def dictToText(textNodeMeta, textNode, n1=None, n2=None, n3=None, n4=None, n5=None):
try:
for key, node in textNode.items():
if isinstance(node, dict):
if n4:
dictToText(textNodeMeta, node, n1, n2, n3, n4, (int(key) + 1))
elif n3:
dictToText(textNodeMeta, node, n1, n2, n3, (int(key) + 1))
elif n2:
dictToText(textNodeMeta, node, n1, n2, (int(key) + 1))
elif n1:
dictToText(textNodeMeta, node, n1, (int(key) + 1))
else:
dictToText(textNodeMeta, node, (int(key)+ 1))
else:
existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])})
existingCorpus = db.corpora.find_one({'slug': slugify(textNodeMeta['corpus'])})
existingLanguage = db.languages.find_one({'slug': slugify(textNodeMeta['textLanguage'])})
if existingAuthor:
existingWork = db.works.find_one({
'slug': slugify(textNodeMeta['workTitle']),
'authors': existingAuthor['_id']
})
else:
existingWork = None
db.authors.insert({
'english_name': textNodeMeta['author'].title(),
'original_name': textNodeMeta['author'].title(),
'slug': slugify(textNodeMeta['author']),
'authorLanguages': [slugify(textNodeMeta['textLanguage'])],
'createdAt': datetime.datetime.now(),
'updatedAt': datetime.datetime.now(),
})
existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])})
if textNodeMeta['textLanguage'] not in existingAuthor['authorLanguages']:
db.authors.update({
'_id': existingAuthor['_id']
}, {
'$addToSet': {
'authorLanguages': slugify(textNodeMeta['textLanguage'])
}
})
if not existingCorpus:
db.corpora.insert({
'title': textNodeMeta['corpus'].title(),
'slug': slugify(textNodeMeta['corpus']),
'link': textNodeMeta['corpusLink'],
'corpusLanguages': [slugify(textNodeMeta['textLanguage'])],
'createdAt': datetime.datetime.now(),
'updatedAt': datetime.datetime.now(),
})
else:
if textNodeMeta['textLanguage'] not in existingCorpus['corpusLanguages']:
db.corpora.update({
'_id': existingCorpus['_id']
}, {
'$addToSet': {
'corpusLanguages': slugify(textNodeMeta['textLanguage'])
}
})
if not existingWork:
db.works.insert({
'english_title': textNodeMeta['workTitle'].title(),
'original_title': textNodeMeta['originalTitle'].title(),
'slug': slugify(textNodeMeta['workTitle']),
'workLanguage': slugify(textNodeMeta['textLanguage']),
'corpus': slugify(textNodeMeta['corpus']),
'authors': [existingAuthor['_id']],
'edition': textNodeMeta['edition'],
'structure': textNodeMeta['structure'],
'form': textNodeMeta['form'],
'filename': textNodeMeta['filename'],
'hash': textNodeMeta['hash'],
'createdAt': datetime.datetime.now(),
'updatedAt': datetime.datetime.now(),
})
existingWork = db.works.find_one({
'slug': slugify(textNodeMeta['workTitle']),
'authors': existingAuthor['_id']
})
if not existingLanguage:
db.languages.insert({
'title': textNodeMeta['textLanguage'].title(),
'slug': slugify(textNodeMeta['textLanguage']),
'createdAt': datetime.datetime.now(),
'updatedAt': datetime.datetime.now(),
})
ingestText = {
'work': existingWork['_id'],
'author': slugify(textNodeMeta['author']),
'corpus': slugify(textNodeMeta['corpus']),
'textLanguage': slugify(textNodeMeta['textLanguage']),
'text': node,
'html': node,
'comments': [],
'entities': [],
'createdAt': datetime.datetime.now(),
'updatedAt': datetime.datetime.now(),
}
if n4:
ingestText['n_5'] = int(key) + 1
ingestText['n_4'] = int(n4)
ingestText['n_3'] = int(n3)
ingestText['n_2'] = int(n2)
ingestText['n_1'] = int(n1)
elif n3:
ingestText['n_4'] = int(key) + 1
ingestText['n_3'] = int(n3)
ingestText['n_2'] = int(n2)
ingestText['n_1'] = int(n1)
elif n2:
ingestText['n_3'] = int(key) + 1
ingestText['n_2'] = int(n2)
ingestText['n_1'] = int(n1)
elif n1:
ingestText['n_2'] = int(key) + 1
ingestText['n_1'] = int(n1)
else:
ingestText['n_1'] = int(key) + 1
db.texts.insert(ingestText)
except Exception as e:
logging.info(' --------- broken %s' % textNodeMeta)
logging.info(' --------- exception %s' % e)
def main():
start_time = time.time()
logging.info(' -- resetting db')
# logging.info(' -- -- authors')
# db.authors.remove({})
# logging.info(' -- -- language')
# db.languages.remove({})
logging.info(' -- -- corpora')
db.corpora.remove({})
logging.info(' -- -- works')
db.works.remove({})
logging.info(' -- -- texts')
db.texts.remove({})
# multiprocess ingest
pool = ProcessPoolExecutor(6)
futures = []
for root, dirs, files in os.walk("."):
path = root.split('/')
if len(path) == 2:
logging.info((((len(path) - 1) * '---') + os.path.basename(root)))
if len(path) > 2 and path[2] == 'cltk_json':
logging.info((((len(path) - 1) * '---') + os.path.basename(root)))
for fname in files:
ext = os.path.splitext(fname)
if ext[1] == ".json":
logging.info(((len(path) * '---') + fname))
with open(os.path.join(root, fname), 'r') as f:
data = json.load(f)
if 'work' in data:
work = data['work']
elif 'english_title' in data:
work = data['english_title']
elif 'englishTitle' in data:
work = data['englishTitle']
elif 'title' in data:
work = data['title']
else:
logging.info(' ############## broken text: %s' % fname)
continue
if 'originalTitle' in data:
originalTitle = data['originalTitle']
else:
originalTitle = work
if 'edition' in data:
edition = data['edition']
else:
edition = ''
author = data['author']
if 'meta' in data:
structure = data['meta'].lower()
else:
structure = ''
if 'meta' in data:
structure = data['meta'].lower()
else:
structure = ''
if 'source' in data:
corpus = data['source']
else:
corpus = make_corpus_from_dirname(path[1])
if 'sourceLink' in data:
corpusLink = data['sourceLink']
else:
corpusLink = make_corpus_link_from_dirname(corpus)
language = ''
if 'language' in data:
language = data['language']
form = 'prose'
if 'line' in structure:
form = 'poetry'
filehash = md5(os.path.join(root, fname))
textNodeMeta = {
'workTitle': work,
'filename': fname,
'originalTitle': originalTitle,
'edition': edition,
'author': author,
'corpus': corpus,
'corpusLink': corpusLink,
'textLanguage': language,
'structure': structure,
'form': form,
'hash': filehash,
}
# dictToText(textNodeMeta, data['text'])
futures.append(pool.submit(dictToText, textNodeMeta, data['text']))
nWorks = len(futures)
for i, x in enumerate(as_completed(futures)):
logging.info("--- %s minutes : %s work of %s ---" % ((time.time() - start_time) / 60, i, nWorks))
wait(futures)
logging.info("--- %s minutes ---" % ((time.time() - start_time) / 60))
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment