lukehollis/ingest.py

## ingest.py
import os
import json
import pdb
import datetime
import pymongo
import six
import re
import unicodedata
import concurrent.futures
from concurrent.futures import ProcessPoolExecutor, wait, as_completed
from django.utils.text import slugify
import logging
import time
import hashlib

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

LANGUAGES = [
        'sanskrit',
        'punjabi',
        'pali',
        'old_english',
        'latin',
        'javanese',
        'chinese',
        'hindi',
        'hebrew',
        'bengali',
        'greek',
        'malayalam',
        'prakrit',
        'old_church_slavonic',
        'middle_english',
        'telugu',
        'coptic'
]

client = pymongo.MongoClient('localhost', 27017, maxPoolSize=None)
db = client['cltk_frontend']

def mongo(db, host="localhost", port=27017):
	client = pymongo.MongoClient(host, port, maxPoolSize=None)
	# client = pymongo.MongoClient('mongodb://cltkAdmin:quidfaciatlaetassegetes@52.91.62.113')
	return client[db]

def md5(fname):
    hash_md5 = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

def make_corpus_from_dirname(dirname):
	corpus = dirname.replace("texts", "").replace("text", "")

	for language in LANGUAGES:
		corpus = re.sub('^' + language, '', corpus)

	corpus = corpus.replace("_", " ")
	corpus = corpus.strip()
	corpus = corpus.title()

	return corpus

def make_corpus_link_from_dirname(corpus):
	corpusLink = ''

	if corpus.lower() == "Open Greek and Latin".lower():
		corpusLink = "https://github.com/OpenGreekAndLatin"

	elif corpus.lower() == "The First 1k Years Of Greek".lower():
		corpusLink = "http://opengreekandlatin.github.io/First1KGreek/"

	elif corpus.lower() == "Perseus Digital Library".lower():
		corpusLink = "http://www.perseus.tufts.edu/hopper/"

	else:
		logging.info("%s didn't have a link" % corpus)

	return corpusLink

def dictToText(textNodeMeta, textNode, n1=None, n2=None, n3=None, n4=None, n5=None):
	try:
		for key, node in textNode.items():
			if isinstance(node, dict):
				if n4:
					dictToText(textNodeMeta, node, n1, n2, n3, n4, (int(key) + 1))
				elif n3:
					dictToText(textNodeMeta, node, n1, n2, n3, (int(key) + 1))
				elif n2:
					dictToText(textNodeMeta, node, n1, n2, (int(key) + 1))
				elif n1:
					dictToText(textNodeMeta, node, n1, (int(key) + 1))
				else:
					dictToText(textNodeMeta, node, (int(key)+ 1))
			else:

				existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])})
				existingCorpus = db.corpora.find_one({'slug': slugify(textNodeMeta['corpus'])})
				existingLanguage = db.languages.find_one({'slug': slugify(textNodeMeta['textLanguage'])})

				if existingAuthor:
					existingWork = db.works.find_one({
						'slug': slugify(textNodeMeta['workTitle']),
						'authors': existingAuthor['_id']
					})
				else:
					existingWork = None
					db.authors.insert({
						'english_name': textNodeMeta['author'].title(),
						'original_name': textNodeMeta['author'].title(),
						'slug': slugify(textNodeMeta['author']),
						'authorLanguages': [slugify(textNodeMeta['textLanguage'])],
						'createdAt': datetime.datetime.now(),
						'updatedAt': datetime.datetime.now(),
					})
					existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])})
					if textNodeMeta['textLanguage'] not in existingAuthor['authorLanguages']:
						db.authors.update({
							'_id': existingAuthor['_id']
						}, {
							'$addToSet': {
								'authorLanguages': slugify(textNodeMeta['textLanguage'])
							}
						})

				if not existingCorpus:
					db.corpora.insert({
						'title': textNodeMeta['corpus'].title(),
						'slug': slugify(textNodeMeta['corpus']),
						'link': textNodeMeta['corpusLink'],
						'corpusLanguages': [slugify(textNodeMeta['textLanguage'])],
						'createdAt': datetime.datetime.now(),
						'updatedAt': datetime.datetime.now(),
					})
				else:
					if textNodeMeta['textLanguage'] not in existingCorpus['corpusLanguages']:
						db.corpora.update({
							'_id': existingCorpus['_id']
						}, {
							'$addToSet': {
								'corpusLanguages': slugify(textNodeMeta['textLanguage'])
							}
						})

				if not existingWork:
					db.works.insert({
						'english_title': textNodeMeta['workTitle'].title(),
						'original_title': textNodeMeta['originalTitle'].title(),
						'slug': slugify(textNodeMeta['workTitle']),
						'workLanguage': slugify(textNodeMeta['textLanguage']),
						'corpus': slugify(textNodeMeta['corpus']),
						'authors': [existingAuthor['_id']],
						'edition': textNodeMeta['edition'],
						'structure': textNodeMeta['structure'],
						'form': textNodeMeta['form'],
						'filename': textNodeMeta['filename'],
						'hash': textNodeMeta['hash'],
						'createdAt': datetime.datetime.now(),
						'updatedAt': datetime.datetime.now(),
					})

					existingWork = db.works.find_one({
						'slug': slugify(textNodeMeta['workTitle']),
						'authors': existingAuthor['_id']
					})

				if not existingLanguage:
					db.languages.insert({
						'title': textNodeMeta['textLanguage'].title(),
						'slug': slugify(textNodeMeta['textLanguage']),
						'createdAt': datetime.datetime.now(),
						'updatedAt': datetime.datetime.now(),
					})

				ingestText = {
					'work': existingWork['_id'],
					'author': slugify(textNodeMeta['author']),
					'corpus': slugify(textNodeMeta['corpus']),
					'textLanguage': slugify(textNodeMeta['textLanguage']),
					'text': node,
					'html': node,
					'comments': [],
					'entities': [],
					'createdAt': datetime.datetime.now(),
					'updatedAt': datetime.datetime.now(),
				}

				if n4:
					ingestText['n_5'] = int(key) + 1
					ingestText['n_4'] = int(n4)
					ingestText['n_3'] = int(n3)
					ingestText['n_2'] = int(n2)
					ingestText['n_1'] = int(n1)
				elif n3:
					ingestText['n_4'] = int(key) + 1
					ingestText['n_3'] = int(n3)
					ingestText['n_2'] = int(n2)
					ingestText['n_1'] = int(n1)
				elif n2:
					ingestText['n_3'] = int(key) + 1
					ingestText['n_2'] = int(n2)
					ingestText['n_1'] = int(n1)
				elif n1:
					ingestText['n_2'] = int(key) + 1
					ingestText['n_1'] = int(n1)
				else:
					ingestText['n_1'] = int(key) + 1

				db.texts.insert(ingestText)

	except Exception as e:
		logging.info(' --------- broken %s' % textNodeMeta)
		logging.info(' --------- exception %s' % e)

def main():
	start_time = time.time()

	logging.info(' -- resetting db')
	# logging.info(' -- -- authors')
	# db.authors.remove({})
	# logging.info(' -- -- language')
	# db.languages.remove({})
	logging.info(' -- -- corpora')
	db.corpora.remove({})
	logging.info(' -- -- works')
	db.works.remove({})
	logging.info(' -- -- texts')
	db.texts.remove({})

	# multiprocess ingest
	pool = ProcessPoolExecutor(6)
	futures = []

	for root, dirs, files in os.walk("."):
		path = root.split('/')

		if len(path) == 2:
			logging.info((((len(path) - 1) * '---') +  os.path.basename(root)))

		if len(path) > 2 and path[2] == 'cltk_json':
			logging.info((((len(path) - 1) * '---') + os.path.basename(root)))

			for fname in files:
				ext = os.path.splitext(fname)
				if ext[1] == ".json":

					logging.info(((len(path) * '---') + fname))

					with open(os.path.join(root, fname), 'r') as f:
						data = json.load(f)

					if 'work' in data:
						work = data['work']
					elif 'english_title' in data:
						work = data['english_title']
					elif 'englishTitle' in data:
						work = data['englishTitle']
					elif 'title' in data:
						work = data['title']
					else:
						logging.info(' ############## broken text: %s' % fname)
						continue

					if 'originalTitle' in data:
						originalTitle = data['originalTitle']
					else:
						originalTitle = work

					if 'edition' in data:
						edition = data['edition']
					else:
						edition = ''
					author = data['author']
					if 'meta' in data:
						structure = data['meta'].lower()
					else:
						structure = ''

					if 'meta' in data:
						structure = data['meta'].lower()
					else:
						structure = ''

					if 'source' in data:
						corpus = data['source']
					else:
						corpus = make_corpus_from_dirname(path[1])

					if 'sourceLink' in data:
						corpusLink = data['sourceLink']
					else:
						corpusLink = make_corpus_link_from_dirname(corpus)

					language = ''
					if 'language' in data:
						language = data['language']

					form = 'prose'
					if 'line' in structure:
						form = 'poetry'

					filehash = md5(os.path.join(root, fname))

					textNodeMeta = {
						'workTitle': work,
						'filename': fname,
						'originalTitle': originalTitle,
						'edition': edition,
						'author': author,
						'corpus': corpus,
						'corpusLink': corpusLink,
						'textLanguage': language,
						'structure': structure,
						'form': form,
						'hash': filehash,
					}

					# dictToText(textNodeMeta, data['text'])
					futures.append(pool.submit(dictToText, textNodeMeta, data['text']))

	nWorks = len(futures)
	for i, x in enumerate(as_completed(futures)):
		logging.info("--- %s minutes : %s work of %s ---" % ((time.time() - start_time) / 60, i, nWorks))

	wait(futures)
	logging.info("--- %s minutes ---" % ((time.time() - start_time) / 60))

if __name__ == '__main__':
	main()
	import os
	import json
	import pdb
	import datetime
	import pymongo
	import six
	import re
	import unicodedata
	import concurrent.futures
	from concurrent.futures import ProcessPoolExecutor, wait, as_completed
	from django.utils.text import slugify
	import logging
	import time
	import hashlib

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	LANGUAGES = [
	'sanskrit',
	'punjabi',
	'pali',
	'old_english',
	'latin',
	'javanese',
	'chinese',
	'hindi',
	'hebrew',
	'bengali',
	'greek',
	'malayalam',
	'prakrit',
	'old_church_slavonic',
	'middle_english',
	'telugu',
	'coptic'
	]

	client = pymongo.MongoClient('localhost', 27017, maxPoolSize=None)
	db = client['cltk_frontend']

	def mongo(db, host="localhost", port=27017):
	client = pymongo.MongoClient(host, port, maxPoolSize=None)
	# client = pymongo.MongoClient('mongodb://cltkAdmin:quidfaciatlaetassegetes@52.91.62.113')
	return client[db]

	def md5(fname):
	hash_md5 = hashlib.md5()
	with open(fname, "rb") as f:
	for chunk in iter(lambda: f.read(4096), b""):
	hash_md5.update(chunk)
	return hash_md5.hexdigest()

	def make_corpus_from_dirname(dirname):
	corpus = dirname.replace("texts", "").replace("text", "")

	for language in LANGUAGES:
	corpus = re.sub('^' + language, '', corpus)

	corpus = corpus.replace("_", " ")
	corpus = corpus.strip()
	corpus = corpus.title()

	return corpus

	def make_corpus_link_from_dirname(corpus):
	corpusLink = ''

	if corpus.lower() == "Open Greek and Latin".lower():
	corpusLink = "https://github.com/OpenGreekAndLatin"

	elif corpus.lower() == "The First 1k Years Of Greek".lower():
	corpusLink = "http://opengreekandlatin.github.io/First1KGreek/"

	elif corpus.lower() == "Perseus Digital Library".lower():
	corpusLink = "http://www.perseus.tufts.edu/hopper/"

	else:
	logging.info("%s didn't have a link" % corpus)

	return corpusLink

	def dictToText(textNodeMeta, textNode, n1=None, n2=None, n3=None, n4=None, n5=None):
	try:
	for key, node in textNode.items():
	if isinstance(node, dict):
	if n4:
	dictToText(textNodeMeta, node, n1, n2, n3, n4, (int(key) + 1))
	elif n3:
	dictToText(textNodeMeta, node, n1, n2, n3, (int(key) + 1))
	elif n2:
	dictToText(textNodeMeta, node, n1, n2, (int(key) + 1))
	elif n1:
	dictToText(textNodeMeta, node, n1, (int(key) + 1))
	else:
	dictToText(textNodeMeta, node, (int(key)+ 1))
	else:

	existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])})
	existingCorpus = db.corpora.find_one({'slug': slugify(textNodeMeta['corpus'])})
	existingLanguage = db.languages.find_one({'slug': slugify(textNodeMeta['textLanguage'])})

	if existingAuthor:
	existingWork = db.works.find_one({
	'slug': slugify(textNodeMeta['workTitle']),
	'authors': existingAuthor['_id']
	})
	else:
	existingWork = None
	db.authors.insert({
	'english_name': textNodeMeta['author'].title(),
	'original_name': textNodeMeta['author'].title(),
	'slug': slugify(textNodeMeta['author']),
	'authorLanguages': [slugify(textNodeMeta['textLanguage'])],
	'createdAt': datetime.datetime.now(),
	'updatedAt': datetime.datetime.now(),
	})
	existingAuthor = db.authors.find_one({'slug': slugify(textNodeMeta['author'])})
	if textNodeMeta['textLanguage'] not in existingAuthor['authorLanguages']:
	db.authors.update({
	'_id': existingAuthor['_id']
	}, {
	'$addToSet': {
	'authorLanguages': slugify(textNodeMeta['textLanguage'])
	}
	})

	if not existingCorpus:
	db.corpora.insert({
	'title': textNodeMeta['corpus'].title(),
	'slug': slugify(textNodeMeta['corpus']),
	'link': textNodeMeta['corpusLink'],
	'corpusLanguages': [slugify(textNodeMeta['textLanguage'])],
	'createdAt': datetime.datetime.now(),
	'updatedAt': datetime.datetime.now(),
	})
	else:
	if textNodeMeta['textLanguage'] not in existingCorpus['corpusLanguages']:
	db.corpora.update({
	'_id': existingCorpus['_id']
	}, {
	'$addToSet': {
	'corpusLanguages': slugify(textNodeMeta['textLanguage'])
	}
	})

	if not existingWork:
	db.works.insert({
	'english_title': textNodeMeta['workTitle'].title(),
	'original_title': textNodeMeta['originalTitle'].title(),
	'slug': slugify(textNodeMeta['workTitle']),
	'workLanguage': slugify(textNodeMeta['textLanguage']),
	'corpus': slugify(textNodeMeta['corpus']),
	'authors': [existingAuthor['_id']],
	'edition': textNodeMeta['edition'],
	'structure': textNodeMeta['structure'],
	'form': textNodeMeta['form'],
	'filename': textNodeMeta['filename'],
	'hash': textNodeMeta['hash'],
	'createdAt': datetime.datetime.now(),
	'updatedAt': datetime.datetime.now(),
	})

	existingWork = db.works.find_one({
	'slug': slugify(textNodeMeta['workTitle']),
	'authors': existingAuthor['_id']
	})

	if not existingLanguage:
	db.languages.insert({
	'title': textNodeMeta['textLanguage'].title(),
	'slug': slugify(textNodeMeta['textLanguage']),
	'createdAt': datetime.datetime.now(),
	'updatedAt': datetime.datetime.now(),
	})

	ingestText = {
	'work': existingWork['_id'],
	'author': slugify(textNodeMeta['author']),
	'corpus': slugify(textNodeMeta['corpus']),
	'textLanguage': slugify(textNodeMeta['textLanguage']),
	'text': node,
	'html': node,
	'comments': [],
	'entities': [],
	'createdAt': datetime.datetime.now(),
	'updatedAt': datetime.datetime.now(),
	}

	if n4:
	ingestText['n_5'] = int(key) + 1
	ingestText['n_4'] = int(n4)
	ingestText['n_3'] = int(n3)
	ingestText['n_2'] = int(n2)
	ingestText['n_1'] = int(n1)
	elif n3:
	ingestText['n_4'] = int(key) + 1
	ingestText['n_3'] = int(n3)
	ingestText['n_2'] = int(n2)
	ingestText['n_1'] = int(n1)
	elif n2:
	ingestText['n_3'] = int(key) + 1
	ingestText['n_2'] = int(n2)
	ingestText['n_1'] = int(n1)
	elif n1:
	ingestText['n_2'] = int(key) + 1
	ingestText['n_1'] = int(n1)
	else:
	ingestText['n_1'] = int(key) + 1

	db.texts.insert(ingestText)

	except Exception as e:
	logging.info(' --------- broken %s' % textNodeMeta)
	logging.info(' --------- exception %s' % e)

	def main():
	start_time = time.time()

	logging.info(' -- resetting db')
	# logging.info(' -- -- authors')
	# db.authors.remove({})
	# logging.info(' -- -- language')
	# db.languages.remove({})
	logging.info(' -- -- corpora')
	db.corpora.remove({})
	logging.info(' -- -- works')
	db.works.remove({})
	logging.info(' -- -- texts')
	db.texts.remove({})

	# multiprocess ingest
	pool = ProcessPoolExecutor(6)
	futures = []

	for root, dirs, files in os.walk("."):
	path = root.split('/')

	if len(path) == 2:
	logging.info((((len(path) - 1) * '---') + os.path.basename(root)))

	if len(path) > 2 and path[2] == 'cltk_json':
	logging.info((((len(path) - 1) * '---') + os.path.basename(root)))

	for fname in files:
	ext = os.path.splitext(fname)
	if ext[1] == ".json":

	logging.info(((len(path) * '---') + fname))

	with open(os.path.join(root, fname), 'r') as f:
	data = json.load(f)

	if 'work' in data:
	work = data['work']
	elif 'english_title' in data:
	work = data['english_title']
	elif 'englishTitle' in data:
	work = data['englishTitle']
	elif 'title' in data:
	work = data['title']
	else:
	logging.info(' ############## broken text: %s' % fname)
	continue

	if 'originalTitle' in data:
	originalTitle = data['originalTitle']
	else:
	originalTitle = work

	if 'edition' in data:
	edition = data['edition']
	else:
	edition = ''
	author = data['author']
	if 'meta' in data:
	structure = data['meta'].lower()
	else:
	structure = ''

	if 'meta' in data:
	structure = data['meta'].lower()
	else:
	structure = ''

	if 'source' in data:
	corpus = data['source']
	else:
	corpus = make_corpus_from_dirname(path[1])

	if 'sourceLink' in data:
	corpusLink = data['sourceLink']
	else:
	corpusLink = make_corpus_link_from_dirname(corpus)

	language = ''
	if 'language' in data:
	language = data['language']

	form = 'prose'
	if 'line' in structure:
	form = 'poetry'

	filehash = md5(os.path.join(root, fname))

	textNodeMeta = {
	'workTitle': work,
	'filename': fname,
	'originalTitle': originalTitle,
	'edition': edition,
	'author': author,
	'corpus': corpus,
	'corpusLink': corpusLink,
	'textLanguage': language,
	'structure': structure,
	'form': form,
	'hash': filehash,
	}

	# dictToText(textNodeMeta, data['text'])
	futures.append(pool.submit(dictToText, textNodeMeta, data['text']))

	nWorks = len(futures)
	for i, x in enumerate(as_completed(futures)):
	logging.info("--- %s minutes : %s work of %s ---" % ((time.time() - start_time) / 60, i, nWorks))

	wait(futures)
	logging.info("--- %s minutes ---" % ((time.time() - start_time) / 60))

	if __name__ == '__main__':
	main()