Last active
October 15, 2017 01:26
-
-
Save atkm/301a50776b2831a52c58a4b0e397508c to your computer and use it in GitHub Desktop.
Populate Postgres with arXiv articles
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from app import db | |
from models import Article, Author, Category, article_author, article_category | |
import os, gzip, glob | |
from datetime import datetime | |
import xml.etree.ElementTree as ET | |
def get_title(record): | |
arXiv_prefix = '{http://arxiv.org/OAI/arXiv/}' | |
metadata = record.find('metadata').find(arXiv_prefix + "arXiv") | |
title = metadata.find(arXiv_prefix + "title").text | |
return title | |
def get_authors(record): | |
arXiv_prefix = '{http://arxiv.org/OAI/arXiv/}' | |
metadata = record.find('metadata').find(arXiv_prefix + "arXiv") | |
authors = metadata.find(arXiv_prefix + 'authors') | |
ls = [] | |
for a in authors.getchildren(): | |
keyname = a.find(arXiv_prefix + "keyname").text | |
forenames = a.find(arXiv_prefix + "forenames").text | |
ls.append( ' '.join( (forenames, keyname) ) ) | |
return ls | |
def get_categories(record): | |
arXiv_prefix = '{http://arxiv.org/OAI/arXiv/}' | |
metadata = record.find('metadata').find(arXiv_prefix + "arXiv") | |
categories = metadata.find(arXiv_prefix + 'categories').text | |
return categories.split() | |
def insert_record(record): | |
arXiv_prefix = '{http://arxiv.org/OAI/arXiv/}' | |
article = Article() | |
metadata = record.find('metadata').find(arXiv_prefix + "arXiv") | |
if metadata is None: | |
return 'deleted' | |
identifier = metadata.find(arXiv_prefix + "id").text | |
exists = db.session.query(Article).filter( | |
Article.identifier == identifier | |
).exists() | |
if db.session.query(exists).scalar(): | |
return 'exists' | |
submitted = metadata.find(arXiv_prefix + "created").text | |
submitted = datetime.strptime(submitted, "%Y-%m-%d") | |
title = metadata.find(arXiv_prefix + "title").text | |
abstract = metadata.find(arXiv_prefix + "abstract").text.strip() | |
article.submitted = submitted | |
article.title = title | |
article.identifier = identifier | |
article.abstract = abstract | |
authors = metadata.find(arXiv_prefix + 'authors') | |
added_authors = [] | |
for a in authors.getchildren(): | |
keyname = a.find(arXiv_prefix + "keyname") | |
forenames = a.find(arXiv_prefix + "forenames") | |
if keyname is None or forenames is None: | |
continue #bad entry | |
fullname = ' '.join( (forenames.text, keyname.text) ) | |
# authors added in this record. Fix error from ['Wende Liu', 'Wende Liu'] | |
if fullname in added_authors: | |
continue | |
else: | |
added_authors.append(fullname) | |
search = db.session.query(Author).filter_by(name=fullname) | |
if db.session.query(search.exists()).scalar(): | |
article.authors.append(search.one()) | |
else: | |
article.authors.append(Author(fullname)) | |
categories = metadata.find(arXiv_prefix + "categories").text | |
for c in categories.split(): | |
search = db.session.query(Category).filter_by(name=c) | |
if db.session.query(search.exists()).scalar(): | |
article.categories.append(search.one()) | |
else: | |
article.categories.append(Category(c)) | |
db.session.add(article) | |
return 'inserted' | |
def insert_xml(xml): | |
""" | |
xml: ElementTree such that xml.getroot() returns an Element | |
""" | |
root = xml.getroot() | |
for record in root.find('ListRecords').findall("record"): | |
result = insert_record(record) | |
if result == 'exists': | |
print('Skipping file as content already exist.') | |
return | |
db.session.commit() | |
print('Added. Commit.') | |
if __name__=='__main__': | |
# to from scratch | |
#db.metadata.drop_all(db.engine) | |
#db.metadata.create_all(db.engine) | |
files = glob.glob('/home/atkm/.metha/20170923/*.xml.gz') | |
#files = glob.glob('/home/atkm/.metha/20170923/2017-08-31-*.xml.gz') # test. | |
try: | |
for xmlgz in files: | |
print(os.path.basename(xmlgz), end=' ') | |
with gzip.open(xmlgz, 'rb') as f: | |
xml = ET.parse(f) | |
insert_xml(xml) | |
print('Population succeeded.') | |
print('%s articles in db.' % | |
db.session.query(Article).count()) | |
except Exception as e: | |
print('Error in %s.' % xmlgz) | |
print('Rolling back...') | |
db.session.rollback() | |
raise e |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment