Created
March 8, 2015 13:16
-
-
Save gardenunez/1809ee02d2876573ae7a to your computer and use it in GitHub Desktop.
Arxiv crawler by category using the arxiv.org web api
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import urllib | |
import argparse | |
import sqlite3 | |
from xml.dom import minidom | |
import datetime | |
from arxiv_subject_classification import SUBJECT_CLASSIFICATION | |
def get_args(): | |
"""Get arguments""" | |
parser = argparse.ArgumentParser(description="ArXiv crawler") | |
parser.add_argument("-c", type=str, dest="cat", | |
help="Subject Classification or Category of Arxiv") | |
def create_db(): | |
print 'creating table' | |
conn = sqlite3.connect('arxiv_crawler.db') | |
c = conn.cursor() | |
c.execute('''CREATE TABLE IF NOT EXISTS raw_data | |
(arxiv_id text, data text, created_date text)''') | |
conn.commit() | |
conn.close() | |
def select_top_ten_raw_data(): | |
print 'selecting crawled data' | |
conn = sqlite3.connect('arxiv_crawler.db') | |
c = conn.cursor() | |
for row in c.execute('select * from raw_data limit 100'): | |
print row | |
conn.close() | |
def clean_raw_data(): | |
print 'cleaning raw data table' | |
conn = sqlite3.connect('arxiv_crawler.db') | |
c = conn.cursor() | |
c.execute('delete from raw_data') | |
conn.commit() | |
conn.close() | |
def save_to_file(data, filename): | |
with open(filename, 'w+') as arxiv_file: | |
arxiv_file.write(data) | |
print '{} file saved'.format(filename) | |
def save_arxiv_data_to_db(data): | |
"""Save arxiv xml raw data into arxiv_crawler db""" | |
conn = sqlite3.connect('arxiv_crawler.db') | |
try: | |
c = conn.cursor() | |
xmlobj = minidom.parseString(data) | |
entries = xmlobj.getElementsByTagName('entry') | |
raw_data_entries = [] | |
for entry in entries: | |
url = entry.getElementsByTagName('id')[0].childNodes[0].data | |
tokens = url.split('/') | |
arxiv_id = tokens[len(tokens) - 1] | |
raw_data_entries.append((arxiv_id, entry.toxml().replace('\n', ''), str(datetime.datetime.now()))) | |
c.executemany('INSERT INTO raw_data(arxiv_id, data, created_date) values (?,?,?)', | |
raw_data_entries) | |
conn.commit() | |
except Exception: | |
conn.close() | |
raise | |
def fetch_arxiv_data(category, offset=0, limit=100): | |
url = 'http://export.arxiv.org/api/query?search_query=cat:{}&start={}&max_results={}'.format( | |
category, | |
offset, | |
limit | |
) | |
data = urllib.urlopen(url).read() | |
return data | |
def digest_arxiv_entry(xmlsource): | |
""" | |
Parse an arxiv xml string | |
:param xmlsource: str | |
:return: list of articles | |
""" | |
xmldata = minidom.parseString(xmlsource) | |
articles = [] | |
entries = xmldata.getElementsByTagName('entry') | |
for entry in entries: | |
url = entry.getElementsByTagName('id')[0].childNodes[0].data | |
doi_elem = entry.getElementsByTagName('arxiv:doi') | |
doi = None | |
if doi_elem: | |
doi = doi_elem[0].childNodes[0].data | |
title = entry.getElementsByTagName('title')[0].childNodes[0].data.encode('ascii', 'ignore').replace('\n', ' ') | |
abstract = entry.getElementsByTagName('summary')[0].childNodes[0].data.encode('ascii', 'ignore').replace('\n', ' ').lstrip() | |
published_date = entry.getElementsByTagName('published')[0].childNodes[0].data | |
updated_date = entry.getElementsByTagName('updated')[0].childNodes[0].data | |
articles.append( | |
{ | |
'doi': doi, | |
'title': title, | |
'abstract': abstract, | |
'published': published_date, | |
'updated': updated_date, | |
'url': url | |
} | |
) | |
return articles | |
def crawl_all_categories(): | |
"""Crawl all entries from all categories""" | |
print 'start crawling all categories' | |
#TODO uncomment this | |
# for cat in SUBJECT_CLASSIFICATION.keys(): | |
for cat in SUBJECT_CLASSIFICATION.keys()[:5]: | |
crawl_by_category(cat) | |
print 'end process!!' | |
def crawl_by_category(cat): | |
if cat not in SUBJECT_CLASSIFICATION: | |
raise Exception("Invalid category") | |
cat_name = SUBJECT_CLASSIFICATION[cat] | |
print 'crawling category: {}'.format(cat_name) | |
data = fetch_arxiv_data(cat, 0, 1) | |
xml_data = minidom.parseString(data) | |
# TODO uncomment this | |
# total_results = int(xml_data.getElementsByTagName('opensearch:totalResults')[0].childNodes[0].data) | |
total_results = 5 | |
print 'starting to fetch {} entries'.format(total_results) | |
if total_results > 0: | |
offset = -10 | |
limit = 10 | |
while offset < total_results: | |
offset += limit | |
data = fetch_arxiv_data(cat, offset, limit) | |
save_arxiv_data_to_db(data) | |
#filename = "arxiv_{}_{}_{}.xml".format(cat, offset, limit) | |
#saveToFile(data, filename) | |
print 'end crawling {}!!'.format(cat_name) | |
def main(): | |
create_db() | |
clean_raw_data() | |
crawl_all_categories() | |
select_top_ten_raw_data() | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
SUBJECT_CLASSIFICATION = { | |
"stat.AP": "Statistics - Applications", | |
"stat.CO": "Statistics - Computation", | |
"stat.ML": "Statistics - Machine Learning", | |
"stat.ME": "Statistics - Methodology", | |
"stat.TH": "Statistics - Theory", | |
"q-bio.BM": "Quantitative Biology - Biomolecules", | |
"q-bio.CB": "Quantitative Biology - Cell Behavior", | |
"q-bio.GN": "Quantitative Biology - Genomics", | |
"q-bio.MN": "Quantitative Biology - Molecular Networks", | |
"q-bio.NC": "Quantitative Biology - Neurons and Cognition", | |
"q-bio.OT": "Quantitative Biology - Other", | |
"q-bio.PE": "Quantitative Biology - Populations and Evolution", | |
"q-bio.QM": "Quantitative Biology - Quantitative Methods", | |
"q-bio.SC": "Quantitative Biology - Subcellular Processes", | |
"q-bio.TO": "Quantitative Biology - Tissues and Organs", | |
"cs.AR": "Computer Science - Architecture", | |
"cs.AI":"Computer Science - Artificial Intelligence", | |
"cs.CL":"Computer Science - Computation and Language", | |
"cs.CC":"Computer Science - Computational Complexity", | |
"cs.CE":"Computer Science - Computational Engineering; Finance; and Science", | |
"cs.CG":"Computer Science - Computational Geometry", | |
"cs.GT":"Computer Science - Computer Science and Game Theory", | |
"cs.CV":"Computer Science - Computer Vision and Pattern Recognition", | |
"cs.CY":"Computer Science - Computers and Society", | |
"cs.CR":"Computer Science - Cryptography and Security", | |
"cs.DS":"Computer Science - Data Structures and Algorithms", | |
"cs.DB":"Computer Science - Databases", | |
"cs.DL":"Computer Science - Digital Libraries", | |
"cs.DM":"Computer Science - Discrete Mathematics", | |
"cs.DC":"Computer Science - Distributed; Parallel; and Cluster Computing", | |
"cs.GL":"Computer Science - General Literature", | |
"cs.GR":"Computer Science - Graphics", | |
"cs.HC":"Computer Science - Human-Computer Interaction", | |
"cs.IR":"Computer Science - Information Retrieval", | |
"cs.IT":"Computer Science - Information Theory", | |
"cs.LG":"Computer Science - Learning", | |
"cs.LO":"Computer Science - Logic in Computer Science", | |
"cs.MS":"Computer Science - Mathematical Software", | |
"cs.MA":"Computer Science - Multiagent Systems", | |
"cs.MM":"Computer Science - Multimedia", | |
"cs.NI":"Computer Science - Networking and Internet Architecture", | |
"cs.NE":"Computer Science - Neural and Evolutionary Computing", | |
"cs.NA":"Computer Science - Numerical Analysis", | |
"cs.OS":"Computer Science - Operating Systems", | |
"cs.OH":"Computer Science - Other", | |
"cs.PF":"Computer Science - Performance", | |
"cs.PL":"Computer Science - Programming Languages", | |
"cs.RO":"Computer Science - Robotics", | |
"cs.SE":"Computer Science - Software Engineering", | |
"cs.SD":"Computer Science - Sound", | |
"cs.SC":"Computer Science - Symbolic Computation", | |
"nlin.AO": "Nonlinear Sciences - Adaptation and Self-Organizing Systems", | |
"nlin.CG": "Nonlinear Sciences - Cellular Automata and Lattice Gases", | |
"nlin.CD": "Nonlinear Sciences - Chaotic Dynamics", | |
"nlin.SI": "Nonlinear Sciences - Exactly Solvable and Integrable Systems", | |
"nlin.PS": "Nonlinear Sciences - Pattern Formation and Solitons", | |
"math.AG": "Mathematics - Algebraic Geometry", | |
"math.AT": "Mathematics - Algebraic Topology", | |
"math.AP": "Mathematics - Analysis of PDEs", | |
"math.CT": "Mathematics - Category Theory", | |
"math.CA": "Mathematics - Classical Analysis and ODEs", | |
"math.CO": "Mathematics - Combinatorics", | |
"math.AC": "Mathematics - Commutative Algebra", | |
"math.CV": "Mathematics - Complex Variables", | |
"math.DG": "Mathematics - Differential Geometry", | |
"math.DS": "Mathematics - Dynamical Systems", | |
"math.FA": "Mathematics - Functional Analysis", | |
"math.GM": "Mathematics - General Mathematics", | |
"math.GN": "Mathematics - General Topology", | |
"math.GT": "Mathematics - Geometric Topology", | |
"math.GR": "Mathematics - Group Theory", | |
"math.HO": "Mathematics - History and Overview", | |
"math.IT": "Mathematics - Information Theory", | |
"math.KT": "Mathematics - K-Theory and Homology", | |
"math.LO": "Mathematics - Logic", | |
"math.MP": "Mathematics - Mathematical Physics", | |
"math.MG": "Mathematics - Metric Geometry", | |
"math.NT": "Mathematics - Number Theory", | |
"math.NA": "Mathematics - Numerical Analysis", | |
"math.OA": "Mathematics - Operator Algebras", | |
"math.OC": "Mathematics - Optimization and Control", | |
"math.PR": "Mathematics - Probability", | |
"math.QA": "Mathematics - Quantum Algebra", | |
"math.RT": "Mathematics - Representation Theory", | |
"math.RA": "Mathematics - Rings and Algebras", | |
"math.SP": "Mathematics - Spectral Theory", | |
"math.ST": "Mathematics - Statistics", | |
"math.SG": "Mathematics - Symplectic Geometry", | |
"astro-ph": "Astrophysics", | |
"cond-mat.dis-nn": "Physics - Disordered Systems and Neural Networks", | |
"cond-mat.mes-hall": "Physics - Mesoscopic Systems and Quantum Hall Effect", | |
"cond-mat.mtrl-sci": "Physics - Materials Science", | |
"cond-mat.other": "Physics - Other", | |
"cond-mat.soft": "Physics - Soft Condensed Matter", | |
"cond-mat.stat-mech": "Physics - Statistical Mechanics", | |
"cond-mat.str-el": "Physics - Strongly Correlated Electrons", | |
"cond-mat.supr-con": "Physics - Superconductivity", | |
"gr-qc": "General Relativity and Quantum Cosmology", | |
"hep-ex": "High Energy Physics - Experiment", | |
"hep-lat": "High Energy Physics - Lattice", | |
"hep-ph": "High Energy Physics - Phenomenology", | |
"hep-th": "High Energy Physics - Theory", | |
"math-ph": "Mathematical Physics", | |
"nucl-ex": "Nuclear Experiment", | |
"nucl-th":" Nuclear Theory", | |
"physics.acc-ph": "Physics - Accelerator Physics", | |
"physics.ao-ph": "Physics - Atmospheric and Oceanic Physics", | |
"physics.atom-ph": "Physics - Atomic Physics", | |
"physics.atm-clus": "Physics - Atomic and Molecular Clusters", | |
"physics.bio-ph": "Physics - Biological Physics", | |
"physics.chem-ph": "Physics - Chemical Physics", | |
"physics.class-ph": "Physics - Classical Physics", | |
"physics.comp-ph": "Physics - Computational Physics", | |
"physics.data-an": "Physics - Data Analysis; Statistics and Probability", | |
"physics.flu-dyn": "Physics - Fluid Dynamics", | |
"physics.gen-ph": "Physics - General Physics", | |
"physics.geo-ph": "Physics - Geophysics", | |
"physics.hist-ph": "Physics - History of Physics", | |
"physics.ins-det": "Physics - Instrumentation and Detectors", | |
"physics.med-ph": "Physics - Medical Physics", | |
"physics.optics": "Physics - Optics", | |
"physics.ed-ph": "Physics - Physics Education", | |
"physics.soc-ph": "Physics - Physics and Society", | |
"physics.plasm-ph": "Physics - Plasma Physics", | |
"physics.pop-ph": "Physics - Popular Physics", | |
"physics.space-ph": "Physics - Space Physics", | |
"quant-ph": "Quantum Physics" | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment