Skip to content

Instantly share code, notes, and snippets.

@gardenunez
Created March 8, 2015 13:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gardenunez/1809ee02d2876573ae7a to your computer and use it in GitHub Desktop.
Save gardenunez/1809ee02d2876573ae7a to your computer and use it in GitHub Desktop.
Arxiv crawler by category using the arxiv.org web api
#!/usr/bin/env python
import urllib
import argparse
import sqlite3
from xml.dom import minidom
import datetime
from arxiv_subject_classification import SUBJECT_CLASSIFICATION
def get_args():
"""Get arguments"""
parser = argparse.ArgumentParser(description="ArXiv crawler")
parser.add_argument("-c", type=str, dest="cat",
help="Subject Classification or Category of Arxiv")
def create_db():
print 'creating table'
conn = sqlite3.connect('arxiv_crawler.db')
c = conn.cursor()
c.execute('''CREATE TABLE IF NOT EXISTS raw_data
(arxiv_id text, data text, created_date text)''')
conn.commit()
conn.close()
def select_top_ten_raw_data():
print 'selecting crawled data'
conn = sqlite3.connect('arxiv_crawler.db')
c = conn.cursor()
for row in c.execute('select * from raw_data limit 100'):
print row
conn.close()
def clean_raw_data():
print 'cleaning raw data table'
conn = sqlite3.connect('arxiv_crawler.db')
c = conn.cursor()
c.execute('delete from raw_data')
conn.commit()
conn.close()
def save_to_file(data, filename):
with open(filename, 'w+') as arxiv_file:
arxiv_file.write(data)
print '{} file saved'.format(filename)
def save_arxiv_data_to_db(data):
"""Save arxiv xml raw data into arxiv_crawler db"""
conn = sqlite3.connect('arxiv_crawler.db')
try:
c = conn.cursor()
xmlobj = minidom.parseString(data)
entries = xmlobj.getElementsByTagName('entry')
raw_data_entries = []
for entry in entries:
url = entry.getElementsByTagName('id')[0].childNodes[0].data
tokens = url.split('/')
arxiv_id = tokens[len(tokens) - 1]
raw_data_entries.append((arxiv_id, entry.toxml().replace('\n', ''), str(datetime.datetime.now())))
c.executemany('INSERT INTO raw_data(arxiv_id, data, created_date) values (?,?,?)',
raw_data_entries)
conn.commit()
except Exception:
conn.close()
raise
def fetch_arxiv_data(category, offset=0, limit=100):
url = 'http://export.arxiv.org/api/query?search_query=cat:{}&start={}&max_results={}'.format(
category,
offset,
limit
)
data = urllib.urlopen(url).read()
return data
def digest_arxiv_entry(xmlsource):
"""
Parse an arxiv xml string
:param xmlsource: str
:return: list of articles
"""
xmldata = minidom.parseString(xmlsource)
articles = []
entries = xmldata.getElementsByTagName('entry')
for entry in entries:
url = entry.getElementsByTagName('id')[0].childNodes[0].data
doi_elem = entry.getElementsByTagName('arxiv:doi')
doi = None
if doi_elem:
doi = doi_elem[0].childNodes[0].data
title = entry.getElementsByTagName('title')[0].childNodes[0].data.encode('ascii', 'ignore').replace('\n', ' ')
abstract = entry.getElementsByTagName('summary')[0].childNodes[0].data.encode('ascii', 'ignore').replace('\n', ' ').lstrip()
published_date = entry.getElementsByTagName('published')[0].childNodes[0].data
updated_date = entry.getElementsByTagName('updated')[0].childNodes[0].data
articles.append(
{
'doi': doi,
'title': title,
'abstract': abstract,
'published': published_date,
'updated': updated_date,
'url': url
}
)
return articles
def crawl_all_categories():
"""Crawl all entries from all categories"""
print 'start crawling all categories'
#TODO uncomment this
# for cat in SUBJECT_CLASSIFICATION.keys():
for cat in SUBJECT_CLASSIFICATION.keys()[:5]:
crawl_by_category(cat)
print 'end process!!'
def crawl_by_category(cat):
if cat not in SUBJECT_CLASSIFICATION:
raise Exception("Invalid category")
cat_name = SUBJECT_CLASSIFICATION[cat]
print 'crawling category: {}'.format(cat_name)
data = fetch_arxiv_data(cat, 0, 1)
xml_data = minidom.parseString(data)
# TODO uncomment this
# total_results = int(xml_data.getElementsByTagName('opensearch:totalResults')[0].childNodes[0].data)
total_results = 5
print 'starting to fetch {} entries'.format(total_results)
if total_results > 0:
offset = -10
limit = 10
while offset < total_results:
offset += limit
data = fetch_arxiv_data(cat, offset, limit)
save_arxiv_data_to_db(data)
#filename = "arxiv_{}_{}_{}.xml".format(cat, offset, limit)
#saveToFile(data, filename)
print 'end crawling {}!!'.format(cat_name)
def main():
create_db()
clean_raw_data()
crawl_all_categories()
select_top_ten_raw_data()
if __name__ == '__main__':
main()
#!/usr/bin/env python
SUBJECT_CLASSIFICATION = {
"stat.AP": "Statistics - Applications",
"stat.CO": "Statistics - Computation",
"stat.ML": "Statistics - Machine Learning",
"stat.ME": "Statistics - Methodology",
"stat.TH": "Statistics - Theory",
"q-bio.BM": "Quantitative Biology - Biomolecules",
"q-bio.CB": "Quantitative Biology - Cell Behavior",
"q-bio.GN": "Quantitative Biology - Genomics",
"q-bio.MN": "Quantitative Biology - Molecular Networks",
"q-bio.NC": "Quantitative Biology - Neurons and Cognition",
"q-bio.OT": "Quantitative Biology - Other",
"q-bio.PE": "Quantitative Biology - Populations and Evolution",
"q-bio.QM": "Quantitative Biology - Quantitative Methods",
"q-bio.SC": "Quantitative Biology - Subcellular Processes",
"q-bio.TO": "Quantitative Biology - Tissues and Organs",
"cs.AR": "Computer Science - Architecture",
"cs.AI":"Computer Science - Artificial Intelligence",
"cs.CL":"Computer Science - Computation and Language",
"cs.CC":"Computer Science - Computational Complexity",
"cs.CE":"Computer Science - Computational Engineering; Finance; and Science",
"cs.CG":"Computer Science - Computational Geometry",
"cs.GT":"Computer Science - Computer Science and Game Theory",
"cs.CV":"Computer Science - Computer Vision and Pattern Recognition",
"cs.CY":"Computer Science - Computers and Society",
"cs.CR":"Computer Science - Cryptography and Security",
"cs.DS":"Computer Science - Data Structures and Algorithms",
"cs.DB":"Computer Science - Databases",
"cs.DL":"Computer Science - Digital Libraries",
"cs.DM":"Computer Science - Discrete Mathematics",
"cs.DC":"Computer Science - Distributed; Parallel; and Cluster Computing",
"cs.GL":"Computer Science - General Literature",
"cs.GR":"Computer Science - Graphics",
"cs.HC":"Computer Science - Human-Computer Interaction",
"cs.IR":"Computer Science - Information Retrieval",
"cs.IT":"Computer Science - Information Theory",
"cs.LG":"Computer Science - Learning",
"cs.LO":"Computer Science - Logic in Computer Science",
"cs.MS":"Computer Science - Mathematical Software",
"cs.MA":"Computer Science - Multiagent Systems",
"cs.MM":"Computer Science - Multimedia",
"cs.NI":"Computer Science - Networking and Internet Architecture",
"cs.NE":"Computer Science - Neural and Evolutionary Computing",
"cs.NA":"Computer Science - Numerical Analysis",
"cs.OS":"Computer Science - Operating Systems",
"cs.OH":"Computer Science - Other",
"cs.PF":"Computer Science - Performance",
"cs.PL":"Computer Science - Programming Languages",
"cs.RO":"Computer Science - Robotics",
"cs.SE":"Computer Science - Software Engineering",
"cs.SD":"Computer Science - Sound",
"cs.SC":"Computer Science - Symbolic Computation",
"nlin.AO": "Nonlinear Sciences - Adaptation and Self-Organizing Systems",
"nlin.CG": "Nonlinear Sciences - Cellular Automata and Lattice Gases",
"nlin.CD": "Nonlinear Sciences - Chaotic Dynamics",
"nlin.SI": "Nonlinear Sciences - Exactly Solvable and Integrable Systems",
"nlin.PS": "Nonlinear Sciences - Pattern Formation and Solitons",
"math.AG": "Mathematics - Algebraic Geometry",
"math.AT": "Mathematics - Algebraic Topology",
"math.AP": "Mathematics - Analysis of PDEs",
"math.CT": "Mathematics - Category Theory",
"math.CA": "Mathematics - Classical Analysis and ODEs",
"math.CO": "Mathematics - Combinatorics",
"math.AC": "Mathematics - Commutative Algebra",
"math.CV": "Mathematics - Complex Variables",
"math.DG": "Mathematics - Differential Geometry",
"math.DS": "Mathematics - Dynamical Systems",
"math.FA": "Mathematics - Functional Analysis",
"math.GM": "Mathematics - General Mathematics",
"math.GN": "Mathematics - General Topology",
"math.GT": "Mathematics - Geometric Topology",
"math.GR": "Mathematics - Group Theory",
"math.HO": "Mathematics - History and Overview",
"math.IT": "Mathematics - Information Theory",
"math.KT": "Mathematics - K-Theory and Homology",
"math.LO": "Mathematics - Logic",
"math.MP": "Mathematics - Mathematical Physics",
"math.MG": "Mathematics - Metric Geometry",
"math.NT": "Mathematics - Number Theory",
"math.NA": "Mathematics - Numerical Analysis",
"math.OA": "Mathematics - Operator Algebras",
"math.OC": "Mathematics - Optimization and Control",
"math.PR": "Mathematics - Probability",
"math.QA": "Mathematics - Quantum Algebra",
"math.RT": "Mathematics - Representation Theory",
"math.RA": "Mathematics - Rings and Algebras",
"math.SP": "Mathematics - Spectral Theory",
"math.ST": "Mathematics - Statistics",
"math.SG": "Mathematics - Symplectic Geometry",
"astro-ph": "Astrophysics",
"cond-mat.dis-nn": "Physics - Disordered Systems and Neural Networks",
"cond-mat.mes-hall": "Physics - Mesoscopic Systems and Quantum Hall Effect",
"cond-mat.mtrl-sci": "Physics - Materials Science",
"cond-mat.other": "Physics - Other",
"cond-mat.soft": "Physics - Soft Condensed Matter",
"cond-mat.stat-mech": "Physics - Statistical Mechanics",
"cond-mat.str-el": "Physics - Strongly Correlated Electrons",
"cond-mat.supr-con": "Physics - Superconductivity",
"gr-qc": "General Relativity and Quantum Cosmology",
"hep-ex": "High Energy Physics - Experiment",
"hep-lat": "High Energy Physics - Lattice",
"hep-ph": "High Energy Physics - Phenomenology",
"hep-th": "High Energy Physics - Theory",
"math-ph": "Mathematical Physics",
"nucl-ex": "Nuclear Experiment",
"nucl-th":" Nuclear Theory",
"physics.acc-ph": "Physics - Accelerator Physics",
"physics.ao-ph": "Physics - Atmospheric and Oceanic Physics",
"physics.atom-ph": "Physics - Atomic Physics",
"physics.atm-clus": "Physics - Atomic and Molecular Clusters",
"physics.bio-ph": "Physics - Biological Physics",
"physics.chem-ph": "Physics - Chemical Physics",
"physics.class-ph": "Physics - Classical Physics",
"physics.comp-ph": "Physics - Computational Physics",
"physics.data-an": "Physics - Data Analysis; Statistics and Probability",
"physics.flu-dyn": "Physics - Fluid Dynamics",
"physics.gen-ph": "Physics - General Physics",
"physics.geo-ph": "Physics - Geophysics",
"physics.hist-ph": "Physics - History of Physics",
"physics.ins-det": "Physics - Instrumentation and Detectors",
"physics.med-ph": "Physics - Medical Physics",
"physics.optics": "Physics - Optics",
"physics.ed-ph": "Physics - Physics Education",
"physics.soc-ph": "Physics - Physics and Society",
"physics.plasm-ph": "Physics - Plasma Physics",
"physics.pop-ph": "Physics - Popular Physics",
"physics.space-ph": "Physics - Space Physics",
"quant-ph": "Quantum Physics"
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment