Skip to content

Instantly share code, notes, and snippets.

@adzcai
Created March 4, 2023 18:07
Show Gist options
  • Save adzcai/f527e8a1df08c18cf22093e783ab8b9b to your computer and use it in GitHub Desktop.
Save adzcai/f527e8a1df08c18cf22093e783ab8b9b to your computer and use it in GitHub Desktop.
Scrape arXiv categories from https://arxiv.org/category_taxonomy
const categories = document.querySelector("#category_taxonomy_list");
// get each h2 and div pair from categories
const categoryPairs = Array.from(categories.children).filter(
(child) => child.tagName === "H2" || child.tagName === "DIV"
);
// group the h2 and div pairs into an array of arrays
const categoryGroups = [];
for (let i = 0; i < categoryPairs.length; i += 2) {
categoryGroups.push(categoryPairs.slice(i, i + 2));
}
// get the category name and id from each group
const categoriesList = categoryGroups.map(([header, body]) => {
const name = header.textContent;
// two nested divs inside the body is a list of divs corresponding to the subcategories
const subcategories = Array.from(body.children[0].children[0].children).map(
(subcategory) => {
try {
const subname = subcategory.children[0].children[0].firstChild;
const description = subcategory.children[0].children[0].children[0];
return {
name: subname.textContent.trim(),
description: description.textContent.slice(1, -1),
};
} catch (err) {
return null;
}
}
);
return {
name,
subcategories,
};
});
[
{
"name": "Computer Science",
"subcategories": [
{
"name": "cs.AI",
"description": "Artificial Intelligence"
},
{
"name": "cs.AR",
"description": "Hardware Architecture"
},
{
"name": "cs.CC",
"description": "Computational Complexity"
},
{
"name": "cs.CE",
"description": "Computational Engineering, Finance, and Science"
},
{
"name": "cs.CG",
"description": "Computational Geometry"
},
{
"name": "cs.CL",
"description": "Computation and Language"
},
{
"name": "cs.CR",
"description": "Cryptography and Security"
},
{
"name": "cs.CV",
"description": "Computer Vision and Pattern Recognition"
},
{
"name": "cs.CY",
"description": "Computers and Society"
},
{
"name": "cs.DB",
"description": "Databases"
},
{
"name": "cs.DC",
"description": "Distributed, Parallel, and Cluster Computing"
},
{
"name": "cs.DL",
"description": "Digital Libraries"
},
{
"name": "cs.DM",
"description": "Discrete Mathematics"
},
{
"name": "cs.DS",
"description": "Data Structures and Algorithms"
},
{
"name": "cs.ET",
"description": "Emerging Technologies"
},
{
"name": "cs.FL",
"description": "Formal Languages and Automata Theory"
},
{
"name": "cs.GL",
"description": "General Literature"
},
{
"name": "cs.GR",
"description": "Graphics"
},
{
"name": "cs.GT",
"description": "Computer Science and Game Theory"
},
{
"name": "cs.HC",
"description": "Human-Computer Interaction"
},
{
"name": "cs.IR",
"description": "Information Retrieval"
},
{
"name": "cs.IT",
"description": "Information Theory"
},
{
"name": "cs.LG",
"description": "Machine Learning"
},
{
"name": "cs.LO",
"description": "Logic in Computer Science"
},
{
"name": "cs.MA",
"description": "Multiagent Systems"
},
{
"name": "cs.MM",
"description": "Multimedia"
},
{
"name": "cs.MS",
"description": "Mathematical Software"
},
{
"name": "cs.NA",
"description": "Numerical Analysis"
},
{
"name": "cs.NE",
"description": "Neural and Evolutionary Computing"
},
{
"name": "cs.NI",
"description": "Networking and Internet Architecture"
},
{
"name": "cs.OH",
"description": "Other Computer Science"
},
{
"name": "cs.OS",
"description": "Operating Systems"
},
{
"name": "cs.PF",
"description": "Performance"
},
{
"name": "cs.PL",
"description": "Programming Languages"
},
{
"name": "cs.RO",
"description": "Robotics"
},
{
"name": "cs.SC",
"description": "Symbolic Computation"
},
{
"name": "cs.SD",
"description": "Sound"
},
{
"name": "cs.SE",
"description": "Software Engineering"
},
{
"name": "cs.SI",
"description": "Social and Information Networks"
},
{
"name": "cs.SY",
"description": "Systems and Control"
}
]
},
{
"name": "Economics",
"subcategories": [
{
"name": "econ.EM",
"description": "Econometrics"
},
{
"name": "econ.GN",
"description": "General Economics"
},
{
"name": "econ.TH",
"description": "Theoretical Economics"
}
]
},
{
"name": "Electrical Engineering and Systems Science",
"subcategories": [
{
"name": "eess.AS",
"description": "Audio and Speech Processing"
},
{
"name": "eess.IV",
"description": "Image and Video Processing"
},
{
"name": "eess.SP",
"description": "Signal Processing"
},
{
"name": "eess.SY",
"description": "Systems and Control"
}
]
},
{
"name": "Mathematics",
"subcategories": [
{
"name": "math.AC",
"description": "Commutative Algebra"
},
{
"name": "math.AG",
"description": "Algebraic Geometry"
},
{
"name": "math.AP",
"description": "Analysis of PDEs"
},
{
"name": "math.AT",
"description": "Algebraic Topology"
},
{
"name": "math.CA",
"description": "Classical Analysis and ODEs"
},
{
"name": "math.CO",
"description": "Combinatorics"
},
{
"name": "math.CT",
"description": "Category Theory"
},
{
"name": "math.CV",
"description": "Complex Variables"
},
{
"name": "math.DG",
"description": "Differential Geometry"
},
{
"name": "math.DS",
"description": "Dynamical Systems"
},
{
"name": "math.FA",
"description": "Functional Analysis"
},
{
"name": "math.GM",
"description": "General Mathematics"
},
{
"name": "math.GN",
"description": "General Topology"
},
{
"name": "math.GR",
"description": "Group Theory"
},
{
"name": "math.GT",
"description": "Geometric Topology"
},
{
"name": "math.HO",
"description": "History and Overview"
},
{
"name": "math.IT",
"description": "Information Theory"
},
{
"name": "math.KT",
"description": "K-Theory and Homology"
},
{
"name": "math.LO",
"description": "Logic"
},
{
"name": "math.MG",
"description": "Metric Geometry"
},
{
"name": "math.MP",
"description": "Mathematical Physics"
},
{
"name": "math.NA",
"description": "Numerical Analysis"
},
{
"name": "math.NT",
"description": "Number Theory"
},
{
"name": "math.OA",
"description": "Operator Algebras"
},
{
"name": "math.OC",
"description": "Optimization and Control"
},
{
"name": "math.PR",
"description": "Probability"
},
{
"name": "math.QA",
"description": "Quantum Algebra"
},
{
"name": "math.RA",
"description": "Rings and Algebras"
},
{
"name": "math.RT",
"description": "Representation Theory"
},
{
"name": "math.SG",
"description": "Symplectic Geometry"
},
{
"name": "math.SP",
"description": "Spectral Theory"
},
{
"name": "math.ST",
"description": "Statistics Theory"
}
]
},
{
"name": "Quantitative Biology",
"subcategories": [
{
"name": "q-bio.BM",
"description": "Biomolecules"
},
{
"name": "q-bio.CB",
"description": "Cell Behavior"
},
{
"name": "q-bio.GN",
"description": "Genomics"
},
{
"name": "q-bio.MN",
"description": "Molecular Networks"
},
{
"name": "q-bio.NC",
"description": "Neurons and Cognition"
},
{
"name": "q-bio.OT",
"description": "Other Quantitative Biology"
},
{
"name": "q-bio.PE",
"description": "Populations and Evolution"
},
{
"name": "q-bio.QM",
"description": "Quantitative Methods"
},
{
"name": "q-bio.SC",
"description": "Subcellular Processes"
},
{
"name": "q-bio.TO",
"description": "Tissues and Organs"
}
]
},
{
"name": "Quantitative Finance",
"subcategories": [
{
"name": "q-fin.CP",
"description": "Computational Finance"
},
{
"name": "q-fin.EC",
"description": "Economics"
},
{
"name": "q-fin.GN",
"description": "General Finance"
},
{
"name": "q-fin.MF",
"description": "Mathematical Finance"
},
{
"name": "q-fin.PM",
"description": "Portfolio Management"
},
{
"name": "q-fin.PR",
"description": "Pricing of Securities"
},
{
"name": "q-fin.RM",
"description": "Risk Management"
},
{
"name": "q-fin.ST",
"description": "Statistical Finance"
},
{
"name": "q-fin.TR",
"description": "Trading and Market Microstructure"
}
]
},
{
"name": "Statistics",
"subcategories": [
{
"name": "stat.AP",
"description": "Applications"
},
{
"name": "stat.CO",
"description": "Computation"
},
{
"name": "stat.ME",
"description": "Methodology"
},
{
"name": "stat.ML",
"description": "Machine Learning"
},
{
"name": "stat.OT",
"description": "Other Statistics"
},
{
"name": "stat.TH",
"description": "Statistics Theory"
}
]
}
]
const categories = document.querySelector("#category_taxonomy_list > div:nth-child(10)");
// get each h2 and div pair from categories
const categoryGroups = Array.from(categories.children).map(c => c.children);
// group the h2 and div pairs into an array of arrays
console.log(categoryGroups)
// get the category name and id from each group
const categoriesList = categoryGroups.map(([header, body]) => {
console.log(header.children[0].firstChild)
const name = header.children[0].firstChild.textContent.trim();
const subtitle = header.querySelector('span').textContent.trim();
// two nested divs inside the body is a list of divs corresponding to the subcategories
const subcategories = Array.from(body.children).map(
(subcategory) => {
try {
const subname = subcategory.children[0].children[0].firstChild;
const description = subcategory.children[0].children[0].children[0];
return {
name: subname.textContent.trim(),
description: description.textContent.slice(1, -1),
};
} catch (err) {
return null;
}
}
);
return {
name,
subtitle,
subcategories,
};
});
[
{
"name": "Astrophysics",
"subtitle": "astro-ph",
"subcategories": [
{
"name": "astro-ph.CO",
"description": "Cosmology and Nongalactic Astrophysics"
},
{
"name": "astro-ph.EP",
"description": "Earth and Planetary Astrophysics"
},
{
"name": "astro-ph.GA",
"description": "Astrophysics of Galaxies"
},
{
"name": "astro-ph.HE",
"description": "High Energy Astrophysical Phenomena"
},
{
"name": "astro-ph.IM",
"description": "Instrumentation and Methods for Astrophysics"
},
{
"name": "astro-ph.SR",
"description": "Solar and Stellar Astrophysics"
}
]
},
{
"name": "Condensed Matter",
"subtitle": "cond-mat",
"subcategories": [
{
"name": "cond-mat.dis-nn",
"description": "Disordered Systems and Neural Networks"
},
{
"name": "cond-mat.mes-hall",
"description": "Mesoscale and Nanoscale Physics"
},
{
"name": "cond-mat.mtrl-sci",
"description": "Materials Science"
},
{
"name": "cond-mat.other",
"description": "Other Condensed Matter"
},
{
"name": "cond-mat.quant-gas",
"description": "Quantum Gases"
},
{
"name": "cond-mat.soft",
"description": "Soft Condensed Matter"
},
{
"name": "cond-mat.stat-mech",
"description": "Statistical Mechanics"
},
{
"name": "cond-mat.str-el",
"description": "Strongly Correlated Electrons"
},
{
"name": "cond-mat.supr-con",
"description": "Superconductivity"
}
]
},
{
"name": "General Relativity and Quantum Cosmology",
"subtitle": "gr-qc",
"subcategories": [
{
"name": "gr-qc",
"description": "General Relativity and Quantum Cosmology"
}
]
},
{
"name": "High Energy Physics - Experiment",
"subtitle": "hep-ex",
"subcategories": [
{
"name": "hep-ex",
"description": "High Energy Physics - Experiment"
}
]
},
{
"name": "High Energy Physics - Lattice",
"subtitle": "hep-lat",
"subcategories": [
{
"name": "hep-lat",
"description": "High Energy Physics - Lattice"
}
]
},
{
"name": "High Energy Physics - Phenomenology",
"subtitle": "hep-ph",
"subcategories": [
{
"name": "hep-ph",
"description": "High Energy Physics - Phenomenology"
}
]
},
{
"name": "High Energy Physics - Theory",
"subtitle": "hep-th",
"subcategories": [
{
"name": "hep-th",
"description": "High Energy Physics - Theory"
}
]
},
{
"name": "Mathematical Physics",
"subtitle": "math-ph",
"subcategories": [
{
"name": "math-ph",
"description": "Mathematical Physics"
}
]
},
{
"name": "Nonlinear Sciences",
"subtitle": "nlin",
"subcategories": [
{
"name": "nlin.AO",
"description": "Adaptation and Self-Organizing Systems"
},
{
"name": "nlin.CD",
"description": "Chaotic Dynamics"
},
{
"name": "nlin.CG",
"description": "Cellular Automata and Lattice Gases"
},
{
"name": "nlin.PS",
"description": "Pattern Formation and Solitons"
},
{
"name": "nlin.SI",
"description": "Exactly Solvable and Integrable Systems"
}
]
},
{
"name": "Nuclear Experiment",
"subtitle": "nucl-ex",
"subcategories": [
{
"name": "nucl-ex",
"description": "Nuclear Experiment"
}
]
},
{
"name": "Nuclear Theory",
"subtitle": "nucl-th",
"subcategories": [
{
"name": "nucl-th",
"description": "Nuclear Theory"
}
]
},
{
"name": "Physics",
"subtitle": "physics",
"subcategories": [
{
"name": "physics.acc-ph",
"description": "Accelerator Physics"
},
{
"name": "physics.ao-ph",
"description": "Atmospheric and Oceanic Physics"
},
{
"name": "physics.app-ph",
"description": "Applied Physics"
},
{
"name": "physics.atm-clus",
"description": "Atomic and Molecular Clusters"
},
{
"name": "physics.atom-ph",
"description": "Atomic Physics"
},
{
"name": "physics.bio-ph",
"description": "Biological Physics"
},
{
"name": "physics.chem-ph",
"description": "Chemical Physics"
},
{
"name": "physics.class-ph",
"description": "Classical Physics"
},
{
"name": "physics.comp-ph",
"description": "Computational Physics"
},
{
"name": "physics.data-an",
"description": "Data Analysis, Statistics and Probability"
},
{
"name": "physics.ed-ph",
"description": "Physics Education"
},
{
"name": "physics.flu-dyn",
"description": "Fluid Dynamics"
},
{
"name": "physics.gen-ph",
"description": "General Physics"
},
{
"name": "physics.geo-ph",
"description": "Geophysics"
},
{
"name": "physics.hist-ph",
"description": "History and Philosophy of Physics"
},
{
"name": "physics.ins-det",
"description": "Instrumentation and Detectors"
},
{
"name": "physics.med-ph",
"description": "Medical Physics"
},
{
"name": "physics.optics",
"description": "Optics"
},
{
"name": "physics.plasm-ph",
"description": "Plasma Physics"
},
{
"name": "physics.pop-ph",
"description": "Popular Physics"
},
{
"name": "physics.soc-ph",
"description": "Physics and Society"
},
{
"name": "physics.space-ph",
"description": "Space Physics"
}
]
},
{
"name": "Quantum Physics",
"subtitle": "quant-ph",
"subcategories": [
{
"name": "quant-ph",
"description": "Quantum Physics"
}
]
}
]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment