Skip to content

Instantly share code, notes, and snippets.

@dginev
Last active April 24, 2019 16:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save dginev/ac5c282fbb5aa3fbd99b67029c638835 to your computer and use it in GitHub Desktop.
Save dginev/ac5c282fbb5aa3fbd99b67029c638835 to your computer and use it in GitHub Desktop.
Extracting arXiv category metadata from OAI_PMHv2.0 xml harvest
//! Convert arXiv's OAI harvested XML files into a lookup table for classification labels
// Step 0. Prerequisite: download all needed arXiv metadata via OAI, e.g.
//```
// $ pip install git+http://github.com/bloomonkey/oai-harvest.git#egg=oaiharvest
// $ mkdir metadata/arxiv; cd metadata/arxiv
// $ oai-reg add arxiv http://export.arxiv.org/oai2?verb=Identify
// $ oai-harvest arxiv --until 2018-09-09
//```
// endpoint documentation at: https://arxiv.org/help/oa
use jwalk::WalkDir;
use libxml::parser::Parser;
use libxml::xpath::Context;
use rayon::prelude::*;
use serde_json;
use std::collections::{HashMap, HashSet};
use std::env;
use std::error::Error;
use std::fs::File;
use std::sync::{Arc, Mutex};
use std::time::SystemTime;
fn main() -> Result<(), Box<Error>> {
let start = SystemTime::now();
// Read input arguments
let mut input_args = env::args();
let _ = input_args.next(); // skip process name
let metadata_path = match input_args.next() {
Some(path) => path,
None => "../../metadata/arxiv/".to_string(),
};
let labels_filepath = match input_args.next() {
Some(path) => path,
None => "categories-arXMLiv-08-2018.json".to_string(),
};
// Extract a dataset (JSON?) of relevant metadata for a given NLP task.
// here, arXiv subject categories
let catalog: HashMap<String, Vec<String>> = HashMap::new();
let catalog_arc = Arc::new(Mutex::new(catalog));
WalkDir::new(metadata_path)
.num_threads(rayon::current_num_threads())
.sort(true)
.into_iter()
.filter_map(|each| {
if let Ok(entry) = each {
let file_name = entry.file_name.to_str().unwrap_or("");
if file_name.ends_with(".xml") {
let path = entry.path().to_str().unwrap_or("").to_owned();
if !path.is_empty() {
return Some(path);
}
}
}
// all other cases
None
})
.enumerate()
.par_bridge()
.for_each(|each| {
let (index, path) = each;
if index % 10000 == 0 {
println!("at document {:?}", index);
}
let parser = Parser::default();
let doc = parser.parse_file(&path).unwrap();
let mut context = Context::new(&doc).unwrap();
let id = context.findvalue("/*/*[local-name()='id']", None).unwrap();
let category = context
.findvalue("/*/*[local-name()='categories']", None)
.unwrap();
let mut categories: HashSet<String> = HashSet::new();
for cat in category.split(' ') {
let dotparts: Vec<&str> = cat.split('.').collect();
if dotparts.len() > 1 {
// also record the lead-in category
categories.insert(dotparts[0].to_lowercase().to_string());
// and the entire category
categories.insert(cat.to_lowercase().to_string());
} else {
// just a single category to record
categories.insert(cat.to_lowercase().to_string());
}
}
let mut categories_vec: Vec<String> = categories.drain().collect();
categories_vec.sort();
let thread_arc = catalog_arc.clone();
let mut catalog_lock = thread_arc.lock().unwrap();
catalog_lock.insert(id, categories_vec);
});
// serialize to json
let file = File::create(labels_filepath)?;
let catalog_lock = catalog_arc.lock().unwrap();
serde_json::to_writer(file, &*catalog_lock)?;
let duration_sec = SystemTime::now().duration_since(start).unwrap().as_secs();
println!("-- metadata packer took {:?} seconds.", duration_sec);
Ok(())
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment