Skip to content

Instantly share code, notes, and snippets.

@atomic77
Last active February 19, 2021 17:11
Show Gist options
  • Save atomic77/291e20fc05b3546b2ea5dd7897d2138a to your computer and use it in GitHub Desktop.
Save atomic77/291e20fc05b3546b2ea5dd7897d2138a to your computer and use it in GitHub Desktop.

This is a fast, but incomplete, parser for JSON-based wikibase dump files that I wrote to extract large amounts of info without having to load them into a database. I don't think I'll have enough time to turn this into a full-fledged Rust library for the wikibase format, so I'm posting it as a Gist in case it's useful to someone in its current state.

It can be used to filter entries based on a claim, and extract out only desired properties. By default, it reads from stdin as a single page per line.

For example, to select the birth and death time (properties P569, P570) for all instances of (claim Q5) human-being (property P31):

$ zcat latest-all.json.gz | head -6 | cargo run --release -- --filter-property P31 --filter-claim Q5 --select-properties P569 P570
{"id":"Q23","name":"George Washington","claims":[{"id":"Q23$3BF0223A-D656-435B-9FD1-32E0B8F54A69","mainsnak":{"datatype":"time","snaktype":"value","property":"P569","datavalue":{"type":"time","value":{"time":"+1732-02-22T00:00:00Z","timezone":0,"before":0,"after":0,"precision":11,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"}}},"claim_type":"statement","rank":"normal"},{"id":"q23$423dae3a-4b2a-1e9a-033f-632f0580c92e","mainsnak":{"datatype":"time","snaktype":"value","property":"P570","datavalue":{"type":"time","value":{"time":"+1799-12-14T00:00:00Z","timezone":0,"before":0,"after":0,"precision":11,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"}}},"claim_type":"statement","rank":"normal"}]}
{"id":"Q42","name":"Douglas Adams","claims":[{"id":"q42$D8404CDA-25E4-4334-AF13-A3290BCD9C0F","mainsnak":{"datatype":"time","snaktype":"value","property":"P569","datavalue":{"type":"time","value":{"time":"+1952-03-11T00:00:00Z","timezone":0,"before":0,"after":0,"precision":11,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"}}},"claim_type":"statement","rank":"normal"},{"id":"q42$65EA9C32-B26C-469B-84FE-FC612B71D159","mainsnak":{"datatype":"time","snaktype":"value","property":"P570","datavalue":{"type":"time","value":{"time":"+2001-05-11T00:00:00Z","timezone":0,"before":0,"after":0,"precision":11,"calendarmodel":"http://www.wikidata.org/entity/Q1985727"}}},"claim_type":"statement","rank":"normal"}]}

Only some of the snak types are extracted properly, but adding the other field types to allow serde_derive to handle the remaining enum variants shouldn't be too hard, but for my work I haven't needed any of these (eg. tabular-data or musical-notation).

Performance

I'm still a Rust beginner, but if there is one thing I've learned in the past few months I've been working with it -- if you can get your program to compile at all, it will probably run fast! Without really even trying, this parser can deserialize and extract out the desired attributes from the dump almost as fast as gzip can uncompress the data, so the limiting factor is CPU, unless you have a ton of fast disk lying around that you want to dedicate to these dumps.

In practice, on my 6-core i5-8600T, I can break up the dump into a number of parts, and use three parallel parsers, allowing me to process the full dump in 35m. Your mileage may vary.

[package]
name = "wbparse"
version = "0.1.0"
authors = ["Alex Tomic <atomic777@gmail.com>"]
edition = "2018"
[dependencies]
serde_json = "1.0"
serde = "1.0"
serde_derive = ""
chrono = {version = "", features = ["serde"]}
clap = "3.0.0-beta.2"
#![feature(option_result_contains)]
use std::collections::HashMap;
use serde_json;
use serde_derive::{Deserialize, Serialize};
use chrono::{DateTime, Utc};
use std::io::{self, BufRead};
use clap::Clap;
use std::fs;
#[derive(Clone, Debug, Deserialize, Serialize)]
struct SiteData {
site: String,
title: String,
badges: Vec<String>,
url: Option<String>
}
#[derive(Clone, Debug, Deserialize, Serialize)]
struct Translation {
language: String,
value: String
}
mod wiki_date_format {
use chrono::{DateTime, Utc, TimeZone, NaiveDate};
use serde::{self, Deserialize, Serializer, Deserializer};
const FORMAT: &'static str = "+%Y-%m-%dT%H:%M:%SZ";
pub fn serialize<S>( date: &DateTime<Utc>, serializer: S,) -> Result<S::Ok, S::Error>
where S: Serializer,
{
let s = format!("{}", date.format(FORMAT));
serializer.serialize_str(&s)
}
/// Chrono does not like dates like +1750-00-00 that are often used for historical
/// figures where a precise date is not known. In those cases, try to extract out
/// the year and return the day and month as January 1st
pub fn deserialize<'de, D>(deserializer: D) -> Result<DateTime<Utc>, D::Error>
where D: Deserializer<'de>,
{
let s = String::deserialize(deserializer)?;
match Utc.datetime_from_str(&s, FORMAT) {
Ok(val) => Ok(val),
Err(_) => {
let d = NaiveDate::from_ymd(s[1..5].parse::<i32>().unwrap(), 1, 1);
let dt = d.and_hms(0,0,0);
let datetime_utc = DateTime::<Utc>::from_utc(dt, Utc);
Ok(datetime_utc)
}
}
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "type", content = "value", rename_all = "kebab-case")]
enum SnakDataValue {
Time {
#[serde(with = "wiki_date_format")]
time: DateTime<Utc>,
timezone: i32,
before: i32,
after: i32,
precision: i32,
calendarmodel: String
},
Quantity {
amount: String,
unit: String
},
#[serde(rename_all = "kebab-case")]
WikibaseEntityid {
entity_type: String,
id: String,
numeric_id: i32
},
Globecoordinate {
latitude: f32,
longitude: f32,
precision: Option<f32>
// Ignore globe and altitude
},
// TODO Add implementations for these other value types
String {},
Monolingualtext{},
NoValue
}
impl Default for SnakDataValue {
fn default() -> Self {
SnakDataValue::NoValue
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
struct SnakValue {
snaktype: String,
property: String,
#[serde(default)]
datavalue: SnakDataValue
}
// TODO Fill out the implementations for the remaining types
#[derive(Clone, Debug, Deserialize, Serialize)]
#[serde(tag = "datatype", rename_all = "kebab-case")]
enum Snak {
#[serde(alias = "commonsMedia")]
CommonsMedia {},
ExternalId {
snaktype: String,
property: String
// FIXME Having trouble deserializing datavalue as a raw string, so we'll ignore it for now
},
GeoShape {},
GlobeCoordinate {
#[serde(flatten)]
snak: SnakValue
},
Monolingualtext {},
MusicalNotation {},
Math {},
Quantity {
#[serde(flatten)]
snak: SnakValue
},
String,
TabularData {},
Time {
#[serde(flatten)]
snak: SnakValue
},
Url {
/*
snaktype: String,
property: String,
datavalue: SnakDataValue
*/
},
WikibaseItem {
// snaktype: String,
#[serde(flatten)]
snak: SnakValue
// property: String,
// datavalue: SnakDataValue
},
WikibaseForm {},
WikibaseLexeme {},
WikibaseProperty {},
}
#[derive(Clone, Debug, Deserialize, Serialize)]
struct Claim {
id: String,
mainsnak: Snak,
#[serde(alias = "type")]
claim_type: String,
rank: String,
// qualifiers: Vec<String>
}
// https://doc.wikimedia.org/Wikibase/master/php/md_docs_topics_json.html
// Top-level structure
#[derive(Clone, Debug, Deserialize, Serialize)]
struct WikiEntity {
id: String,
#[serde(alias = "type")]
item_type: String,
labels: HashMap<String, Translation>,
descriptions: HashMap<String, Translation>,
aliases: HashMap<String, Vec<Translation>>,
sitelinks: Option<HashMap<String, SiteData>>,
claims: HashMap<String, Vec<Claim>>
}
#[derive(Clone, Debug)]
struct ParseStats {
pages: usize,
matches: usize,
failures: usize,
persons: usize,
}
impl ParseStats {
fn new() -> ParseStats {
ParseStats {
pages: 0,
matches: 0,
failures: 0,
persons: 0,
}
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
struct OutputValue<'a> {
id: &'a str,
name: &'a str,
claims: Vec<Claim>
}
#[derive(Clap, Debug)]
#[clap( version = "0.2", about = "A rust-based parser for wikibase dumps")]
struct Opts {
/// Optional single json file to load instead of stdin
#[clap(short, long)]
json_file: Option<String>,
/// Property filter, eg. P31 for "instance of"
#[clap(long)]
filter_property: Option<String>,
/// Right hand of property predicate. --filter-claim="Q5" cmbined with --property="P31"
/// to select all humans
#[clap(short, long)]
filter_claim: Option<String>,
/// What property to retrieve
#[clap(short, long)]
select_properties: Option<Vec<String>>,
/// Specialized selector; clean this up
#[clap(long)]
select_geo_property: Option<String>,
}
impl WikiEntity {
/// Try to find an english label for this entry
fn get_label(self: &Self) -> &str {
for l in vec!["en", "en-gb", "en-us", "en-ca"] {
if let Some(trans) = self.labels.get(l) {
return &trans.value;
}
}
""
}
}
fn apply_filter(we: WikiEntity, prop: &str, claim: &str) -> Option<WikiEntity> {
if let Some(claims) = we.claims.get(prop) {
let filtered_claims = claims.into_iter().filter(|c| match &c.mainsnak {
Snak::WikibaseItem { snak, ..} => snak.property == prop,
_ => false
}).cloned().collect::<Vec<Claim>>();
let matches = filtered_claims.into_iter().filter(|c| match &c.mainsnak {
Snak::WikibaseItem { snak, .. } => match &snak.datavalue {
SnakDataValue::WikibaseEntityid { id, .. } => id == claim,
_ => false
},
_ => false
}).collect::<Vec<Claim>>();
if matches.len() > 0 {
return Some(we);
}
}
None
}
fn parse_str(line: &str, ps: &mut ParseStats, opts: &Opts) {
match serde_json::from_str::<WikiEntity>(line) {
Ok(wiki_item) => {
if let Some(we) = apply_filter(wiki_item, &opts.filter_property.as_ref().unwrap(), &opts.filter_claim.as_ref().unwrap()) {
ps.persons += 1;
let matches = opts.select_properties.as_ref().unwrap().into_iter()
.filter_map(|prop| we.claims.get(prop))
.cloned()
.flatten()
.collect::<Vec<Claim>>();
// Don't return any matches where a property is missing
if matches.len() >= opts.select_properties.as_ref().unwrap().len() {
ps.matches += 1;
let out = OutputValue {
id: &we.id.to_owned(),
name: we.get_label(),
claims: matches
};
println!("{}", serde_json::to_string(&out).unwrap());
}
}
ps.pages += 1;
},
Err(e) => {
eprintln!("Error: {:?}", &e);
ps.failures += 1;
}
}
}
#[derive(Clone, Debug, Deserialize, Serialize)]
struct OutputCoords<'a> {
id: &'a str,
name: &'a str,
latitude: f32,
longitude: f32
}
/// Quick and dirty extractor just for Geo-coordinates
fn parse_geo_only(line: &str, ps: &mut ParseStats, opts: &Opts) {
if let Ok(wiki_item) = serde_json::from_str::<WikiEntity>(line) {
ps.pages += 1;
if let Some(claims) = wiki_item.claims.get(opts.select_geo_property.as_ref().unwrap()) {
if let Snak::GlobeCoordinate { snak, .. } = &claims.first().unwrap().mainsnak {
if let SnakDataValue::Globecoordinate { latitude, longitude, .. } = snak.datavalue {
let out = OutputCoords {
id: &wiki_item.id.to_owned(),
name: wiki_item.get_label(),
latitude: latitude.to_owned(),
longitude: longitude.to_owned()
};
println!("{}", serde_json::to_string(&out).unwrap());
}
}
}
}
}
fn main() {
let opts: Opts = Opts::parse();
eprintln!("{:?}", &opts);
let mut ps = ParseStats::new();
let stdin = io::stdin();
if let Some(f) = &opts.json_file {
let s = fs::read_to_string(f).unwrap();
parse_str(&s, &mut ps, &opts);
} else {
let start = std::time::Instant::now();
for l in stdin.lock().lines() {
let mut line = l.unwrap();
// The string will have a trailing "," we need to strip off before trying to deser
line.truncate(line.len()-1);
if line.len() == 0 {
continue
}
if opts.select_geo_property.is_some() {
parse_geo_only(&line, &mut ps, &opts);
} else {
parse_str(&line, &mut ps, &opts);
}
if ps.pages % 50000 == 0 {
let elapsed = start.elapsed().as_secs_f64();
eprintln!("{:.1} pg/s, {:?}", ps.pages as f64 / elapsed, ps);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment