|
#![feature(option_result_contains)] |
|
|
|
use std::collections::HashMap; |
|
use serde_json; |
|
use serde_derive::{Deserialize, Serialize}; |
|
use chrono::{DateTime, Utc}; |
|
use std::io::{self, BufRead}; |
|
use clap::Clap; |
|
use std::fs; |
|
|
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
struct SiteData { |
|
site: String, |
|
title: String, |
|
badges: Vec<String>, |
|
url: Option<String> |
|
} |
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
struct Translation { |
|
language: String, |
|
value: String |
|
} |
|
|
|
mod wiki_date_format { |
|
use chrono::{DateTime, Utc, TimeZone, NaiveDate}; |
|
use serde::{self, Deserialize, Serializer, Deserializer}; |
|
|
|
const FORMAT: &'static str = "+%Y-%m-%dT%H:%M:%SZ"; |
|
|
|
pub fn serialize<S>( date: &DateTime<Utc>, serializer: S,) -> Result<S::Ok, S::Error> |
|
where S: Serializer, |
|
{ |
|
let s = format!("{}", date.format(FORMAT)); |
|
serializer.serialize_str(&s) |
|
} |
|
/// Chrono does not like dates like +1750-00-00 that are often used for historical |
|
/// figures where a precise date is not known. In those cases, try to extract out |
|
/// the year and return the day and month as January 1st |
|
pub fn deserialize<'de, D>(deserializer: D) -> Result<DateTime<Utc>, D::Error> |
|
where D: Deserializer<'de>, |
|
{ |
|
let s = String::deserialize(deserializer)?; |
|
match Utc.datetime_from_str(&s, FORMAT) { |
|
Ok(val) => Ok(val), |
|
Err(_) => { |
|
let d = NaiveDate::from_ymd(s[1..5].parse::<i32>().unwrap(), 1, 1); |
|
let dt = d.and_hms(0,0,0); |
|
let datetime_utc = DateTime::<Utc>::from_utc(dt, Utc); |
|
Ok(datetime_utc) |
|
} |
|
} |
|
} |
|
} |
|
|
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
#[serde(tag = "type", content = "value", rename_all = "kebab-case")] |
|
enum SnakDataValue { |
|
Time { |
|
#[serde(with = "wiki_date_format")] |
|
time: DateTime<Utc>, |
|
timezone: i32, |
|
before: i32, |
|
after: i32, |
|
precision: i32, |
|
calendarmodel: String |
|
}, |
|
Quantity { |
|
amount: String, |
|
unit: String |
|
}, |
|
#[serde(rename_all = "kebab-case")] |
|
WikibaseEntityid { |
|
entity_type: String, |
|
id: String, |
|
numeric_id: i32 |
|
}, |
|
Globecoordinate { |
|
latitude: f32, |
|
longitude: f32, |
|
precision: Option<f32> |
|
// Ignore globe and altitude |
|
}, |
|
// TODO Add implementations for these other value types |
|
String {}, |
|
Monolingualtext{}, |
|
NoValue |
|
} |
|
|
|
impl Default for SnakDataValue { |
|
fn default() -> Self { |
|
SnakDataValue::NoValue |
|
} |
|
} |
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
struct SnakValue { |
|
snaktype: String, |
|
property: String, |
|
#[serde(default)] |
|
datavalue: SnakDataValue |
|
} |
|
|
|
// TODO Fill out the implementations for the remaining types |
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
#[serde(tag = "datatype", rename_all = "kebab-case")] |
|
enum Snak { |
|
#[serde(alias = "commonsMedia")] |
|
CommonsMedia {}, |
|
ExternalId { |
|
snaktype: String, |
|
property: String |
|
// FIXME Having trouble deserializing datavalue as a raw string, so we'll ignore it for now |
|
}, |
|
GeoShape {}, |
|
GlobeCoordinate { |
|
#[serde(flatten)] |
|
snak: SnakValue |
|
}, |
|
Monolingualtext {}, |
|
MusicalNotation {}, |
|
Math {}, |
|
Quantity { |
|
#[serde(flatten)] |
|
snak: SnakValue |
|
}, |
|
String, |
|
TabularData {}, |
|
Time { |
|
#[serde(flatten)] |
|
snak: SnakValue |
|
}, |
|
Url { |
|
/* |
|
snaktype: String, |
|
property: String, |
|
datavalue: SnakDataValue |
|
*/ |
|
}, |
|
WikibaseItem { |
|
// snaktype: String, |
|
#[serde(flatten)] |
|
snak: SnakValue |
|
// property: String, |
|
// datavalue: SnakDataValue |
|
}, |
|
WikibaseForm {}, |
|
WikibaseLexeme {}, |
|
WikibaseProperty {}, |
|
} |
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
struct Claim { |
|
id: String, |
|
mainsnak: Snak, |
|
#[serde(alias = "type")] |
|
claim_type: String, |
|
rank: String, |
|
// qualifiers: Vec<String> |
|
} |
|
|
|
// https://doc.wikimedia.org/Wikibase/master/php/md_docs_topics_json.html |
|
// Top-level structure |
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
struct WikiEntity { |
|
id: String, |
|
#[serde(alias = "type")] |
|
item_type: String, |
|
labels: HashMap<String, Translation>, |
|
descriptions: HashMap<String, Translation>, |
|
aliases: HashMap<String, Vec<Translation>>, |
|
sitelinks: Option<HashMap<String, SiteData>>, |
|
claims: HashMap<String, Vec<Claim>> |
|
} |
|
|
|
|
|
#[derive(Clone, Debug)] |
|
struct ParseStats { |
|
pages: usize, |
|
matches: usize, |
|
failures: usize, |
|
persons: usize, |
|
} |
|
|
|
impl ParseStats { |
|
fn new() -> ParseStats { |
|
ParseStats { |
|
pages: 0, |
|
matches: 0, |
|
failures: 0, |
|
persons: 0, |
|
} |
|
} |
|
} |
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
struct OutputValue<'a> { |
|
id: &'a str, |
|
name: &'a str, |
|
claims: Vec<Claim> |
|
} |
|
|
|
#[derive(Clap, Debug)] |
|
#[clap( version = "0.2", about = "A rust-based parser for wikibase dumps")] |
|
struct Opts { |
|
/// Optional single json file to load instead of stdin |
|
#[clap(short, long)] |
|
json_file: Option<String>, |
|
|
|
/// Property filter, eg. P31 for "instance of" |
|
#[clap(long)] |
|
filter_property: Option<String>, |
|
|
|
/// Right hand of property predicate. --filter-claim="Q5" cmbined with --property="P31" |
|
/// to select all humans |
|
#[clap(short, long)] |
|
filter_claim: Option<String>, |
|
|
|
/// What property to retrieve |
|
#[clap(short, long)] |
|
select_properties: Option<Vec<String>>, |
|
|
|
/// Specialized selector; clean this up |
|
#[clap(long)] |
|
select_geo_property: Option<String>, |
|
|
|
} |
|
|
|
impl WikiEntity { |
|
|
|
/// Try to find an english label for this entry |
|
fn get_label(self: &Self) -> &str { |
|
for l in vec!["en", "en-gb", "en-us", "en-ca"] { |
|
if let Some(trans) = self.labels.get(l) { |
|
return &trans.value; |
|
} |
|
} |
|
"" |
|
} |
|
|
|
} |
|
fn apply_filter(we: WikiEntity, prop: &str, claim: &str) -> Option<WikiEntity> { |
|
|
|
if let Some(claims) = we.claims.get(prop) { |
|
let filtered_claims = claims.into_iter().filter(|c| match &c.mainsnak { |
|
Snak::WikibaseItem { snak, ..} => snak.property == prop, |
|
_ => false |
|
}).cloned().collect::<Vec<Claim>>(); |
|
|
|
let matches = filtered_claims.into_iter().filter(|c| match &c.mainsnak { |
|
Snak::WikibaseItem { snak, .. } => match &snak.datavalue { |
|
SnakDataValue::WikibaseEntityid { id, .. } => id == claim, |
|
_ => false |
|
}, |
|
_ => false |
|
}).collect::<Vec<Claim>>(); |
|
|
|
if matches.len() > 0 { |
|
return Some(we); |
|
} |
|
} |
|
None |
|
} |
|
|
|
fn parse_str(line: &str, ps: &mut ParseStats, opts: &Opts) { |
|
match serde_json::from_str::<WikiEntity>(line) { |
|
Ok(wiki_item) => { |
|
|
|
if let Some(we) = apply_filter(wiki_item, &opts.filter_property.as_ref().unwrap(), &opts.filter_claim.as_ref().unwrap()) { |
|
ps.persons += 1; |
|
let matches = opts.select_properties.as_ref().unwrap().into_iter() |
|
.filter_map(|prop| we.claims.get(prop)) |
|
.cloned() |
|
.flatten() |
|
.collect::<Vec<Claim>>(); |
|
|
|
// Don't return any matches where a property is missing |
|
if matches.len() >= opts.select_properties.as_ref().unwrap().len() { |
|
ps.matches += 1; |
|
let out = OutputValue { |
|
id: &we.id.to_owned(), |
|
name: we.get_label(), |
|
claims: matches |
|
}; |
|
println!("{}", serde_json::to_string(&out).unwrap()); |
|
} |
|
} |
|
ps.pages += 1; |
|
}, |
|
Err(e) => { |
|
eprintln!("Error: {:?}", &e); |
|
ps.failures += 1; |
|
} |
|
} |
|
} |
|
|
|
#[derive(Clone, Debug, Deserialize, Serialize)] |
|
struct OutputCoords<'a> { |
|
id: &'a str, |
|
name: &'a str, |
|
latitude: f32, |
|
longitude: f32 |
|
} |
|
|
|
/// Quick and dirty extractor just for Geo-coordinates |
|
fn parse_geo_only(line: &str, ps: &mut ParseStats, opts: &Opts) { |
|
if let Ok(wiki_item) = serde_json::from_str::<WikiEntity>(line) { |
|
ps.pages += 1; |
|
if let Some(claims) = wiki_item.claims.get(opts.select_geo_property.as_ref().unwrap()) { |
|
if let Snak::GlobeCoordinate { snak, .. } = &claims.first().unwrap().mainsnak { |
|
if let SnakDataValue::Globecoordinate { latitude, longitude, .. } = snak.datavalue { |
|
let out = OutputCoords { |
|
id: &wiki_item.id.to_owned(), |
|
name: wiki_item.get_label(), |
|
latitude: latitude.to_owned(), |
|
longitude: longitude.to_owned() |
|
}; |
|
println!("{}", serde_json::to_string(&out).unwrap()); |
|
} |
|
} |
|
} |
|
} |
|
} |
|
|
|
fn main() { |
|
let opts: Opts = Opts::parse(); |
|
eprintln!("{:?}", &opts); |
|
let mut ps = ParseStats::new(); |
|
let stdin = io::stdin(); |
|
if let Some(f) = &opts.json_file { |
|
let s = fs::read_to_string(f).unwrap(); |
|
parse_str(&s, &mut ps, &opts); |
|
} else { |
|
let start = std::time::Instant::now(); |
|
for l in stdin.lock().lines() { |
|
let mut line = l.unwrap(); |
|
// The string will have a trailing "," we need to strip off before trying to deser |
|
line.truncate(line.len()-1); |
|
if line.len() == 0 { |
|
continue |
|
} |
|
if opts.select_geo_property.is_some() { |
|
parse_geo_only(&line, &mut ps, &opts); |
|
} else { |
|
parse_str(&line, &mut ps, &opts); |
|
} |
|
if ps.pages % 50000 == 0 { |
|
let elapsed = start.elapsed().as_secs_f64(); |
|
eprintln!("{:.1} pg/s, {:?}", ps.pages as f64 / elapsed, ps); |
|
} |
|
} |
|
} |
|
} |