Skip to content

Instantly share code, notes, and snippets.

Created March 8, 2017 12:55
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/167c2912fe0746f4b05b1fec51ae0f14 to your computer and use it in GitHub Desktop.
Save anonymous/167c2912fe0746f4b05b1fec51ae0f14 to your computer and use it in GitHub Desktop.
HK text extractor
[package]
name = "hk"
version = "0.0.1"
[dependencies]
byteorder = "1.0"
xml-rs = "0.4"
// this file has to be put to src/main.rs in the directory where Cargo.toml resides
extern crate byteorder;
extern crate xml;
use std::env;
use std::fs::{self, File};
use std::io::{self, BufReader, BufRead, BufWriter, Read, ErrorKind, Write};
use std::mem;
use std::path::Path;
use byteorder::{LittleEndian, ReadBytesExt};
use xml::reader::{ParserConfig, XmlEvent};
fn main() {
let mut args: Vec<_> = env::args().skip(1).take(2).collect();
let (source_file, dest_dir) = if args.len() == 2 {
let dest_dir = args.pop().unwrap();
let source_file = args.pop().unwrap();
(source_file, dest_dir)
} else {
panic!("Expected exactly two arguments: <source> <dest>");
};
fs::create_dir_all(&dest_dir).unwrap();
let mut source = BufReader::new(File::open(&source_file).unwrap());
macro_rules! break_if_none {
($val:expr) => {
match $val {
Some(v) => v,
None => break,
}
}
}
loop {
let name_length = break_if_none!(read_length(&mut source));
let mut name_block = vec![0u8; name_length as usize];
source.read_exact(&mut name_block).expect("Failed to read name block");
skip_zeros(&mut source).expect("Failed to skip zeros after the name block");
let data_length = break_if_none!(read_length(&mut source));
let mut data_block = vec![0u8; data_length as usize];
source.read_exact(&mut data_block).expect("Failed to read data block");
skip_zeros(&mut source).expect("Failed to skip zeros after the data block");
process_item(name_block, data_block, &dest_dir);
}
}
const INVALID_PREFIXES: &'static [&'static str] = &[
"ES", "FR", "DE",
"Word Count", "_", "LineBreaking"
];
fn process_item(name_block: Vec<u8>, data_block: Vec<u8>, dest_dir: &str) {
let prefixed_name = String::from_utf8_lossy(&name_block);
if INVALID_PREFIXES.iter().cloned().find(|&p| prefixed_name.starts_with(p)).is_some() {
return;
}
let mut parts: Vec<_> = prefixed_name.splitn(2, "_").collect();
if parts.len() != 2 {
println!("Strange name: {}", prefixed_name);
return;
}
let name: &str = parts.pop().unwrap();
let prefix: &str = parts.pop().unwrap();
let mut entries = parse_entries(data_block);
entries.sort_by(|p1, p2| p1.0.cmp(&p2.0));
println!("{} in {}: {} entries", name, prefix, entries.len());
store_entries(dest_dir, prefix, name, entries);
}
fn store_entries(dest_dir: &str, subdir_name: &str, file_name: &str,
entries: Vec<(String, String)>) {
let dir_path = Path::new(dest_dir).join(subdir_name);
fs::create_dir_all(&dir_path)
.expect(&format!("Failed to create directory {}", dir_path.display()));
let file_name = file_name.to_owned() + ".txt";
let file_path = dir_path.join(&file_name);
let mut file = BufWriter::new(File::create(&file_path)
.expect(&format!("Failed to create file {}", file_path.display())));
let mut print_entry = |entry_name: String, entry_text: String| -> io::Result<()> {
for _ in 0..8 {
file.write_all(b"----------")?;
}
file.write_all(b"\n")?;
file.write_all(entry_name.as_bytes())?;
file.write_all(b":\n\n")?;
let entry_text = entry_text.replace("<br>", "\n");
let entry_text = entry_text.replace("<page>", "\n\n");
file.write_all(entry_text.as_bytes())?;
if !entry_text.ends_with("\n") {
file.write_all(b"\n")?;
}
Ok(())
};
for (entry_name, entry_text) in entries {
print_entry(entry_name, entry_text)
.expect(&format!("failed to write an entry to file {}", file_path.display()));
}
}
fn parse_entries(data_block: Vec<u8>) -> Vec<(String, String)> {
let mut result = Vec::new();
let parser = ParserConfig::new()
.whitespace_to_characters(true)
.coalesce_characters(true)
.cdata_to_characters(true)
.create_reader(&*data_block);
let mut entry_name = String::new();
for e in parser {
let e = match e {
Ok(e) => e,
Err(e) => {
println!("Failed to read an XML document: {}", e);
return result;
}
};
match e {
XmlEvent::StartElement { name, attributes, .. } => {
if name.local_name == "entry" {
match attributes.into_iter().find(|attr| attr.name.local_name == "name") {
Some(attr) => entry_name = attr.value,
None => println!("Failed to determine entry name, attribute is missing"),
}
}
},
XmlEvent::Characters(text) => {
if !entry_name.is_empty() {
result.push((mem::replace(&mut entry_name, String::new()), text));
}
},
_ => {}
}
}
result
}
fn skip_zeros<R: BufRead>(r: &mut R) -> io::Result<usize> {
let mut read = 0;
loop {
let (done, used) = {
let available = match r.fill_buf() {
Ok(n) => n,
Err(ref e) if e.kind() == ErrorKind::Interrupted => continue,
Err(e) => return Err(e)
};
match available.iter().position(|&b| b != 0) {
Some(i) => (true, i),
None => (false, available.len()),
}
};
r.consume(used);
read += used;
if done || used == 0 {
return Ok(read);
}
}
}
fn read_length<R: Read>(r: &mut R) -> Option<u32> {
match r.read_u32::<LittleEndian>() {
Ok(length) => Some(length),
Err(ref e) if e.kind() == ErrorKind::UnexpectedEof => None,
Err(e) => panic!("Failed to read length: {}", e),
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment