/* This Rust code scans through the Common Crawl, looking for text that's | |
* not English. I suspect I may learn much later that it's terrible, | |
* unidiomatic Rust, but it would take me months to learn what good Rust is. | |
* | |
* We depend on some external libraries: | |
* | |
* - html5ever: an HTML parser (we only use its low-level tokenizer) | |
* - encoding: handles text in all the encodings that WHATWG recognizes | |
* - string_cache: interns a bunch of frequently-used strings, like tag names -- necessary to use | |
* the html5ever tokenizer | |
* - cld2: our favorite language detector | |
*/ | |
extern crate html5ever; | |
extern crate encoding; | |
#[macro_use(atom)] | |
extern crate string_cache; | |
extern crate cld2; | |
use std::io; | |
use std::io::prelude::*; | |
use std::default::Default; | |
use std::string::String; | |
use std::str; | |
use html5ever::tokenizer::{Tokenizer, TokenSink, Token, TagToken, CharacterTokens, StartTag, EndTag}; | |
use html5ever::tendril::{Tendril, fmt}; | |
use encoding::{Encoding, EncodingRef, DecoderTrap}; | |
use encoding::label::encoding_from_whatwg_label; | |
use encoding::all::{UTF_8}; | |
use cld2::{detect_language_ext, Format, Reliable, Lang, Hints}; | |
/* The following code is from http://stackoverflow.com/a/34978794/773754. | |
* This clever code adds an as_deref() method to Option objects, which allows converting an | |
* Option<String> into an Option<&str>. | |
*/ | |
use std::ops::Deref; | |
trait OptionDeref<T: Deref> { | |
fn as_deref(&self) -> Option<&T::Target>; | |
} | |
impl<T: Deref> OptionDeref<T> for Option<T> { | |
fn as_deref(&self) -> Option<&T::Target> { | |
self.as_ref().map(Deref::deref) | |
} | |
} | |
#[derive(PartialEq, Debug)] | |
enum ParserState { | |
WARC, | |
HTTP, | |
HTML, | |
} | |
/* The HTML tokenizer works in a streaming way by taking in a reference to a | |
* TokenSink that you implement. Our TokenSink is called the TokenHandler, and | |
* here's what its state looks like. | |
*/ | |
struct TokenHandler { | |
text: String, | |
encoding: EncodingRef, | |
active: bool, | |
language_hint: Option<String>, | |
language_depth: i64 | |
} | |
/* Methods of the TokenHandler that aren't part of the HTML parser. */ | |
impl TokenHandler { | |
fn new() -> TokenHandler { | |
TokenHandler { | |
text: String::with_capacity(65536), | |
encoding: UTF_8, | |
active: true, | |
language_hint: None, | |
// The language_depth is a counter that keeps track of our depth | |
// in the tag stack since we've seen a 'lang' attribute. When it | |
// reaches 0, it drops that 'lang' value. | |
// | |
// This makes us forgetful in a case that has nested 'lang' | |
// attributes, but that seems rare. | |
// | |
// When we haven't seen a 'lang' attribute, the counter is set to | |
// a high number, so it never hits 0. | |
language_depth: 1000 | |
} | |
} | |
/* We've started a new document, so language-detect and possibly output | |
* the text we've accumulated, then reset all the attributes. | |
*/ | |
fn new_document(&mut self) { | |
if self.text.len() > 0 { | |
handle_language(self.text.clone(), self.language_hint.clone()); | |
} | |
self.text.truncate(0); | |
self.encoding = UTF_8; | |
self.active = true; | |
self.language_hint = None; | |
self.language_depth = 1000; | |
} | |
/* We closed a tag with a 'lang' attribute. Send the text on to language | |
* detection, but don't forget about other things such as encoding. | |
*/ | |
fn end_language_section(&mut self) { | |
if self.text.len() > 0 { | |
handle_language(self.text.clone(), self.language_hint.clone()); | |
} | |
self.text.truncate(0); | |
self.language_hint = None; | |
self.language_depth = 1000; | |
} | |
} | |
/* The part of the TokenHandler that makes it a TokenSink, which is the | |
* process_token method. | |
*/ | |
impl TokenSink for TokenHandler { | |
fn process_token(&mut self, token: Token) { | |
match token { | |
TagToken(tag) => { | |
match tag.kind { | |
StartTag => { | |
// We've received an HTML opening tag. | |
match tag.name { | |
// If it's a <script> or <style> tag, start disregarding content. | |
atom!("script") | atom!("style") => { | |
self.active = false; | |
}, | |
// If it's a <meta> tag, look for a charset or http-equiv attribute. | |
atom!("meta") => { | |
let mut content_type = false; | |
for attr in &tag.attrs { | |
if attr.name.local == atom!("charset") { | |
match encoding_from_whatwg_label(&attr.value) { | |
Some(new_encoding) => self.encoding = new_encoding, | |
None => {} | |
} | |
} | |
if attr.name.local == atom!("http-equiv") && attr.value.to_lowercase() == "content-type" { | |
content_type = true; | |
} | |
} | |
if content_type { | |
for attr in &tag.attrs { | |
if attr.name.local == atom!("content") { | |
match content_type_to_encoding(&attr.value) { | |
Some(new_encoding) => { | |
self.encoding = new_encoding; | |
}, | |
None => {} | |
} | |
} | |
} | |
} | |
} | |
// Other start tags do nothing in particular. | |
_ => {} | |
}; | |
// Now look for a 'lang' attribute on the tag. If it has one, and the tag | |
// doesn't close immediately, start a new section of text that we believe | |
// to be in that language. | |
for attr in tag.attrs { | |
if attr.name.local == atom!("lang") { | |
self.end_language_section(); | |
self.language_hint = Some(attr.value.to_string()); | |
if !tag.self_closing { | |
self.language_depth = 0; | |
} | |
} | |
}; | |
if tag.self_closing { | |
// This could be a <br> tag or something -- it's a token boundary. | |
self.text.push(' '); | |
} | |
else { | |
// Increase our language_depth based on the fact that we saw a start tag -- | |
// this is how we will keep track of where a 'lang' attribute ends. | |
self.language_depth += 1; | |
} | |
}, | |
EndTag => { | |
// We've received an HTML closing tag. | |
match tag.name { | |
// If a <script> or <style> tag ended, stop ignoring content. | |
atom!("script") | atom!("style") => { | |
self.active = true; | |
}, | |
// Otherwise, the only important thing is that it's a token boundary. | |
_ => { | |
self.text.push(' '); | |
} | |
}; | |
// Decrease our language_depth, and end the language section if it reaches | |
// 0. | |
self.language_depth -= 1; | |
if self.language_depth == 0 { | |
self.end_language_section(); | |
} | |
} | |
}; | |
if tag.self_closing { self.text.push(' '); } | |
}, | |
CharacterTokens(tendril) => { | |
// We've received actual text. It's in the form of a Tendril, which is basically as | |
// frightening as it sounds, so convert it to a string, then convert that to a | |
// &str, so we can push it onto the text. | |
if self.active { | |
self.text.push_str(&tendril.to_string()); | |
} | |
}, | |
_ => {} | |
} | |
} | |
} | |
/* Language-detect the given text, and output it if it's non-English. */ | |
fn handle_language(text: String, language_hint: Option<String>) { | |
let hint_ref: Option<&str> = language_hint.as_deref(); | |
let hints = Hints { | |
content_language: hint_ref, | |
.. Default::default() | |
}; | |
let detection_result = detect_language_ext(&text, Format::Text, &hints); | |
if detection_result.reliability == Reliable { | |
match detection_result.language { | |
Some(Lang(language)) => { | |
if language != "en" { | |
println!("{}\t{}", language, text.replace("\r", "").replace("\n", " ")) | |
} | |
}, | |
None => {} | |
} | |
} | |
} | |
fn content_type_to_encoding(content_type: &str) -> Option<EncodingRef> { | |
let split1: Vec<&str> = content_type.split("charset=").collect(); | |
if split1.len() < 2 { return None }; | |
let split2: Vec<&str> = split1[1].split(" ").collect(); | |
let encoding_name: &str = split2[0]; | |
return encoding_from_whatwg_label(encoding_name); | |
} | |
fn content_type_bytes_to_encoding(content_type_bytes: &[u8]) -> Option<EncodingRef> { | |
match str::from_utf8(&content_type_bytes) { | |
Ok(content_type) => content_type_to_encoding(&content_type.trim()), | |
Err(_) => None | |
} | |
} | |
fn main() { | |
let stdin = io::stdin(); | |
let mut state = ParserState::WARC; | |
let mut tokenizer = Tokenizer::new(TokenHandler::new(), Default::default()); | |
for line_bytes_opt in stdin.lock().split(b'\n') { | |
let bline: Vec<u8> = line_bytes_opt.unwrap(); | |
if bline == b"WARC/1.0\r" { | |
state = ParserState::WARC; | |
tokenizer.sink_mut().new_document(); | |
} | |
else if bline == b"\r" { | |
// A blank line ends the HTTP headers, transitioning | |
// to HTML. | |
if state == ParserState::HTTP { | |
state = ParserState::HTML; | |
} | |
} | |
else if state == ParserState::WARC && bline.starts_with(b"HTTP/") { | |
state = ParserState::HTTP; | |
} | |
else if state == ParserState::HTML { | |
match tokenizer.sink().encoding.decode(&bline, DecoderTrap::Strict) { | |
Ok(sline) => { | |
let tend: Tendril<fmt::UTF8> = Tendril::from_slice(sline.trim()); | |
tokenizer.feed(tend); | |
}, | |
Err(_) => {} | |
} | |
} | |
else if state == ParserState::HTTP { | |
if bline.starts_with(b"Content-Type:") { | |
let content_type_bytes = &bline[13..]; | |
match content_type_bytes_to_encoding(&content_type_bytes) { | |
Some(new_encoding) => { | |
tokenizer.sink_mut().encoding = new_encoding; | |
}, | |
None => {} | |
} | |
} | |
else if bline.starts_with(b"Content-Language:") { | |
let content_lang_bytes = &bline[17..]; | |
match str::from_utf8(content_lang_bytes) { | |
Ok(content_lang) => { | |
tokenizer.sink_mut().language_hint = Some(content_lang.trim().to_string()); | |
}, | |
Err(_) => {} | |
} | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment