rspeer/commoncrawl.rs

## commoncrawl.rs
/* This Rust code scans through the Common Crawl, looking for text that's
 * not English. I suspect I may learn much later that it's terrible,
 * unidiomatic Rust, but it would take me months to learn what good Rust is.
 *
 * We depend on some external libraries:
 *
 *   - html5ever: an HTML parser (we only use its low-level tokenizer)
 *   - encoding: handles text in all the encodings that WHATWG recognizes
 *   - string_cache: interns a bunch of frequently-used strings, like tag names -- necessary to use
 *     the html5ever tokenizer
 *   - cld2: our favorite language detector
 */

extern crate html5ever;
extern crate encoding;
#[macro_use(atom)]
extern crate string_cache;
extern crate cld2;

use std::io;
use std::io::prelude::*;
use std::default::Default;
use std::string::String;
use std::str;
use html5ever::tokenizer::{Tokenizer, TokenSink, Token, TagToken, CharacterTokens, StartTag, EndTag};
use html5ever::tendril::{Tendril, fmt};
use encoding::{Encoding, EncodingRef, DecoderTrap};
use encoding::label::encoding_from_whatwg_label;
use encoding::all::{UTF_8};
use cld2::{detect_language_ext, Format, Reliable, Lang, Hints};


/* The following code is from http://stackoverflow.com/a/34978794/773754.
 * This clever code adds an as_deref() method to Option objects, which allows converting an
 * Option<String> into an Option<&str>.
 */

use std::ops::Deref;

trait OptionDeref<T: Deref> {
    fn as_deref(&self) -> Option<&T::Target>;
}

impl<T: Deref> OptionDeref<T> for Option<T> {
    fn as_deref(&self) -> Option<&T::Target> {
        self.as_ref().map(Deref::deref)
    }
}


#[derive(PartialEq, Debug)]
enum ParserState {
    WARC,
    HTTP,
    HTML,
}

/* The HTML tokenizer works in a streaming way by taking in a reference to a
 * TokenSink that you implement. Our TokenSink is called the TokenHandler, and
 * here's what its state looks like.
 */

struct TokenHandler {
    text: String,
    encoding: EncodingRef,
    active: bool,
    language_hint: Option<String>,
    language_depth: i64
}

/* Methods of the TokenHandler that aren't part of the HTML parser. */
impl TokenHandler {
    fn new() -> TokenHandler {
        TokenHandler {
            text: String::with_capacity(65536),
            encoding: UTF_8,
            active: true,
            language_hint: None,

            // The language_depth is a counter that keeps track of our depth
            // in the tag stack since we've seen a 'lang' attribute. When it
            // reaches 0, it drops that 'lang' value.
            //
            // This makes us forgetful in a case that has nested 'lang'
            // attributes, but that seems rare.
            //
            // When we haven't seen a 'lang' attribute, the counter is set to
            // a high number, so it never hits 0.
            language_depth: 1000
        }
    }

    /* We've started a new document, so language-detect and possibly output
     * the text we've accumulated, then reset all the attributes.
     */
    fn new_document(&mut self) {
        if self.text.len() > 0 {
            handle_language(self.text.clone(), self.language_hint.clone());
        }

        self.text.truncate(0);
        self.encoding = UTF_8;
        self.active = true;
        self.language_hint = None;
        self.language_depth = 1000;
    }

    /* We closed a tag with a 'lang' attribute. Send the text on to language
     * detection, but don't forget about other things such as encoding.
     */
    fn end_language_section(&mut self) {
        if self.text.len() > 0 {
            handle_language(self.text.clone(), self.language_hint.clone());
        }

        self.text.truncate(0);
        self.language_hint = None;
        self.language_depth = 1000;
    }
}

/* The part of the TokenHandler that makes it a TokenSink, which is the
 * process_token method.
 */
impl TokenSink for TokenHandler {
    fn process_token(&mut self, token: Token) {
        match token {
            TagToken(tag) => {
                match tag.kind {
                    StartTag => {
                        // We've received an HTML opening tag.
                        match tag.name {
                            // If it's a <script> or <style> tag, start disregarding content.
                            atom!("script") | atom!("style") => {
                                self.active = false;
                            },

                            // If it's a <meta> tag, look for a charset or http-equiv attribute.
                            atom!("meta") => {
                                let mut content_type = false;
                                for attr in &tag.attrs {
                                    if attr.name.local == atom!("charset") {
                                        match encoding_from_whatwg_label(&attr.value) {
                                            Some(new_encoding) => self.encoding = new_encoding,
                                            None => {}
                                        }
                                    }
                                    if attr.name.local == atom!("http-equiv") && attr.value.to_lowercase() == "content-type" {
                                        content_type = true;
                                    }
                                }
                                if content_type {
                                    for attr in &tag.attrs {
                                        if attr.name.local == atom!("content") {
                                            match content_type_to_encoding(&attr.value) {
                                                Some(new_encoding) => {
                                                    self.encoding = new_encoding;
                                                },
                                                None => {}
                                            }
                                        }
                                    }
                                }
                            }

                            // Other start tags do nothing in particular.
                            _ => {}
                        };

                        // Now look for a 'lang' attribute on the tag. If it has one, and the tag
                        // doesn't close immediately, start a new section of text that we believe
                        // to be in that language.
                        for attr in tag.attrs {
                            if attr.name.local == atom!("lang") {
                                self.end_language_section();
                                self.language_hint = Some(attr.value.to_string());
                                if !tag.self_closing {
                                    self.language_depth = 0;
                                }
                            }
                        };

                        if tag.self_closing {
                            // This could be a <br> tag or something -- it's a token boundary.
                            self.text.push(' ');
                        }
                        else {
                            // Increase our language_depth based on the fact that we saw a start tag --
                            // this is how we will keep track of where a 'lang' attribute ends.
                            self.language_depth += 1;
                        }
                    },
                    EndTag => {
                        // We've received an HTML closing tag.
                        match tag.name {
                            // If a <script> or <style> tag ended, stop ignoring content.
                            atom!("script") | atom!("style") => {
                                self.active = true;
                            },
                            // Otherwise, the only important thing is that it's a token boundary.
                            _ => {
                                self.text.push(' ');
                            }
                        };

                        // Decrease our language_depth, and end the language section if it reaches
                        // 0.
                        self.language_depth -= 1;
                        if self.language_depth == 0 {
                            self.end_language_section();
                        }

                    }
                };
                if tag.self_closing { self.text.push(' '); }
            },
            CharacterTokens(tendril) => {
                // We've received actual text. It's in the form of a Tendril, which is basically as
                // frightening as it sounds, so convert it to a string, then convert that to a
                // &str, so we can push it onto the text.
                if self.active {
                    self.text.push_str(&tendril.to_string());
                }
            },
            _ => {}
        }
    }
}

/* Language-detect the given text, and output it if it's non-English. */
fn handle_language(text: String, language_hint: Option<String>) {
    let hint_ref: Option<&str> = language_hint.as_deref();
    let hints = Hints {
        content_language: hint_ref,
        .. Default::default()
    };
    let detection_result = detect_language_ext(&text, Format::Text, &hints);
    if detection_result.reliability == Reliable {
        match detection_result.language {
            Some(Lang(language)) => {
                if language != "en" {
                    println!("{}\t{}", language, text.replace("\r", "").replace("\n", " "))
                }
            },
            None => {}
        }
    }
}


fn content_type_to_encoding(content_type: &str) -> Option<EncodingRef> {
    let split1: Vec<&str> = content_type.split("charset=").collect();
    if split1.len() < 2 { return None };
    let split2: Vec<&str> = split1[1].split(" ").collect();
    let encoding_name: &str = split2[0];
    return encoding_from_whatwg_label(encoding_name);
}

fn content_type_bytes_to_encoding(content_type_bytes: &[u8]) -> Option<EncodingRef> {
    match str::from_utf8(&content_type_bytes) {
        Ok(content_type) => content_type_to_encoding(&content_type.trim()),
        Err(_) => None
    }
}

fn main() {
    let stdin = io::stdin();
    let mut state = ParserState::WARC;
    let mut tokenizer = Tokenizer::new(TokenHandler::new(), Default::default());

    for line_bytes_opt in stdin.lock().split(b'\n') {
        let bline: Vec<u8> = line_bytes_opt.unwrap();
        if bline == b"WARC/1.0\r" {
            state = ParserState::WARC;
            tokenizer.sink_mut().new_document();
        }
        else if bline == b"\r" {
            // A blank line ends the HTTP headers, transitioning
            // to HTML.
            if state == ParserState::HTTP {
                state = ParserState::HTML;
            }
        }
        else if state == ParserState::WARC && bline.starts_with(b"HTTP/") {
            state = ParserState::HTTP;
        }
        else if state == ParserState::HTML {
            match tokenizer.sink().encoding.decode(&bline, DecoderTrap::Strict) {
                Ok(sline) => {
                    let tend: Tendril<fmt::UTF8> = Tendril::from_slice(sline.trim());
                    tokenizer.feed(tend);
                },
                Err(_) => {}
            }
        }
        else if state == ParserState::HTTP {
            if bline.starts_with(b"Content-Type:") {
                let content_type_bytes = &bline[13..];
                match content_type_bytes_to_encoding(&content_type_bytes) {
                    Some(new_encoding) => {
                        tokenizer.sink_mut().encoding = new_encoding;
                    },
                    None => {}
                }
            }
            else if bline.starts_with(b"Content-Language:") {
                let content_lang_bytes = &bline[17..];
                match str::from_utf8(content_lang_bytes) {
                    Ok(content_lang) => {
                        tokenizer.sink_mut().language_hint = Some(content_lang.trim().to_string());
                    },
                    Err(_) => {}
                }
            }
        }
    }
}
	/* This Rust code scans through the Common Crawl, looking for text that's
	* not English. I suspect I may learn much later that it's terrible,
	* unidiomatic Rust, but it would take me months to learn what good Rust is.
	*
	* We depend on some external libraries:
	*
	* - html5ever: an HTML parser (we only use its low-level tokenizer)
	* - encoding: handles text in all the encodings that WHATWG recognizes
	* - string_cache: interns a bunch of frequently-used strings, like tag names -- necessary to use
	* the html5ever tokenizer
	* - cld2: our favorite language detector
	*/

	extern crate html5ever;
	extern crate encoding;
	#[macro_use(atom)]
	extern crate string_cache;
	extern crate cld2;

	use std::io;
	use std::io::prelude::*;
	use std::default::Default;
	use std::string::String;
	use std::str;
	use html5ever::tokenizer::{Tokenizer, TokenSink, Token, TagToken, CharacterTokens, StartTag, EndTag};
	use html5ever::tendril::{Tendril, fmt};
	use encoding::{Encoding, EncodingRef, DecoderTrap};
	use encoding::label::encoding_from_whatwg_label;
	use encoding::all::{UTF_8};
	use cld2::{detect_language_ext, Format, Reliable, Lang, Hints};


	/* The following code is from http://stackoverflow.com/a/34978794/773754.
	* This clever code adds an as_deref() method to Option objects, which allows converting an
	* Option<String> into an Option<&str>.
	*/

	use std::ops::Deref;

	trait OptionDeref<T: Deref> {
	fn as_deref(&self) -> Option<&T::Target>;
	}

	impl<T: Deref> OptionDeref<T> for Option<T> {
	fn as_deref(&self) -> Option<&T::Target> {
	self.as_ref().map(Deref::deref)
	}
	}


	#[derive(PartialEq, Debug)]
	enum ParserState {
	WARC,
	HTTP,
	HTML,
	}

	/* The HTML tokenizer works in a streaming way by taking in a reference to a
	* TokenSink that you implement. Our TokenSink is called the TokenHandler, and
	* here's what its state looks like.
	*/

	struct TokenHandler {
	text: String,
	encoding: EncodingRef,
	active: bool,
	language_hint: Option<String>,
	language_depth: i64
	}

	/* Methods of the TokenHandler that aren't part of the HTML parser. */
	impl TokenHandler {
	fn new() -> TokenHandler {
	TokenHandler {
	text: String::with_capacity(65536),
	encoding: UTF_8,
	active: true,
	language_hint: None,

	// The language_depth is a counter that keeps track of our depth
	// in the tag stack since we've seen a 'lang' attribute. When it
	// reaches 0, it drops that 'lang' value.
	//
	// This makes us forgetful in a case that has nested 'lang'
	// attributes, but that seems rare.
	//
	// When we haven't seen a 'lang' attribute, the counter is set to
	// a high number, so it never hits 0.
	language_depth: 1000
	}
	}

	/* We've started a new document, so language-detect and possibly output
	* the text we've accumulated, then reset all the attributes.
	*/
	fn new_document(&mut self) {
	if self.text.len() > 0 {
	handle_language(self.text.clone(), self.language_hint.clone());
	}

	self.text.truncate(0);
	self.encoding = UTF_8;
	self.active = true;
	self.language_hint = None;
	self.language_depth = 1000;
	}

	/* We closed a tag with a 'lang' attribute. Send the text on to language
	* detection, but don't forget about other things such as encoding.
	*/
	fn end_language_section(&mut self) {
	if self.text.len() > 0 {
	handle_language(self.text.clone(), self.language_hint.clone());
	}

	self.text.truncate(0);
	self.language_hint = None;
	self.language_depth = 1000;
	}
	}

	/* The part of the TokenHandler that makes it a TokenSink, which is the
	* process_token method.
	*/
	impl TokenSink for TokenHandler {
	fn process_token(&mut self, token: Token) {
	match token {
	TagToken(tag) => {
	match tag.kind {
	StartTag => {
	// We've received an HTML opening tag.
	match tag.name {
	// If it's a <script> or <style> tag, start disregarding content.
	atom!("script") \| atom!("style") => {
	self.active = false;
	},

	// If it's a <meta> tag, look for a charset or http-equiv attribute.
	atom!("meta") => {
	let mut content_type = false;
	for attr in &tag.attrs {
	if attr.name.local == atom!("charset") {
	match encoding_from_whatwg_label(&attr.value) {
	Some(new_encoding) => self.encoding = new_encoding,
	None => {}
	}
	}
	if attr.name.local == atom!("http-equiv") && attr.value.to_lowercase() == "content-type" {
	content_type = true;
	}
	}
	if content_type {
	for attr in &tag.attrs {
	if attr.name.local == atom!("content") {
	match content_type_to_encoding(&attr.value) {
	Some(new_encoding) => {
	self.encoding = new_encoding;
	},
	None => {}
	}
	}
	}
	}
	}

	// Other start tags do nothing in particular.
	_ => {}
	};

	// Now look for a 'lang' attribute on the tag. If it has one, and the tag
	// doesn't close immediately, start a new section of text that we believe
	// to be in that language.
	for attr in tag.attrs {
	if attr.name.local == atom!("lang") {
	self.end_language_section();
	self.language_hint = Some(attr.value.to_string());
	if !tag.self_closing {
	self.language_depth = 0;
	}
	}
	};

	if tag.self_closing {
	// This could be a <br> tag or something -- it's a token boundary.
	self.text.push(' ');
	}
	else {
	// Increase our language_depth based on the fact that we saw a start tag --
	// this is how we will keep track of where a 'lang' attribute ends.
	self.language_depth += 1;
	}
	},
	EndTag => {
	// We've received an HTML closing tag.
	match tag.name {
	// If a <script> or <style> tag ended, stop ignoring content.
	atom!("script") \| atom!("style") => {
	self.active = true;
	},
	// Otherwise, the only important thing is that it's a token boundary.
	_ => {
	self.text.push(' ');
	}
	};

	// Decrease our language_depth, and end the language section if it reaches
	// 0.
	self.language_depth -= 1;
	if self.language_depth == 0 {
	self.end_language_section();
	}

	}
	};
	if tag.self_closing { self.text.push(' '); }
	},
	CharacterTokens(tendril) => {
	// We've received actual text. It's in the form of a Tendril, which is basically as
	// frightening as it sounds, so convert it to a string, then convert that to a
	// &str, so we can push it onto the text.
	if self.active {
	self.text.push_str(&tendril.to_string());
	}
	},
	_ => {}
	}
	}
	}

	/* Language-detect the given text, and output it if it's non-English. */
	fn handle_language(text: String, language_hint: Option<String>) {
	let hint_ref: Option<&str> = language_hint.as_deref();
	let hints = Hints {
	content_language: hint_ref,
	.. Default::default()
	};
	let detection_result = detect_language_ext(&text, Format::Text, &hints);
	if detection_result.reliability == Reliable {
	match detection_result.language {
	Some(Lang(language)) => {
	if language != "en" {
	println!("{}\t{}", language, text.replace("\r", "").replace("\n", " "))
	}
	},
	None => {}
	}
	}
	}


	fn content_type_to_encoding(content_type: &str) -> Option<EncodingRef> {
	let split1: Vec<&str> = content_type.split("charset=").collect();
	if split1.len() < 2 { return None };
	let split2: Vec<&str> = split1[1].split(" ").collect();
	let encoding_name: &str = split2[0];
	return encoding_from_whatwg_label(encoding_name);
	}

	fn content_type_bytes_to_encoding(content_type_bytes: &[u8]) -> Option<EncodingRef> {
	match str::from_utf8(&content_type_bytes) {
	Ok(content_type) => content_type_to_encoding(&content_type.trim()),
	Err(_) => None
	}
	}

	fn main() {
	let stdin = io::stdin();
	let mut state = ParserState::WARC;
	let mut tokenizer = Tokenizer::new(TokenHandler::new(), Default::default());

	for line_bytes_opt in stdin.lock().split(b'\n') {
	let bline: Vec<u8> = line_bytes_opt.unwrap();
	if bline == b"WARC/1.0\r" {
	state = ParserState::WARC;
	tokenizer.sink_mut().new_document();
	}
	else if bline == b"\r" {
	// A blank line ends the HTTP headers, transitioning
	// to HTML.
	if state == ParserState::HTTP {
	state = ParserState::HTML;
	}
	}
	else if state == ParserState::WARC && bline.starts_with(b"HTTP/") {
	state = ParserState::HTTP;
	}
	else if state == ParserState::HTML {
	match tokenizer.sink().encoding.decode(&bline, DecoderTrap::Strict) {
	Ok(sline) => {
	let tend: Tendril<fmt::UTF8> = Tendril::from_slice(sline.trim());
	tokenizer.feed(tend);
	},
	Err(_) => {}
	}
	}
	else if state == ParserState::HTTP {
	if bline.starts_with(b"Content-Type:") {
	let content_type_bytes = &bline[13..];
	match content_type_bytes_to_encoding(&content_type_bytes) {
	Some(new_encoding) => {
	tokenizer.sink_mut().encoding = new_encoding;
	},
	None => {}
	}
	}
	else if bline.starts_with(b"Content-Language:") {
	let content_lang_bytes = &bline[17..];
	match str::from_utf8(content_lang_bytes) {
	Ok(content_lang) => {
	tokenizer.sink_mut().language_hint = Some(content_lang.trim().to_string());
	},
	Err(_) => {}
	}
	}
	}
	}
	}