Created
July 3, 2017 07:24
-
-
Save jameysharp/e2d882cf244013395ec9e4a4c4a7abd7 to your computer and use it in GitHub Desktop.
Demo combining the html5ever parser from Servo with the Tokio wrapper for Curl.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[package] | |
name = "crawler" | |
version = "0.1.0" | |
authors = ["Jamey Sharp <jamey@minilop.net>"] | |
[dependencies] | |
curl = "0.4" | |
futures = "0.1" | |
html5ever = "0.5" | |
scraper = "0.4" | |
tendril = "0.2" | |
tokio-core = "0.1" | |
tokio-curl = "0.1" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Demo combining the html5ever parser from Servo with the Tokio wrapper | |
// for Curl. | |
extern crate curl; | |
extern crate futures; | |
extern crate html5ever; | |
extern crate scraper; | |
extern crate tendril; | |
extern crate tokio_core; | |
extern crate tokio_curl; | |
use curl::easy::Easy; | |
use futures::Future; | |
use futures::stream::Stream; | |
use futures::sync::mpsc::unbounded; | |
use html5ever::driver::parse_document; | |
use scraper::html::Html; | |
use std::env::args; | |
use tendril::{Tendril, TendrilSink}; | |
use tendril::fmt::UTF8; | |
use tokio_core::reactor::Core; | |
use tokio_curl::Session; | |
fn main() { | |
let url = args().nth(1).unwrap(); | |
// We need a Tokio Core event loop and a Curl session. | |
let mut lp = Core::new().unwrap(); | |
let session = Session::new(lp.handle()); | |
// The Tokio Curl wrapper runs Curl in a separate thread, but the | |
// string representation used internally by the HTML parser doesn't | |
// implement `Send` so we can't feed stuff directly into it and then | |
// use the final result in a different thread. So we send the | |
// strings across a channel instead, which also gives us a little | |
// parallelism. | |
let (sink, source) = unbounded(); | |
// Set up a request. The Curl bindings allow for returning lots of | |
// errors that can't actually happen, so here's a pile of unwraps. | |
let mut req = Easy::new(); | |
req.get(true).unwrap(); | |
req.url(&*url).unwrap(); | |
req.write_function(move |data| { | |
println!("sending {} bytes", data.len()); | |
// Tendril doesn't support directly stealing the input byte | |
// slice, so we're forced to let it copy. | |
// FIXME: This could totally fail! But error reporting out | |
// through the callback is tricky, so pretend it can't for now. | |
let tendril: Tendril<UTF8> = Tendril::try_from_byte_slice(data).unwrap(); | |
// Then we have to tell Tendril to make the copy it now owns | |
// safe to Send across a channel. Since it is definitely owned, | |
// the documentation says this should be cheap. | |
if sink.send(tendril.into_send()).is_err() { | |
// Receiver quit, so tell Curl to stop reading. | |
return Ok(0); | |
} | |
Ok(data.len()) | |
}).unwrap(); | |
// If the request completes successfully, we need to reset the | |
// callback so the Curl binding will drop its reference to the sink, | |
// which will allow the other end of the channel to terminate. If | |
// the request failed, the join() call below will stop reading from | |
// the other end of the channel anyway so we don't have to care. | |
let request = session.perform(req).and_then(|mut req| { | |
req.write_function(|_| Ok(0)).unwrap(); | |
Ok(req) | |
}); | |
// The source side of the channel has an error type of (), which | |
// indicates that it (theoretically) can't fail, but we need it to | |
// have the same error type as the Curl request so we can wait for | |
// them both at the same time. Since this shouldn't happen, just | |
// panic if it does. | |
let source = source.map_err(|()| unreachable!()); | |
// Simultaneously, let's feed the data we've read into a parser | |
// that's filling in a new document. To do that, we reduce the | |
// stream of text blocks one at a time, pushing each one into the | |
// parser as we receive it. | |
let builder = source.fold(parse_document(Html::new_document(), Default::default()), |mut parser, tendril| { | |
// The parser requires a StrTendril, which we can get from a | |
// SendTendril using Tendril::from. | |
parser.process(Tendril::from(tendril)); | |
println!("parsed a chunk"); | |
// Each time, we construct a new Future that immediately | |
// resolves to the new parser state. After the last chunk is | |
// processed, calling finish on the parser will give us the | |
// constructed HTML document. | |
Ok(parser) | |
}).and_then(|parser| Ok(parser.finish())); | |
// Now run the Curl request and the HTML parser in parallel, | |
// blocking until both complete. | |
let (mut req, html) = lp.run(request.join(builder)).unwrap(); | |
// Finally we can access whatever HTTP status information and parsed | |
// HTML we care about. | |
println!("{:?}: {} nodes", req.response_code(), html.tree.values().count()); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
That isn't true. If it was
Result<T, !>
, then it couldn't fail, but()
can be constructed just fine.