rillian/kuchiki-0.8.5-speedreader.diff

## kuchiki-0.8.5-speedreader.diff
diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml
index f241163..ba66011 100644
--- a/.github/workflows/audit.yaml
+++ b/.github/workflows/audit.yaml
@@ -1,4 +1,4 @@
-name: Security audit
+name: Audit
 on:
   push:
     paths:
@@ -6,7 +6,8 @@ on:
       - '**/Cargo.lock'
   pull_request:
     branches:
-      main
+      - main
+      - speedreader
     paths:
       - '**/Cargo.toml'
       - '**/Cargo.lock'
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 25a0496..f83a690 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -3,9 +3,11 @@ on:
   push:
     branches:
       - main
+      - speedreader
   pull_request:
     branches:
-      main
+      - main
+      - speedreader
   schedule:
     - cron: '28 20 2 * *'
 jobs:
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 25bc5fa..7a46b45 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - main
+      - speedreader
     pull_request:

 jobs:
diff --git a/Cargo.toml b/Cargo.toml
index b90cfe8..19a22d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "kuchikiki"
-version = "0.8.2"
+version = "0.8.4-speedreader"
 authors = [
   "Brave Authors",
   "Ralph Giles <rgiles@brave.com>",
@@ -18,9 +18,9 @@ doctest = false
 [dependencies]
 cssparser = "0.27"
 matches = "0.1.4"
-html5ever = "0.26.0"
+html5ever = "0.25.1"
 selectors = "0.22"
-indexmap = "1.6.0"
+indexmap = { version = "1.9.3", features = [ "std" ] }

 [dev-dependencies]
 tempfile = "3"
diff --git a/examples/find_matches.rs b/examples/find_matches.rs
index b496d08..54651c2 100644
--- a/examples/find_matches.rs
+++ b/examples/find_matches.rs
@@ -1,4 +1,4 @@
-use kuchikiki::traits::*;
+use kuchikiki::traits::TendrilSink;

 fn main() {
     let html = r"
@@ -14,7 +14,7 @@ fn main() {
     ";
     let css_selector = ".foo";

-    let document = kuchikiki::parse_html().one(html);
+    let document = kuchikiki::parse_html().one(html).document_node;

     for css_match in document.select(css_selector).unwrap() {
         // css_match is a NodeDataRef, but most of the interesting methods are
diff --git a/src/parser.rs b/src/parser.rs
index f160a6a..71bd438 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -61,15 +61,27 @@ pub fn parse_fragment_with_options(

 /// Receives new tree nodes during parsing.
 pub struct Sink {
-    document_node: NodeRef,
-    on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
+    /// The `Document` itself.
+    pub document_node: NodeRef,
+
+    /// The Sink will invoke this callback if it encounters a parse error.
+    pub on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
+}
+
+impl Default for Sink {
+    fn default() -> Sink {
+        Sink {
+            document_node: NodeRef::new_document(),
+            on_parse_error: None,
+        }
+    }
 }

 impl TreeSink for Sink {
-    type Output = NodeRef;
+    type Output = Self;

-    fn finish(self) -> NodeRef {
-        self.document_node
+    fn finish(self) -> Self {
+        self
     }

     type Handle = NodeRef;
diff --git a/src/tests.rs b/src/tests.rs
index a4f39e5..7031cf7 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -14,7 +14,7 @@ fn text_nodes() {
 <!doctype html>
 <title>Test case</title>
 <p>Content contains <b>Important</b> data</p>";
-    let document = parse_html().one(html);
+    let document = parse_html().one(html).document_node;
     let paragraph = document.select("p").unwrap().collect::<Vec<_>>();
     assert_eq!(paragraph.len(), 1);
     assert_eq!(
@@ -44,7 +44,7 @@ fn parse_and_serialize() {
 <!doctype html>
 <title>Test case</title>
 <p>Content";
-    let document = parse_html().one(html);
+    let document = parse_html().one(html).document_node;
     assert_eq!(
         document.as_document().unwrap().quirks_mode(),
         QuirksMode::NoQuirks
@@ -61,7 +61,7 @@ fn parse_and_serialize_fragment() {
     let html = r"<tbody><tr><td>Test case";

     let ctx_name = QualName::new(None, ns!(html), local_name!("tbody"));
-    let document = parse_fragment(ctx_name, vec![]).one(html);
+    let document = parse_fragment(ctx_name, vec![]).one(html).document_node;
     assert_eq!(
         document.as_document().unwrap().quirks_mode(),
         QuirksMode::NoQuirks
@@ -86,7 +86,11 @@ fn parse_file() {


 </body></html>";
-    let document = parse_html().from_utf8().from_file(&path).unwrap();
+    let document = parse_html()
+        .from_utf8()
+        .from_file(&path)
+        .unwrap()
+        .document_node;
     assert_eq!(document.to_string(), html);
 }

@@ -97,10 +101,14 @@ fn serialize_and_read_file() {
     path.push("temp.html");

     let html = r"<!DOCTYPE html><html><head><title>Title</title></head><body>Body</body></html>";
-    let document = parse_html().one(html);
+    let document = parse_html().one(html).document_node;
     let _ = document.serialize_to_file(path.clone());

-    let document2 = parse_html().from_utf8().from_file(&path).unwrap();
+    let document2 = parse_html()
+        .from_utf8()
+        .from_file(&path)
+        .unwrap()
+        .document_node;
     assert_eq!(document.to_string(), document2.to_string());
 }

@@ -113,7 +121,7 @@ fn select() {
 <p class=foo>Foo
 ";

-    let document = parse_html().one(html);
+    let document = parse_html().one(html).document_node;
     let matching = document.select("p.foo").unwrap().collect::<Vec<_>>();
     assert_eq!(matching.len(), 2);
     let child = matching[0].as_node().first_child().unwrap();
@@ -140,7 +148,7 @@ fn select_first() {
 <p class=foo>Baz
 ";

-    let document = parse_html().one(html);
+    let document = parse_html().one(html).document_node;
     let matching = document.select_first("p.foo").unwrap();
     let child = matching.as_node().first_child().unwrap();
     assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n");
@@ -165,7 +173,7 @@ fn to_string() {
     </body>
 </html>";

-    let document = parse_html().one(html);
+    let document = parse_html().one(html).document_node;
     assert_eq!(
         document
             .inclusive_descendants()
diff --git a/src/tree.rs b/src/tree.rs
index e3bcc3e..7a336b8 100644
--- a/src/tree.rs
+++ b/src/tree.rs
@@ -59,6 +59,14 @@ pub struct ElementData {
     /// If the element is an HTML `<template>` element,
     /// the document fragment node that is the root of template contents.
     pub template_contents: Option<NodeRef>,
+
+    /// The element's score for whether it should be considered for the
+    /// "readable" tree.
+    pub score: Cell<f32>,
+
+    /// The element's score for whether it should be considered for the
+    /// "readable" tree.
+    pub is_candidate: Cell<bool>,
 }

 /// Data specific to document nodes.
@@ -222,6 +230,8 @@ impl NodeRef {
             attributes: RefCell::new(Attributes {
                 map: attributes.into_iter().collect(),
             }),
+            score: Cell::new(0.0),
+            is_candidate: Cell::new(false),
         }))
     }
	diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml
	index f241163..ba66011 100644
	--- a/.github/workflows/audit.yaml
	+++ b/.github/workflows/audit.yaml
	@@ -1,4 +1,4 @@
	-name: Security audit
	+name: Audit
	on:
	push:
	paths:
	@@ -6,7 +6,8 @@ on:
	- '**/Cargo.lock'
	pull_request:
	branches:
	- main
	+ - main
	+ - speedreader
	paths:
	- '**/Cargo.toml'
	- '**/Cargo.lock'
	diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
	index 25a0496..f83a690 100644
	--- a/.github/workflows/lint.yaml
	+++ b/.github/workflows/lint.yaml
	@@ -3,9 +3,11 @@ on:
	push:
	branches:
	- main
	+ - speedreader
	pull_request:
	branches:
	- main
	+ - main
	+ - speedreader
	schedule:
	- cron: '28 20 2 * *'
	jobs:
	diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
	index 25bc5fa..7a46b45 100644
	--- a/.github/workflows/tests.yaml
	+++ b/.github/workflows/tests.yaml
	@@ -4,6 +4,7 @@ on:
	push:
	branches:
	- main
	+ - speedreader
	pull_request:

	jobs:
	diff --git a/Cargo.toml b/Cargo.toml
	index b90cfe8..19a22d8 100644
	--- a/Cargo.toml
	+++ b/Cargo.toml
	@@ -1,6 +1,6 @@
	[package]
	name = "kuchikiki"
	-version = "0.8.2"
	+version = "0.8.4-speedreader"
	authors = [
	"Brave Authors",
	"Ralph Giles <rgiles@brave.com>",
	@@ -18,9 +18,9 @@ doctest = false
	[dependencies]
	cssparser = "0.27"
	matches = "0.1.4"
	-html5ever = "0.26.0"
	+html5ever = "0.25.1"
	selectors = "0.22"
	-indexmap = "1.6.0"
	+indexmap = { version = "1.9.3", features = [ "std" ] }

	[dev-dependencies]
	tempfile = "3"
	diff --git a/examples/find_matches.rs b/examples/find_matches.rs
	index b496d08..54651c2 100644
	--- a/examples/find_matches.rs
	+++ b/examples/find_matches.rs
	@@ -1,4 +1,4 @@
	-use kuchikiki::traits::*;
	+use kuchikiki::traits::TendrilSink;

	fn main() {
	let html = r"
	@@ -14,7 +14,7 @@ fn main() {
	";
	let css_selector = ".foo";

	- let document = kuchikiki::parse_html().one(html);
	+ let document = kuchikiki::parse_html().one(html).document_node;

	for css_match in document.select(css_selector).unwrap() {
	// css_match is a NodeDataRef, but most of the interesting methods are
	diff --git a/src/parser.rs b/src/parser.rs
	index f160a6a..71bd438 100644
	--- a/src/parser.rs
	+++ b/src/parser.rs
	@@ -61,15 +61,27 @@ pub fn parse_fragment_with_options(

	/// Receives new tree nodes during parsing.
	pub struct Sink {
	- document_node: NodeRef,
	- on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
	+ /// The `Document` itself.
	+ pub document_node: NodeRef,
	+
	+ /// The Sink will invoke this callback if it encounters a parse error.
	+ pub on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
	+}
	+
	+impl Default for Sink {
	+ fn default() -> Sink {
	+ Sink {
	+ document_node: NodeRef::new_document(),
	+ on_parse_error: None,
	+ }
	+ }
	}

	impl TreeSink for Sink {
	- type Output = NodeRef;
	+ type Output = Self;

	- fn finish(self) -> NodeRef {
	- self.document_node
	+ fn finish(self) -> Self {
	+ self
	}

	type Handle = NodeRef;
	diff --git a/src/tests.rs b/src/tests.rs
	index a4f39e5..7031cf7 100644
	--- a/src/tests.rs
	+++ b/src/tests.rs
	@@ -14,7 +14,7 @@ fn text_nodes() {
	<!doctype html>
	<title>Test case</title>
	<p>Content contains <b>Important</b> data</p>";
	- let document = parse_html().one(html);
	+ let document = parse_html().one(html).document_node;
	let paragraph = document.select("p").unwrap().collect::<Vec<_>>();
	assert_eq!(paragraph.len(), 1);
	assert_eq!(
	@@ -44,7 +44,7 @@ fn parse_and_serialize() {
	<!doctype html>
	<title>Test case</title>
	<p>Content";
	- let document = parse_html().one(html);
	+ let document = parse_html().one(html).document_node;
	assert_eq!(
	document.as_document().unwrap().quirks_mode(),
	QuirksMode::NoQuirks
	@@ -61,7 +61,7 @@ fn parse_and_serialize_fragment() {
	let html = r"<tbody><tr><td>Test case";

	let ctx_name = QualName::new(None, ns!(html), local_name!("tbody"));
	- let document = parse_fragment(ctx_name, vec![]).one(html);
	+ let document = parse_fragment(ctx_name, vec![]).one(html).document_node;
	assert_eq!(
	document.as_document().unwrap().quirks_mode(),
	QuirksMode::NoQuirks
	@@ -86,7 +86,11 @@ fn parse_file() {


	</body></html>";
	- let document = parse_html().from_utf8().from_file(&path).unwrap();
	+ let document = parse_html()
	+ .from_utf8()
	+ .from_file(&path)
	+ .unwrap()
	+ .document_node;
	assert_eq!(document.to_string(), html);
	}

	@@ -97,10 +101,14 @@ fn serialize_and_read_file() {
	path.push("temp.html");

	let html = r"<!DOCTYPE html><html><head><title>Title</title></head><body>Body</body></html>";
	- let document = parse_html().one(html);
	+ let document = parse_html().one(html).document_node;
	let _ = document.serialize_to_file(path.clone());

	- let document2 = parse_html().from_utf8().from_file(&path).unwrap();
	+ let document2 = parse_html()
	+ .from_utf8()
	+ .from_file(&path)
	+ .unwrap()
	+ .document_node;
	assert_eq!(document.to_string(), document2.to_string());
	}

	@@ -113,7 +121,7 @@ fn select() {
	<p class=foo>Foo
	";

	- let document = parse_html().one(html);
	+ let document = parse_html().one(html).document_node;
	let matching = document.select("p.foo").unwrap().collect::<Vec<_>>();
	assert_eq!(matching.len(), 2);
	let child = matching[0].as_node().first_child().unwrap();
	@@ -140,7 +148,7 @@ fn select_first() {
	<p class=foo>Baz
	";

	- let document = parse_html().one(html);
	+ let document = parse_html().one(html).document_node;
	let matching = document.select_first("p.foo").unwrap();
	let child = matching.as_node().first_child().unwrap();
	assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n");
	@@ -165,7 +173,7 @@ fn to_string() {
	</body>
	</html>";

	- let document = parse_html().one(html);
	+ let document = parse_html().one(html).document_node;
	assert_eq!(
	document
	.inclusive_descendants()
	diff --git a/src/tree.rs b/src/tree.rs
	index e3bcc3e..7a336b8 100644
	--- a/src/tree.rs
	+++ b/src/tree.rs
	@@ -59,6 +59,14 @@ pub struct ElementData {
	/// If the element is an HTML `<template>` element,
	/// the document fragment node that is the root of template contents.
	pub template_contents: Option<NodeRef>,
	+
	+ /// The element's score for whether it should be considered for the
	+ /// "readable" tree.
	+ pub score: Cell<f32>,
	+
	+ /// The element's score for whether it should be considered for the
	+ /// "readable" tree.
	+ pub is_candidate: Cell<bool>,
	}

	/// Data specific to document nodes.
	@@ -222,6 +230,8 @@ impl NodeRef {
	attributes: RefCell::new(Attributes {
	map: attributes.into_iter().collect(),
	}),
	+ score: Cell::new(0.0),
	+ is_candidate: Cell::new(false),
	}))
	}