Skip to content

Instantly share code, notes, and snippets.

@rillian
Last active February 8, 2024 20:32
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rillian/ad84b2f1c62560f7c968675f50844a68 to your computer and use it in GitHub Desktop.
Save rillian/ad84b2f1c62560f7c968675f50844a68 to your computer and use it in GitHub Desktop.
git diff v0.8.2..v0.8.4-speedreader
diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml
index f241163..ba66011 100644
--- a/.github/workflows/audit.yaml
+++ b/.github/workflows/audit.yaml
@@ -1,4 +1,4 @@
-name: Security audit
+name: Audit
on:
push:
paths:
@@ -6,7 +6,8 @@ on:
- '**/Cargo.lock'
pull_request:
branches:
- main
+ - main
+ - speedreader
paths:
- '**/Cargo.toml'
- '**/Cargo.lock'
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index 25a0496..f83a690 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -3,9 +3,11 @@ on:
push:
branches:
- main
+ - speedreader
pull_request:
branches:
- main
+ - main
+ - speedreader
schedule:
- cron: '28 20 2 * *'
jobs:
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
index 25bc5fa..7a46b45 100644
--- a/.github/workflows/tests.yaml
+++ b/.github/workflows/tests.yaml
@@ -4,6 +4,7 @@ on:
push:
branches:
- main
+ - speedreader
pull_request:
jobs:
diff --git a/Cargo.toml b/Cargo.toml
index b90cfe8..19a22d8 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
[package]
name = "kuchikiki"
-version = "0.8.2"
+version = "0.8.4-speedreader"
authors = [
"Brave Authors",
"Ralph Giles <rgiles@brave.com>",
@@ -18,9 +18,9 @@ doctest = false
[dependencies]
cssparser = "0.27"
matches = "0.1.4"
-html5ever = "0.26.0"
+html5ever = "0.25.1"
selectors = "0.22"
-indexmap = "1.6.0"
+indexmap = { version = "1.9.3", features = [ "std" ] }
[dev-dependencies]
tempfile = "3"
diff --git a/examples/find_matches.rs b/examples/find_matches.rs
index b496d08..54651c2 100644
--- a/examples/find_matches.rs
+++ b/examples/find_matches.rs
@@ -1,4 +1,4 @@
-use kuchikiki::traits::*;
+use kuchikiki::traits::TendrilSink;
fn main() {
let html = r"
@@ -14,7 +14,7 @@ fn main() {
";
let css_selector = ".foo";
- let document = kuchikiki::parse_html().one(html);
+ let document = kuchikiki::parse_html().one(html).document_node;
for css_match in document.select(css_selector).unwrap() {
// css_match is a NodeDataRef, but most of the interesting methods are
diff --git a/src/parser.rs b/src/parser.rs
index f160a6a..71bd438 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -61,15 +61,27 @@ pub fn parse_fragment_with_options(
/// Receives new tree nodes during parsing.
pub struct Sink {
- document_node: NodeRef,
- on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
+ /// The `Document` itself.
+ pub document_node: NodeRef,
+
+ /// The Sink will invoke this callback if it encounters a parse error.
+ pub on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>,
+}
+
+impl Default for Sink {
+ fn default() -> Sink {
+ Sink {
+ document_node: NodeRef::new_document(),
+ on_parse_error: None,
+ }
+ }
}
impl TreeSink for Sink {
- type Output = NodeRef;
+ type Output = Self;
- fn finish(self) -> NodeRef {
- self.document_node
+ fn finish(self) -> Self {
+ self
}
type Handle = NodeRef;
diff --git a/src/tests.rs b/src/tests.rs
index a4f39e5..7031cf7 100644
--- a/src/tests.rs
+++ b/src/tests.rs
@@ -14,7 +14,7 @@ fn text_nodes() {
<!doctype html>
<title>Test case</title>
<p>Content contains <b>Important</b> data</p>";
- let document = parse_html().one(html);
+ let document = parse_html().one(html).document_node;
let paragraph = document.select("p").unwrap().collect::<Vec<_>>();
assert_eq!(paragraph.len(), 1);
assert_eq!(
@@ -44,7 +44,7 @@ fn parse_and_serialize() {
<!doctype html>
<title>Test case</title>
<p>Content";
- let document = parse_html().one(html);
+ let document = parse_html().one(html).document_node;
assert_eq!(
document.as_document().unwrap().quirks_mode(),
QuirksMode::NoQuirks
@@ -61,7 +61,7 @@ fn parse_and_serialize_fragment() {
let html = r"<tbody><tr><td>Test case";
let ctx_name = QualName::new(None, ns!(html), local_name!("tbody"));
- let document = parse_fragment(ctx_name, vec![]).one(html);
+ let document = parse_fragment(ctx_name, vec![]).one(html).document_node;
assert_eq!(
document.as_document().unwrap().quirks_mode(),
QuirksMode::NoQuirks
@@ -86,7 +86,11 @@ fn parse_file() {
</body></html>";
- let document = parse_html().from_utf8().from_file(&path).unwrap();
+ let document = parse_html()
+ .from_utf8()
+ .from_file(&path)
+ .unwrap()
+ .document_node;
assert_eq!(document.to_string(), html);
}
@@ -97,10 +101,14 @@ fn serialize_and_read_file() {
path.push("temp.html");
let html = r"<!DOCTYPE html><html><head><title>Title</title></head><body>Body</body></html>";
- let document = parse_html().one(html);
+ let document = parse_html().one(html).document_node;
let _ = document.serialize_to_file(path.clone());
- let document2 = parse_html().from_utf8().from_file(&path).unwrap();
+ let document2 = parse_html()
+ .from_utf8()
+ .from_file(&path)
+ .unwrap()
+ .document_node;
assert_eq!(document.to_string(), document2.to_string());
}
@@ -113,7 +121,7 @@ fn select() {
<p class=foo>Foo
";
- let document = parse_html().one(html);
+ let document = parse_html().one(html).document_node;
let matching = document.select("p.foo").unwrap().collect::<Vec<_>>();
assert_eq!(matching.len(), 2);
let child = matching[0].as_node().first_child().unwrap();
@@ -140,7 +148,7 @@ fn select_first() {
<p class=foo>Baz
";
- let document = parse_html().one(html);
+ let document = parse_html().one(html).document_node;
let matching = document.select_first("p.foo").unwrap();
let child = matching.as_node().first_child().unwrap();
assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n");
@@ -165,7 +173,7 @@ fn to_string() {
</body>
</html>";
- let document = parse_html().one(html);
+ let document = parse_html().one(html).document_node;
assert_eq!(
document
.inclusive_descendants()
diff --git a/src/tree.rs b/src/tree.rs
index e3bcc3e..7a336b8 100644
--- a/src/tree.rs
+++ b/src/tree.rs
@@ -59,6 +59,14 @@ pub struct ElementData {
/// If the element is an HTML `<template>` element,
/// the document fragment node that is the root of template contents.
pub template_contents: Option<NodeRef>,
+
+ /// The element's score for whether it should be considered for the
+ /// "readable" tree.
+ pub score: Cell<f32>,
+
+ /// The element's score for whether it should be considered for the
+ /// "readable" tree.
+ pub is_candidate: Cell<bool>,
}
/// Data specific to document nodes.
@@ -222,6 +230,8 @@ impl NodeRef {
attributes: RefCell::new(Attributes {
map: attributes.into_iter().collect(),
}),
+ score: Cell::new(0.0),
+ is_candidate: Cell::new(false),
}))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment