Last active
February 8, 2024 20:32
-
-
Save rillian/ad84b2f1c62560f7c968675f50844a68 to your computer and use it in GitHub Desktop.
git diff v0.8.2..v0.8.4-speedreader
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/.github/workflows/audit.yaml b/.github/workflows/audit.yaml | |
index f241163..ba66011 100644 | |
--- a/.github/workflows/audit.yaml | |
+++ b/.github/workflows/audit.yaml | |
@@ -1,4 +1,4 @@ | |
-name: Security audit | |
+name: Audit | |
on: | |
push: | |
paths: | |
@@ -6,7 +6,8 @@ on: | |
- '**/Cargo.lock' | |
pull_request: | |
branches: | |
- main | |
+ - main | |
+ - speedreader | |
paths: | |
- '**/Cargo.toml' | |
- '**/Cargo.lock' | |
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml | |
index 25a0496..f83a690 100644 | |
--- a/.github/workflows/lint.yaml | |
+++ b/.github/workflows/lint.yaml | |
@@ -3,9 +3,11 @@ on: | |
push: | |
branches: | |
- main | |
+ - speedreader | |
pull_request: | |
branches: | |
- main | |
+ - main | |
+ - speedreader | |
schedule: | |
- cron: '28 20 2 * *' | |
jobs: | |
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml | |
index 25bc5fa..7a46b45 100644 | |
--- a/.github/workflows/tests.yaml | |
+++ b/.github/workflows/tests.yaml | |
@@ -4,6 +4,7 @@ on: | |
push: | |
branches: | |
- main | |
+ - speedreader | |
pull_request: | |
jobs: | |
diff --git a/Cargo.toml b/Cargo.toml | |
index b90cfe8..19a22d8 100644 | |
--- a/Cargo.toml | |
+++ b/Cargo.toml | |
@@ -1,6 +1,6 @@ | |
[package] | |
name = "kuchikiki" | |
-version = "0.8.2" | |
+version = "0.8.4-speedreader" | |
authors = [ | |
"Brave Authors", | |
"Ralph Giles <rgiles@brave.com>", | |
@@ -18,9 +18,9 @@ doctest = false | |
[dependencies] | |
cssparser = "0.27" | |
matches = "0.1.4" | |
-html5ever = "0.26.0" | |
+html5ever = "0.25.1" | |
selectors = "0.22" | |
-indexmap = "1.6.0" | |
+indexmap = { version = "1.9.3", features = [ "std" ] } | |
[dev-dependencies] | |
tempfile = "3" | |
diff --git a/examples/find_matches.rs b/examples/find_matches.rs | |
index b496d08..54651c2 100644 | |
--- a/examples/find_matches.rs | |
+++ b/examples/find_matches.rs | |
@@ -1,4 +1,4 @@ | |
-use kuchikiki::traits::*; | |
+use kuchikiki::traits::TendrilSink; | |
fn main() { | |
let html = r" | |
@@ -14,7 +14,7 @@ fn main() { | |
"; | |
let css_selector = ".foo"; | |
- let document = kuchikiki::parse_html().one(html); | |
+ let document = kuchikiki::parse_html().one(html).document_node; | |
for css_match in document.select(css_selector).unwrap() { | |
// css_match is a NodeDataRef, but most of the interesting methods are | |
diff --git a/src/parser.rs b/src/parser.rs | |
index f160a6a..71bd438 100644 | |
--- a/src/parser.rs | |
+++ b/src/parser.rs | |
@@ -61,15 +61,27 @@ pub fn parse_fragment_with_options( | |
/// Receives new tree nodes during parsing. | |
pub struct Sink { | |
- document_node: NodeRef, | |
- on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>, | |
+ /// The `Document` itself. | |
+ pub document_node: NodeRef, | |
+ | |
+ /// The Sink will invoke this callback if it encounters a parse error. | |
+ pub on_parse_error: Option<Box<dyn FnMut(Cow<'static, str>)>>, | |
+} | |
+ | |
+impl Default for Sink { | |
+ fn default() -> Sink { | |
+ Sink { | |
+ document_node: NodeRef::new_document(), | |
+ on_parse_error: None, | |
+ } | |
+ } | |
} | |
impl TreeSink for Sink { | |
- type Output = NodeRef; | |
+ type Output = Self; | |
- fn finish(self) -> NodeRef { | |
- self.document_node | |
+ fn finish(self) -> Self { | |
+ self | |
} | |
type Handle = NodeRef; | |
diff --git a/src/tests.rs b/src/tests.rs | |
index a4f39e5..7031cf7 100644 | |
--- a/src/tests.rs | |
+++ b/src/tests.rs | |
@@ -14,7 +14,7 @@ fn text_nodes() { | |
<!doctype html> | |
<title>Test case</title> | |
<p>Content contains <b>Important</b> data</p>"; | |
- let document = parse_html().one(html); | |
+ let document = parse_html().one(html).document_node; | |
let paragraph = document.select("p").unwrap().collect::<Vec<_>>(); | |
assert_eq!(paragraph.len(), 1); | |
assert_eq!( | |
@@ -44,7 +44,7 @@ fn parse_and_serialize() { | |
<!doctype html> | |
<title>Test case</title> | |
<p>Content"; | |
- let document = parse_html().one(html); | |
+ let document = parse_html().one(html).document_node; | |
assert_eq!( | |
document.as_document().unwrap().quirks_mode(), | |
QuirksMode::NoQuirks | |
@@ -61,7 +61,7 @@ fn parse_and_serialize_fragment() { | |
let html = r"<tbody><tr><td>Test case"; | |
let ctx_name = QualName::new(None, ns!(html), local_name!("tbody")); | |
- let document = parse_fragment(ctx_name, vec![]).one(html); | |
+ let document = parse_fragment(ctx_name, vec![]).one(html).document_node; | |
assert_eq!( | |
document.as_document().unwrap().quirks_mode(), | |
QuirksMode::NoQuirks | |
@@ -86,7 +86,11 @@ fn parse_file() { | |
</body></html>"; | |
- let document = parse_html().from_utf8().from_file(&path).unwrap(); | |
+ let document = parse_html() | |
+ .from_utf8() | |
+ .from_file(&path) | |
+ .unwrap() | |
+ .document_node; | |
assert_eq!(document.to_string(), html); | |
} | |
@@ -97,10 +101,14 @@ fn serialize_and_read_file() { | |
path.push("temp.html"); | |
let html = r"<!DOCTYPE html><html><head><title>Title</title></head><body>Body</body></html>"; | |
- let document = parse_html().one(html); | |
+ let document = parse_html().one(html).document_node; | |
let _ = document.serialize_to_file(path.clone()); | |
- let document2 = parse_html().from_utf8().from_file(&path).unwrap(); | |
+ let document2 = parse_html() | |
+ .from_utf8() | |
+ .from_file(&path) | |
+ .unwrap() | |
+ .document_node; | |
assert_eq!(document.to_string(), document2.to_string()); | |
} | |
@@ -113,7 +121,7 @@ fn select() { | |
<p class=foo>Foo | |
"; | |
- let document = parse_html().one(html); | |
+ let document = parse_html().one(html).document_node; | |
let matching = document.select("p.foo").unwrap().collect::<Vec<_>>(); | |
assert_eq!(matching.len(), 2); | |
let child = matching[0].as_node().first_child().unwrap(); | |
@@ -140,7 +148,7 @@ fn select_first() { | |
<p class=foo>Baz | |
"; | |
- let document = parse_html().one(html); | |
+ let document = parse_html().one(html).document_node; | |
let matching = document.select_first("p.foo").unwrap(); | |
let child = matching.as_node().first_child().unwrap(); | |
assert_eq!(&**child.as_text().unwrap().borrow(), "Foo\n"); | |
@@ -165,7 +173,7 @@ fn to_string() { | |
</body> | |
</html>"; | |
- let document = parse_html().one(html); | |
+ let document = parse_html().one(html).document_node; | |
assert_eq!( | |
document | |
.inclusive_descendants() | |
diff --git a/src/tree.rs b/src/tree.rs | |
index e3bcc3e..7a336b8 100644 | |
--- a/src/tree.rs | |
+++ b/src/tree.rs | |
@@ -59,6 +59,14 @@ pub struct ElementData { | |
/// If the element is an HTML `<template>` element, | |
/// the document fragment node that is the root of template contents. | |
pub template_contents: Option<NodeRef>, | |
+ | |
+ /// The element's score for whether it should be considered for the | |
+ /// "readable" tree. | |
+ pub score: Cell<f32>, | |
+ | |
+ /// The element's score for whether it should be considered for the | |
+ /// "readable" tree. | |
+ pub is_candidate: Cell<bool>, | |
} | |
/// Data specific to document nodes. | |
@@ -222,6 +230,8 @@ impl NodeRef { | |
attributes: RefCell::new(Attributes { | |
map: attributes.into_iter().collect(), | |
}), | |
+ score: Cell::new(0.0), | |
+ is_candidate: Cell::new(false), | |
})) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment