From 47cc9de9538022f7b9eadf80be09a95f31e6149e Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Mon, 2 Dec 2024 08:15:20 +0100 Subject: [PATCH 1/4] Make current nightly version of Clippy happy. --- scraper/src/element_ref/element.rs | 4 ++-- scraper/src/element_ref/mod.rs | 4 ++-- scraper/src/element_ref/serializable.rs | 2 +- scraper/src/error.rs | 4 ++-- scraper/src/html/mod.rs | 4 ++-- scraper/src/html/tree_sink.rs | 19 ++++++++----------- scraper/src/main.rs | 3 +-- 7 files changed, 18 insertions(+), 22 deletions(-) diff --git a/scraper/src/element_ref/element.rs b/scraper/src/element_ref/element.rs index e804d81e..23d2a16e 100644 --- a/scraper/src/element_ref/element.rs +++ b/scraper/src/element_ref/element.rs @@ -9,7 +9,7 @@ use super::ElementRef; use crate::selector::{CssLocalName, CssString, NonTSPseudoClass, PseudoElement, Simple}; /// Note: will never match against non-tree-structure pseudo-classes. -impl<'a> Element for ElementRef<'a> { +impl Element for ElementRef<'_> { type Impl = Simple; fn opaque(&self) -> OpaqueElement { @@ -135,7 +135,7 @@ impl<'a> Element for ElementRef<'a> { fn is_root(&self) -> bool { self.parent() - .map_or(false, |parent| parent.value().is_document()) + .is_some_and(|parent| parent.value().is_document()) } fn apply_selector_flags(&self, _flags: matching::ElementSelectorFlags) {} diff --git a/scraper/src/element_ref/mod.rs b/scraper/src/element_ref/mod.rs index 4e8500e2..cfe2a3d9 100644 --- a/scraper/src/element_ref/mod.rs +++ b/scraper/src/element_ref/mod.rs @@ -117,7 +117,7 @@ impl<'a> ElementRef<'a> { } } -impl<'a> Debug for ElementRef<'a> { +impl Debug for ElementRef<'_> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { Debug::fmt(self.value(), f) } @@ -160,7 +160,7 @@ impl Clone for Select<'_, '_> { } } -impl<'a, 'b> Iterator for Select<'a, 'b> { +impl<'a> Iterator for Select<'a, '_> { type Item = ElementRef<'a>; fn next(&mut self) -> Option> { diff --git a/scraper/src/element_ref/serializable.rs b/scraper/src/element_ref/serializable.rs index 98dda704..0b88c8d3 100644 --- a/scraper/src/element_ref/serializable.rs +++ b/scraper/src/element_ref/serializable.rs @@ -4,7 +4,7 @@ use html5ever::serialize::{Serialize, Serializer, TraversalScope}; use crate::ElementRef; -impl<'a> Serialize for ElementRef<'a> { +impl Serialize for ElementRef<'_> { fn serialize( &self, serializer: &mut S, diff --git a/scraper/src/error.rs b/scraper/src/error.rs index 15141eed..ef27dea1 100644 --- a/scraper/src/error.rs +++ b/scraper/src/error.rs @@ -73,7 +73,7 @@ impl<'a> From> for SelectorErrorKind<'a> { } } -impl<'a> Display for SelectorErrorKind<'a> { +impl Display for SelectorErrorKind<'_> { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!( f, @@ -103,7 +103,7 @@ impl<'a> Display for SelectorErrorKind<'a> { } } -impl<'a> Error for SelectorErrorKind<'a> { +impl Error for SelectorErrorKind<'_> { fn description(&self) -> &str { match self { Self::UnexpectedToken(_) => "Token was not expected", diff --git a/scraper/src/html/mod.rs b/scraper/src/html/mod.rs index 39ad74cf..f64578bb 100644 --- a/scraper/src/html/mod.rs +++ b/scraper/src/html/mod.rs @@ -153,7 +153,7 @@ impl Clone for Select<'_, '_> { } } -impl<'a, 'b> Iterator for Select<'a, 'b> { +impl<'a> Iterator for Select<'a, '_> { type Item = ElementRef<'a>; fn next(&mut self) -> Option> { @@ -178,7 +178,7 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { } } -impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { +impl DoubleEndedIterator for Select<'_, '_> { fn next_back(&mut self) -> Option { for node in self.inner.by_ref().rev() { if let Some(element) = ElementRef::wrap(node) { diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index f9e18720..02d43eb6 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -141,7 +141,7 @@ impl TreeSink for HtmlTreeSink { NodeOrText::AppendText(text) => { let text = make_tendril(text); - let did_concat = parent.last_child().map_or(false, |mut n| match n.value() { + let did_concat = parent.last_child().is_some_and(|mut n| match n.value() { Node::Text(t) => { t.text.push_tendril(&text); true @@ -181,16 +181,13 @@ impl TreeSink for HtmlTreeSink { NodeOrText::AppendText(text) => { let text = make_tendril(text); - let did_concat = - sibling - .prev_sibling() - .map_or(false, |mut n| match n.value() { - Node::Text(t) => { - t.text.push_tendril(&text); - true - } - _ => false, - }); + let did_concat = sibling.prev_sibling().is_some_and(|mut n| match n.value() { + Node::Text(t) => { + t.text.push_tendril(&text); + true + } + _ => false, + }); if !did_concat { sibling.insert_before(Node::Text(Text { text })); diff --git a/scraper/src/main.rs b/scraper/src/main.rs index 9d4684b3..8f9de9fa 100644 --- a/scraper/src/main.rs +++ b/scraper/src/main.rs @@ -125,8 +125,7 @@ fn main() { .iter() .map(File::open) .map(Result::unwrap) - .map(|mut f| query(&input, &output, &selector, &mut f)) - .any(|m| m) + .any(|mut f| query(&input, &output, &selector, &mut f)) }; process::exit(i32::from(!matched)); From ee66ee8d23f82eae453f00b507321e2f4819fc50 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Mon, 2 Dec 2024 08:34:17 +0100 Subject: [PATCH 2/4] Drop hash table for per-element attributes for more compact sorted vector. --- Cargo.lock | 72 +++++++++++++---------------------- scraper/Cargo.toml | 3 +- scraper/src/html/tree_sink.rs | 11 ++++++ scraper/src/node.rs | 41 ++++++++++++-------- 4 files changed, 64 insertions(+), 63 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d0ca7422..70b6588d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,24 +2,11 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "ahash" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" -dependencies = [ - "cfg-if", - "getrandom", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "autocfg" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bitflags" @@ -141,9 +128,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" [[package]] name = "html5ever" @@ -161,9 +148,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown", @@ -171,15 +158,15 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "libc" -version = "0.2.158" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "lock_api" @@ -225,9 +212,9 @@ checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "parking_lot" @@ -340,9 +327,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -388,9 +375,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.3" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ "bitflags", ] @@ -405,7 +392,6 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" name = "scraper" version = "0.21.0" dependencies = [ - "ahash", "cssparser", "ego-tree", "getopts", @@ -437,18 +423,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.209" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.209" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", @@ -510,9 +496,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.76" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -532,15 +518,15 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-width" -version = "0.1.13" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "utf-8" @@ -548,12 +534,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 7a0549e3..fe11f8c0 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -13,11 +13,10 @@ repository = "https://github.com/causal-agent/scraper" readme = "README.md" [dependencies] -ahash = "0.8.0" cssparser = "0.34.0" ego-tree = "0.9.0" html5ever = "0.29.0" -indexmap = { version = "2.6.0", optional = true } +indexmap = { version = "2.7.0", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" tendril = "0.4.3" diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index 02d43eb6..49b30b49 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -223,6 +223,17 @@ impl TreeSink for HtmlTreeSink { }; for attr in attrs { + #[cfg(not(feature = "deterministic"))] + if let Err(idx) = element + .attrs + .binary_search_by(|(name, _)| name.cmp(&attr.name)) + { + element + .attrs + .insert(idx, (attr.name, make_tendril(attr.value))); + } + + #[cfg(feature = "deterministic")] element .attrs .entry(attr.name) diff --git a/scraper/src/node.rs b/scraper/src/node.rs index f2390c3e..4f900857 100644 --- a/scraper/src/node.rs +++ b/scraper/src/node.rs @@ -1,9 +1,5 @@ //! HTML nodes. -#[cfg(not(feature = "deterministic"))] -use ahash::AHashMap as HashMap; -#[cfg(not(feature = "deterministic"))] -use std::collections::hash_map; use std::fmt; use std::ops::Deref; use std::slice::Iter as SliceIter; @@ -219,7 +215,7 @@ pub type Attributes = indexmap::IndexMap; /// Please enable the `deterministic` feature for order-preserving /// (de)serialization. #[cfg(not(feature = "deterministic"))] -pub type Attributes = HashMap; +pub type Attributes = Vec<(QualName, StrTendril)>; /// An HTML element. #[derive(Clone, PartialEq, Eq)] @@ -232,16 +228,20 @@ pub struct Element { id: OnceCell>, - classes: OnceCell>, + classes: OnceCell>, } impl Element { #[doc(hidden)] pub fn new(name: QualName, attributes: Vec) -> Self { - let attrs = attributes + #[allow(unused_mut)] + let mut attrs = attributes .into_iter() - .map(|a| (a.name, crate::tendril_util::make(a.value))) - .collect(); + .map(|attr| (attr.name, crate::tendril_util::make(attr.value))) + .collect::(); + + #[cfg(not(feature = "deterministic"))] + attrs.sort_unstable_by(|lhs, rhs| lhs.0.cmp(&rhs.0)); Element { attrs, @@ -277,17 +277,17 @@ impl Element { /// Returns an iterator over the element's classes. pub fn classes(&self) -> Classes { let classes = self.classes.get_or_init(|| { - let mut classes: Vec = self + let mut classes = self .attrs .iter() .filter(|(name, _)| name.local.as_ref() == "class") - .flat_map(|(_, value)| value.split_whitespace().map(LocalName::from)) - .collect(); + .flat_map(|(_, value)| value.split_ascii_whitespace().map(LocalName::from)) + .collect::>(); classes.sort_unstable(); classes.dedup(); - classes + classes.into_boxed_slice() }); Classes { @@ -298,7 +298,18 @@ impl Element { /// Returns the value of an attribute. pub fn attr(&self, attr: &str) -> Option<&str> { let qualname = QualName::new(None, ns!(), LocalName::from(attr)); - self.attrs.get(&qualname).map(Deref::deref) + + #[cfg(not(feature = "deterministic"))] + let value = self + .attrs + .binary_search_by(|attr| attr.0.cmp(&qualname)) + .ok() + .map(|idx| &*self.attrs[idx].1); + + #[cfg(feature = "deterministic")] + let value = self.attrs.get(&qualname).map(Deref::deref); + + value } /// Returns an iterator over the element's attributes. @@ -330,7 +341,7 @@ pub type AttributesIter<'a> = indexmap::map::Iter<'a, QualName, StrTendril>; /// An iterator over a node's attributes. #[cfg(not(feature = "deterministic"))] -pub type AttributesIter<'a> = hash_map::Iter<'a, QualName, StrTendril>; +pub type AttributesIter<'a> = SliceIter<'a, (QualName, StrTendril)>; /// Iterator over attributes. #[allow(missing_debug_implementations)] From 483ecab72112d75a3bb4f8784c9a9d3b20a01507 Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Tue, 10 Dec 2024 00:54:22 +0100 Subject: [PATCH 3/4] Bump ego-tree to version 0.10.0 --- Cargo.lock | 4 ++-- scraper/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 70b6588d..5d027678 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,9 +77,9 @@ dependencies = [ [[package]] name = "ego-tree" -version = "0.9.0" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7c6ba7d4eec39eaa9ab24d44a0e73a7949a1095a8b3f3abb11eddf27dbb56a53" +checksum = "b2972feb8dffe7bc8c5463b1dacda1b0dfbed3710e50f977d965429692d74cd8" [[package]] name = "equivalent" diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index fe11f8c0..b2862c94 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -14,7 +14,7 @@ readme = "README.md" [dependencies] cssparser = "0.34.0" -ego-tree = "0.9.0" +ego-tree = "0.10.0" html5ever = "0.29.0" indexmap = { version = "2.7.0", optional = true } precomputed-hash = "0.1.1" From dcf5e0c781f6b1d9a67b48ae0bdf551b04e9bc94 Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Tue, 10 Dec 2024 11:48:43 +0100 Subject: [PATCH 4/4] Version 0.22.0 --- Cargo.lock | 2 +- scraper/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5d027678..5937ff36 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -390,7 +390,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.21.0" +version = "0.22.0" dependencies = [ "cssparser", "ego-tree", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index b2862c94..0144f3a0 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.21.0" +version = "0.22.0" edition = "2021" description = "HTML parsing and querying with CSS selectors" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy