From 05976d0983662723c46db9483b7a6f600540312e Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Sat, 28 Oct 2023 14:08:09 +0200 Subject: [PATCH 01/14] Add trait to abstract over selectable collections of elements, i.e. Html and ElementRef itself. --- src/lib.rs | 1 + src/selectable.rs | 70 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) create mode 100644 src/selectable.rs diff --git a/src/lib.rs b/src/lib.rs index c000283f..7462cb79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -149,6 +149,7 @@ pub mod element_ref; pub mod error; pub mod html; pub mod node; +pub mod selectable; pub mod selector; #[cfg(feature = "atomic")] diff --git a/src/selectable.rs b/src/selectable.rs new file mode 100644 index 00000000..f77a9976 --- /dev/null +++ b/src/selectable.rs @@ -0,0 +1,70 @@ +//! Provides the [`Selectable`] to abstract over collections of elements + +use crate::{ + element_ref::{self, ElementRef}, + html::{self, Html}, + selector::Selector, +}; + +/// Trait to abstract over collections of elements to which a [CSS selector][Selector] can be applied +/// +/// The mainly enables writing helper functions which are generic over [`Html`] and [`ElementRef`], e.g. +/// +/// ``` +/// use scraper::{selectable::Selectable, selector::Selector}; +/// +/// fn text_of_first_match<'a, S>(selectable: S, selector: &Selector) -> Option +/// where +/// S: Selectable<'a>, +/// { +/// selectable.select(selector).next().map(|element| element.text().collect()) +/// } +/// ``` +pub trait Selectable<'a> { + /// Iterator over [element references][ElementRef] matching a [CSS selector[Selector] + type Select<'b>: Iterator>; + + /// Applies the given `selector` to the collection of elements represented by `self` + fn select(self, selector: &Selector) -> Self::Select<'_>; +} + +impl<'a> Selectable<'a> for &'a Html { + type Select<'b> = html::Select<'a, 'b>; + + fn select(self, selector: &Selector) -> Self::Select<'_> { + Html::select(self, selector) + } +} + +impl<'a> Selectable<'a> for ElementRef<'a> { + type Select<'b> = element_ref::Select<'a, 'b>; + + fn select(self, selector: &Selector) -> Self::Select<'_> { + ElementRef::select(&self, selector) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn select_one<'a, S>(selectable: S, selector: &Selector) -> Option> + where + S: Selectable<'a>, + { + selectable.select(selector).next() + } + + #[test] + fn html_and_element_ref_are_selectable() { + let fragment = Html::parse_fragment( + r#""#, + ); + + let selector = Selector::parse("select.foo").unwrap(); + let element = select_one(&fragment, &selector).unwrap(); + + let selector = Selector::parse("select.foo option[value='bar']").unwrap(); + let _element = select_one(element, &selector).unwrap(); + } +} From ee9049a7d35e6581aab3270417fb801a1f374c5e Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Sun, 29 Oct 2023 20:30:11 +0100 Subject: [PATCH 02/14] Mark both Select iterators as fused to enable downstream optimizations. --- src/element_ref/mod.rs | 3 +++ src/html/mod.rs | 3 +++ 2 files changed, 6 insertions(+) diff --git a/src/element_ref/mod.rs b/src/element_ref/mod.rs index 0485fca6..da3f0620 100644 --- a/src/element_ref/mod.rs +++ b/src/element_ref/mod.rs @@ -1,5 +1,6 @@ //! Element references. +use std::iter::FusedIterator; use std::ops::Deref; use ego_tree::iter::{Edge, Traverse}; @@ -115,6 +116,8 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { } } +impl FusedIterator for Select<'_, '_> {} + /// Iterator over descendent text nodes. #[derive(Debug, Clone)] pub struct Text<'a> { diff --git a/src/html/mod.rs b/src/html/mod.rs index 26ec5ea0..ad7b93f0 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -2,6 +2,7 @@ #[cfg(feature = "errors")] use std::borrow::Cow; +use std::iter::FusedIterator; use ego_tree::iter::Nodes; use ego_tree::Tree; @@ -161,6 +162,8 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { } } +impl FusedIterator for Select<'_, '_> {} + mod serializable; mod tree_sink; From dadbf2defac4a4bca59d8212b30e9c2e8294d150 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 30 Oct 2023 06:45:37 +0000 Subject: [PATCH 03/14] Bump ahash from 0.8.3 to 0.8.6 Bumps [ahash](https://github.com/tkaitchuck/ahash) from 0.8.3 to 0.8.6. - [Release notes](https://github.com/tkaitchuck/ahash/releases) - [Commits](https://github.com/tkaitchuck/ahash/commits) --- updated-dependencies: - dependency-name: ahash dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4148d03a..2f30fc4f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,14 +4,15 @@ version = 3 [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -638,3 +639,23 @@ name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "zerocopy" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd66a62464e3ffd4e37bd09950c2b9dd6c4f8767380fabba0d523f9a775bc85a" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "255c4596d41e6916ced49cfafea18727b24d67878fa180ddfd69b9df34fd1726" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.37", +] From 0996a3f930ade8e6987e9185d914a34111ba31ad Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Mon, 30 Oct 2023 08:04:29 +0100 Subject: [PATCH 04/14] Bump all locked Cargo dependencies include ahash. --- Cargo.lock | 77 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4148d03a..49083560 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,14 +4,15 @@ version = 3 [[package]] name = "ahash" -version = "0.8.3" +version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" +checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a" dependencies = [ "cfg-if", "getrandom", "once_cell", "version_check", + "zerocopy", ] [[package]] @@ -28,15 +29,15 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.0" +version = "2.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4682ae6287fcf752ecaabbfcc7b6f9b72aa33933dc23a554d853aea8eea8635" +checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07" [[package]] name = "byteorder" -version = "1.4.3" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "cfg-if" @@ -64,7 +65,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" dependencies = [ "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -146,9 +147,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.14.1" +version = "0.14.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dfda62a12f55daeae5015f81b0baea145391cb4520f86c248fc615d72640d12" +checksum = "f93e7192158dbcda357bdec5fb5788eebf8bbac027f3f33e719d29135ae84156" [[package]] name = "html5ever" @@ -182,15 +183,15 @@ checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "libc" -version = "0.2.148" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9cdc71e17332e86d2e1d38c1f99edcb6288ee11b815fb1a4b049eaa2114d369b" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "lock_api" -version = "0.4.10" +version = "0.4.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16" +checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" dependencies = [ "autocfg", "scopeguard", @@ -246,9 +247,9 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.8" +version = "0.9.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" +checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" dependencies = [ "cfg-if", "libc", @@ -316,7 +317,7 @@ dependencies = [ "phf_shared 0.11.2", "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -351,9 +352,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" -version = "1.0.67" +version = "1.0.69" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d433d9f1a3e8c1263d9456598b16fec66f4acc9a74dacffd35c7bb09b3a1328" +checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" dependencies = [ "unicode-ident", ] @@ -399,9 +400,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.3.5" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +checksum = "4722d768eff46b75989dd134e5c353f0d6296e5aaa3132e776cbdb56be7731aa" dependencies = [ "bitflags 1.3.2", ] @@ -433,7 +434,7 @@ version = "0.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" dependencies = [ - "bitflags 2.4.0", + "bitflags 2.4.1", "cssparser", "derive_more", "fxhash", @@ -448,22 +449,22 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.188" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cf9e0fcba69a370eed61bcf2b728575f726b50b55cba78064753d708ddc7549e" +checksum = "91d3c334ca1ee894a2c6f6ad698fe8c435b76d504b13d436f0685d648d6d96f7" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.188" +version = "1.0.190" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eca7ac642d82aa35b60049a6eccb4be6be75e599bd2e9adb5f875a737654af2" +checksum = "67c5609f394e5c2bd7fc51efda478004ea80ef42fee983d5c67a65e34f32c0e3" dependencies = [ "proc-macro2", "quote", - "syn 2.0.37", + "syn 2.0.38", ] [[package]] @@ -532,9 +533,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.37" +version = "2.0.38" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7303ef2c05cd654186cb250d29049a24840ca25d2747c25c0381c8d9e2f582e8" +checksum = "e96b79aaa137db8f61e26363a0c9b47d8b4ec75da28b7d1d614c2303e232408b" dependencies = [ "proc-macro2", "quote", @@ -638,3 +639,23 @@ name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" + +[[package]] +name = "zerocopy" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd66a62464e3ffd4e37bd09950c2b9dd6c4f8767380fabba0d523f9a775bc85a" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "255c4596d41e6916ced49cfafea18727b24d67878fa180ddfd69b9df34fd1726" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.38", +] From 4e5b3c638b7580e6462143b0e5c58de4a19c771b Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Wed, 1 Nov 2023 20:13:50 +0100 Subject: [PATCH 05/14] Add convenience methods to iterate only over child and descendant elements instead of all nodes. --- src/element_ref/mod.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/element_ref/mod.rs b/src/element_ref/mod.rs index 0485fca6..20a943db 100644 --- a/src/element_ref/mod.rs +++ b/src/element_ref/mod.rs @@ -81,6 +81,36 @@ impl<'a> ElementRef<'a> { inner: self.traverse(), } } + + /// Iterate over all child nodes which are elements + /// + /// # Example + /// + /// ``` + /// # use scraper::Html; + /// let fragment = Html::parse_fragment("foobarbazqux"); + /// + /// let children = fragment.root_element().child_elements().map(|element| element.value().name()).collect::>(); + /// assert_eq!(children, ["span", "a"]); + /// ``` + pub fn child_elements(&self) -> impl Iterator> { + self.children().filter_map(ElementRef::wrap) + } + + /// Iterate over all descendent nodes which are elements + /// + /// # Example + /// + /// ``` + /// # use scraper::Html; + /// let fragment = Html::parse_fragment("foobarbazqux"); + /// + /// let descendants = fragment.root_element().descendent_elements().map(|element| element.value().name()).collect::>(); + /// assert_eq!(descendants, ["html", "span", "b", "a", "i"]); + /// ``` + pub fn descendent_elements(&self) -> impl Iterator> { + self.descendants().filter_map(ElementRef::wrap) + } } impl<'a> Deref for ElementRef<'a> { From b4ccf39e1ef1068ad32da518d097c40dbbe6e0c2 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 6 Nov 2023 06:48:43 +0000 Subject: [PATCH 06/14] Bump indexmap from 2.0.2 to 2.1.0 Bumps [indexmap](https://github.com/bluss/indexmap) from 2.0.2 to 2.1.0. - [Changelog](https://github.com/bluss/indexmap/blob/master/RELEASES.md) - [Commits](https://github.com/bluss/indexmap/compare/2.0.2...2.1.0) --- updated-dependencies: - dependency-name: indexmap dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4148d03a..d60b9016 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -166,9 +166,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.0.2" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8adf3ddd720272c6ea8bf59463c04e0f93d0bbf7c5439b691bca2987e0270897" +checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" dependencies = [ "equivalent", "hashbrown", diff --git a/Cargo.toml b/Cargo.toml index 21ad948b..4ee87094 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ html5ever = "0.26" selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8" -indexmap = { version = "2.0.2", optional = true } +indexmap = { version = "2.1.0", optional = true } once_cell = "1.0" [dependencies.getopts] From 8068a0ff4d1fe4d4d857ee7b746cd34df24dbca8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 11 Dec 2023 06:39:54 +0000 Subject: [PATCH 07/14] Bump once_cell from 1.18.0 to 1.19.0 Bumps [once_cell](https://github.com/matklad/once_cell) from 1.18.0 to 1.19.0. - [Changelog](https://github.com/matklad/once_cell/blob/master/CHANGELOG.md) - [Commits](https://github.com/matklad/once_cell/compare/v1.18.0...v1.19.0) --- updated-dependencies: - dependency-name: once_cell dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 073a1bd6..0880f834 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -231,9 +231,9 @@ checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" [[package]] name = "once_cell" -version = "1.18.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "parking_lot" diff --git a/Cargo.toml b/Cargo.toml index 4ee87094..6bddac82 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,7 +20,7 @@ selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8" indexmap = { version = "2.1.0", optional = true } -once_cell = "1.0" +once_cell = "1.19" [dependencies.getopts] version = "0.2.21" From d8af8ead961ae77e86f6cd7a88d3adee7ed8d75b Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Fri, 22 Dec 2023 11:29:39 +0100 Subject: [PATCH 08/14] Another try at actually using an nth index cache This re-uses the cache only between the match operations for a single select iteration which ensures that the selector is not dropped (and its allocation address re-used) while the cache is alive thereby avoiding the ABA problem fixed by 7fdac0a5c2f23b12dae328837148e19295fad735. This does not require any change in user code to benefit from the increased caching even though some Debug and Clone implementations need to be done manually now. --- src/element_ref/mod.rs | 35 +++++++++++++++++++++++++++++++-- src/html/mod.rs | 44 +++++++++++++++++++++++++++++++++++++----- src/selector.rs | 16 +++++++++++++-- 3 files changed, 86 insertions(+), 9 deletions(-) diff --git a/src/element_ref/mod.rs b/src/element_ref/mod.rs index 3a4df63c..5461041f 100644 --- a/src/element_ref/mod.rs +++ b/src/element_ref/mod.rs @@ -1,11 +1,13 @@ //! Element references. +use std::fmt; use std::iter::FusedIterator; use std::ops::Deref; use ego_tree::iter::{Edge, Traverse}; use ego_tree::NodeRef; use html5ever::serialize::{serialize, SerializeOpts, TraversalScope}; +use selectors::NthIndexCache; use crate::node::Element; use crate::{Node, Selector}; @@ -47,6 +49,7 @@ impl<'a> ElementRef<'a> { scope: *self, inner, selector, + nth_index_cache: NthIndexCache::default(), } } @@ -122,11 +125,33 @@ impl<'a> Deref for ElementRef<'a> { } /// Iterator over descendent elements matching a selector. -#[derive(Debug, Clone)] pub struct Select<'a, 'b> { scope: ElementRef<'a>, inner: Traverse<'a, Node>, selector: &'b Selector, + nth_index_cache: NthIndexCache, +} + +impl fmt::Debug for Select<'_, '_> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Select") + .field("scope", &self.scope) + .field("inner", &self.inner) + .field("selector", &self.selector) + .field("nth_index_cache", &"..") + .finish() + } +} + +impl Clone for Select<'_, '_> { + fn clone(&self) -> Self { + Self { + scope: self.scope, + inner: self.inner.clone(), + selector: self.selector, + nth_index_cache: NthIndexCache::default(), + } + } } impl<'a, 'b> Iterator for Select<'a, 'b> { @@ -136,7 +161,11 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { for edge in &mut self.inner { if let Edge::Open(node) = edge { if let Some(element) = ElementRef::wrap(node) { - if self.selector.matches_with_scope(&element, Some(self.scope)) { + if self.selector.matches_with_scope_and_cache( + &element, + Some(self.scope), + &mut self.nth_index_cache, + ) { return Some(element); } } @@ -169,6 +198,8 @@ impl<'a> Iterator for Text<'a> { } } +impl FusedIterator for Text<'_> {} + mod element; mod serializable; diff --git a/src/html/mod.rs b/src/html/mod.rs index ad7b93f0..5178149c 100644 --- a/src/html/mod.rs +++ b/src/html/mod.rs @@ -2,14 +2,15 @@ #[cfg(feature = "errors")] use std::borrow::Cow; +use std::fmt; use std::iter::FusedIterator; use ego_tree::iter::Nodes; use ego_tree::Tree; use html5ever::serialize::SerializeOpts; use html5ever::tree_builder::QuirksMode; -use html5ever::QualName; -use html5ever::{driver, serialize}; +use html5ever::{driver, serialize, QualName}; +use selectors::NthIndexCache; use tendril::TendrilSink; use crate::selector::Selector; @@ -94,6 +95,7 @@ impl Html { Select { inner: self.tree.nodes(), selector, + nth_index_cache: NthIndexCache::default(), } } @@ -122,10 +124,30 @@ impl Html { } /// Iterator over elements matching a selector. -#[derive(Debug)] pub struct Select<'a, 'b> { inner: Nodes<'a, Node>, selector: &'b Selector, + nth_index_cache: NthIndexCache, +} + +impl fmt::Debug for Select<'_, '_> { + fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { + fmt.debug_struct("Select") + .field("inner", &self.inner) + .field("selector", &self.selector) + .field("nth_index_cache", &"..") + .finish() + } +} + +impl Clone for Select<'_, '_> { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + selector: self.selector, + nth_index_cache: NthIndexCache::default(), + } + } } impl<'a, 'b> Iterator for Select<'a, 'b> { @@ -134,7 +156,13 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { fn next(&mut self) -> Option> { for node in self.inner.by_ref() { if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { + if element.parent().is_some() + && self.selector.matches_with_scope_and_cache( + &element, + None, + &mut self.nth_index_cache, + ) + { return Some(element); } } @@ -153,7 +181,13 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { fn next_back(&mut self) -> Option { for node in self.inner.by_ref().rev() { if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { + if element.parent().is_some() + && self.selector.matches_with_scope_and_cache( + &element, + None, + &mut self.nth_index_cache, + ) + { return Some(element); } } diff --git a/src/selector.rs b/src/selector.rs index 68f880cc..7ef13f2f 100644 --- a/src/selector.rs +++ b/src/selector.rs @@ -8,6 +8,7 @@ use html5ever::{LocalName, Namespace}; use selectors::{ matching, parser::{self, ParseRelative, SelectorList, SelectorParseErrorKind}, + NthIndexCache, }; use crate::error::SelectorErrorKind; @@ -42,11 +43,22 @@ impl Selector { /// The optional `scope` argument is used to specify which element has `:scope` pseudo-class. /// When it is `None`, `:scope` will match the root element. pub fn matches_with_scope(&self, element: &ElementRef, scope: Option) -> bool { - let mut nth_index_cache = Default::default(); + self.matches_with_scope_and_cache(element, scope, &mut NthIndexCache::default()) + } + + // The `nth_index_cache` must not be used after `self` is dropped + // to avoid incorrect results (even though no undefined behaviour is possible) + // due to the usage of selector memory addresses as cache keys. + pub(crate) fn matches_with_scope_and_cache( + &self, + element: &ElementRef, + scope: Option, + nth_index_cache: &mut NthIndexCache, + ) -> bool { let mut context = matching::MatchingContext::new( matching::MatchingMode::Normal, None, - &mut nth_index_cache, + nth_index_cache, matching::QuirksMode::NoQuirks, matching::NeedsSelectorFlags::No, matching::IgnoreNthChildForInvalidation::No, From 8bd467ae32c9fb392d09d9fea3d77bb9804f9c48 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Jan 2024 06:37:46 +0000 Subject: [PATCH 09/14] Bump ahash from 0.8.6 to 0.8.7 Bumps [ahash](https://github.com/tkaitchuck/ahash) from 0.8.6 to 0.8.7. - [Release notes](https://github.com/tkaitchuck/ahash/releases) - [Commits](https://github.com/tkaitchuck/ahash/commits) --- updated-dependencies: - dependency-name: ahash dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0880f834..e63d2dfe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "ahash" -version = "0.8.6" +version = "0.8.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91429305e9f0a25f6205c5b8e0d2db09e0708a7a6df0f42212bb56c32c8ac97a" +checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" dependencies = [ "cfg-if", "getrandom", @@ -642,18 +642,18 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "zerocopy" -version = "0.7.20" +version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd66a62464e3ffd4e37bd09950c2b9dd6c4f8767380fabba0d523f9a775bc85a" +checksum = "74d4d3961e53fa4c9a25a8637fc2bfaf2595b3d3ae34875568a5cf64787716be" dependencies = [ "zerocopy-derive", ] [[package]] name = "zerocopy-derive" -version = "0.7.20" +version = "0.7.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "255c4596d41e6916ced49cfafea18727b24d67878fa180ddfd69b9df34fd1726" +checksum = "9ce1b18ccd8e73a9321186f97e46f9f04b778851177567b1975109d26a08d2a6" dependencies = [ "proc-macro2", "quote", From 51485a0dd8ae724c4e992c937c1106f22227682b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 29 Jan 2024 06:44:26 +0000 Subject: [PATCH 10/14] Bump indexmap from 2.1.0 to 2.2.1 Bumps [indexmap](https://github.com/indexmap-rs/indexmap) from 2.1.0 to 2.2.1. - [Changelog](https://github.com/indexmap-rs/indexmap/blob/master/RELEASES.md) - [Commits](https://github.com/indexmap-rs/indexmap/compare/2.1.0...2.2.1) --- updated-dependencies: - dependency-name: indexmap dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e63d2dfe..0126ddde 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.1.0" +version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d530e1a18b1cb4c484e6e34556a0d948706958449fca0cab753d649f2bce3d1f" +checksum = "433de089bd45971eecf4668ee0ee8f4cec17db4f8bd8f7bc3197a6ce37aa7d9b" dependencies = [ "equivalent", "hashbrown", diff --git a/Cargo.toml b/Cargo.toml index 6bddac82..819ed57b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ html5ever = "0.26" selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8" -indexmap = { version = "2.1.0", optional = true } +indexmap = { version = "2.2.1", optional = true } once_cell = "1.19" [dependencies.getopts] From 1775ac7c3acbc4eded30d72454ea0dc4c12677db Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 5 Feb 2024 06:07:11 +0000 Subject: [PATCH 11/14] Bump indexmap from 2.2.1 to 2.2.2 Bumps [indexmap](https://github.com/indexmap-rs/indexmap) from 2.2.1 to 2.2.2. - [Changelog](https://github.com/indexmap-rs/indexmap/blob/master/RELEASES.md) - [Commits](https://github.com/indexmap-rs/indexmap/compare/2.2.1...2.2.2) --- updated-dependencies: - dependency-name: indexmap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 0126ddde..3030ba45 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.1" +version = "2.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "433de089bd45971eecf4668ee0ee8f4cec17db4f8bd8f7bc3197a6ce37aa7d9b" +checksum = "824b2ae422412366ba479e8111fd301f7b5faece8149317bb81925979a53f520" dependencies = [ "equivalent", "hashbrown", diff --git a/Cargo.toml b/Cargo.toml index 819ed57b..8e3bcd89 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ html5ever = "0.26" selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8" -indexmap = { version = "2.2.1", optional = true } +indexmap = { version = "2.2.2", optional = true } once_cell = "1.19" [dependencies.getopts] From 6634e9dd14a89e644e3900bdd14c298e16c0c628 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 12 Feb 2024 06:52:07 +0000 Subject: [PATCH 12/14] Bump indexmap from 2.2.2 to 2.2.3 Bumps [indexmap](https://github.com/indexmap-rs/indexmap) from 2.2.2 to 2.2.3. - [Changelog](https://github.com/indexmap-rs/indexmap/blob/master/RELEASES.md) - [Commits](https://github.com/indexmap-rs/indexmap/compare/2.2.2...2.2.3) --- updated-dependencies: - dependency-name: indexmap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3030ba45..0f3704f7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -167,9 +167,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.2" +version = "2.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "824b2ae422412366ba479e8111fd301f7b5faece8149317bb81925979a53f520" +checksum = "233cf39063f058ea2caae4091bf4a3ef70a653afbc026f5c4a4135d114e3c177" dependencies = [ "equivalent", "hashbrown", diff --git a/Cargo.toml b/Cargo.toml index 8e3bcd89..ce55a928 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,7 @@ html5ever = "0.26" selectors = "0.25.0" tendril = "0.4.3" ahash = "0.8" -indexmap = { version = "2.2.2", optional = true } +indexmap = { version = "2.2.3", optional = true } once_cell = "1.19" [dependencies.getopts] From 67fc720e4b09392ba8ad13ce156c4b29b7b71afb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 26 Feb 2024 06:46:50 +0000 Subject: [PATCH 13/14] Bump ahash from 0.8.7 to 0.8.9 Bumps [ahash](https://github.com/tkaitchuck/ahash) from 0.8.7 to 0.8.9. - [Release notes](https://github.com/tkaitchuck/ahash/releases) - [Commits](https://github.com/tkaitchuck/ahash/compare/0.8.7...v0.8.9) --- updated-dependencies: - dependency-name: ahash dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3030ba45..b892d4f4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "ahash" -version = "0.8.7" +version = "0.8.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77c3a9648d43b9cd48db467b3f87fdd6e146bcc88ab0180006cef2179fe11d01" +checksum = "d713b3834d76b85304d4d525563c1276e2e30dc97cc67bfb4585a4a29fc2c89f" dependencies = [ "cfg-if", "getrandom", From 1e123525b811ca869f9b19c1335235a7fed5a382 Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Thu, 29 Feb 2024 09:55:59 +0100 Subject: [PATCH 14/14] Version 0.19.0 --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 6c374f38..4f83a3c4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -415,7 +415,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.18.1" +version = "0.19.0" dependencies = [ "ahash", "cssparser", diff --git a/Cargo.toml b/Cargo.toml index ce55a928..304f68d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.18.1" +version = "0.19.0" edition = "2021" description = "HTML parsing and querying with CSS selectors" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy