diff --git a/Cargo.lock b/Cargo.lock index 5937ff36..50df5256 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown", @@ -390,7 +390,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.22.0" +version = "0.23.1" dependencies = [ "cssparser", "ego-tree", @@ -399,6 +399,7 @@ dependencies = [ "indexmap", "precomputed-hash", "selectors", + "serde", "tendril", ] @@ -423,18 +424,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.215" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" dependencies = [ "proc-macro2", "quote", diff --git a/LICENSE b/LICENSE index 3c787528..bb793d8a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ Copyright © 2016, June McEnroe Copyright © 2017, Vivek Kushwaha +Copyright © 2024-2025, rust-scraper Contributors Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/README.md b/README.md deleted file mode 100644 index 39450ec2..00000000 --- a/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# scraper - -[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] -[![downloads](https://img.shields.io/crates/d/scraper)][crate] -[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] - -HTML parsing and querying with CSS selectors. - -`scraper` is on [Crates.io][crate] and [GitHub][github]. - -[crate]: https://crates.io/crates/scraper -[github]: https://github.com/causal-agent/scraper -[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml - -Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. - -## Examples - -### Parsing a document - -```rust -use scraper::Html; - -let html = r#" - - - Hello, world! -

Hello, world!

-"#; - -let document = Html::parse_document(html); -``` - -### Parsing a fragment - -```rust -use scraper::Html; -let fragment = Html::parse_fragment("

Hello, world!

"); -``` - -### Parsing a selector - -```rust -use scraper::Selector; -let selector = Selector::parse("h1.foo").unwrap(); -``` - -### Selecting elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" - -"#; - -let fragment = Html::parse_fragment(html); -let selector = Selector::parse("li").unwrap(); - -for element in fragment.select(&selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Selecting descendent elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" - -"#; - -let fragment = Html::parse_fragment(html); -let ul_selector = Selector::parse("ul").unwrap(); -let li_selector = Selector::parse("li").unwrap(); - -let ul = fragment.select(&ul_selector).next().unwrap(); -for element in ul.select(&li_selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Accessing element attributes - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment(r#""#); -let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); - -let input = fragment.select(&selector).next().unwrap(); -assert_eq!(Some("bar"), input.value().attr("value")); -``` - -### Serializing HTML and inner HTML - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); - -assert_eq!("

Hello, world!

", h1.html()); -assert_eq!("Hello, world!", h1.inner_html()); -``` - -### Accessing descendent text - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); -let text = h1.text().collect::>(); - -assert_eq!(vec!["Hello, ", "world!"], text); -``` - -### Manipulating the DOM - -```rust -use html5ever::tree_builder::TreeSink; -use scraper::{Html, Selector}; - -let html = "hello

REMOVE ME

"; -let selector = Selector::parse(".hello").unwrap(); -let mut document = Html::parse_document(html); -let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); -for id in node_ids { - document.remove_from_parent(&id); -} -assert_eq!(document.html(), "hello"); -``` - -## Contributing - -Please feel free to open pull requests. If you're planning on implementing -something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) -then please open an issue first. diff --git a/README.md b/README.md new file mode 120000 index 00000000..a6541ddb --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +scraper/README.md \ No newline at end of file diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 0144f3a0..b88d6f49 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.22.0" +version = "0.23.1" edition = "2021" description = "HTML parsing and querying with CSS selectors" @@ -16,9 +16,10 @@ readme = "README.md" cssparser = "0.34.0" ego-tree = "0.10.0" html5ever = "0.29.0" -indexmap = { version = "2.7.0", optional = true } +indexmap = { version = "2.7.1", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" +serde = { version = "1.0.218", optional = true } tendril = "0.4.3" [dependencies.getopts] @@ -31,6 +32,7 @@ deterministic = ["indexmap"] main = ["getopts"] atomic = [] errors = [] +serde = ["dep:serde"] [[bin]] name = "scraper" diff --git a/scraper/README.md b/scraper/README.md index 3dfb7d79..39450ec2 120000 --- a/scraper/README.md +++ b/scraper/README.md @@ -1 +1,152 @@ -./../README.md \ No newline at end of file +# scraper + +[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] +[![downloads](https://img.shields.io/crates/d/scraper)][crate] +[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] + +HTML parsing and querying with CSS selectors. + +`scraper` is on [Crates.io][crate] and [GitHub][github]. + +[crate]: https://crates.io/crates/scraper +[github]: https://github.com/causal-agent/scraper +[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml + +Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. + +## Examples + +### Parsing a document + +```rust +use scraper::Html; + +let html = r#" + + + Hello, world! +

Hello, world!

+"#; + +let document = Html::parse_document(html); +``` + +### Parsing a fragment + +```rust +use scraper::Html; +let fragment = Html::parse_fragment("

Hello, world!

"); +``` + +### Parsing a selector + +```rust +use scraper::Selector; +let selector = Selector::parse("h1.foo").unwrap(); +``` + +### Selecting elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" +
    +
  • Foo
  • +
  • Bar
  • +
  • Baz
  • +
+"#; + +let fragment = Html::parse_fragment(html); +let selector = Selector::parse("li").unwrap(); + +for element in fragment.select(&selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Selecting descendent elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" +
    +
  • Foo
  • +
  • Bar
  • +
  • Baz
  • +
+"#; + +let fragment = Html::parse_fragment(html); +let ul_selector = Selector::parse("ul").unwrap(); +let li_selector = Selector::parse("li").unwrap(); + +let ul = fragment.select(&ul_selector).next().unwrap(); +for element in ul.select(&li_selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Accessing element attributes + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment(r#""#); +let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); + +let input = fragment.select(&selector).next().unwrap(); +assert_eq!(Some("bar"), input.value().attr("value")); +``` + +### Serializing HTML and inner HTML + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); + +assert_eq!("

Hello, world!

", h1.html()); +assert_eq!("Hello, world!", h1.inner_html()); +``` + +### Accessing descendent text + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); +let text = h1.text().collect::>(); + +assert_eq!(vec!["Hello, ", "world!"], text); +``` + +### Manipulating the DOM + +```rust +use html5ever::tree_builder::TreeSink; +use scraper::{Html, Selector}; + +let html = "hello

REMOVE ME

"; +let selector = Selector::parse(".hello").unwrap(); +let mut document = Html::parse_document(html); +let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); +for id in node_ids { + document.remove_from_parent(&id); +} +assert_eq!(document.html(), "hello"); +``` + +## Contributing + +Please feel free to open pull requests. If you're planning on implementing +something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) +then please open an issue first. diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index 49b30b49..8af46156 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -11,7 +11,7 @@ use std::cell::{Ref, RefCell}; /// Wraps `Html` instances as sinks to drive parsing #[derive(Debug)] -pub struct HtmlTreeSink(RefCell); +pub struct HtmlTreeSink(pub RefCell); impl HtmlTreeSink { /// Wrap a `Html`instance as a sink to drive parsing diff --git a/scraper/src/selector.rs b/scraper/src/selector.rs index 4f76c2b8..a8f23e46 100644 --- a/scraper/src/selector.rs +++ b/scraper/src/selector.rs @@ -11,6 +11,9 @@ use selectors::{ parser::{self, ParseRelative, SelectorList, SelectorParseErrorKind}, }; +#[cfg(feature = "serde")] +use serde::{de::Visitor, Deserialize, Serialize}; + use crate::error::SelectorErrorKind; use crate::ElementRef; @@ -80,6 +83,36 @@ impl ToCss for Selector { } } +#[cfg(feature = "serde")] +impl Serialize for Selector { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(&self.to_css_string()) + } +} + +#[cfg(feature = "serde")] +impl<'de> Deserialize<'de> for Selector { + fn deserialize>(deserializer: D) -> Result { + deserializer.deserialize_str(SelectorVisitor) + } +} + +#[cfg(feature = "serde")] +struct SelectorVisitor; + +#[cfg(feature = "serde")] +impl Visitor<'_> for SelectorVisitor { + type Value = Selector; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "a css selector string") + } + + fn visit_str(self, v: &str) -> Result { + Selector::parse(v).map_err(serde::de::Error::custom) + } +} + /// An implementation of `Parser` for `selectors` #[derive(Clone, Copy, Debug)] pub struct Parser; pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy