From 52dba3b87428901693ba8e7f65197f8aa3cbf86c Mon Sep 17 00:00:00 2001 From: Tim McNamara Date: Wed, 2 Jun 2021 14:15:10 +1200 Subject: [PATCH] Benchmark other methods mentioned in README The word boundary extensions to &str/String behave in a very similiar, but not identical manner to .graphemes(). For example, Mandarin to slow(ish) on .graphemes() but fast(ish) on .word_boundaries() whereas languages with whitespace-delimited words tend to have the same per- formance characteristics with the latter methods. As the library develops, it would be worthwhile to monitor the speed of the rest of the documented API. --- Cargo.toml | 8 +++++ benches/unicode_words.rs | 64 ++++++++++++++++++++++++++++++++++++++++ benches/word_bounds.rs | 64 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 136 insertions(+) create mode 100644 benches/unicode_words.rs create mode 100644 benches/word_bounds.rs diff --git a/Cargo.toml b/Cargo.toml index bf237cf..981394e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -28,4 +28,12 @@ bencher = "0.1" [[bench]] name = "graphemes" +harness = false + +[[bench]] +name = "unicode_words" +harness = false + +[[bench]] +name = "word_bounds" harness = false \ No newline at end of file diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs new file mode 100644 index 0000000..731e325 --- /dev/null +++ b/benches/unicode_words.rs @@ -0,0 +1,64 @@ +#[macro_use] +extern crate bencher; +extern crate unicode_segmentation; + +use bencher::Bencher; +use unicode_segmentation::UnicodeSegmentation; +use std::fs; + +fn unicode_words(bench: &mut Bencher, path: &str) { + let text = fs::read_to_string(path).unwrap(); + bench.iter(|| { + for w in text.unicode_words() { + bencher::black_box(w); + } + }); + + bench.bytes = text.len() as u64; +} + +fn unicode_words_arabic(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/arabic.txt"); +} + +fn unicode_words_english(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/english.txt"); +} + +fn unicode_words_hindi(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/hindi.txt"); +} + +fn unicode_words_japanese(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/japanese.txt"); +} + +fn unicode_words_korean(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/korean.txt"); +} + +fn unicode_words_mandarin(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/mandarin.txt"); +} + +fn unicode_words_russian(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/russian.txt"); +} + +fn unicode_words_source_code(bench: &mut Bencher) { + unicode_words(bench, "benches/texts/source_code.txt"); +} + +benchmark_group!( + benches, + unicode_words_arabic, + unicode_words_english, + unicode_words_hindi, + unicode_words_japanese, + unicode_words_korean, + unicode_words_mandarin, + unicode_words_russian, + unicode_words_source_code, +); + +benchmark_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs new file mode 100644 index 0000000..035f57e --- /dev/null +++ b/benches/word_bounds.rs @@ -0,0 +1,64 @@ +#[macro_use] +extern crate bencher; +extern crate unicode_segmentation; + +use bencher::Bencher; +use unicode_segmentation::UnicodeSegmentation; +use std::fs; + +fn word_bounds(bench: &mut Bencher, path: &str) { + let text = fs::read_to_string(path).unwrap(); + bench.iter(|| { + for w in text.split_word_bounds() { + bencher::black_box(w); + } + }); + + bench.bytes = text.len() as u64; +} + +fn word_bounds_arabic(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/arabic.txt"); +} + +fn word_bounds_english(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/english.txt"); +} + +fn word_bounds_hindi(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/hindi.txt"); +} + +fn word_bounds_japanese(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/japanese.txt"); +} + +fn word_bounds_korean(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/korean.txt"); +} + +fn word_bounds_mandarin(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/mandarin.txt"); +} + +fn word_bounds_russian(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/russian.txt"); +} + +fn word_bounds_source_code(bench: &mut Bencher) { + word_bounds(bench, "benches/texts/source_code.txt"); +} + +benchmark_group!( + benches, + word_bounds_arabic, + word_bounds_english, + word_bounds_hindi, + word_bounds_japanese, + word_bounds_korean, + word_bounds_mandarin, + word_bounds_russian, + word_bounds_source_code, +); + +benchmark_main!(benches); pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy