From d6a6eb848214135095e9034c24de748c4dfcbe53 Mon Sep 17 00:00:00 2001 From: Jonas Heinrich Date: Tue, 16 Apr 2024 11:50:57 +0200 Subject: [PATCH] Modify benchmarks to compare against stdlib functions This commit refactors and expands the microbenchmarks in order to evaluate the performance hit of handling full unicode. It is expected that `unicode-segmentation`'s functions are slower since they consider graphemes, the question is just how much. - bump criterion dependency - rename benchmarks to remove unicode/grapheme relationship - move benchmarks into benchmark group - add scalar versions with stdlib "equivalents" (scalars) --- Cargo.toml | 7 ++-- benches/chars.rs | 60 +++++++++++++++++++++++++++ benches/graphemes.rs | 63 ---------------------------- benches/unicode_words.rs | 61 ---------------------------- benches/word_bounds.rs | 88 +++++++++++++++------------------------- benches/words.rs | 59 +++++++++++++++++++++++++++ 6 files changed, 154 insertions(+), 184 deletions(-) create mode 100644 benches/chars.rs delete mode 100644 benches/graphemes.rs delete mode 100644 benches/unicode_words.rs create mode 100644 benches/words.rs diff --git a/Cargo.toml b/Cargo.toml index dda0abf..a8d25db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" -criterion = "0.3" +criterion = "0.5" [[bench]] -name = "graphemes" +name = "chars" harness = false [[bench]] -name = "unicode_words" +name = "words" harness = false [[bench]] name = "word_bounds" harness = false - diff --git a/benches/chars.rs b/benches/chars.rs new file mode 100644 index 0000000..d8dc5ea --- /dev/null +++ b/benches/chars.rs @@ -0,0 +1,60 @@ +//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based +//! `std::str::chars`. +//! +//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it +//! does not consider the complexity of grapheme clusters. The question in this benchmark +//! is how much slower full unicode handling is. + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use unicode_segmentation; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for c in UnicodeSegmentation::graphemes(black_box(&*text), true) { + black_box(c); + } +} + +#[inline(always)] +fn scalar(text: &str) { + for c in black_box(&*text).chars() { + black_box(c); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("chars"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("scalar", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| scalar(content)), + ); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); diff --git a/benches/graphemes.rs b/benches/graphemes.rs deleted file mode 100644 index 3a0b9b7..0000000 --- a/benches/graphemes.rs +++ /dev/null @@ -1,63 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use unicode_segmentation; - -use std::fs; -use unicode_segmentation::UnicodeSegmentation; - -fn graphemes(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - - c.bench_function(&format!("graphemes_{}", lang), |bench| { - bench.iter(|| { - for g in UnicodeSegmentation::graphemes(black_box(&*text), true) { - black_box(g); - } - }) - }); -} - -fn graphemes_arabic(c: &mut Criterion) { - graphemes(c, "arabic", "benches/texts/arabic.txt"); -} - -fn graphemes_english(c: &mut Criterion) { - graphemes(c, "english", "benches/texts/english.txt"); -} - -fn graphemes_hindi(c: &mut Criterion) { - graphemes(c, "hindi", "benches/texts/hindi.txt"); -} - -fn graphemes_japanese(c: &mut Criterion) { - graphemes(c, "japanese", "benches/texts/japanese.txt"); -} - -fn graphemes_korean(c: &mut Criterion) { - graphemes(c, "korean", "benches/texts/korean.txt"); -} - -fn graphemes_mandarin(c: &mut Criterion) { - graphemes(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn graphemes_russian(c: &mut Criterion) { - graphemes(c, "russian", "benches/texts/russian.txt"); -} - -fn graphemes_source_code(c: &mut Criterion) { - graphemes(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - graphemes_arabic, - graphemes_english, - graphemes_hindi, - graphemes_japanese, - graphemes_korean, - graphemes_mandarin, - graphemes_russian, - graphemes_source_code, -); - -criterion_main!(benches); diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs deleted file mode 100644 index a7f8f41..0000000 --- a/benches/unicode_words.rs +++ /dev/null @@ -1,61 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; - -use std::fs; -use unicode_segmentation::UnicodeSegmentation; - -fn unicode_words(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - c.bench_function(&format!("unicode_words_{}", lang), |bench| { - bench.iter(|| { - for w in text.unicode_words() { - black_box(w); - } - }) - }); -} - -fn unicode_words_arabic(c: &mut Criterion) { - unicode_words(c, "arabic", "benches/texts/arabic.txt"); -} - -fn unicode_words_english(c: &mut Criterion) { - unicode_words(c, "english", "benches/texts/english.txt"); -} - -fn unicode_words_hindi(c: &mut Criterion) { - unicode_words(c, "hindi", "benches/texts/hindi.txt"); -} - -fn unicode_words_japanese(c: &mut Criterion) { - unicode_words(c, "japanese", "benches/texts/japanese.txt"); -} - -fn unicode_words_korean(c: &mut Criterion) { - unicode_words(c, "korean", "benches/texts/korean.txt"); -} - -fn unicode_words_mandarin(c: &mut Criterion) { - unicode_words(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn unicode_words_russian(c: &mut Criterion) { - unicode_words(c, "russian", "benches/texts/russian.txt"); -} - -fn unicode_words_source_code(c: &mut Criterion) { - unicode_words(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - unicode_words_arabic, - unicode_words_english, - unicode_words_hindi, - unicode_words_japanese, - unicode_words_korean, - unicode_words_mandarin, - unicode_words_russian, - unicode_words_source_code, -); - -criterion_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs index cae7a88..42d50ff 100644 --- a/benches/word_bounds.rs +++ b/benches/word_bounds.rs @@ -1,61 +1,37 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; use std::fs; use unicode_segmentation::UnicodeSegmentation; -fn word_bounds(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - c.bench_function(&format!("word_bounds_{}", lang), |bench| { - bench.iter(|| { - for w in text.split_word_bounds() { - black_box(w); - } - }); - }); -} - -fn word_bounds_arabic(c: &mut Criterion) { - word_bounds(c, "arabic", "benches/texts/arabic.txt"); -} - -fn word_bounds_english(c: &mut Criterion) { - word_bounds(c, "english", "benches/texts/english.txt"); -} - -fn word_bounds_hindi(c: &mut Criterion) { - word_bounds(c, "hindi", "benches/texts/hindi.txt"); -} - -fn word_bounds_japanese(c: &mut Criterion) { - word_bounds(c, "japanese", "benches/texts/japanese.txt"); -} - -fn word_bounds_korean(c: &mut Criterion) { - word_bounds(c, "korean", "benches/texts/korean.txt"); -} - -fn word_bounds_mandarin(c: &mut Criterion) { - word_bounds(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn word_bounds_russian(c: &mut Criterion) { - word_bounds(c, "russian", "benches/texts/russian.txt"); -} - -fn word_bounds_source_code(c: &mut Criterion) { - word_bounds(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - word_bounds_arabic, - word_bounds_english, - word_bounds_hindi, - word_bounds_japanese, - word_bounds_korean, - word_bounds_mandarin, - word_bounds_russian, - word_bounds_source_code, -); - +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.split_word_bounds() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("word_bounds"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } +} + +criterion_group!(benches, bench_all); criterion_main!(benches); diff --git a/benches/words.rs b/benches/words.rs new file mode 100644 index 0000000..86785d5 --- /dev/null +++ b/benches/words.rs @@ -0,0 +1,59 @@ +//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8 +//! scalar-based `std::str::split_whitespace`. +//! +//! It is expected that `std::str::split_whitespace` is faster than +//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme +//! clusters. The question in this benchmark is how much slower full unicode handling is. + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.unicode_words() { + black_box(w); + } +} + +#[inline(always)] +fn scalar(text: &str) { + for w in text.split_whitespace() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("words"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("scalar", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| scalar(content)), + ); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy