From d6a6eb848214135095e9034c24de748c4dfcbe53 Mon Sep 17 00:00:00 2001
From: Jonas Heinrich <Jonas.Heinrich@sap.com>
Date: Tue, 16 Apr 2024 11:50:57 +0200
Subject: [PATCH] Modify benchmarks to compare against stdlib functions

This commit refactors and expands the microbenchmarks in order to
evaluate the performance hit of handling full unicode. It is expected
that `unicode-segmentation`'s functions are slower since they consider
graphemes, the question is just how much.

- bump criterion dependency
- rename benchmarks to remove unicode/grapheme relationship
- move benchmarks into benchmark group
- add scalar versions with stdlib "equivalents" (scalars)
---
 Cargo.toml               |  7 ++--
 benches/chars.rs         | 60 +++++++++++++++++++++++++++
 benches/graphemes.rs     | 63 ----------------------------
 benches/unicode_words.rs | 61 ----------------------------
 benches/word_bounds.rs   | 88 +++++++++++++++-------------------------
 benches/words.rs         | 59 +++++++++++++++++++++++++++
 6 files changed, 154 insertions(+), 184 deletions(-)
 create mode 100644 benches/chars.rs
 delete mode 100644 benches/graphemes.rs
 delete mode 100644 benches/unicode_words.rs
 create mode 100644 benches/words.rs

diff --git a/Cargo.toml b/Cargo.toml
index dda0abf..a8d25db 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only.
 
 [dev-dependencies]
 quickcheck = "0.7"
-criterion = "0.3"
+criterion = "0.5"
 
 [[bench]]
-name = "graphemes"
+name = "chars"
 harness = false
 
 [[bench]]
-name = "unicode_words"
+name = "words"
 harness = false
 
 [[bench]]
 name = "word_bounds"
 harness = false
-
diff --git a/benches/chars.rs b/benches/chars.rs
new file mode 100644
index 0000000..d8dc5ea
--- /dev/null
+++ b/benches/chars.rs
@@ -0,0 +1,60 @@
+//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based
+//! `std::str::chars`.
+//!
+//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it
+//! does not consider the complexity of grapheme clusters. The question in this benchmark
+//! is how much slower full unicode handling is.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+use unicode_segmentation;
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for c in UnicodeSegmentation::graphemes(black_box(&*text), true) {
+        black_box(c);
+    }
+}
+
+#[inline(always)]
+fn scalar(text: &str) {
+    for c in black_box(&*text).chars() {
+        black_box(c);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("chars");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("scalar", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| scalar(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);
diff --git a/benches/graphemes.rs b/benches/graphemes.rs
deleted file mode 100644
index 3a0b9b7..0000000
--- a/benches/graphemes.rs
+++ /dev/null
@@ -1,63 +0,0 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-use unicode_segmentation;
-
-use std::fs;
-use unicode_segmentation::UnicodeSegmentation;
-
-fn graphemes(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-
-    c.bench_function(&format!("graphemes_{}", lang), |bench| {
-        bench.iter(|| {
-            for g in UnicodeSegmentation::graphemes(black_box(&*text), true) {
-                black_box(g);
-            }
-        })
-    });
-}
-
-fn graphemes_arabic(c: &mut Criterion) {
-    graphemes(c, "arabic", "benches/texts/arabic.txt");
-}
-
-fn graphemes_english(c: &mut Criterion) {
-    graphemes(c, "english", "benches/texts/english.txt");
-}
-
-fn graphemes_hindi(c: &mut Criterion) {
-    graphemes(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn graphemes_japanese(c: &mut Criterion) {
-    graphemes(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn graphemes_korean(c: &mut Criterion) {
-    graphemes(c, "korean", "benches/texts/korean.txt");
-}
-
-fn graphemes_mandarin(c: &mut Criterion) {
-    graphemes(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn graphemes_russian(c: &mut Criterion) {
-    graphemes(c, "russian", "benches/texts/russian.txt");
-}
-
-fn graphemes_source_code(c: &mut Criterion) {
-    graphemes(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    graphemes_arabic,
-    graphemes_english,
-    graphemes_hindi,
-    graphemes_japanese,
-    graphemes_korean,
-    graphemes_mandarin,
-    graphemes_russian,
-    graphemes_source_code,
-);
-
-criterion_main!(benches);
diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs
deleted file mode 100644
index a7f8f41..0000000
--- a/benches/unicode_words.rs
+++ /dev/null
@@ -1,61 +0,0 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
-
-use std::fs;
-use unicode_segmentation::UnicodeSegmentation;
-
-fn unicode_words(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-    c.bench_function(&format!("unicode_words_{}", lang), |bench| {
-        bench.iter(|| {
-            for w in text.unicode_words() {
-                black_box(w);
-            }
-        })
-    });
-}
-
-fn unicode_words_arabic(c: &mut Criterion) {
-    unicode_words(c, "arabic", "benches/texts/arabic.txt");
-}
-
-fn unicode_words_english(c: &mut Criterion) {
-    unicode_words(c, "english", "benches/texts/english.txt");
-}
-
-fn unicode_words_hindi(c: &mut Criterion) {
-    unicode_words(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn unicode_words_japanese(c: &mut Criterion) {
-    unicode_words(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn unicode_words_korean(c: &mut Criterion) {
-    unicode_words(c, "korean", "benches/texts/korean.txt");
-}
-
-fn unicode_words_mandarin(c: &mut Criterion) {
-    unicode_words(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn unicode_words_russian(c: &mut Criterion) {
-    unicode_words(c, "russian", "benches/texts/russian.txt");
-}
-
-fn unicode_words_source_code(c: &mut Criterion) {
-    unicode_words(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    unicode_words_arabic,
-    unicode_words_english,
-    unicode_words_hindi,
-    unicode_words_japanese,
-    unicode_words_korean,
-    unicode_words_mandarin,
-    unicode_words_russian,
-    unicode_words_source_code,
-);
-
-criterion_main!(benches);
diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs
index cae7a88..42d50ff 100644
--- a/benches/word_bounds.rs
+++ b/benches/word_bounds.rs
@@ -1,61 +1,37 @@
-use criterion::{black_box, criterion_group, criterion_main, Criterion};
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
 
 use std::fs;
 use unicode_segmentation::UnicodeSegmentation;
 
-fn word_bounds(c: &mut Criterion, lang: &str, path: &str) {
-    let text = fs::read_to_string(path).unwrap();
-    c.bench_function(&format!("word_bounds_{}", lang), |bench| {
-        bench.iter(|| {
-            for w in text.split_word_bounds() {
-                black_box(w);
-            }
-        });
-    });
-}
-
-fn word_bounds_arabic(c: &mut Criterion) {
-    word_bounds(c, "arabic", "benches/texts/arabic.txt");
-}
-
-fn word_bounds_english(c: &mut Criterion) {
-    word_bounds(c, "english", "benches/texts/english.txt");
-}
-
-fn word_bounds_hindi(c: &mut Criterion) {
-    word_bounds(c, "hindi", "benches/texts/hindi.txt");
-}
-
-fn word_bounds_japanese(c: &mut Criterion) {
-    word_bounds(c, "japanese", "benches/texts/japanese.txt");
-}
-
-fn word_bounds_korean(c: &mut Criterion) {
-    word_bounds(c, "korean", "benches/texts/korean.txt");
-}
-
-fn word_bounds_mandarin(c: &mut Criterion) {
-    word_bounds(c, "mandarin", "benches/texts/mandarin.txt");
-}
-
-fn word_bounds_russian(c: &mut Criterion) {
-    word_bounds(c, "russian", "benches/texts/russian.txt");
-}
-
-fn word_bounds_source_code(c: &mut Criterion) {
-    word_bounds(c, "source_code", "benches/texts/source_code.txt");
-}
-
-criterion_group!(
-    benches,
-    word_bounds_arabic,
-    word_bounds_english,
-    word_bounds_hindi,
-    word_bounds_japanese,
-    word_bounds_korean,
-    word_bounds_mandarin,
-    word_bounds_russian,
-    word_bounds_source_code,
-);
-
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.split_word_bounds() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("word_bounds");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
 criterion_main!(benches);
diff --git a/benches/words.rs b/benches/words.rs
new file mode 100644
index 0000000..86785d5
--- /dev/null
+++ b/benches/words.rs
@@ -0,0 +1,59 @@
+//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8
+//! scalar-based `std::str::split_whitespace`.
+//!
+//! It is expected that `std::str::split_whitespace` is faster than
+//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme
+//! clusters. The question in this benchmark is how much slower full unicode handling is.
+
+use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion};
+
+use std::fs;
+use unicode_segmentation::UnicodeSegmentation;
+
+const FILES: &[&str] = &[
+    "arabic",
+    "english",
+    "hindi",
+    "japanese",
+    "korean",
+    "mandarin",
+    "russian",
+    "source_code",
+];
+
+#[inline(always)]
+fn grapheme(text: &str) {
+    for w in text.unicode_words() {
+        black_box(w);
+    }
+}
+
+#[inline(always)]
+fn scalar(text: &str) {
+    for w in text.split_whitespace() {
+        black_box(w);
+    }
+}
+
+fn bench_all(c: &mut Criterion) {
+    let mut group = c.benchmark_group("words");
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("grapheme", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| grapheme(content)),
+        );
+    }
+
+    for file in FILES {
+        group.bench_with_input(
+            BenchmarkId::new("scalar", file),
+            &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(),
+            |b, content| b.iter(|| scalar(content)),
+        );
+    }
+}
+
+criterion_group!(benches, bench_all);
+criterion_main!(benches);

<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN' 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>
<html xmlns='http://www.w3.org/1999/xhtml'>
<head>
<title>pFad - Phonifier reborn</title>
<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />
</head>
<body>
<h1>Pfad - The Proxy pFad of &#169; 2024 Garber Painting. All rights reserved.</h1>


<!-- Disclaimer -->
<p>Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.</p>
<br>
<p>Alternative Proxies:</p><p><a href="http://rainy.clevelandohioweatherforecast.com/php-proxy/index.php?q=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/133.patch" target="_blank">Alternative Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/index.php?u=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/133.patch" target="_blank">pFad Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v3index.php?u=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/133.patch" target="_blank">pFad v3 Proxy</a></p><p><a href="http://rainy.clevelandohioweatherforecast.com/pFad/v4index.php?u=https://patch-diff.githubusercontent.com/raw/unicode-rs/unicode-segmentation/pull/133.patch" target="_blank">pFad v4 Proxy</a></p></body>
</html>