diff --git a/Cargo.toml b/Cargo.toml index dda0abf..a8d25db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,17 +23,16 @@ no_std = [] # This is a no-op, preserved for backward compatibility only. [dev-dependencies] quickcheck = "0.7" -criterion = "0.3" +criterion = "0.5" [[bench]] -name = "graphemes" +name = "chars" harness = false [[bench]] -name = "unicode_words" +name = "words" harness = false [[bench]] name = "word_bounds" harness = false - diff --git a/benches/chars.rs b/benches/chars.rs new file mode 100644 index 0000000..d8dc5ea --- /dev/null +++ b/benches/chars.rs @@ -0,0 +1,60 @@ +//! Compares the performance of `UnicodeSegmentation::graphemes` with stdlib's UTF-8 scalar-based +//! `std::str::chars`. +//! +//! It is expected that `std::str::chars` is faster than `UnicodeSegmentation::graphemes` since it +//! does not consider the complexity of grapheme clusters. The question in this benchmark +//! is how much slower full unicode handling is. + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; +use unicode_segmentation; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for c in UnicodeSegmentation::graphemes(black_box(&*text), true) { + black_box(c); + } +} + +#[inline(always)] +fn scalar(text: &str) { + for c in black_box(&*text).chars() { + black_box(c); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("chars"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("scalar", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| scalar(content)), + ); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches); diff --git a/benches/graphemes.rs b/benches/graphemes.rs deleted file mode 100644 index 3a0b9b7..0000000 --- a/benches/graphemes.rs +++ /dev/null @@ -1,63 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use unicode_segmentation; - -use std::fs; -use unicode_segmentation::UnicodeSegmentation; - -fn graphemes(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - - c.bench_function(&format!("graphemes_{}", lang), |bench| { - bench.iter(|| { - for g in UnicodeSegmentation::graphemes(black_box(&*text), true) { - black_box(g); - } - }) - }); -} - -fn graphemes_arabic(c: &mut Criterion) { - graphemes(c, "arabic", "benches/texts/arabic.txt"); -} - -fn graphemes_english(c: &mut Criterion) { - graphemes(c, "english", "benches/texts/english.txt"); -} - -fn graphemes_hindi(c: &mut Criterion) { - graphemes(c, "hindi", "benches/texts/hindi.txt"); -} - -fn graphemes_japanese(c: &mut Criterion) { - graphemes(c, "japanese", "benches/texts/japanese.txt"); -} - -fn graphemes_korean(c: &mut Criterion) { - graphemes(c, "korean", "benches/texts/korean.txt"); -} - -fn graphemes_mandarin(c: &mut Criterion) { - graphemes(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn graphemes_russian(c: &mut Criterion) { - graphemes(c, "russian", "benches/texts/russian.txt"); -} - -fn graphemes_source_code(c: &mut Criterion) { - graphemes(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - graphemes_arabic, - graphemes_english, - graphemes_hindi, - graphemes_japanese, - graphemes_korean, - graphemes_mandarin, - graphemes_russian, - graphemes_source_code, -); - -criterion_main!(benches); diff --git a/benches/unicode_words.rs b/benches/unicode_words.rs deleted file mode 100644 index a7f8f41..0000000 --- a/benches/unicode_words.rs +++ /dev/null @@ -1,61 +0,0 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; - -use std::fs; -use unicode_segmentation::UnicodeSegmentation; - -fn unicode_words(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - c.bench_function(&format!("unicode_words_{}", lang), |bench| { - bench.iter(|| { - for w in text.unicode_words() { - black_box(w); - } - }) - }); -} - -fn unicode_words_arabic(c: &mut Criterion) { - unicode_words(c, "arabic", "benches/texts/arabic.txt"); -} - -fn unicode_words_english(c: &mut Criterion) { - unicode_words(c, "english", "benches/texts/english.txt"); -} - -fn unicode_words_hindi(c: &mut Criterion) { - unicode_words(c, "hindi", "benches/texts/hindi.txt"); -} - -fn unicode_words_japanese(c: &mut Criterion) { - unicode_words(c, "japanese", "benches/texts/japanese.txt"); -} - -fn unicode_words_korean(c: &mut Criterion) { - unicode_words(c, "korean", "benches/texts/korean.txt"); -} - -fn unicode_words_mandarin(c: &mut Criterion) { - unicode_words(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn unicode_words_russian(c: &mut Criterion) { - unicode_words(c, "russian", "benches/texts/russian.txt"); -} - -fn unicode_words_source_code(c: &mut Criterion) { - unicode_words(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - unicode_words_arabic, - unicode_words_english, - unicode_words_hindi, - unicode_words_japanese, - unicode_words_korean, - unicode_words_mandarin, - unicode_words_russian, - unicode_words_source_code, -); - -criterion_main!(benches); diff --git a/benches/word_bounds.rs b/benches/word_bounds.rs index cae7a88..42d50ff 100644 --- a/benches/word_bounds.rs +++ b/benches/word_bounds.rs @@ -1,61 +1,37 @@ -use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; use std::fs; use unicode_segmentation::UnicodeSegmentation; -fn word_bounds(c: &mut Criterion, lang: &str, path: &str) { - let text = fs::read_to_string(path).unwrap(); - c.bench_function(&format!("word_bounds_{}", lang), |bench| { - bench.iter(|| { - for w in text.split_word_bounds() { - black_box(w); - } - }); - }); -} - -fn word_bounds_arabic(c: &mut Criterion) { - word_bounds(c, "arabic", "benches/texts/arabic.txt"); -} - -fn word_bounds_english(c: &mut Criterion) { - word_bounds(c, "english", "benches/texts/english.txt"); -} - -fn word_bounds_hindi(c: &mut Criterion) { - word_bounds(c, "hindi", "benches/texts/hindi.txt"); -} - -fn word_bounds_japanese(c: &mut Criterion) { - word_bounds(c, "japanese", "benches/texts/japanese.txt"); -} - -fn word_bounds_korean(c: &mut Criterion) { - word_bounds(c, "korean", "benches/texts/korean.txt"); -} - -fn word_bounds_mandarin(c: &mut Criterion) { - word_bounds(c, "mandarin", "benches/texts/mandarin.txt"); -} - -fn word_bounds_russian(c: &mut Criterion) { - word_bounds(c, "russian", "benches/texts/russian.txt"); -} - -fn word_bounds_source_code(c: &mut Criterion) { - word_bounds(c, "source_code", "benches/texts/source_code.txt"); -} - -criterion_group!( - benches, - word_bounds_arabic, - word_bounds_english, - word_bounds_hindi, - word_bounds_japanese, - word_bounds_korean, - word_bounds_mandarin, - word_bounds_russian, - word_bounds_source_code, -); - +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.split_word_bounds() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("word_bounds"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } +} + +criterion_group!(benches, bench_all); criterion_main!(benches); diff --git a/benches/words.rs b/benches/words.rs new file mode 100644 index 0000000..86785d5 --- /dev/null +++ b/benches/words.rs @@ -0,0 +1,59 @@ +//! Compares the performance of `UnicodeSegmentation::unicode_words` with stdlib's UTF-8 +//! scalar-based `std::str::split_whitespace`. +//! +//! It is expected that `std::str::split_whitespace` is faster than +//! `UnicodeSegmentation::unicode_words` since it does not consider the complexity of grapheme +//! clusters. The question in this benchmark is how much slower full unicode handling is. + +use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; + +use std::fs; +use unicode_segmentation::UnicodeSegmentation; + +const FILES: &[&str] = &[ + "arabic", + "english", + "hindi", + "japanese", + "korean", + "mandarin", + "russian", + "source_code", +]; + +#[inline(always)] +fn grapheme(text: &str) { + for w in text.unicode_words() { + black_box(w); + } +} + +#[inline(always)] +fn scalar(text: &str) { + for w in text.split_whitespace() { + black_box(w); + } +} + +fn bench_all(c: &mut Criterion) { + let mut group = c.benchmark_group("words"); + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("grapheme", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| grapheme(content)), + ); + } + + for file in FILES { + group.bench_with_input( + BenchmarkId::new("scalar", file), + &fs::read_to_string(format!("benches/texts/{}.txt", file)).unwrap(), + |b, content| b.iter(|| scalar(content)), + ); + } +} + +criterion_group!(benches, bench_all); +criterion_main!(benches);
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: