Skip to content

Commit 52dba3b

Browse files
committed
Benchmark other methods mentioned in README
The word boundary extensions to &str/String behave in a very similiar, but not identical manner to .graphemes(). For example, Mandarin to slow(ish) on .graphemes() but fast(ish) on .word_boundaries() whereas languages with whitespace-delimited words tend to have the same per- formance characteristics with the latter methods. As the library develops, it would be worthwhile to monitor the speed of the rest of the documented API.
1 parent 573b7bb commit 52dba3b

File tree

3 files changed

+136
-0
lines changed

3 files changed

+136
-0
lines changed

Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,4 +28,12 @@ bencher = "0.1"
2828

2929
[[bench]]
3030
name = "graphemes"
31+
harness = false
32+
33+
[[bench]]
34+
name = "unicode_words"
35+
harness = false
36+
37+
[[bench]]
38+
name = "word_bounds"
3139
harness = false

benches/unicode_words.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
extern crate unicode_segmentation;
4+
5+
use bencher::Bencher;
6+
use unicode_segmentation::UnicodeSegmentation;
7+
use std::fs;
8+
9+
fn unicode_words(bench: &mut Bencher, path: &str) {
10+
let text = fs::read_to_string(path).unwrap();
11+
bench.iter(|| {
12+
for w in text.unicode_words() {
13+
bencher::black_box(w);
14+
}
15+
});
16+
17+
bench.bytes = text.len() as u64;
18+
}
19+
20+
fn unicode_words_arabic(bench: &mut Bencher) {
21+
unicode_words(bench, "benches/texts/arabic.txt");
22+
}
23+
24+
fn unicode_words_english(bench: &mut Bencher) {
25+
unicode_words(bench, "benches/texts/english.txt");
26+
}
27+
28+
fn unicode_words_hindi(bench: &mut Bencher) {
29+
unicode_words(bench, "benches/texts/hindi.txt");
30+
}
31+
32+
fn unicode_words_japanese(bench: &mut Bencher) {
33+
unicode_words(bench, "benches/texts/japanese.txt");
34+
}
35+
36+
fn unicode_words_korean(bench: &mut Bencher) {
37+
unicode_words(bench, "benches/texts/korean.txt");
38+
}
39+
40+
fn unicode_words_mandarin(bench: &mut Bencher) {
41+
unicode_words(bench, "benches/texts/mandarin.txt");
42+
}
43+
44+
fn unicode_words_russian(bench: &mut Bencher) {
45+
unicode_words(bench, "benches/texts/russian.txt");
46+
}
47+
48+
fn unicode_words_source_code(bench: &mut Bencher) {
49+
unicode_words(bench, "benches/texts/source_code.txt");
50+
}
51+
52+
benchmark_group!(
53+
benches,
54+
unicode_words_arabic,
55+
unicode_words_english,
56+
unicode_words_hindi,
57+
unicode_words_japanese,
58+
unicode_words_korean,
59+
unicode_words_mandarin,
60+
unicode_words_russian,
61+
unicode_words_source_code,
62+
);
63+
64+
benchmark_main!(benches);

benches/word_bounds.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#[macro_use]
2+
extern crate bencher;
3+
extern crate unicode_segmentation;
4+
5+
use bencher::Bencher;
6+
use unicode_segmentation::UnicodeSegmentation;
7+
use std::fs;
8+
9+
fn word_bounds(bench: &mut Bencher, path: &str) {
10+
let text = fs::read_to_string(path).unwrap();
11+
bench.iter(|| {
12+
for w in text.split_word_bounds() {
13+
bencher::black_box(w);
14+
}
15+
});
16+
17+
bench.bytes = text.len() as u64;
18+
}
19+
20+
fn word_bounds_arabic(bench: &mut Bencher) {
21+
word_bounds(bench, "benches/texts/arabic.txt");
22+
}
23+
24+
fn word_bounds_english(bench: &mut Bencher) {
25+
word_bounds(bench, "benches/texts/english.txt");
26+
}
27+
28+
fn word_bounds_hindi(bench: &mut Bencher) {
29+
word_bounds(bench, "benches/texts/hindi.txt");
30+
}
31+
32+
fn word_bounds_japanese(bench: &mut Bencher) {
33+
word_bounds(bench, "benches/texts/japanese.txt");
34+
}
35+
36+
fn word_bounds_korean(bench: &mut Bencher) {
37+
word_bounds(bench, "benches/texts/korean.txt");
38+
}
39+
40+
fn word_bounds_mandarin(bench: &mut Bencher) {
41+
word_bounds(bench, "benches/texts/mandarin.txt");
42+
}
43+
44+
fn word_bounds_russian(bench: &mut Bencher) {
45+
word_bounds(bench, "benches/texts/russian.txt");
46+
}
47+
48+
fn word_bounds_source_code(bench: &mut Bencher) {
49+
word_bounds(bench, "benches/texts/source_code.txt");
50+
}
51+
52+
benchmark_group!(
53+
benches,
54+
word_bounds_arabic,
55+
word_bounds_english,
56+
word_bounds_hindi,
57+
word_bounds_japanese,
58+
word_bounds_korean,
59+
word_bounds_mandarin,
60+
word_bounds_russian,
61+
word_bounds_source_code,
62+
);
63+
64+
benchmark_main!(benches);

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy