Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[env]
# Only takes effect when the optional `pcre2` feature is enabled and `pcre2-sys`
# is built. Keeps runtime free of dynamic libpcre2 dependency.
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and pcre2-sys is built

which is always?

PCRE2_SYS_STATIC = { value = "1", force = false }
2 changes: 1 addition & 1 deletion bindings/python/src/utils/normalization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ impl Pattern for PyPattern {
s.find_matches(inside)
}
}
PyPattern::Regex(r) => Python::attach(|py| (&r.borrow(py).inner).find_matches(inside)),
PyPattern::Regex(r) => Python::attach(|py| r.borrow(py).inner.find_matches(inside)),
}
}
}
Expand Down
2 changes: 2 additions & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,13 @@ getrandom = { version = "0.3" }
esaxx-rs = { version = "0.1.10", default-features = false, features=[]}
monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
pcre2 = { version = "0.2", optional = true }
dary_heap = { version = "0.3.6", features = ["serde"] }
compact_str = { version = "0.9", features = ["serde"] }

[features]
default = ["progressbar", "onig", "esaxx_fast"]
pcre2 = ["dep:pcre2", "fancy-regex"]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is one declared with the dep: prefix and not the other?

esaxx_fast = ["esaxx-rs/cpp"]
progressbar = ["indicatif"]
http = ["hf-hub"]
Expand Down
35 changes: 35 additions & 0 deletions tokenizers/benches/llama3_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod common;
use common::{iter_bench_encode, iter_bench_encode_batch, iter_bench_train};
use criterion::{Criterion, Throughput};
use std::hint::black_box;
use std::sync::Arc;
use tokenizers::{
models::{bpe::BpeTrainerBuilder, TrainerWrapper},
EncodeInput, Tokenizer,
Expand Down Expand Up @@ -43,6 +44,40 @@ pub fn llama3(c: &mut Criterion) {
group.bench_function("llama3-batch", |b| {
b.iter_custom(|iters| iter_bench_encode_batch(iters, &tokenizer, &batches))
});
// Concurrent long-context: N threads each encode a different large input (80k chars)
let all_lines: Vec<&str> = data.lines().collect();
let lines_per_thread = 1000;
let tokenizer_arc = Arc::new(tokenizer.clone());
for num_threads in [1, 2, 4, 8] {
let inputs: Vec<String> = (0..num_threads)
.map(|i| {
let start = i * lines_per_thread;
all_lines[start..start + lines_per_thread].join("\n")
})
.collect();
let total_bytes: usize = inputs.iter().map(|s| s.len()).sum();
let tok = tokenizer_arc.clone();
group.throughput(Throughput::Bytes(total_bytes as u64));
group.bench_function(format!("llama3-concurrent-long-{num_threads}t"), move |b| {
b.iter(|| {
std::thread::scope(|s| {
let handles: Vec<_> = inputs
.iter()
.map(|input| {
let tok = &tok;
s.spawn(move || {
black_box(tok.encode(black_box(input.as_str()), false).unwrap())
})
})
.collect();
for h in handles {
h.join().unwrap();
}
});
})
});
}

let mut trainer: TrainerWrapper = BpeTrainerBuilder::default()
.show_progress(false)
.build()
Expand Down
1 change: 0 additions & 1 deletion tokenizers/src/normalizers/replace.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use crate::tokenizer::pattern::Pattern;
use crate::tokenizer::Decoder;
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::SysRegex;
Expand Down
18 changes: 1 addition & 17 deletions tokenizers/src/tokenizer/pattern.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,23 +62,7 @@ impl Pattern for &Regex {

impl Pattern for &SysRegex {
fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> {
if inside.is_empty() {
return Ok(vec![((0, 0), false)]);
}

let mut prev = 0;
let mut splits = Vec::with_capacity(inside.len());
for (start, end) in self.find_iter(inside) {
if prev != start {
splits.push(((prev, start), false));
}
splits.push(((start, end), true));
prev = end;
}
if prev != inside.len() {
splits.push(((prev, inside.len()), false))
}
Ok(splits)
SysRegex::find_matches(self, inside)
}
}

Expand Down
24 changes: 24 additions & 0 deletions tokenizers/src/utils/fancy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,30 @@ impl SysRegex {
regex: Regex::new(regex_str)?,
})
}

pub fn find_matches(
&self,
inside: &str,
) -> Result<Vec<(Offsets, bool)>, Box<dyn Error + Send + Sync + 'static>> {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this doesn't need to be a Result or am I missing something?

if inside.is_empty() {
return Ok(vec![((0, 0), false)]);
}

let mut prev = 0;
let mut splits = Vec::with_capacity(inside.len());
for matched in self.regex.find_iter(inside) {
let matched = matched?;
if prev != matched.start() {
splits.push(((prev, matched.start()), false));
}
splits.push(((matched.start(), matched.end()), true));
prev = matched.end();
}
if prev != inside.len() {
splits.push(((prev, inside.len()), false));
}
Ok(splits)
}
}

pub struct Matches<'r, 't>(fancy_regex::Matches<'r, 't>);
Expand Down
23 changes: 15 additions & 8 deletions tokenizers/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,24 @@ pub(crate) mod cache;
#[cfg(feature = "http")]
pub(crate) mod from_pretrained;

#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
mod fancy;
#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
pub use fancy::SysRegex;
#[cfg(feature = "onig")]
// Regex backend priority: pcre2 (JIT) > onig > fancy-regex
#[cfg(feature = "pcre2")]
mod pcre2_backend;
#[cfg(feature = "pcre2")]
pub use pcre2_backend::SysRegex;

#[cfg(all(feature = "onig", not(feature = "pcre2")))]
mod onig;
#[cfg(feature = "onig")]
#[cfg(all(feature = "onig", not(feature = "pcre2")))]
pub use crate::utils::onig::SysRegex;

#[cfg(not(any(feature = "onig", feature = "fancy-regex")))]
compile_error!("One of the `onig`, or `fancy-regex` features must be enabled");
#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))]
mod fancy;
#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))]
pub use fancy::SysRegex;

#[cfg(not(any(feature = "onig", feature = "fancy-regex", feature = "pcre2")))]
compile_error!("One of the `pcre2`, `onig`, or `fancy-regex` features must be enabled");

pub mod iter;
pub mod padding;
Expand Down
20 changes: 20 additions & 0 deletions tokenizers/src/utils/onig.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,26 @@ impl SysRegex {
regex: Regex::new(regex_str)?,
})
}

pub fn find_matches(&self, inside: &str) -> Result<Vec<(Offsets, bool)>> {
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same here for the Result ?

if inside.is_empty() {
return Ok(vec![((0, 0), false)]);
}

let mut prev = 0;
let mut splits = Vec::with_capacity(inside.len());
for (start, end) in self.regex.find_iter(inside) {
if prev != start {
splits.push(((prev, start), false));
}
splits.push(((start, end), true));
prev = end;
}
if prev != inside.len() {
splits.push(((prev, inside.len()), false));
}
Ok(splits)
}
}

impl Pattern for &Regex {
Expand Down
Loading