Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ monostate = "0.1.12"
ahash = { version = "0.8.11", features = ["serde"] }
dary_heap = { version = "0.3.6", features = ["serde"] }
compact_str = { version = "0.9", features = ["serde"] }
pcre2 = { version = "0.2.11", optional = true }

[features]
default = ["progressbar", "onig", "esaxx_fast"]
Expand Down
16 changes: 10 additions & 6 deletions tokenizers/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,21 @@ pub(crate) mod cache;
#[cfg(feature = "http")]
pub(crate) mod from_pretrained;

#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))]
mod fancy;
#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
#[cfg(all(feature = "fancy-regex", not(feature = "onig"), not(feature = "pcre2")))]
pub use fancy::SysRegex;
#[cfg(feature = "onig")]
#[cfg(all(feature = "onig", not(feature = "pcre2")))]
mod onig;
#[cfg(feature = "onig")]
#[cfg(all(feature = "onig", not(feature = "pcre2")))]
pub use crate::utils::onig::SysRegex;
#[cfg(feature = "pcre2")]
mod pcre2;
#[cfg(feature = "pcre2")]
pub use crate::utils::pcre2::SysRegex;

#[cfg(not(any(feature = "onig", feature = "fancy-regex")))]
compile_error!("One of the `onig`, or `fancy-regex` features must be enabled");
#[cfg(not(any(feature = "onig", feature = "fancy-regex", feature = "pcre2")))]
compile_error!("One of the `onig`, `fancy-regex`, or `pcre2` features must be enabled");

pub mod iter;
pub mod padding;
Expand Down
36 changes: 36 additions & 0 deletions tokenizers/src/utils/pcre2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
use pcre2::bytes::RegexBuilder;

#[derive(Debug)]
pub struct SysRegex {
regex: pcre2::bytes::Regex,
}

impl SysRegex {
pub fn new(
regex_str: &str,
) -> std::result::Result<Self, Box<dyn std::error::Error + Send + Sync + 'static>> {
let regex = RegexBuilder::new()
.jit_if_available(true)
.utf(true)
.ucp(true)
.build(regex_str)?;
Ok(Self { regex })
}

pub fn find_iter<'r, 't>(&'r self, inside: &'t str) -> Matches<'r, 't> {
Matches(self.regex.find_iter(inside.as_bytes()))
}
}

pub struct Matches<'r, 't>(pcre2::bytes::Matches<'r, 't>);

impl Iterator for Matches<'_, '_> {
type Item = (usize, usize);

fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some(Ok(mat)) => Some((mat.start(), mat.end())),
None | Some(Err(_)) => None,
}
}
}
Loading