Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions bindings/node/src/processors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ pub fn roberta_processing(

#[napi]
pub fn byte_level_processing(trim_offsets: Option<bool>) -> Result<Processor> {
let mut byte_level = tk::processors::byte_level::ByteLevel::default();
let mut byte_level = tk::processors::byte_level::ByteLevelPostProcessor::default();

if let Some(trim_offsets) = trim_offsets {
byte_level = byte_level.trim_offsets(trim_offsets);
byte_level.0 = byte_level.0.trim_offsets(trim_offsets);
}

Ok(Processor {
Expand Down
4 changes: 2 additions & 2 deletions bindings/python/src/decoders.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use serde::de::Error;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use tk::decoders::bpe::BPEDecoder;
use tk::decoders::byte_fallback::ByteFallback;
use tk::decoders::byte_level::ByteLevel;
use tk::decoders::byte_level::ByteLevelDecoder;
use tk::decoders::ctc::CTC;
use tk::decoders::fuse::Fuse;
use tk::decoders::metaspace::{Metaspace, PrependScheme};
Expand Down Expand Up @@ -187,7 +187,7 @@ impl PyByteLevelDec {
#[new]
#[pyo3(signature = (**_kwargs), text_signature = "(self)")]
fn new(_kwargs: Option<&Bound<'_, PyDict>>) -> (Self, PyDecoder) {
(PyByteLevelDec {}, ByteLevel::default().into())
(PyByteLevelDec {}, ByteLevelDecoder::default().into())
}
}

Expand Down
7 changes: 5 additions & 2 deletions bindings/python/src/processors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use serde::Deserializer;
use serde::Serializer;
use serde::{Deserialize, Serialize};
use tk::processors::bert::BertProcessing;
use tk::processors::byte_level::ByteLevel;
use tk::processors::byte_level::{ByteLevel, ByteLevelPostProcessor};
use tk::processors::roberta::RobertaProcessing;
use tk::processors::template::{SpecialToken, Template};
use tk::processors::PostProcessorWrapper;
Expand Down Expand Up @@ -538,7 +538,10 @@ impl PyByteLevel {
byte_level = byte_level.use_regex(ur);
}

(PyByteLevel {}, byte_level.into())
(
PyByteLevel {},
ByteLevelPostProcessor::from(byte_level).into(),
)
}

#[getter]
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,7 +813,7 @@ def test_repr_complete(self):
out = repr(tokenizer)
assert (
out
== 'Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=Sequence(normalizers=[Lowercase(), Strip(strip_left=True, strip_right=True)]), pre_tokenizer=ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True), post_processor=TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[1], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[0], tokens=["[SEP]"])}), decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))'
== 'Tokenizer(version="1.0", truncation=None, padding=None, added_tokens=[], normalizer=Sequence(normalizers=[Lowercase(), Strip(strip_left=True, strip_right=True)]), pre_tokenizer=ByteLevel(add_prefix_space=True, use_regex=True), post_processor=TemplateProcessing(single=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0)], pair=[SpecialToken(id="[CLS]", type_id=0), Sequence(id=A, type_id=0), SpecialToken(id="[SEP]", type_id=0), Sequence(id=B, type_id=1), SpecialToken(id="[SEP]", type_id=1)], special_tokens={"[CLS]":SpecialToken(id="[CLS]", ids=[1], tokens=["[CLS]"]), "[SEP]":SpecialToken(id="[SEP]", ids=[0], tokens=["[SEP]"])}), decoder=None, model=BPE(dropout=None, unk_token=None, continuing_subword_prefix=None, end_of_word_suffix=None, fuse_unk=False, byte_fallback=False, ignore_merges=False, vocab={}, merges=[]))'
)


Expand Down
5 changes: 3 additions & 2 deletions tokenizers/benches/bpe_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ extern crate criterion;
mod common;

use criterion::{Criterion, Throughput};
use tokenizers::decoders::byte_level::ByteLevel as ByteLevelDecoder;
use tokenizers::models::bpe::{BpeTrainerBuilder, BPE};
use tokenizers::models::TrainerWrapper;
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::pre_tokenizers::byte_level::{ByteLevel, ByteLevelDecoder};
use tokenizers::pre_tokenizers::whitespace::Whitespace;
use tokenizers::tokenizer::{AddedToken, EncodeInput};
use tokenizers::Tokenizer;
Expand All @@ -19,7 +20,7 @@ static BATCH_SIZE: usize = 1_000;
fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
let mut tokenizer = Tokenizer::new(bpe);
tokenizer.with_pre_tokenizer(Some(ByteLevel::default()));
tokenizer.with_decoder(Some(ByteLevel::default()));
tokenizer.with_decoder(Some(ByteLevelDecoder::default()));
tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]);
tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]);
tokenizer
Expand Down
4 changes: 2 additions & 2 deletions tokenizers/benches/truncation_benchmark.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ extern crate criterion;
use criterion::{BenchmarkId, Criterion, Throughput};
use std::hint::black_box;
use tokenizers::models::bpe::BPE;
use tokenizers::pre_tokenizers::byte_level::ByteLevel;
use tokenizers::pre_tokenizers::byte_level::{ByteLevel, ByteLevelDecoder};
use tokenizers::tokenizer::{
AddedToken, TruncationDirection, TruncationParams, TruncationStrategy,
};
Expand All @@ -16,7 +16,7 @@ fn create_gpt2_tokenizer() -> Tokenizer {
.unwrap();
let mut tokenizer = Tokenizer::new(bpe);
tokenizer.with_pre_tokenizer(Some(ByteLevel::default()));
tokenizer.with_decoder(Some(ByteLevel::default()));
tokenizer.with_decoder(Some(ByteLevelDecoder::default()));
tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]);
tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]);
tokenizer
Expand Down
8 changes: 4 additions & 4 deletions tokenizers/src/decoders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,15 @@ use crate::decoders::sequence::Sequence;
use crate::decoders::strip::Strip;
use crate::decoders::wordpiece::WordPiece;
use crate::normalizers::replace::Replace;
use crate::pre_tokenizers::byte_level::ByteLevel;
use crate::pre_tokenizers::byte_level::ByteLevelDecoder;
use crate::pre_tokenizers::metaspace::Metaspace;
use crate::{Decoder, Result};

#[derive(Serialize, Clone, Debug)]
#[serde(untagged)]
pub enum DecoderWrapper {
BPE(BPEDecoder),
ByteLevel(ByteLevel),
ByteLevel(ByteLevelDecoder),
WordPiece(WordPiece),
Metaspace(Metaspace),
CTC(CTC),
Expand Down Expand Up @@ -76,7 +76,7 @@ impl<'de> Deserialize<'de> for DecoderWrapper {
#[serde(untagged)]
pub enum DecoderUntagged {
BPE(BPEDecoder),
ByteLevel(ByteLevel),
ByteLevel(ByteLevelDecoder),
WordPiece(WordPiece),
Metaspace(Metaspace),
CTC(CTC),
Expand Down Expand Up @@ -167,7 +167,7 @@ impl Decoder for DecoderWrapper {
}

impl_enum_from!(BPEDecoder, DecoderWrapper, BPE);
impl_enum_from!(ByteLevel, DecoderWrapper, ByteLevel);
impl_enum_from!(ByteLevelDecoder, DecoderWrapper, ByteLevel);
impl_enum_from!(ByteFallback, DecoderWrapper, ByteFallback);
impl_enum_from!(Fuse, DecoderWrapper, Fuse);
impl_enum_from!(Strip, DecoderWrapper, Strip);
Expand Down
Loading
Loading