diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7c5e5a1f0..04c41b04c 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,6 +67,8 @@ jobs: # Run tests - name: Run tests run: cargo test + - name: Run percent-encoding IRI (iri) tests + run: cargo test -p percent-encoding --features iri # Run tests enabling the serde feature - name: Run tests with the serde feature run: cargo test --features "url/serde,url/expose_internals" diff --git a/percent_encoding/Cargo.toml b/percent_encoding/Cargo.toml index 562ba5f86..88501a4e0 100644 --- a/percent_encoding/Cargo.toml +++ b/percent_encoding/Cargo.toml @@ -13,6 +13,9 @@ rust-version = "1.51" default = ["std"] std = ["alloc"] alloc = [] +# Encode only ASCII code units in `AsciiSet`; leave UTF-8 non-ASCII bytes literal (IRI-style). +# Used by Anki for Unicode paths in file:/media references; see `AsciiSet::should_percent_encode`. +iri = [] [package.metadata.docs.rs] rustdoc-args = ["--generate-link-to-definition"] diff --git a/percent_encoding/src/ascii_set.rs b/percent_encoding/src/ascii_set.rs index 26c66dde7..ad4ef22ae 100644 --- a/percent_encoding/src/ascii_set.rs +++ b/percent_encoding/src/ascii_set.rs @@ -50,7 +50,14 @@ impl AsciiSet { } pub(crate) fn should_percent_encode(&self, byte: u8) -> bool { - !byte.is_ascii() || self.contains(byte) + #[cfg(feature = "iri")] + { + byte.is_ascii() && self.contains(byte) + } + #[cfg(not(feature = "iri"))] + { + !byte.is_ascii() || self.contains(byte) + } } pub const fn add(&self, byte: u8) -> Self { @@ -211,3 +218,28 @@ mod tests { assert!(COMPLEMENT.contains(b'C')); } } + +#[cfg(all(test, feature = "iri"))] +mod iri_tests { + use super::*; + + #[test] + fn should_percent_encode_leaves_non_ascii_utf8_unencoded() { + let set = AsciiSet::EMPTY.add(b'/').add(b'%'); + for &byte in "日本語.mp3".as_bytes() { + assert!( + !set.should_percent_encode(byte), + "byte {:#x} should not be percent-encoded", + byte + ); + } + } + + #[test] + fn should_percent_encode_still_encodes_ascii_in_set() { + let set = AsciiSet::EMPTY.add(b' ').add(b'?'); + assert!(set.should_percent_encode(b' ')); + assert!(set.should_percent_encode(b'?')); + assert!(!set.should_percent_encode(b'a')); + } +} diff --git a/percent_encoding/src/lib.rs b/percent_encoding/src/lib.rs index ee36e0c5a..5750b4636 100644 --- a/percent_encoding/src/lib.rs +++ b/percent_encoding/src/lib.rs @@ -98,7 +98,8 @@ pub fn percent_encode_byte(byte: u8) -> &'static str { /// Percent-encode the given bytes with the given set. /// -/// Non-ASCII bytes and bytes in `ascii_set` are encoded. +/// Bytes in `ascii_set` are encoded. Non-ASCII bytes are also encoded unless the crate +/// feature `iri` is enabled (IRI-style: UTF-8 non-ASCII octets pass through). /// /// The return type: /// @@ -418,6 +419,16 @@ mod tests { ); } + #[cfg(feature = "iri")] + #[test] + fn utf8_percent_encode_unicode_filename_unchanged_for_path_set() { + const PATHISH: &AsciiSet = &CONTROLS.add(b'#').add(b'?').add(b'{').add(b'}'); + assert_eq!( + super::utf8_percent_encode("日本語.mp3", PATHISH).collect::(), + "日本語.mp3" + ); + } + #[test] fn percent_decode() { assert_eq!(