From 0a8e4b18a58c939186d18cd25848d0fb56718e17 Mon Sep 17 00:00:00 2001 From: James Seo Date: Thu, 19 Mar 2026 23:10:30 -0700 Subject: [PATCH 1/8] gh-146192: Add base32 support to binascii Add base32 encoder and decoder functions implemented in C to `binascii` and use them to greatly improve the performance and reduce the memory usage of the existing base32 codec functions in `base64`. No API or documentation changes are necessary with respect to any functions in `base64`, and all existing unit tests for those functions continue to pass without modification. Resolves: gh-146192 --- Doc/library/binascii.rst | 43 ++ Lib/base64.py | 85 +--- Lib/test/test_binascii.py | 304 +++++++++++- ...-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst | 2 + Modules/binascii.c | 455 ++++++++++++++++++ Modules/clinic/binascii.c.h | 128 ++++- 6 files changed, 938 insertions(+), 79 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 70ba036756ff32..9137b7203698df 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -182,6 +182,49 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: 3.15 +.. function:: a2b_base32(string, /) + + Convert base32 data back to binary and return the binary data. + + Valid base32 data: + + * Conforms to :rfc:`4648`. + * Contains only characters from the base32 alphabet. + * Contains no excess data after padding (including excess padding, newlines, etc.). + * Does not start with padding. + + Invalid base32 data will raise :exc:`binascii.Error`. + + .. versionadded:: 3.15 + +.. function:: b2a_base32(data, /) + + Convert binary data to a line(s) of ASCII characters in base32 coding, + as specified in :rfc:`4648`. The return value is the converted line. + + .. versionadded:: 3.15 + +.. function:: a2b_base32hex(string, /) + + Convert base32hex data back to binary and return the binary data. + + Valid base32hex: + + * Conforms to :rfc:`4648`. + * Contains only characters from the base32hex alphabet. + * Contains no excess data after padding (including excess padding, newlines, etc.). + * Does not start with padding. + + Invalid base32hex data will raise :exc:`binascii.Error`. + + .. versionadded:: 3.15 + +.. function:: b2a_base32hex(data, /) + + Convert binary data to a line(s) of ASCII characters in base32hex coding, + as specified in :rfc:`4648`. The return value is the converted line. + + .. versionadded:: 3.15 .. function:: a2b_qp(data, header=False) diff --git a/Lib/base64.py b/Lib/base64.py index a429760da79f2a..576d429522ba31 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -206,51 +206,8 @@ def urlsafe_b64decode(s): the letter O). For security purposes the default is None, so that 0 and 1 are not allowed in the input. ''' -_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567' -_b32hexalphabet = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' -_b32tab2 = {} -_b32rev = {} - -def _b32encode(alphabet, s): - # Delay the initialization of the table to not waste memory - # if the function is never called - if alphabet not in _b32tab2: - b32tab = [bytes((i,)) for i in alphabet] - _b32tab2[alphabet] = [a + b for a in b32tab for b in b32tab] - b32tab = None - - if not isinstance(s, bytes_types): - s = memoryview(s).tobytes() - leftover = len(s) % 5 - # Pad the last quantum with zero bits if necessary - if leftover: - s = s + b'\0' * (5 - leftover) # Don't use += ! - encoded = bytearray() - from_bytes = int.from_bytes - b32tab2 = _b32tab2[alphabet] - for i in range(0, len(s), 5): - c = from_bytes(s[i: i + 5]) # big endian - encoded += (b32tab2[c >> 30] + # bits 1 - 10 - b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20 - b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30 - b32tab2[c & 0x3ff] # bits 31 - 40 - ) - # Adjust for any leftover partial quanta - if leftover == 1: - encoded[-6:] = b'======' - elif leftover == 2: - encoded[-4:] = b'====' - elif leftover == 3: - encoded[-3:] = b'===' - elif leftover == 4: - encoded[-1:] = b'=' - return encoded.take_bytes() - -def _b32decode(alphabet, s, casefold=False, map01=None): - # Delay the initialization of the table to not waste memory - # if the function is never called - if alphabet not in _b32rev: - _b32rev[alphabet] = {v: k for k, v in enumerate(alphabet)} + +def _b32decode_prepare(s, casefold=False, map01=None): s = _bytes_from_decode_data(s) if len(s) % 8: raise binascii.Error('Incorrect padding') @@ -263,51 +220,27 @@ def _b32decode(alphabet, s, casefold=False, map01=None): s = s.translate(bytes.maketrans(b'01', b'O' + map01)) if casefold: s = s.upper() - # Strip off pad characters from the right. We need to count the pad - # characters because this will tell us how many null bytes to remove from - # the end of the decoded string. - l = len(s) - s = s.rstrip(b'=') - padchars = l - len(s) - # Now decode the full quanta - decoded = bytearray() - b32rev = _b32rev[alphabet] - for i in range(0, len(s), 8): - quanta = s[i: i + 8] - acc = 0 - try: - for c in quanta: - acc = (acc << 5) + b32rev[c] - except KeyError: - raise binascii.Error('Non-base32 digit found') from None - decoded += acc.to_bytes(5) # big endian - # Process the last, partial quanta - if l % 8 or padchars not in {0, 1, 3, 4, 6}: - raise binascii.Error('Incorrect padding') - if padchars and decoded: - acc <<= 5 * padchars - last = acc.to_bytes(5) # big endian - leftover = (43 - 5 * padchars) // 8 # 1: 4, 3: 3, 4: 2, 6: 1 - decoded[-5:] = last[:leftover] - return decoded.take_bytes() + return s def b32encode(s): - return _b32encode(_b32alphabet, s) + return binascii.b2a_base32(s) b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32') def b32decode(s, casefold=False, map01=None): - return _b32decode(_b32alphabet, s, casefold, map01) + s = _b32decode_prepare(s, casefold, map01) + return binascii.a2b_base32(s) b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32', extra_args=_B32_DECODE_MAP01_DOCSTRING) def b32hexencode(s): - return _b32encode(_b32hexalphabet, s) + return binascii.b2a_base32hex(s) b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex') def b32hexdecode(s, casefold=False): # base32hex does not have the 01 mapping - return _b32decode(_b32hexalphabet, s, casefold) + s = _b32decode_prepare(s, casefold) + return binascii.a2b_base32hex(s) b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex', extra_args='') diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 667ec9b5241aa9..3ac468d636d203 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -10,10 +10,10 @@ # Note: "*_hex" functions are aliases for "(un)hexlify" -b2a_functions = ['b2a_ascii85', 'b2a_base64', 'b2a_base85', +b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base32hex', 'b2a_base64', 'b2a_base85', 'b2a_hex', 'b2a_qp', 'b2a_uu', 'hexlify'] -a2b_functions = ['a2b_ascii85', 'a2b_base64', 'a2b_base85', +a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base32hex', 'a2b_base64', 'a2b_base85', 'a2b_hex', 'a2b_qp', 'a2b_uu', 'unhexlify'] all_functions = a2b_functions + b2a_functions + ['crc32', 'crc_hqx'] @@ -670,6 +670,306 @@ def test_base85_alphabet(self): with self.assertRaises(TypeError): binascii.a2b_base64(data, alphabet=bytearray(alphabet)) + def test_base32_valid(self): + # Test base32 with valid data + lines = [] + step = 0 + i = 0 + while i < len(self.rawdata): + b = self.type2test(self.rawdata[i:i + step]) + a = binascii.b2a_base32(b) + lines.append(a) + i += step + step += 1 + res = bytes() + for line in lines: + a = self.type2test(line) + b = binascii.a2b_base32(a) + res += b + self.assertEqual(res, self.rawdata) + + def test_base32_errors(self): + def _fixPadding(data): + fixed = data.replace(b"=", b"") + len_8 = len(fixed) % 8 + p = 8 - len_8 if len_8 else 0 + return fixed + b"=" * p + + def _assertRegexTemplate(assert_regex, data, good_padding_result=None): + with self.assertRaisesRegex(binascii.Error, assert_regex): + binascii.a2b_base32(self.type2test(data)) + if good_padding_result: + fixed = self.type2test(_fixPadding(data)) + self.assertEqual(binascii.a2b_base32(fixed), good_padding_result) + + def assertNonBase32Data(*args): + _assertRegexTemplate(r"(?i)Only base32 data", *args) + + def assertExcessData(*args): + _assertRegexTemplate(r"(?i)Excess data", *args) + + def assertExcessPadding(*args): + _assertRegexTemplate(r"(?i)Excess padding", *args) + + def assertLeadingPadding(*args): + _assertRegexTemplate(r"(?i)Leading padding", *args) + + def assertIncorrectPadding(*args): + _assertRegexTemplate(r"(?i)Incorrect padding", *args) + + def assertDiscontinuousPadding(*args): + _assertRegexTemplate(r"(?i)Discontinuous padding", *args) + + def assertInvalidLength(*args): + _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args) + + assertNonBase32Data(b"a") + assertNonBase32Data(b"AA-") + assertNonBase32Data(b"ABCDE==!") + assertNonBase32Data(b"ab:(){:|:&};:==") + + assertExcessData(b"AB======C") + assertExcessData(b"AB======CD") + assertExcessData(b"ABCD====E") + assertExcessData(b"ABCDE===FGH") + assertExcessData(b"ABCDEFG=H") + assertExcessData(b"432Z====55555555") + + assertExcessData(b"BE======EF", b"\t\x08") + assertExcessData(b"BEEF====C", b"\t\x08Q") + assertExcessData(b"BEEFC===AK", b"\t\x08Q\x01") + assertExcessData(b"BEEFCAK=E", b"\t\x08Q\x01D") + + assertExcessPadding(b"BE=======", b"\t") + assertExcessPadding(b"BE========", b"\t") + assertExcessPadding(b"BEEF=====", b"\t\x08") + assertExcessPadding(b"BEEF======", b"\t\x08") + assertExcessPadding(b"BEEFC====", b"\t\x08Q") + assertExcessPadding(b"BEEFC=====", b"\t\x08Q") + assertExcessPadding(b"BEEFCAK==", b"\t\x08Q\x01") + assertExcessPadding(b"BEEFCAK===", b"\t\x08Q\x01") + assertExcessPadding(b"BEEFCAKE=", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE==", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE===", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE====", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=====", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE======", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=======", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE========", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=========", b"\t\x08Q\x01D") + + assertLeadingPadding(b"=", b"") + assertLeadingPadding(b"==", b"") + assertLeadingPadding(b"===", b"") + assertLeadingPadding(b"====", b"") + assertLeadingPadding(b"=====", b"") + assertLeadingPadding(b"======", b"") + assertLeadingPadding(b"=======", b"") + assertLeadingPadding(b"========", b"") + assertLeadingPadding(b"=========", b"") + assertLeadingPadding(b"=BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"==BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"===BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"====BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=====BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"======BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=======BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"========BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=========BEEFCAKE", b"\t\x08Q\x01D") + + assertIncorrectPadding(b"A") + assertIncorrectPadding(b"AB") + assertIncorrectPadding(b"ABC") + assertIncorrectPadding(b"ABCD") + assertIncorrectPadding(b"ABCDE") + assertIncorrectPadding(b"ABCDEF") + assertIncorrectPadding(b"ABCDEFG") + + assertIncorrectPadding(b"BE=", b"\t") + assertIncorrectPadding(b"BE==", b"\t") + assertIncorrectPadding(b"BE===", b"\t") + assertIncorrectPadding(b"BE====", b"\t") + assertIncorrectPadding(b"BE=====", b"\t") + assertIncorrectPadding(b"BEEF=", b"\t\x08") + assertIncorrectPadding(b"BEEF==", b"\t\x08") + assertIncorrectPadding(b"BEEF===", b"\t\x08") + assertIncorrectPadding(b"BEEFC=", b"\t\x08Q") + assertIncorrectPadding(b"BEEFC==", b"\t\x08Q") + + assertDiscontinuousPadding(b"BE=EF===", b"\t\x08") + assertDiscontinuousPadding(b"BE==EF==", b"\t\x08") + assertDiscontinuousPadding(b"BEEF=C==", b"\t\x08Q") + assertDiscontinuousPadding(b"BEEFC=AK", b"\t\x08Q\x01") + + assertInvalidLength(b"A=") + assertInvalidLength(b"A==") + assertInvalidLength(b"A===") + assertInvalidLength(b"A====") + assertInvalidLength(b"A=====") + assertInvalidLength(b"A======") + assertInvalidLength(b"ABC=") + assertInvalidLength(b"ABC==") + assertInvalidLength(b"ABC===") + assertInvalidLength(b"ABC====") + assertInvalidLength(b"ABCDEF=") + + assertInvalidLength(b"B=E=====", b"\t") + assertInvalidLength(b"B==E====", b"\t") + assertInvalidLength(b"BEE=F===", b"\t\x08") + assertInvalidLength(b"BEE==F==", b"\t\x08") + assertInvalidLength(b"BEEFCA=K", b"\t\x08Q\x01") + assertInvalidLength(b"BEEFCA=====K", b"\t\x08Q\x01") + + def test_base32hex_valid(self): + # Test base32hex with valid data + lines = [] + step = 0 + i = 0 + while i < len(self.rawdata): + b = self.type2test(self.rawdata[i:i + step]) + a = binascii.b2a_base32hex(b) + lines.append(a) + i += step + step += 1 + res = bytes() + for line in lines: + a = self.type2test(line) + b = binascii.a2b_base32hex(a) + res += b + self.assertEqual(res, self.rawdata) + + def test_base32hex_errors(self): + def _fixPadding(data): + fixed = data.replace(b"=", b"") + len_8 = len(fixed) % 8 + p = 8 - len_8 if len_8 else 0 + return fixed + b"=" * p + + def _assertRegexTemplate(assert_regex, data, good_padding_result=None): + with self.assertRaisesRegex(binascii.Error, assert_regex): + binascii.a2b_base32hex(self.type2test(data)) + if good_padding_result: + fixed = self.type2test(_fixPadding(data)) + self.assertEqual(binascii.a2b_base32hex(fixed), good_padding_result) + + def assertNonBase32HexData(*args): + _assertRegexTemplate(r"(?i)Only base32hex data", *args) + + def assertExcessData(*args): + _assertRegexTemplate(r"(?i)Excess data", *args) + + def assertExcessPadding(*args): + _assertRegexTemplate(r"(?i)Excess padding", *args) + + def assertLeadingPadding(*args): + _assertRegexTemplate(r"(?i)Leading padding", *args) + + def assertIncorrectPadding(*args): + _assertRegexTemplate(r"(?i)Incorrect padding", *args) + + def assertDiscontinuousPadding(*args): + _assertRegexTemplate(r"(?i)Discontinuous padding", *args) + + def assertInvalidLength(*args): + _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args) + + assertNonBase32HexData(b"a") + assertNonBase32HexData(b"AA-") + assertNonBase32HexData(b"ABCDE==!") + assertNonBase32HexData(b"ab:(){:|:&};:==") + + assertExcessData(b"AB======C") + assertExcessData(b"AB======CD") + assertExcessData(b"ABCD====E") + assertExcessData(b"ABCDE===FGH") + assertExcessData(b"ABCDEFG=H") + assertExcessData(b"4321====55555555") + + assertExcessData(b"BE======EF", b"[\x9c") + assertExcessData(b"BEEF====C", b"[\x9c\xf6") + assertExcessData(b"BEEFC===AK", b"[\x9c\xf6*") + assertExcessData(b"BEEFCAK=E", b"[\x9c\xf6*\x8e") + + assertExcessPadding(b"BE=======", b"[") + assertExcessPadding(b"BE========", b"[") + assertExcessPadding(b"BEEF=====", b"[\x9c") + assertExcessPadding(b"BEEF======", b"[\x9c") + assertExcessPadding(b"BEEFC====", b"[\x9c\xf6") + assertExcessPadding(b"BEEFC=====", b"[\x9c\xf6") + assertExcessPadding(b"BEEFCAK==", b"[\x9c\xf6*") + assertExcessPadding(b"BEEFCAK===", b"[\x9c\xf6*") + assertExcessPadding(b"BEEFCAKE=", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE==", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE===", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE====", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE=====", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE======", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE=======", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE========", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE=========", b"[\x9c\xf6*\x8e") + + assertLeadingPadding(b"=", b"") + assertLeadingPadding(b"==", b"") + assertLeadingPadding(b"===", b"") + assertLeadingPadding(b"====", b"") + assertLeadingPadding(b"=====", b"") + assertLeadingPadding(b"======", b"") + assertLeadingPadding(b"=======", b"") + assertLeadingPadding(b"========", b"") + assertLeadingPadding(b"=========", b"") + assertLeadingPadding(b"=BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"==BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"===BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"====BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"=====BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"======BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"=======BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"========BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"=========BEEFCAKE", b"[\x9c\xf6*\x8e") + + assertIncorrectPadding(b"A") + assertIncorrectPadding(b"AB") + assertIncorrectPadding(b"ABC") + assertIncorrectPadding(b"ABCD") + assertIncorrectPadding(b"ABCDE") + assertIncorrectPadding(b"ABCDEF") + assertIncorrectPadding(b"ABCDEFG") + + assertIncorrectPadding(b"BE=", b"[") + assertIncorrectPadding(b"BE==", b"[") + assertIncorrectPadding(b"BE===", b"[") + assertIncorrectPadding(b"BE====", b"[") + assertIncorrectPadding(b"BE=====", b"[") + assertIncorrectPadding(b"BEEF=", b"[\x9c") + assertIncorrectPadding(b"BEEF==", b"[\x9c") + assertIncorrectPadding(b"BEEF===", b"[\x9c") + assertIncorrectPadding(b"BEEFC=", b"[\x9c\xf6") + assertIncorrectPadding(b"BEEFC==", b"[\x9c\xf6") + + assertDiscontinuousPadding(b"BE=EF===", b"[\x9c") + assertDiscontinuousPadding(b"BE==EF==", b"[\x9c") + assertDiscontinuousPadding(b"BEEF=C==", b"[\x9c\xf6") + assertDiscontinuousPadding(b"BEEFC=AK", b"[\x9c\xf6*") + + assertInvalidLength(b"A=") + assertInvalidLength(b"A==") + assertInvalidLength(b"A===") + assertInvalidLength(b"A====") + assertInvalidLength(b"A=====") + assertInvalidLength(b"A======") + assertInvalidLength(b"ABC=") + assertInvalidLength(b"ABC==") + assertInvalidLength(b"ABC===") + assertInvalidLength(b"ABC====") + assertInvalidLength(b"ABCDEF=") + + assertInvalidLength(b"B=E=====", b"[") + assertInvalidLength(b"B==E====", b"[") + assertInvalidLength(b"BEE=F===", b"[\x9c") + assertInvalidLength(b"BEE==F==", b"[\x9c") + assertInvalidLength(b"BEEFCA=K", b"[\x9c\xf6*") + assertInvalidLength(b"BEEFCA=====K", b"[\x9c\xf6*") + def test_uu(self): MAX_UU = 45 for backtick in (True, False): diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst new file mode 100644 index 00000000000000..a27639d2908651 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst @@ -0,0 +1,2 @@ +Add base32 support to :mod:`binascii` and improve the performance of the +base-32 converters in :mod:`base64`. Patch by James Seo. diff --git a/Modules/binascii.c b/Modules/binascii.c index f85f32b32e962c..241aeac400063e 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -244,6 +244,152 @@ static const unsigned char table_b2a_base85_a85[] Py_ALIGNED(64) = #define BASE85_A85_Z 0x00000000 #define BASE85_A85_Y 0x20202020 + +static const unsigned char table_a2b_base32[] Py_ALIGNED(64) = { + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,26,27, 28,29,30,31, -1,-1,-1,-1, -1,-1,-1,-1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, + 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, +}; + +static const unsigned char table_a2b_base32hex[] Py_ALIGNED(64) = { + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1, -1,-1,-1,-1, + -1,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, + 25,26,27,28, 29,30,31,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, +}; + +static const unsigned char table_b2a_base32[] Py_ALIGNED(64) = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"; + +static const unsigned char table_b2a_base32hex[] Py_ALIGNED(64) = + "0123456789ABCDEFGHIJKLMNOPQRSTUV"; + +#define BASE32_PAD '=' + +/* + * Fast base32 encoding/decoding helpers. + * + * Analogous to the helpers for base64. + */ + +/* Encode 5 bytes into 8 base32 characters. */ +static inline void +base32_encode_quintet(const unsigned char *in, unsigned char *out, + const unsigned char table[]) +{ + uint64_t combined = ((uint64_t)in[0] << 32) | + ((uint64_t)in[1] << 24) | + ((uint64_t)in[2] << 16) | + ((uint64_t)in[3] << 8) | + (uint64_t)in[4]; + out[0] = table[(combined >> 35) & 0x1f]; + out[1] = table[(combined >> 30) & 0x1f]; + out[2] = table[(combined >> 25) & 0x1f]; + out[3] = table[(combined >> 20) & 0x1f]; + out[4] = table[(combined >> 15) & 0x1f]; + out[5] = table[(combined >> 10) & 0x1f]; + out[6] = table[(combined >> 5) & 0x1f]; + out[7] = table[combined & 0x1f]; +} + +/* + * Encode multiple complete 5-byte groups. + * Returns the number of input bytes processed (always a multiple of 5). + */ +static inline Py_ssize_t +base32_encode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char table[]) +{ + Py_ssize_t n_quintets = in_len / 5; + const unsigned char *in_end = in + n_quintets * 5; + + while (in < in_end) { + base32_encode_quintet(in, out, table); + in += 5; + out += 8; + } + + return n_quintets * 5; +} + +/* + * Decode 8 base32 characters into 5 bytes. + * Returns 1 on success, 0 if any character is invalid. + */ +static inline int +base32_decode_octet(const unsigned char *in, unsigned char *out, + const unsigned char table[]) +{ + unsigned char v0 = table[in[0]]; + unsigned char v1 = table[in[1]]; + unsigned char v2 = table[in[2]]; + unsigned char v3 = table[in[3]]; + unsigned char v4 = table[in[4]]; + unsigned char v5 = table[in[5]]; + unsigned char v6 = table[in[6]]; + unsigned char v7 = table[in[7]]; + + if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) & 0xe0) { + return 0; + } + + out[0] = (v0 << 3) | (v1 >> 2); + out[1] = (v1 << 6) | (v2 << 1) | (v3 >> 4); + out[2] = (v3 << 4) | (v4 >> 1); + out[3] = (v4 << 7) | (v5 << 2) | (v6 >> 3); + out[4] = (v6 << 5) | v7; + return 1; +} + +/* + * Decode multiple complete 8-character groups (no padding allowed). + * Returns the number of input characters processed. + * Stops at the first invalid character, padding, or incomplete group. + */ +static inline Py_ssize_t +base32_decode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char table[]) +{ + Py_ssize_t n_quintets = in_len / 8; + Py_ssize_t i; + + for (i = 0; i < n_quintets; i++) { + if (!base32_decode_octet(in + i * 8, out + i * 5, table)) { + break; + } + } + + return i * 8; +} + + static const unsigned short crctab_hqx[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, @@ -1367,6 +1513,311 @@ binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, int pad, return PyBytesWriter_FinishWithPointer(writer, ascii_data); } +static PyObject * +base32_decode_impl(PyObject *module, Py_buffer *data, + const unsigned char table_a2b[], const char *name) +{ + const unsigned char *ascii_data = data->buf; + Py_ssize_t ascii_len = data->len; + binascii_state *state = NULL; + + assert(ascii_len >= 0); + + /* Allocate output buffer. */ + size_t bin_len = ((size_t)ascii_len + 7) / 8 * 5; + PyBytesWriter *writer = PyBytesWriter_Create(bin_len); + if (writer == NULL) { + return NULL; + } + unsigned char *bin_data = PyBytesWriter_GetData(writer); + + /* + * Fast path: use optimized decoder for complete octets (groups of 8 bytes). + * The fast path stops at padding, invalid chars, or incomplete octets. + */ + if (ascii_len >= 8) { + Py_ssize_t fast_chars = base32_decode_fast(ascii_data, ascii_len, + bin_data, table_a2b); + if (fast_chars > 0) { + ascii_data += fast_chars; + ascii_len -= fast_chars; + bin_data += (fast_chars / 8) * 5; + } + } + + /* Slow path: handle remaining input (padding, invalid chars, incomplete octets). */ + unsigned char leftchar = 0; + int octet_pos = 0; + int pads = 0; + for (; ascii_len; ascii_len--, ascii_data++) { + unsigned char this_ch = *ascii_data; + + /* Check for pad sequences. They may only occur at certain positions. */ + if (this_ch == BASE32_PAD) { + pads++; + + if ((octet_pos == 2 || octet_pos == 4 + || octet_pos == 5 || octet_pos == 7) + && octet_pos + pads <= 8) + { + continue; + } + + state = get_binascii_state(module); + if (state) { + if (octet_pos == 1 || octet_pos == 3 || octet_pos == 6) { + const unsigned char *ascii_data_start = data->buf; + PyErr_Format(state->Error, + "Invalid %s-encoded string: " + "number of data characters (%zd) " + "cannot be 1, 3, or 6 more " + "than a multiple of 8", + name, (ascii_data - ascii_data_start)); + } + else { + PyErr_SetString(state->Error, + (octet_pos == 0 && ascii_data == data->buf) + ? "Leading padding not allowed" + : "Excess padding not allowed"); + } + } + goto error; + } + + unsigned char v = table_a2b[this_ch]; + if (v >= 32) { + state = get_binascii_state(module); + if (state) { + PyErr_Format(state->Error, "Only %s data is allowed", name); + } + goto error; + } + + /* Data in the middle of/after the padding is not allowed. */ + if (pads) { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, (octet_pos + pads == 8) + ? "Excess data after padding" + : "Discontinuous padding not allowed"); + } + goto error; + } + + switch (octet_pos) { + case 0: + octet_pos = 1; + leftchar = v; + break; + case 1: + octet_pos = 2; + *bin_data++ = (leftchar << 3) | (v >> 2); + leftchar = v & 0x03; + break; + case 2: + octet_pos = 3; + leftchar = (leftchar << 5) | v; + break; + case 3: + octet_pos = 4; + *bin_data++ = (leftchar << 1) | (v >> 4); + leftchar = v & 0x0f; + break; + case 4: + octet_pos = 5; + *bin_data++ = (leftchar << 4) | (v >> 1); + leftchar = v & 0x01; + break; + case 5: + octet_pos = 6; + leftchar = (leftchar << 5) | v; + break; + case 6: + octet_pos = 7; + *bin_data++ = (leftchar << 2) | (v >> 3); + leftchar = v & 0x07; + break; + case 7: + octet_pos = 0; + *bin_data++ = (leftchar << 5) | v; + leftchar = 0; + } + } + + if ((octet_pos != 0 && octet_pos + pads != 8) + || (octet_pos == 0 && pads != 0)) + { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, "Incorrect padding"); + } + goto error; + } + + return PyBytesWriter_FinishWithPointer(writer, bin_data); + +error: + PyBytesWriter_Discard(writer); + return NULL; +} + +static PyObject * +base32_encode_impl(PyObject *module, Py_buffer *data, + const unsigned char table_b2a[], const char *name) +{ + const unsigned char *bin_data = data->buf; + Py_ssize_t bin_len = data->len; + binascii_state *state = NULL; + + assert(bin_len >= 0); + + /* + * Each group of 5 bytes (rounded up) gets encoded as 8 characters. + * Use unsigned integer arithmetic to avoid signed integer overflow. + */ + size_t ascii_len = ((size_t)bin_len + 4u) / 5u * 8u; + if (ascii_len > PY_SSIZE_T_MAX) { + state = get_binascii_state(module); + if (state) { + PyErr_Format(state->Error, "Too much data for %s", name); + } + return NULL; + } + PyBytesWriter *writer = PyBytesWriter_Create(ascii_len); + if (writer == NULL) { + return NULL; + } + unsigned char *ascii_data = PyBytesWriter_GetData(writer); + + /* Use the optimized fast path for complete 5-byte groups. */ + Py_ssize_t fast_bytes = base32_encode_fast(bin_data, bin_len, ascii_data, + table_b2a); + bin_data += fast_bytes; + ascii_data += (fast_bytes / 5) * 8; + bin_len -= fast_bytes; + + /* Handle the remaining 0-4 bytes. */ + if (bin_len == 1) { + /* 1 byte remaining: produces 2 encoded + 6 padding chars. */ + uint32_t val = bin_data[0]; + *ascii_data++ = table_b2a[(val >> 3) & 0x1f]; + *ascii_data++ = table_b2a[(val << 2) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 2) { + /* 2 bytes remaining: produces 4 encoded + 4 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 8) | bin_data[1]; + *ascii_data++ = table_b2a[(val >> 11) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 6) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 1) & 0x1f]; + *ascii_data++ = table_b2a[(val << 4) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 3) { + /* 3 bytes remaining: produces 5 encoded + 3 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 16) + | ((uint32_t)bin_data[1] << 8) + | bin_data[2]; + *ascii_data++ = table_b2a[(val >> 19) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 14) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 9) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 4) & 0x1f]; + *ascii_data++ = table_b2a[(val << 1) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 4) { + /* 4 bytes remaining: produces 7 encoded + 1 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 24) + | ((uint32_t)bin_data[1] << 16) + | ((uint32_t)bin_data[2] << 8) + | bin_data[3]; + *ascii_data++ = table_b2a[(val >> 27) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 22) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 17) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 12) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 7) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 2) & 0x1f]; + *ascii_data++ = table_b2a[(val << 3) & 0x1f]; + *ascii_data++ = BASE32_PAD; + } + + return PyBytesWriter_FinishWithPointer(writer, ascii_data); +} + +/*[clinic input] +binascii.a2b_base32 + + data: ascii_buffer + / + +Decode a line of base32 data. +[clinic start generated code]*/ + +static PyObject * +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=978d91ce9fadedf9 input=9137b28791447ce7]*/ +{ + return base32_decode_impl(module, data, table_a2b_base32, "base32"); +} + +/*[clinic input] +binascii.b2a_base32 + + data: Py_buffer + / + +base32-code line of data. +[clinic start generated code]*/ + +static PyObject * +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=c44b684c550a24cc input=0c6cbb86d32086f5]*/ +{ + return base32_encode_impl(module, data, table_b2a_base32, "base32"); +} + +/*[clinic input] +binascii.a2b_base32hex + + data: ascii_buffer + / + +Decode a line of base32hex data. +[clinic start generated code]*/ + +static PyObject * +binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=29133f84416e93cf input=178fe8e8fb212206]*/ +{ + return base32_decode_impl(module, data, table_a2b_base32hex, "base32hex"); +} + +/*[clinic input] +binascii.b2a_base32hex + + data: Py_buffer + / + +base32hex-code line of data. +[clinic start generated code]*/ + +static PyObject * +binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=8ab2f6742ed918cb input=01108fc686630e91]*/ +{ + return base32_encode_impl(module, data, table_b2a_base32hex, "base32hex"); +} + /*[clinic input] binascii.crc_hqx @@ -2028,6 +2479,10 @@ static struct PyMethodDef binascii_module_methods[] = { BINASCII_A2B_ASCII85_METHODDEF BINASCII_A2B_BASE85_METHODDEF BINASCII_B2A_BASE85_METHODDEF + BINASCII_A2B_BASE32_METHODDEF + BINASCII_B2A_BASE32_METHODDEF + BINASCII_A2B_BASE32HEX_METHODDEF + BINASCII_B2A_BASE32HEX_METHODDEF BINASCII_A2B_HEX_METHODDEF BINASCII_B2A_HEX_METHODDEF BINASCII_HEXLIFY_METHODDEF diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index 2fdecc2efbf9d4..ae6bd7813f89e7 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -711,6 +711,132 @@ binascii_b2a_base85(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P return return_value; } +PyDoc_STRVAR(binascii_a2b_base32__doc__, +"a2b_base32($module, data, /)\n" +"--\n" +"\n" +"Decode a line of base32 data."); + +#define BINASCII_A2B_BASE32_METHODDEF \ + {"a2b_base32", (PyCFunction)binascii_a2b_base32, METH_O, binascii_a2b_base32__doc__}, + +static PyObject * +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_a2b_base32(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (!ascii_buffer_converter(arg, &data)) { + goto exit; + } + return_value = binascii_a2b_base32_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) + PyBuffer_Release(&data); + + return return_value; +} + +PyDoc_STRVAR(binascii_b2a_base32__doc__, +"b2a_base32($module, data, /)\n" +"--\n" +"\n" +"base32-code line of data."); + +#define BINASCII_B2A_BASE32_METHODDEF \ + {"b2a_base32", (PyCFunction)binascii_b2a_base32, METH_O, binascii_b2a_base32__doc__}, + +static PyObject * +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_b2a_base32(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + return_value = binascii_b2a_base32_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + +PyDoc_STRVAR(binascii_a2b_base32hex__doc__, +"a2b_base32hex($module, data, /)\n" +"--\n" +"\n" +"Decode a line of base32hex data."); + +#define BINASCII_A2B_BASE32HEX_METHODDEF \ + {"a2b_base32hex", (PyCFunction)binascii_a2b_base32hex, METH_O, binascii_a2b_base32hex__doc__}, + +static PyObject * +binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_a2b_base32hex(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (!ascii_buffer_converter(arg, &data)) { + goto exit; + } + return_value = binascii_a2b_base32hex_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) + PyBuffer_Release(&data); + + return return_value; +} + +PyDoc_STRVAR(binascii_b2a_base32hex__doc__, +"b2a_base32hex($module, data, /)\n" +"--\n" +"\n" +"base32hex-code line of data."); + +#define BINASCII_B2A_BASE32HEX_METHODDEF \ + {"b2a_base32hex", (PyCFunction)binascii_b2a_base32hex, METH_O, binascii_b2a_base32hex__doc__}, + +static PyObject * +binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_b2a_base32hex(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + return_value = binascii_b2a_base32hex_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + PyDoc_STRVAR(binascii_crc_hqx__doc__, "crc_hqx($module, data, crc, /)\n" "--\n" @@ -1256,4 +1382,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=84c97096b0fb3819 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=bafd226511187580 input=a9049054013a1b77]*/ From bf1308f1f1139fe99411908dc575c17f18d983fa Mon Sep 17 00:00:00 2001 From: James Seo Date: Fri, 20 Mar 2026 08:54:02 -0700 Subject: [PATCH 2/8] Update PR for #145981 - Use the new `alphabet` parameter in `binascii` - Remove `binascii.a2b_base32hex()` and `binascii.b2a_base32hex()` - Change value for `.. versionadded::` ReST directive in docs for new `binascii` functions to "next" instead of "3.15" --- Doc/library/binascii.rst | 46 +++++----- Lib/base64.py | 4 +- Lib/test/test_binascii.py | 174 ++++++------------------------------ Modules/binascii.c | 162 ++++++++++++++------------------- Modules/clinic/binascii.c.h | 171 ++++++++++++++++++++--------------- 5 files changed, 213 insertions(+), 344 deletions(-) diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 9137b7203698df..3facb139e17d43 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -182,7 +182,7 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: 3.15 -.. function:: a2b_base32(string, /) +.. function:: a2b_base32(string, /, *, alphabet=BASE32_ALPHABET) Convert base32 data back to binary and return the binary data. @@ -193,38 +193,22 @@ The :mod:`!binascii` module defines the following functions: * Contains no excess data after padding (including excess padding, newlines, etc.). * Does not start with padding. + Optional *alphabet* must be a :class:`bytes` object of length 32 which + specifies an alternative alphabet. + Invalid base32 data will raise :exc:`binascii.Error`. - .. versionadded:: 3.15 + .. versionadded:: next -.. function:: b2a_base32(data, /) +.. function:: b2a_base32(data, /, *, alphabet=BASE32_ALPHABET) Convert binary data to a line(s) of ASCII characters in base32 coding, as specified in :rfc:`4648`. The return value is the converted line. - .. versionadded:: 3.15 - -.. function:: a2b_base32hex(string, /) - - Convert base32hex data back to binary and return the binary data. - - Valid base32hex: - - * Conforms to :rfc:`4648`. - * Contains only characters from the base32hex alphabet. - * Contains no excess data after padding (including excess padding, newlines, etc.). - * Does not start with padding. - - Invalid base32hex data will raise :exc:`binascii.Error`. - - .. versionadded:: 3.15 - -.. function:: b2a_base32hex(data, /) - - Convert binary data to a line(s) of ASCII characters in base32hex coding, - as specified in :rfc:`4648`. The return value is the converted line. + Optional *alphabet* must be a :term:`bytes-like object` of length 32 which + specifies an alternative alphabet. - .. versionadded:: 3.15 + .. versionadded:: next .. function:: a2b_qp(data, header=False) @@ -370,6 +354,18 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: next +.. data:: BASE32_ALPHABET + + The base32 alphabet according to :rfc:`4648`. + + .. versionadded:: next + +.. data:: BASE32HEX_ALPHABET + + The "Extended Hex" base32hex alphabet according to :rfc:`4648`. + + .. versionadded:: next + .. seealso:: diff --git a/Lib/base64.py b/Lib/base64.py index 576d429522ba31..8c88add3c2595b 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -234,13 +234,13 @@ def b32decode(s, casefold=False, map01=None): extra_args=_B32_DECODE_MAP01_DOCSTRING) def b32hexencode(s): - return binascii.b2a_base32hex(s) + return binascii.b2a_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex') def b32hexdecode(s, casefold=False): # base32hex does not have the 01 mapping s = _b32decode_prepare(s, casefold) - return binascii.a2b_base32hex(s) + return binascii.a2b_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex', extra_args='') diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 3ac468d636d203..7fedd8e1a8b3ca 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -10,10 +10,10 @@ # Note: "*_hex" functions are aliases for "(un)hexlify" -b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base32hex', 'b2a_base64', 'b2a_base85', +b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base64', 'b2a_base85', 'b2a_hex', 'b2a_qp', 'b2a_uu', 'hexlify'] -a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base32hex', 'a2b_base64', 'a2b_base85', +a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base64', 'a2b_base85', 'a2b_hex', 'a2b_qp', 'a2b_uu', 'unhexlify'] all_functions = a2b_functions + b2a_functions + ['crc32', 'crc_hqx'] @@ -820,155 +820,31 @@ def assertInvalidLength(*args): assertInvalidLength(b"BEEFCA=K", b"\t\x08Q\x01") assertInvalidLength(b"BEEFCA=====K", b"\t\x08Q\x01") - def test_base32hex_valid(self): - # Test base32hex with valid data - lines = [] - step = 0 - i = 0 - while i < len(self.rawdata): - b = self.type2test(self.rawdata[i:i + step]) - a = binascii.b2a_base32hex(b) - lines.append(a) - i += step - step += 1 - res = bytes() - for line in lines: - a = self.type2test(line) - b = binascii.a2b_base32hex(a) - res += b - self.assertEqual(res, self.rawdata) - - def test_base32hex_errors(self): - def _fixPadding(data): - fixed = data.replace(b"=", b"") - len_8 = len(fixed) % 8 - p = 8 - len_8 if len_8 else 0 - return fixed + b"=" * p - - def _assertRegexTemplate(assert_regex, data, good_padding_result=None): - with self.assertRaisesRegex(binascii.Error, assert_regex): - binascii.a2b_base32hex(self.type2test(data)) - if good_padding_result: - fixed = self.type2test(_fixPadding(data)) - self.assertEqual(binascii.a2b_base32hex(fixed), good_padding_result) - - def assertNonBase32HexData(*args): - _assertRegexTemplate(r"(?i)Only base32hex data", *args) - - def assertExcessData(*args): - _assertRegexTemplate(r"(?i)Excess data", *args) - - def assertExcessPadding(*args): - _assertRegexTemplate(r"(?i)Excess padding", *args) - - def assertLeadingPadding(*args): - _assertRegexTemplate(r"(?i)Leading padding", *args) - - def assertIncorrectPadding(*args): - _assertRegexTemplate(r"(?i)Incorrect padding", *args) - - def assertDiscontinuousPadding(*args): - _assertRegexTemplate(r"(?i)Discontinuous padding", *args) - - def assertInvalidLength(*args): - _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args) - - assertNonBase32HexData(b"a") - assertNonBase32HexData(b"AA-") - assertNonBase32HexData(b"ABCDE==!") - assertNonBase32HexData(b"ab:(){:|:&};:==") - - assertExcessData(b"AB======C") - assertExcessData(b"AB======CD") - assertExcessData(b"ABCD====E") - assertExcessData(b"ABCDE===FGH") - assertExcessData(b"ABCDEFG=H") - assertExcessData(b"4321====55555555") - - assertExcessData(b"BE======EF", b"[\x9c") - assertExcessData(b"BEEF====C", b"[\x9c\xf6") - assertExcessData(b"BEEFC===AK", b"[\x9c\xf6*") - assertExcessData(b"BEEFCAK=E", b"[\x9c\xf6*\x8e") - - assertExcessPadding(b"BE=======", b"[") - assertExcessPadding(b"BE========", b"[") - assertExcessPadding(b"BEEF=====", b"[\x9c") - assertExcessPadding(b"BEEF======", b"[\x9c") - assertExcessPadding(b"BEEFC====", b"[\x9c\xf6") - assertExcessPadding(b"BEEFC=====", b"[\x9c\xf6") - assertExcessPadding(b"BEEFCAK==", b"[\x9c\xf6*") - assertExcessPadding(b"BEEFCAK===", b"[\x9c\xf6*") - assertExcessPadding(b"BEEFCAKE=", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE==", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE===", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE====", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE=====", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE======", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE=======", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE========", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE=========", b"[\x9c\xf6*\x8e") - - assertLeadingPadding(b"=", b"") - assertLeadingPadding(b"==", b"") - assertLeadingPadding(b"===", b"") - assertLeadingPadding(b"====", b"") - assertLeadingPadding(b"=====", b"") - assertLeadingPadding(b"======", b"") - assertLeadingPadding(b"=======", b"") - assertLeadingPadding(b"========", b"") - assertLeadingPadding(b"=========", b"") - assertLeadingPadding(b"=BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"==BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"===BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"====BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"=====BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"======BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"=======BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"========BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"=========BEEFCAKE", b"[\x9c\xf6*\x8e") - - assertIncorrectPadding(b"A") - assertIncorrectPadding(b"AB") - assertIncorrectPadding(b"ABC") - assertIncorrectPadding(b"ABCD") - assertIncorrectPadding(b"ABCDE") - assertIncorrectPadding(b"ABCDEF") - assertIncorrectPadding(b"ABCDEFG") - - assertIncorrectPadding(b"BE=", b"[") - assertIncorrectPadding(b"BE==", b"[") - assertIncorrectPadding(b"BE===", b"[") - assertIncorrectPadding(b"BE====", b"[") - assertIncorrectPadding(b"BE=====", b"[") - assertIncorrectPadding(b"BEEF=", b"[\x9c") - assertIncorrectPadding(b"BEEF==", b"[\x9c") - assertIncorrectPadding(b"BEEF===", b"[\x9c") - assertIncorrectPadding(b"BEEFC=", b"[\x9c\xf6") - assertIncorrectPadding(b"BEEFC==", b"[\x9c\xf6") - - assertDiscontinuousPadding(b"BE=EF===", b"[\x9c") - assertDiscontinuousPadding(b"BE==EF==", b"[\x9c") - assertDiscontinuousPadding(b"BEEF=C==", b"[\x9c\xf6") - assertDiscontinuousPadding(b"BEEFC=AK", b"[\x9c\xf6*") + def test_base32_alphabet(self): + alphabet = b'0Aa1Bb2Cc3Dd4Ee5Ff6Gg7Hh8Ii9JjKk' + data = self.type2test(self.rawdata) + encoded = binascii.b2a_base32(data, alphabet=alphabet) + trans = bytes.maketrans(binascii.BASE32_ALPHABET, alphabet) + expected = binascii.b2a_base32(data).translate(trans) + self.assertEqual(encoded, expected) + self.assertEqual(binascii.a2b_base32(encoded, alphabet=alphabet), self.rawdata) + self.assertEqual(binascii.b2a_base32(data, alphabet=self.type2test(alphabet)), expected) - assertInvalidLength(b"A=") - assertInvalidLength(b"A==") - assertInvalidLength(b"A===") - assertInvalidLength(b"A====") - assertInvalidLength(b"A=====") - assertInvalidLength(b"A======") - assertInvalidLength(b"ABC=") - assertInvalidLength(b"ABC==") - assertInvalidLength(b"ABC===") - assertInvalidLength(b"ABC====") - assertInvalidLength(b"ABCDEF=") + data = self.type2test(b'') + self.assertEqual(binascii.b2a_base32(data, alphabet=alphabet), b'') + self.assertEqual(binascii.a2b_base32(data, alphabet=alphabet), b'') - assertInvalidLength(b"B=E=====", b"[") - assertInvalidLength(b"B==E====", b"[") - assertInvalidLength(b"BEE=F===", b"[\x9c") - assertInvalidLength(b"BEE==F==", b"[\x9c") - assertInvalidLength(b"BEEFCA=K", b"[\x9c\xf6*") - assertInvalidLength(b"BEEFCA=====K", b"[\x9c\xf6*") + for func in binascii.b2a_base32, binascii.a2b_base32: + with self.assertRaises(TypeError): + func(data, alphabet=None) + with self.assertRaises(TypeError): + func(data, alphabet=alphabet.decode()) + with self.assertRaises(ValueError): + func(data, alphabet=alphabet[:-1]) + with self.assertRaises(ValueError): + func(data, alphabet=alphabet+b'?') + with self.assertRaises(TypeError): + binascii.a2b_base32(data, alphabet=bytearray(alphabet)) def test_uu(self): MAX_UU = 45 diff --git a/Modules/binascii.c b/Modules/binascii.c index 241aeac400063e..44d7986b6c0415 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -265,32 +265,9 @@ static const unsigned char table_a2b_base32[] Py_ALIGNED(64) = { -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, }; -static const unsigned char table_a2b_base32hex[] Py_ALIGNED(64) = { - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1, -1,-1,-1,-1, - -1,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, - 25,26,27,28, 29,30,31,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -}; - static const unsigned char table_b2a_base32[] Py_ALIGNED(64) = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"; -static const unsigned char table_b2a_base32hex[] Py_ALIGNED(64) = - "0123456789ABCDEFGHIJKLMNOPQRSTUV"; - #define BASE32_PAD '=' /* @@ -1513,20 +1490,44 @@ binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, int pad, return PyBytesWriter_FinishWithPointer(writer, ascii_data); } +/*[clinic input] +binascii.a2b_base32 + + data: ascii_buffer + / + * + alphabet: PyBytesObject(c_default="NULL") = BASE32_ALPHABET + +Decode a line of base32 data. +[clinic start generated code]*/ + static PyObject * -base32_decode_impl(PyObject *module, Py_buffer *data, - const unsigned char table_a2b[], const char *name) +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, + PyBytesObject *alphabet) +/*[clinic end generated code: output=12cb58bf547237e2 input=426055ea49ac147e]*/ { const unsigned char *ascii_data = data->buf; Py_ssize_t ascii_len = data->len; binascii_state *state = NULL; + PyObject *table_obj = NULL; + const unsigned char *table_a2b = table_a2b_base32; assert(ascii_len >= 0); + if (alphabet != NULL) { + state = get_binascii_state(module); + table_obj = get_reverse_table(state, (PyObject *)alphabet, 32, BASE32_PAD); + if (table_obj == NULL) { + return NULL; + } + table_a2b = (const unsigned char *)PyBytes_AS_STRING(table_obj); + } + /* Allocate output buffer. */ size_t bin_len = ((size_t)ascii_len + 7) / 8 * 5; PyBytesWriter *writer = PyBytesWriter_Create(bin_len); if (writer == NULL) { + Py_XDECREF(table_obj); return NULL; } unsigned char *bin_data = PyBytesWriter_GetData(writer); @@ -1568,11 +1569,11 @@ base32_decode_impl(PyObject *module, Py_buffer *data, if (octet_pos == 1 || octet_pos == 3 || octet_pos == 6) { const unsigned char *ascii_data_start = data->buf; PyErr_Format(state->Error, - "Invalid %s-encoded string: " + "Invalid base32-encoded string: " "number of data characters (%zd) " "cannot be 1, 3, or 6 more " "than a multiple of 8", - name, (ascii_data - ascii_data_start)); + ascii_data - ascii_data_start); } else { PyErr_SetString(state->Error, @@ -1588,7 +1589,7 @@ base32_decode_impl(PyObject *module, Py_buffer *data, if (v >= 32) { state = get_binascii_state(module); if (state) { - PyErr_Format(state->Error, "Only %s data is allowed", name); + PyErr_SetString(state->Error, "Only base32 data is allowed"); } goto error; } @@ -1654,23 +1655,46 @@ base32_decode_impl(PyObject *module, Py_buffer *data, goto error; } + Py_XDECREF(table_obj); return PyBytesWriter_FinishWithPointer(writer, bin_data); error: PyBytesWriter_Discard(writer); + Py_XDECREF(table_obj); return NULL; } +/*[clinic input] +binascii.b2a_base32 + + data: Py_buffer + / + * + alphabet: Py_buffer(c_default="{NULL, NULL}") = BASE32_ALPHABET + +base32-code line of data. +[clinic start generated code]*/ + static PyObject * -base32_encode_impl(PyObject *module, Py_buffer *data, - const unsigned char table_b2a[], const char *name) +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, + Py_buffer *alphabet) +/*[clinic end generated code: output=058d0d1aeb014d3b input=ffd4fa162a6e1cb5]*/ { + const unsigned char *table_b2a = table_b2a_base32; const unsigned char *bin_data = data->buf; Py_ssize_t bin_len = data->len; binascii_state *state = NULL; assert(bin_len >= 0); + if (alphabet->buf != NULL) { + if (alphabet->len != 32) { + PyErr_SetString(PyExc_ValueError, "alphabet must have length 32"); + return NULL; + } + table_b2a = alphabet->buf; + } + /* * Each group of 5 bytes (rounded up) gets encoded as 8 characters. * Use unsigned integer arithmetic to avoid signed integer overflow. @@ -1679,7 +1703,7 @@ base32_encode_impl(PyObject *module, Py_buffer *data, if (ascii_len > PY_SSIZE_T_MAX) { state = get_binascii_state(module); if (state) { - PyErr_Format(state->Error, "Too much data for %s", name); + PyErr_SetString(state->Error, "Too much data for base32"); } return NULL; } @@ -1754,70 +1778,6 @@ base32_encode_impl(PyObject *module, Py_buffer *data, return PyBytesWriter_FinishWithPointer(writer, ascii_data); } -/*[clinic input] -binascii.a2b_base32 - - data: ascii_buffer - / - -Decode a line of base32 data. -[clinic start generated code]*/ - -static PyObject * -binascii_a2b_base32_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=978d91ce9fadedf9 input=9137b28791447ce7]*/ -{ - return base32_decode_impl(module, data, table_a2b_base32, "base32"); -} - -/*[clinic input] -binascii.b2a_base32 - - data: Py_buffer - / - -base32-code line of data. -[clinic start generated code]*/ - -static PyObject * -binascii_b2a_base32_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=c44b684c550a24cc input=0c6cbb86d32086f5]*/ -{ - return base32_encode_impl(module, data, table_b2a_base32, "base32"); -} - -/*[clinic input] -binascii.a2b_base32hex - - data: ascii_buffer - / - -Decode a line of base32hex data. -[clinic start generated code]*/ - -static PyObject * -binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=29133f84416e93cf input=178fe8e8fb212206]*/ -{ - return base32_decode_impl(module, data, table_a2b_base32hex, "base32hex"); -} - -/*[clinic input] -binascii.b2a_base32hex - - data: Py_buffer - / - -base32hex-code line of data. -[clinic start generated code]*/ - -static PyObject * -binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=8ab2f6742ed918cb input=01108fc686630e91]*/ -{ - return base32_encode_impl(module, data, table_b2a_base32hex, "base32hex"); -} - /*[clinic input] binascii.crc_hqx @@ -2481,8 +2441,6 @@ static struct PyMethodDef binascii_module_methods[] = { BINASCII_B2A_BASE85_METHODDEF BINASCII_A2B_BASE32_METHODDEF BINASCII_B2A_BASE32_METHODDEF - BINASCII_A2B_BASE32HEX_METHODDEF - BINASCII_B2A_BASE32HEX_METHODDEF BINASCII_A2B_HEX_METHODDEF BINASCII_B2A_HEX_METHODDEF BINASCII_HEXLIFY_METHODDEF @@ -2569,6 +2527,16 @@ binascii_exec(PyObject *module) { return -1; } + if (PyModule_Add(module, "BASE32_ALPHABET", + PyBytes_FromStringAndSize((const char *)table_b2a_base32, 32)) < 0) + { + return -1; + } + if (PyModule_Add(module, "BASE32HEX_ALPHABET", + PyBytes_FromString("0123456789ABCDEFGHIJKLMNOPQRSTUV")) < 0) + { + return -1; + } state->reverse_table_cache = PyDict_New(); if (state->reverse_table_cache == NULL) { diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index ae6bd7813f89e7..8057d94a1fb934 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -712,27 +712,72 @@ binascii_b2a_base85(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P } PyDoc_STRVAR(binascii_a2b_base32__doc__, -"a2b_base32($module, data, /)\n" +"a2b_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" "--\n" "\n" "Decode a line of base32 data."); #define BINASCII_A2B_BASE32_METHODDEF \ - {"a2b_base32", (PyCFunction)binascii_a2b_base32, METH_O, binascii_a2b_base32__doc__}, + {"a2b_base32", _PyCFunction_CAST(binascii_a2b_base32), METH_FASTCALL|METH_KEYWORDS, binascii_a2b_base32__doc__}, static PyObject * -binascii_a2b_base32_impl(PyObject *module, Py_buffer *data); +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, + PyBytesObject *alphabet); static PyObject * -binascii_a2b_base32(PyObject *module, PyObject *arg) +binascii_a2b_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(alphabet), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"", "alphabet", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "a2b_base32", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; Py_buffer data = {NULL, NULL}; + PyBytesObject *alphabet = NULL; - if (!ascii_buffer_converter(arg, &data)) { + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!ascii_buffer_converter(args[0], &data)) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + if (!PyBytes_Check(args[1])) { + _PyArg_BadArgument("a2b_base32", "argument 'alphabet'", "bytes", args[1]); goto exit; } - return_value = binascii_a2b_base32_impl(module, &data); + alphabet = (PyBytesObject *)args[1]; +skip_optional_kwonly: + return_value = binascii_a2b_base32_impl(module, &data, alphabet); exit: /* Cleanup for data */ @@ -743,96 +788,80 @@ binascii_a2b_base32(PyObject *module, PyObject *arg) } PyDoc_STRVAR(binascii_b2a_base32__doc__, -"b2a_base32($module, data, /)\n" +"b2a_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" "--\n" "\n" "base32-code line of data."); #define BINASCII_B2A_BASE32_METHODDEF \ - {"b2a_base32", (PyCFunction)binascii_b2a_base32, METH_O, binascii_b2a_base32__doc__}, + {"b2a_base32", _PyCFunction_CAST(binascii_b2a_base32), METH_FASTCALL|METH_KEYWORDS, binascii_b2a_base32__doc__}, static PyObject * -binascii_b2a_base32_impl(PyObject *module, Py_buffer *data); +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, + Py_buffer *alphabet); static PyObject * -binascii_b2a_base32(PyObject *module, PyObject *arg) +binascii_b2a_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - Py_buffer data = {NULL, NULL}; - - if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { - goto exit; - } - return_value = binascii_b2a_base32_impl(module, &data); - -exit: - /* Cleanup for data */ - if (data.obj) { - PyBuffer_Release(&data); - } - - return return_value; -} - -PyDoc_STRVAR(binascii_a2b_base32hex__doc__, -"a2b_base32hex($module, data, /)\n" -"--\n" -"\n" -"Decode a line of base32hex data."); + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) -#define BINASCII_A2B_BASE32HEX_METHODDEF \ - {"a2b_base32hex", (PyCFunction)binascii_a2b_base32hex, METH_O, binascii_a2b_base32hex__doc__}, + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(alphabet), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) -static PyObject * -binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data); + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE -static PyObject * -binascii_a2b_base32hex(PyObject *module, PyObject *arg) -{ - PyObject *return_value = NULL; + static const char * const _keywords[] = {"", "alphabet", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "b2a_base32", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; Py_buffer data = {NULL, NULL}; + Py_buffer alphabet = {NULL, NULL}; - if (!ascii_buffer_converter(arg, &data)) { + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { goto exit; } - return_value = binascii_a2b_base32hex_impl(module, &data); - -exit: - /* Cleanup for data */ - if (data.obj) - PyBuffer_Release(&data); - - return return_value; -} - -PyDoc_STRVAR(binascii_b2a_base32hex__doc__, -"b2a_base32hex($module, data, /)\n" -"--\n" -"\n" -"base32hex-code line of data."); - -#define BINASCII_B2A_BASE32HEX_METHODDEF \ - {"b2a_base32hex", (PyCFunction)binascii_b2a_base32hex, METH_O, binascii_b2a_base32hex__doc__}, - -static PyObject * -binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data); - -static PyObject * -binascii_b2a_base32hex(PyObject *module, PyObject *arg) -{ - PyObject *return_value = NULL; - Py_buffer data = {NULL, NULL}; - - if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { + if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) { goto exit; } - return_value = binascii_b2a_base32hex_impl(module, &data); + if (!noptargs) { + goto skip_optional_kwonly; + } + if (PyObject_GetBuffer(args[1], &alphabet, PyBUF_SIMPLE) != 0) { + goto exit; + } +skip_optional_kwonly: + return_value = binascii_b2a_base32_impl(module, &data, &alphabet); exit: /* Cleanup for data */ if (data.obj) { PyBuffer_Release(&data); } + /* Cleanup for alphabet */ + if (alphabet.obj) { + PyBuffer_Release(&alphabet); + } return return_value; } @@ -1382,4 +1411,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=bafd226511187580 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5be0c5d9b116ee17 input=a9049054013a1b77]*/ From a9a7d26463b57fd2a25b79b2466539c438169aa1 Mon Sep 17 00:00:00 2001 From: James Seo Date: Sat, 21 Mar 2026 07:52:29 -0700 Subject: [PATCH 3/8] Address reviewer feedback - Update docs to refer to "Base 32" and "Base32" - Update docs to better explain `binascii.a2b_base32()` - Inline helper function in `base64` - Add forgotten tests for presence of alphabet module globals --- Doc/library/binascii.rst | 21 +++++++++++++-------- Lib/base64.py | 21 ++++++++------------- Lib/test/test_binascii.py | 4 ++++ Modules/binascii.c | 2 +- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 3facb139e17d43..47c021a85c13ad 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -182,16 +182,21 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: 3.15 + .. function:: a2b_base32(string, /, *, alphabet=BASE32_ALPHABET) Convert base32 data back to binary and return the binary data. - Valid base32 data: + Valid base32 data contains characters from the base32 alphabet specified + in :rfc:`4648` in groups of eight (if necessary, the final group is padded + to eight characters with ``=``). Each group encodes 40 bits of binary data + in the range from ``0`` to ``2 ** 40 - 1``, inclusive. - * Conforms to :rfc:`4648`. - * Contains only characters from the base32 alphabet. - * Contains no excess data after padding (including excess padding, newlines, etc.). - * Does not start with padding. + .. note:: + By default, this function does not map lowercase characters (which are + invalid in standard base32) to their uppercase counterparts, nor does + it contextually map ``0`` to ``O`` and ``1`` to ``I``/``L`` as + :rfc:`4648` allows. Optional *alphabet* must be a :class:`bytes` object of length 32 which specifies an alternative alphabet. @@ -202,7 +207,7 @@ The :mod:`!binascii` module defines the following functions: .. function:: b2a_base32(data, /, *, alphabet=BASE32_ALPHABET) - Convert binary data to a line(s) of ASCII characters in base32 coding, + Convert binary data to a line of ASCII characters in base32 coding, as specified in :rfc:`4648`. The return value is the converted line. Optional *alphabet* must be a :term:`bytes-like object` of length 32 which @@ -356,13 +361,13 @@ The :mod:`!binascii` module defines the following functions: .. data:: BASE32_ALPHABET - The base32 alphabet according to :rfc:`4648`. + The Base 32 alphabet according to :rfc:`4648`. .. versionadded:: next .. data:: BASE32HEX_ALPHABET - The "Extended Hex" base32hex alphabet according to :rfc:`4648`. + The "Extended Hex" Base 32 alphabet according to :rfc:`4648`. .. versionadded:: next diff --git a/Lib/base64.py b/Lib/base64.py index 8c88add3c2595b..9b57cdfefce1e6 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -207,10 +207,12 @@ def urlsafe_b64decode(s): 0 and 1 are not allowed in the input. ''' -def _b32decode_prepare(s, casefold=False, map01=None): +def b32encode(s): + return binascii.b2a_base32(s) +b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32') + +def b32decode(s, casefold=False, map01=None): s = _bytes_from_decode_data(s) - if len(s) % 8: - raise binascii.Error('Incorrect padding') # Handle section 2.4 zero and one mapping. The flag map01 will be either # False, or the character to map the digit 1 (one) to. It should be # either L (el) or I (eye). @@ -220,15 +222,6 @@ def _b32decode_prepare(s, casefold=False, map01=None): s = s.translate(bytes.maketrans(b'01', b'O' + map01)) if casefold: s = s.upper() - return s - - -def b32encode(s): - return binascii.b2a_base32(s) -b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32') - -def b32decode(s, casefold=False, map01=None): - s = _b32decode_prepare(s, casefold, map01) return binascii.a2b_base32(s) b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32', extra_args=_B32_DECODE_MAP01_DOCSTRING) @@ -238,8 +231,10 @@ def b32hexencode(s): b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex') def b32hexdecode(s, casefold=False): + s = _bytes_from_decode_data(s) # base32hex does not have the 01 mapping - s = _b32decode_prepare(s, casefold) + if casefold: + s = s.upper() return binascii.a2b_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex', extra_args='') diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 7fedd8e1a8b3ca..638a4cce0509d0 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -74,6 +74,10 @@ def test_constants(self): b'abcdefghijklmnopqrstuvwxyz' b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'.-:+=^!/*?&<>()[]{}@%$#') + self.assertEqual(binascii.BASE32_ALPHABET, + b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') + self.assertEqual(binascii.BASE32HEX_ALPHABET, + b'0123456789ABCDEFGHIJKLMNOPQRSTUV') def test_functions(self): # Check presence of all functions diff --git a/Modules/binascii.c b/Modules/binascii.c index 44d7986b6c0415..786a95f8bfb42c 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -1672,7 +1672,7 @@ binascii.b2a_base32 * alphabet: Py_buffer(c_default="{NULL, NULL}") = BASE32_ALPHABET -base32-code line of data. +Base32-code line of data. [clinic start generated code]*/ static PyObject * From 6f80c549a6e34762cde54502d24c9cee98e7079f Mon Sep 17 00:00:00 2001 From: James Seo Date: Sat, 21 Mar 2026 08:01:31 -0700 Subject: [PATCH 4/8] Update generated files --- Modules/binascii.c | 2 +- Modules/clinic/binascii.c.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/binascii.c b/Modules/binascii.c index 786a95f8bfb42c..e98cce10f8c58f 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -1678,7 +1678,7 @@ Base32-code line of data. static PyObject * binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, Py_buffer *alphabet) -/*[clinic end generated code: output=058d0d1aeb014d3b input=ffd4fa162a6e1cb5]*/ +/*[clinic end generated code: output=058d0d1aeb014d3b input=99cbe7194799d368]*/ { const unsigned char *table_b2a = table_b2a_base32; const unsigned char *bin_data = data->buf; diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index 8057d94a1fb934..7a411bfc829943 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -791,7 +791,7 @@ PyDoc_STRVAR(binascii_b2a_base32__doc__, "b2a_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" "--\n" "\n" -"base32-code line of data."); +"Base32-code line of data."); #define BINASCII_B2A_BASE32_METHODDEF \ {"b2a_base32", _PyCFunction_CAST(binascii_b2a_base32), METH_FASTCALL|METH_KEYWORDS, binascii_b2a_base32__doc__}, @@ -1411,4 +1411,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=5be0c5d9b116ee17 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=242c0c56b918bd33 input=a9049054013a1b77]*/ From 4c8207076deddadc0521ea5d3b3e733c98280711 Mon Sep 17 00:00:00 2001 From: James Seo Date: Sun, 22 Mar 2026 02:20:53 -0700 Subject: [PATCH 5/8] Address more reviewer feedback - Revise docs - Add whatsnew entry - Minor whitespace change in tests --- Doc/library/binascii.rst | 10 ++++++---- Doc/whatsnew/3.15.rst | 6 ++++++ Lib/test/test_binascii.py | 1 + 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 47c021a85c13ad..64c1ce948d2d32 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -193,10 +193,10 @@ The :mod:`!binascii` module defines the following functions: in the range from ``0`` to ``2 ** 40 - 1``, inclusive. .. note:: - By default, this function does not map lowercase characters (which are - invalid in standard base32) to their uppercase counterparts, nor does - it contextually map ``0`` to ``O`` and ``1`` to ``I``/``L`` as - :rfc:`4648` allows. + This function does not map lowercase characters (which are invalid in + standard base32) to their uppercase counterparts, nor does it + contextually map ``0`` to ``O`` and ``1`` to ``I``/``L`` as :rfc:`4648` + allows. Optional *alphabet* must be a :class:`bytes` object of length 32 which specifies an alternative alphabet. @@ -368,6 +368,8 @@ The :mod:`!binascii` module defines the following functions: .. data:: BASE32HEX_ALPHABET The "Extended Hex" Base 32 alphabet according to :rfc:`4648`. + Data encoded with this alphabet maintains its sort order during bitwise + comparisons. .. versionadded:: next diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 5e6265a45231db..2d959fa1b3fd8d 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -662,6 +662,12 @@ binascii * Added the *ignorechars* parameter in :func:`~binascii.a2b_base64`. (Contributed by Serhiy Storchaka in :gh:`144001`.) +* Added functions for Base 32 encoding: + + - :func:`~binascii.b2a_base32` and :func:`~binascii.a2b_base32` + + (Contributed by James Seo in :gh:`146192`.) + calendar -------- diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 638a4cce0509d0..095920d0ec84b9 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -74,6 +74,7 @@ def test_constants(self): b'abcdefghijklmnopqrstuvwxyz' b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'.-:+=^!/*?&<>()[]{}@%$#') + self.assertEqual(binascii.BASE32_ALPHABET, b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') self.assertEqual(binascii.BASE32HEX_ALPHABET, From 58707ea61a6bcf24bf83fe0c486b50b4b062c1cb Mon Sep 17 00:00:00 2001 From: James Seo Date: Sun, 22 Mar 2026 02:24:33 -0700 Subject: [PATCH 6/8] Rename references to groups of 5 and 8 bytes Referring to a group of 8 bytes as an "octet" may cause confusion, because the term is already commonly used in some languages to refer to a group of 8 bits (i.e. a byte). "Octa" is a suitable preexisting alternative for a group of 64 bits [1] (used by Knuth himself, at that). "Octad" was considered, but it, too, historically refers to a byte. Also rename "quintet" to "quint". "Pentad" was considered, but it historically refers to a group of 5 bits. [1] https://en.wikipedia.org/wiki/Units_of_information --- Modules/binascii.c | 63 +++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 32 deletions(-) diff --git a/Modules/binascii.c b/Modules/binascii.c index e98cce10f8c58f..5c4d1c3250141e 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -278,8 +278,8 @@ static const unsigned char table_b2a_base32[] Py_ALIGNED(64) = /* Encode 5 bytes into 8 base32 characters. */ static inline void -base32_encode_quintet(const unsigned char *in, unsigned char *out, - const unsigned char table[]) +base32_encode_quint(const unsigned char *in, unsigned char *out, + const unsigned char table[]) { uint64_t combined = ((uint64_t)in[0] << 32) | ((uint64_t)in[1] << 24) | @@ -304,16 +304,16 @@ static inline Py_ssize_t base32_encode_fast(const unsigned char *in, Py_ssize_t in_len, unsigned char *out, const unsigned char table[]) { - Py_ssize_t n_quintets = in_len / 5; - const unsigned char *in_end = in + n_quintets * 5; + Py_ssize_t n_quints = in_len / 5; + const unsigned char *in_end = in + n_quints * 5; while (in < in_end) { - base32_encode_quintet(in, out, table); + base32_encode_quint(in, out, table); in += 5; out += 8; } - return n_quintets * 5; + return n_quints * 5; } /* @@ -321,8 +321,8 @@ base32_encode_fast(const unsigned char *in, Py_ssize_t in_len, * Returns 1 on success, 0 if any character is invalid. */ static inline int -base32_decode_octet(const unsigned char *in, unsigned char *out, - const unsigned char table[]) +base32_decode_octa(const unsigned char *in, unsigned char *out, + const unsigned char table[]) { unsigned char v0 = table[in[0]]; unsigned char v1 = table[in[1]]; @@ -354,11 +354,11 @@ static inline Py_ssize_t base32_decode_fast(const unsigned char *in, Py_ssize_t in_len, unsigned char *out, const unsigned char table[]) { - Py_ssize_t n_quintets = in_len / 8; + Py_ssize_t n_quints = in_len / 8; Py_ssize_t i; - for (i = 0; i < n_quintets; i++) { - if (!base32_decode_octet(in + i * 8, out + i * 5, table)) { + for (i = 0; i < n_quints; i++) { + if (!base32_decode_octa(in + i * 8, out + i * 5, table)) { break; } } @@ -1533,8 +1533,8 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, unsigned char *bin_data = PyBytesWriter_GetData(writer); /* - * Fast path: use optimized decoder for complete octets (groups of 8 bytes). - * The fast path stops at padding, invalid chars, or incomplete octets. + * Fast path: use optimized decoder for complete octas (groups of 8 bytes). + * The fast path stops at padding, invalid chars, or incomplete octas. */ if (ascii_len >= 8) { Py_ssize_t fast_chars = base32_decode_fast(ascii_data, ascii_len, @@ -1546,9 +1546,9 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, } } - /* Slow path: handle remaining input (padding, invalid chars, incomplete octets). */ + /* Slow path: handle remaining input (padding, invalid chars, incomplete octas). */ unsigned char leftchar = 0; - int octet_pos = 0; + int octa_pos = 0; int pads = 0; for (; ascii_len; ascii_len--, ascii_data++) { unsigned char this_ch = *ascii_data; @@ -1557,16 +1557,15 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, if (this_ch == BASE32_PAD) { pads++; - if ((octet_pos == 2 || octet_pos == 4 - || octet_pos == 5 || octet_pos == 7) - && octet_pos + pads <= 8) + if ((octa_pos == 2 || octa_pos == 4 || octa_pos == 5 || octa_pos == 7) + && octa_pos + pads <= 8) { continue; } state = get_binascii_state(module); if (state) { - if (octet_pos == 1 || octet_pos == 3 || octet_pos == 6) { + if (octa_pos == 1 || octa_pos == 3 || octa_pos == 6) { const unsigned char *ascii_data_start = data->buf; PyErr_Format(state->Error, "Invalid base32-encoded string: " @@ -1577,7 +1576,7 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, } else { PyErr_SetString(state->Error, - (octet_pos == 0 && ascii_data == data->buf) + (octa_pos == 0 && ascii_data == data->buf) ? "Leading padding not allowed" : "Excess padding not allowed"); } @@ -1598,55 +1597,55 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, if (pads) { state = get_binascii_state(module); if (state) { - PyErr_SetString(state->Error, (octet_pos + pads == 8) + PyErr_SetString(state->Error, (octa_pos + pads == 8) ? "Excess data after padding" : "Discontinuous padding not allowed"); } goto error; } - switch (octet_pos) { + switch (octa_pos) { case 0: - octet_pos = 1; + octa_pos = 1; leftchar = v; break; case 1: - octet_pos = 2; + octa_pos = 2; *bin_data++ = (leftchar << 3) | (v >> 2); leftchar = v & 0x03; break; case 2: - octet_pos = 3; + octa_pos = 3; leftchar = (leftchar << 5) | v; break; case 3: - octet_pos = 4; + octa_pos = 4; *bin_data++ = (leftchar << 1) | (v >> 4); leftchar = v & 0x0f; break; case 4: - octet_pos = 5; + octa_pos = 5; *bin_data++ = (leftchar << 4) | (v >> 1); leftchar = v & 0x01; break; case 5: - octet_pos = 6; + octa_pos = 6; leftchar = (leftchar << 5) | v; break; case 6: - octet_pos = 7; + octa_pos = 7; *bin_data++ = (leftchar << 2) | (v >> 3); leftchar = v & 0x07; break; case 7: - octet_pos = 0; + octa_pos = 0; *bin_data++ = (leftchar << 5) | v; leftchar = 0; } } - if ((octet_pos != 0 && octet_pos + pads != 8) - || (octet_pos == 0 && pads != 0)) + if ((octa_pos != 0 && octa_pos + pads != 8) + || (octa_pos == 0 && pads != 0)) { state = get_binascii_state(module); if (state) { From e3ee6df8be3f197fd3cb1fc11dd68b37317bdd58 Mon Sep 17 00:00:00 2001 From: James Seo Date: Sun, 22 Mar 2026 06:45:09 -0700 Subject: [PATCH 7/8] Minor polishing changes - Reword NEWS.d entry to "Base32" instead of "base-32". No prior entries have ever mentioned "base-64", etc., but they have mentioned "Base64", etc., so this is more consistent. - Reword whatsnew entry to "Base32" instead of "Base 32". No prior entries have ever mentioned "Base 64", etc., and there is an entry a little further up mentioning "Ascii85, Base85, and Z85", so this is more consistent. - Add a whatsnew entry in Optimizations > base64 & binascii section. - Whitespace change in `binascii.c`. --- Doc/whatsnew/3.15.rst | 6 +++++- .../2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst | 4 ++-- Modules/binascii.c | 4 ++-- 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/Doc/whatsnew/3.15.rst b/Doc/whatsnew/3.15.rst index 2d959fa1b3fd8d..b40c75060a4336 100644 --- a/Doc/whatsnew/3.15.rst +++ b/Doc/whatsnew/3.15.rst @@ -662,7 +662,7 @@ binascii * Added the *ignorechars* parameter in :func:`~binascii.a2b_base64`. (Contributed by Serhiy Storchaka in :gh:`144001`.) -* Added functions for Base 32 encoding: +* Added functions for Base32 encoding: - :func:`~binascii.b2a_base32` and :func:`~binascii.a2b_base32` @@ -1285,6 +1285,10 @@ base64 & binascii two orders of magnitude less memory. (Contributed by James Seo and Serhiy Storchaka in :gh:`101178`.) +* Implementation for Base32 has been rewritten in C. + Encoding and decoding is now two orders of magnitude faster. + (Contributed by James Seo in :gh:`146192`) + csv --- diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst index a27639d2908651..304a7cd62102a7 100644 --- a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst @@ -1,2 +1,2 @@ -Add base32 support to :mod:`binascii` and improve the performance of the -base-32 converters in :mod:`base64`. Patch by James Seo. +Add Base32 support to :mod:`binascii` and improve the performance of the +Base32 converters in :mod:`base64`. Patch by James Seo. diff --git a/Modules/binascii.c b/Modules/binascii.c index 5c4d1c3250141e..de6917a7bf267d 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -1598,8 +1598,8 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, state = get_binascii_state(module); if (state) { PyErr_SetString(state->Error, (octa_pos + pads == 8) - ? "Excess data after padding" - : "Discontinuous padding not allowed"); + ? "Excess data after padding" + : "Discontinuous padding not allowed"); } goto error; } From 6c9db0d673925e941a206d312e78a6d6c20df27b Mon Sep 17 00:00:00 2001 From: James Seo Date: Sun, 22 Mar 2026 06:54:38 -0700 Subject: [PATCH 8/8] binascii.c: Modify exception behavior on invalid encoded length When decoding invalid length (1, 3 or 6 mod 8) + no padding, mention the invalid length instead of the improper padding in the exception message to match what the base64 decoder does. Additionally, move the logic for setting the exception message (back) outside the "slow path" loop; if we do end up checking canonicity of decoder input, it will feel (subjectively) better to have several checks grouped together after the loop. --- Lib/test/test_binascii.py | 7 ++++--- Modules/binascii.c | 37 +++++++++++++++++++++---------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 095920d0ec84b9..d4879667c71461 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -782,12 +782,9 @@ def assertInvalidLength(*args): assertLeadingPadding(b"========BEEFCAKE", b"\t\x08Q\x01D") assertLeadingPadding(b"=========BEEFCAKE", b"\t\x08Q\x01D") - assertIncorrectPadding(b"A") assertIncorrectPadding(b"AB") - assertIncorrectPadding(b"ABC") assertIncorrectPadding(b"ABCD") assertIncorrectPadding(b"ABCDE") - assertIncorrectPadding(b"ABCDEF") assertIncorrectPadding(b"ABCDEFG") assertIncorrectPadding(b"BE=", b"\t") @@ -806,6 +803,10 @@ def assertInvalidLength(*args): assertDiscontinuousPadding(b"BEEF=C==", b"\t\x08Q") assertDiscontinuousPadding(b"BEEFC=AK", b"\t\x08Q\x01") + assertInvalidLength(b"A") + assertInvalidLength(b"ABC") + assertInvalidLength(b"ABCDEF") + assertInvalidLength(b"A=") assertInvalidLength(b"A==") assertInvalidLength(b"A===") diff --git a/Modules/binascii.c b/Modules/binascii.c index de6917a7bf267d..d124efeeb8577a 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -1562,24 +1562,16 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, { continue; } - + if (octa_pos == 1 || octa_pos == 3 || octa_pos == 6) { + /* Set an error below. */ + break; + } state = get_binascii_state(module); if (state) { - if (octa_pos == 1 || octa_pos == 3 || octa_pos == 6) { - const unsigned char *ascii_data_start = data->buf; - PyErr_Format(state->Error, - "Invalid base32-encoded string: " - "number of data characters (%zd) " - "cannot be 1, 3, or 6 more " - "than a multiple of 8", - ascii_data - ascii_data_start); - } - else { - PyErr_SetString(state->Error, - (octa_pos == 0 && ascii_data == data->buf) - ? "Leading padding not allowed" - : "Excess padding not allowed"); - } + PyErr_SetString(state->Error, + (octa_pos == 0 && ascii_data == data->buf) + ? "Leading padding not allowed" + : "Excess padding not allowed"); } goto error; } @@ -1644,6 +1636,19 @@ binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, } } + if (octa_pos == 1 || octa_pos == 3 || octa_pos == 6) { + state = get_binascii_state(module); + if (state) { + const unsigned char *ascii_data_start = data->buf; + PyErr_Format(state->Error, + "Invalid base32-encoded string: " + "number of data characters (%zd) " + "cannot be 1, 3, or 6 more than a multiple of 8", + ascii_data - ascii_data_start); + } + goto error; + } + if ((octa_pos != 0 && octa_pos + pads != 8) || (octa_pos == 0 && pads != 0)) {