From 0a8e4b18a58c939186d18cd25848d0fb56718e17 Mon Sep 17 00:00:00 2001 From: James Seo Date: Thu, 19 Mar 2026 23:10:30 -0700 Subject: [PATCH 1/4] gh-146192: Add base32 support to binascii Add base32 encoder and decoder functions implemented in C to `binascii` and use them to greatly improve the performance and reduce the memory usage of the existing base32 codec functions in `base64`. No API or documentation changes are necessary with respect to any functions in `base64`, and all existing unit tests for those functions continue to pass without modification. Resolves: gh-146192 --- Doc/library/binascii.rst | 43 ++ Lib/base64.py | 85 +--- Lib/test/test_binascii.py | 304 +++++++++++- ...-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst | 2 + Modules/binascii.c | 455 ++++++++++++++++++ Modules/clinic/binascii.c.h | 128 ++++- 6 files changed, 938 insertions(+), 79 deletions(-) create mode 100644 Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 70ba036756ff32..9137b7203698df 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -182,6 +182,49 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: 3.15 +.. function:: a2b_base32(string, /) + + Convert base32 data back to binary and return the binary data. + + Valid base32 data: + + * Conforms to :rfc:`4648`. + * Contains only characters from the base32 alphabet. + * Contains no excess data after padding (including excess padding, newlines, etc.). + * Does not start with padding. + + Invalid base32 data will raise :exc:`binascii.Error`. + + .. versionadded:: 3.15 + +.. function:: b2a_base32(data, /) + + Convert binary data to a line(s) of ASCII characters in base32 coding, + as specified in :rfc:`4648`. The return value is the converted line. + + .. versionadded:: 3.15 + +.. function:: a2b_base32hex(string, /) + + Convert base32hex data back to binary and return the binary data. + + Valid base32hex: + + * Conforms to :rfc:`4648`. + * Contains only characters from the base32hex alphabet. + * Contains no excess data after padding (including excess padding, newlines, etc.). + * Does not start with padding. + + Invalid base32hex data will raise :exc:`binascii.Error`. + + .. versionadded:: 3.15 + +.. function:: b2a_base32hex(data, /) + + Convert binary data to a line(s) of ASCII characters in base32hex coding, + as specified in :rfc:`4648`. The return value is the converted line. + + .. versionadded:: 3.15 .. function:: a2b_qp(data, header=False) diff --git a/Lib/base64.py b/Lib/base64.py index a429760da79f2a..576d429522ba31 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -206,51 +206,8 @@ def urlsafe_b64decode(s): the letter O). For security purposes the default is None, so that 0 and 1 are not allowed in the input. ''' -_b32alphabet = b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567' -_b32hexalphabet = b'0123456789ABCDEFGHIJKLMNOPQRSTUV' -_b32tab2 = {} -_b32rev = {} - -def _b32encode(alphabet, s): - # Delay the initialization of the table to not waste memory - # if the function is never called - if alphabet not in _b32tab2: - b32tab = [bytes((i,)) for i in alphabet] - _b32tab2[alphabet] = [a + b for a in b32tab for b in b32tab] - b32tab = None - - if not isinstance(s, bytes_types): - s = memoryview(s).tobytes() - leftover = len(s) % 5 - # Pad the last quantum with zero bits if necessary - if leftover: - s = s + b'\0' * (5 - leftover) # Don't use += ! - encoded = bytearray() - from_bytes = int.from_bytes - b32tab2 = _b32tab2[alphabet] - for i in range(0, len(s), 5): - c = from_bytes(s[i: i + 5]) # big endian - encoded += (b32tab2[c >> 30] + # bits 1 - 10 - b32tab2[(c >> 20) & 0x3ff] + # bits 11 - 20 - b32tab2[(c >> 10) & 0x3ff] + # bits 21 - 30 - b32tab2[c & 0x3ff] # bits 31 - 40 - ) - # Adjust for any leftover partial quanta - if leftover == 1: - encoded[-6:] = b'======' - elif leftover == 2: - encoded[-4:] = b'====' - elif leftover == 3: - encoded[-3:] = b'===' - elif leftover == 4: - encoded[-1:] = b'=' - return encoded.take_bytes() - -def _b32decode(alphabet, s, casefold=False, map01=None): - # Delay the initialization of the table to not waste memory - # if the function is never called - if alphabet not in _b32rev: - _b32rev[alphabet] = {v: k for k, v in enumerate(alphabet)} + +def _b32decode_prepare(s, casefold=False, map01=None): s = _bytes_from_decode_data(s) if len(s) % 8: raise binascii.Error('Incorrect padding') @@ -263,51 +220,27 @@ def _b32decode(alphabet, s, casefold=False, map01=None): s = s.translate(bytes.maketrans(b'01', b'O' + map01)) if casefold: s = s.upper() - # Strip off pad characters from the right. We need to count the pad - # characters because this will tell us how many null bytes to remove from - # the end of the decoded string. - l = len(s) - s = s.rstrip(b'=') - padchars = l - len(s) - # Now decode the full quanta - decoded = bytearray() - b32rev = _b32rev[alphabet] - for i in range(0, len(s), 8): - quanta = s[i: i + 8] - acc = 0 - try: - for c in quanta: - acc = (acc << 5) + b32rev[c] - except KeyError: - raise binascii.Error('Non-base32 digit found') from None - decoded += acc.to_bytes(5) # big endian - # Process the last, partial quanta - if l % 8 or padchars not in {0, 1, 3, 4, 6}: - raise binascii.Error('Incorrect padding') - if padchars and decoded: - acc <<= 5 * padchars - last = acc.to_bytes(5) # big endian - leftover = (43 - 5 * padchars) // 8 # 1: 4, 3: 3, 4: 2, 6: 1 - decoded[-5:] = last[:leftover] - return decoded.take_bytes() + return s def b32encode(s): - return _b32encode(_b32alphabet, s) + return binascii.b2a_base32(s) b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32') def b32decode(s, casefold=False, map01=None): - return _b32decode(_b32alphabet, s, casefold, map01) + s = _b32decode_prepare(s, casefold, map01) + return binascii.a2b_base32(s) b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32', extra_args=_B32_DECODE_MAP01_DOCSTRING) def b32hexencode(s): - return _b32encode(_b32hexalphabet, s) + return binascii.b2a_base32hex(s) b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex') def b32hexdecode(s, casefold=False): # base32hex does not have the 01 mapping - return _b32decode(_b32hexalphabet, s, casefold) + s = _b32decode_prepare(s, casefold) + return binascii.a2b_base32hex(s) b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex', extra_args='') diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 667ec9b5241aa9..3ac468d636d203 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -10,10 +10,10 @@ # Note: "*_hex" functions are aliases for "(un)hexlify" -b2a_functions = ['b2a_ascii85', 'b2a_base64', 'b2a_base85', +b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base32hex', 'b2a_base64', 'b2a_base85', 'b2a_hex', 'b2a_qp', 'b2a_uu', 'hexlify'] -a2b_functions = ['a2b_ascii85', 'a2b_base64', 'a2b_base85', +a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base32hex', 'a2b_base64', 'a2b_base85', 'a2b_hex', 'a2b_qp', 'a2b_uu', 'unhexlify'] all_functions = a2b_functions + b2a_functions + ['crc32', 'crc_hqx'] @@ -670,6 +670,306 @@ def test_base85_alphabet(self): with self.assertRaises(TypeError): binascii.a2b_base64(data, alphabet=bytearray(alphabet)) + def test_base32_valid(self): + # Test base32 with valid data + lines = [] + step = 0 + i = 0 + while i < len(self.rawdata): + b = self.type2test(self.rawdata[i:i + step]) + a = binascii.b2a_base32(b) + lines.append(a) + i += step + step += 1 + res = bytes() + for line in lines: + a = self.type2test(line) + b = binascii.a2b_base32(a) + res += b + self.assertEqual(res, self.rawdata) + + def test_base32_errors(self): + def _fixPadding(data): + fixed = data.replace(b"=", b"") + len_8 = len(fixed) % 8 + p = 8 - len_8 if len_8 else 0 + return fixed + b"=" * p + + def _assertRegexTemplate(assert_regex, data, good_padding_result=None): + with self.assertRaisesRegex(binascii.Error, assert_regex): + binascii.a2b_base32(self.type2test(data)) + if good_padding_result: + fixed = self.type2test(_fixPadding(data)) + self.assertEqual(binascii.a2b_base32(fixed), good_padding_result) + + def assertNonBase32Data(*args): + _assertRegexTemplate(r"(?i)Only base32 data", *args) + + def assertExcessData(*args): + _assertRegexTemplate(r"(?i)Excess data", *args) + + def assertExcessPadding(*args): + _assertRegexTemplate(r"(?i)Excess padding", *args) + + def assertLeadingPadding(*args): + _assertRegexTemplate(r"(?i)Leading padding", *args) + + def assertIncorrectPadding(*args): + _assertRegexTemplate(r"(?i)Incorrect padding", *args) + + def assertDiscontinuousPadding(*args): + _assertRegexTemplate(r"(?i)Discontinuous padding", *args) + + def assertInvalidLength(*args): + _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args) + + assertNonBase32Data(b"a") + assertNonBase32Data(b"AA-") + assertNonBase32Data(b"ABCDE==!") + assertNonBase32Data(b"ab:(){:|:&};:==") + + assertExcessData(b"AB======C") + assertExcessData(b"AB======CD") + assertExcessData(b"ABCD====E") + assertExcessData(b"ABCDE===FGH") + assertExcessData(b"ABCDEFG=H") + assertExcessData(b"432Z====55555555") + + assertExcessData(b"BE======EF", b"\t\x08") + assertExcessData(b"BEEF====C", b"\t\x08Q") + assertExcessData(b"BEEFC===AK", b"\t\x08Q\x01") + assertExcessData(b"BEEFCAK=E", b"\t\x08Q\x01D") + + assertExcessPadding(b"BE=======", b"\t") + assertExcessPadding(b"BE========", b"\t") + assertExcessPadding(b"BEEF=====", b"\t\x08") + assertExcessPadding(b"BEEF======", b"\t\x08") + assertExcessPadding(b"BEEFC====", b"\t\x08Q") + assertExcessPadding(b"BEEFC=====", b"\t\x08Q") + assertExcessPadding(b"BEEFCAK==", b"\t\x08Q\x01") + assertExcessPadding(b"BEEFCAK===", b"\t\x08Q\x01") + assertExcessPadding(b"BEEFCAKE=", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE==", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE===", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE====", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=====", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE======", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=======", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE========", b"\t\x08Q\x01D") + assertExcessPadding(b"BEEFCAKE=========", b"\t\x08Q\x01D") + + assertLeadingPadding(b"=", b"") + assertLeadingPadding(b"==", b"") + assertLeadingPadding(b"===", b"") + assertLeadingPadding(b"====", b"") + assertLeadingPadding(b"=====", b"") + assertLeadingPadding(b"======", b"") + assertLeadingPadding(b"=======", b"") + assertLeadingPadding(b"========", b"") + assertLeadingPadding(b"=========", b"") + assertLeadingPadding(b"=BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"==BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"===BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"====BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=====BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"======BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=======BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"========BEEFCAKE", b"\t\x08Q\x01D") + assertLeadingPadding(b"=========BEEFCAKE", b"\t\x08Q\x01D") + + assertIncorrectPadding(b"A") + assertIncorrectPadding(b"AB") + assertIncorrectPadding(b"ABC") + assertIncorrectPadding(b"ABCD") + assertIncorrectPadding(b"ABCDE") + assertIncorrectPadding(b"ABCDEF") + assertIncorrectPadding(b"ABCDEFG") + + assertIncorrectPadding(b"BE=", b"\t") + assertIncorrectPadding(b"BE==", b"\t") + assertIncorrectPadding(b"BE===", b"\t") + assertIncorrectPadding(b"BE====", b"\t") + assertIncorrectPadding(b"BE=====", b"\t") + assertIncorrectPadding(b"BEEF=", b"\t\x08") + assertIncorrectPadding(b"BEEF==", b"\t\x08") + assertIncorrectPadding(b"BEEF===", b"\t\x08") + assertIncorrectPadding(b"BEEFC=", b"\t\x08Q") + assertIncorrectPadding(b"BEEFC==", b"\t\x08Q") + + assertDiscontinuousPadding(b"BE=EF===", b"\t\x08") + assertDiscontinuousPadding(b"BE==EF==", b"\t\x08") + assertDiscontinuousPadding(b"BEEF=C==", b"\t\x08Q") + assertDiscontinuousPadding(b"BEEFC=AK", b"\t\x08Q\x01") + + assertInvalidLength(b"A=") + assertInvalidLength(b"A==") + assertInvalidLength(b"A===") + assertInvalidLength(b"A====") + assertInvalidLength(b"A=====") + assertInvalidLength(b"A======") + assertInvalidLength(b"ABC=") + assertInvalidLength(b"ABC==") + assertInvalidLength(b"ABC===") + assertInvalidLength(b"ABC====") + assertInvalidLength(b"ABCDEF=") + + assertInvalidLength(b"B=E=====", b"\t") + assertInvalidLength(b"B==E====", b"\t") + assertInvalidLength(b"BEE=F===", b"\t\x08") + assertInvalidLength(b"BEE==F==", b"\t\x08") + assertInvalidLength(b"BEEFCA=K", b"\t\x08Q\x01") + assertInvalidLength(b"BEEFCA=====K", b"\t\x08Q\x01") + + def test_base32hex_valid(self): + # Test base32hex with valid data + lines = [] + step = 0 + i = 0 + while i < len(self.rawdata): + b = self.type2test(self.rawdata[i:i + step]) + a = binascii.b2a_base32hex(b) + lines.append(a) + i += step + step += 1 + res = bytes() + for line in lines: + a = self.type2test(line) + b = binascii.a2b_base32hex(a) + res += b + self.assertEqual(res, self.rawdata) + + def test_base32hex_errors(self): + def _fixPadding(data): + fixed = data.replace(b"=", b"") + len_8 = len(fixed) % 8 + p = 8 - len_8 if len_8 else 0 + return fixed + b"=" * p + + def _assertRegexTemplate(assert_regex, data, good_padding_result=None): + with self.assertRaisesRegex(binascii.Error, assert_regex): + binascii.a2b_base32hex(self.type2test(data)) + if good_padding_result: + fixed = self.type2test(_fixPadding(data)) + self.assertEqual(binascii.a2b_base32hex(fixed), good_padding_result) + + def assertNonBase32HexData(*args): + _assertRegexTemplate(r"(?i)Only base32hex data", *args) + + def assertExcessData(*args): + _assertRegexTemplate(r"(?i)Excess data", *args) + + def assertExcessPadding(*args): + _assertRegexTemplate(r"(?i)Excess padding", *args) + + def assertLeadingPadding(*args): + _assertRegexTemplate(r"(?i)Leading padding", *args) + + def assertIncorrectPadding(*args): + _assertRegexTemplate(r"(?i)Incorrect padding", *args) + + def assertDiscontinuousPadding(*args): + _assertRegexTemplate(r"(?i)Discontinuous padding", *args) + + def assertInvalidLength(*args): + _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args) + + assertNonBase32HexData(b"a") + assertNonBase32HexData(b"AA-") + assertNonBase32HexData(b"ABCDE==!") + assertNonBase32HexData(b"ab:(){:|:&};:==") + + assertExcessData(b"AB======C") + assertExcessData(b"AB======CD") + assertExcessData(b"ABCD====E") + assertExcessData(b"ABCDE===FGH") + assertExcessData(b"ABCDEFG=H") + assertExcessData(b"4321====55555555") + + assertExcessData(b"BE======EF", b"[\x9c") + assertExcessData(b"BEEF====C", b"[\x9c\xf6") + assertExcessData(b"BEEFC===AK", b"[\x9c\xf6*") + assertExcessData(b"BEEFCAK=E", b"[\x9c\xf6*\x8e") + + assertExcessPadding(b"BE=======", b"[") + assertExcessPadding(b"BE========", b"[") + assertExcessPadding(b"BEEF=====", b"[\x9c") + assertExcessPadding(b"BEEF======", b"[\x9c") + assertExcessPadding(b"BEEFC====", b"[\x9c\xf6") + assertExcessPadding(b"BEEFC=====", b"[\x9c\xf6") + assertExcessPadding(b"BEEFCAK==", b"[\x9c\xf6*") + assertExcessPadding(b"BEEFCAK===", b"[\x9c\xf6*") + assertExcessPadding(b"BEEFCAKE=", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE==", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE===", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE====", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE=====", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE======", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE=======", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE========", b"[\x9c\xf6*\x8e") + assertExcessPadding(b"BEEFCAKE=========", b"[\x9c\xf6*\x8e") + + assertLeadingPadding(b"=", b"") + assertLeadingPadding(b"==", b"") + assertLeadingPadding(b"===", b"") + assertLeadingPadding(b"====", b"") + assertLeadingPadding(b"=====", b"") + assertLeadingPadding(b"======", b"") + assertLeadingPadding(b"=======", b"") + assertLeadingPadding(b"========", b"") + assertLeadingPadding(b"=========", b"") + assertLeadingPadding(b"=BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"==BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"===BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"====BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"=====BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"======BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"=======BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"========BEEFCAKE", b"[\x9c\xf6*\x8e") + assertLeadingPadding(b"=========BEEFCAKE", b"[\x9c\xf6*\x8e") + + assertIncorrectPadding(b"A") + assertIncorrectPadding(b"AB") + assertIncorrectPadding(b"ABC") + assertIncorrectPadding(b"ABCD") + assertIncorrectPadding(b"ABCDE") + assertIncorrectPadding(b"ABCDEF") + assertIncorrectPadding(b"ABCDEFG") + + assertIncorrectPadding(b"BE=", b"[") + assertIncorrectPadding(b"BE==", b"[") + assertIncorrectPadding(b"BE===", b"[") + assertIncorrectPadding(b"BE====", b"[") + assertIncorrectPadding(b"BE=====", b"[") + assertIncorrectPadding(b"BEEF=", b"[\x9c") + assertIncorrectPadding(b"BEEF==", b"[\x9c") + assertIncorrectPadding(b"BEEF===", b"[\x9c") + assertIncorrectPadding(b"BEEFC=", b"[\x9c\xf6") + assertIncorrectPadding(b"BEEFC==", b"[\x9c\xf6") + + assertDiscontinuousPadding(b"BE=EF===", b"[\x9c") + assertDiscontinuousPadding(b"BE==EF==", b"[\x9c") + assertDiscontinuousPadding(b"BEEF=C==", b"[\x9c\xf6") + assertDiscontinuousPadding(b"BEEFC=AK", b"[\x9c\xf6*") + + assertInvalidLength(b"A=") + assertInvalidLength(b"A==") + assertInvalidLength(b"A===") + assertInvalidLength(b"A====") + assertInvalidLength(b"A=====") + assertInvalidLength(b"A======") + assertInvalidLength(b"ABC=") + assertInvalidLength(b"ABC==") + assertInvalidLength(b"ABC===") + assertInvalidLength(b"ABC====") + assertInvalidLength(b"ABCDEF=") + + assertInvalidLength(b"B=E=====", b"[") + assertInvalidLength(b"B==E====", b"[") + assertInvalidLength(b"BEE=F===", b"[\x9c") + assertInvalidLength(b"BEE==F==", b"[\x9c") + assertInvalidLength(b"BEEFCA=K", b"[\x9c\xf6*") + assertInvalidLength(b"BEEFCA=====K", b"[\x9c\xf6*") + def test_uu(self): MAX_UU = 45 for backtick in (True, False): diff --git a/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst new file mode 100644 index 00000000000000..a27639d2908651 --- /dev/null +++ b/Misc/NEWS.d/next/Core_and_Builtins/2026-03-20-00-39-25.gh-issue-146192.8aQ6sC.rst @@ -0,0 +1,2 @@ +Add base32 support to :mod:`binascii` and improve the performance of the +base-32 converters in :mod:`base64`. Patch by James Seo. diff --git a/Modules/binascii.c b/Modules/binascii.c index f85f32b32e962c..241aeac400063e 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -244,6 +244,152 @@ static const unsigned char table_b2a_base85_a85[] Py_ALIGNED(64) = #define BASE85_A85_Z 0x00000000 #define BASE85_A85_Y 0x20202020 + +static const unsigned char table_a2b_base32[] Py_ALIGNED(64) = { + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,26,27, 28,29,30,31, -1,-1,-1,-1, -1,-1,-1,-1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14, + 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, +}; + +static const unsigned char table_a2b_base32hex[] Py_ALIGNED(64) = { + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1, -1,-1,-1,-1, + -1,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, + 25,26,27,28, 29,30,31,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, + -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, +}; + +static const unsigned char table_b2a_base32[] Py_ALIGNED(64) = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"; + +static const unsigned char table_b2a_base32hex[] Py_ALIGNED(64) = + "0123456789ABCDEFGHIJKLMNOPQRSTUV"; + +#define BASE32_PAD '=' + +/* + * Fast base32 encoding/decoding helpers. + * + * Analogous to the helpers for base64. + */ + +/* Encode 5 bytes into 8 base32 characters. */ +static inline void +base32_encode_quintet(const unsigned char *in, unsigned char *out, + const unsigned char table[]) +{ + uint64_t combined = ((uint64_t)in[0] << 32) | + ((uint64_t)in[1] << 24) | + ((uint64_t)in[2] << 16) | + ((uint64_t)in[3] << 8) | + (uint64_t)in[4]; + out[0] = table[(combined >> 35) & 0x1f]; + out[1] = table[(combined >> 30) & 0x1f]; + out[2] = table[(combined >> 25) & 0x1f]; + out[3] = table[(combined >> 20) & 0x1f]; + out[4] = table[(combined >> 15) & 0x1f]; + out[5] = table[(combined >> 10) & 0x1f]; + out[6] = table[(combined >> 5) & 0x1f]; + out[7] = table[combined & 0x1f]; +} + +/* + * Encode multiple complete 5-byte groups. + * Returns the number of input bytes processed (always a multiple of 5). + */ +static inline Py_ssize_t +base32_encode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char table[]) +{ + Py_ssize_t n_quintets = in_len / 5; + const unsigned char *in_end = in + n_quintets * 5; + + while (in < in_end) { + base32_encode_quintet(in, out, table); + in += 5; + out += 8; + } + + return n_quintets * 5; +} + +/* + * Decode 8 base32 characters into 5 bytes. + * Returns 1 on success, 0 if any character is invalid. + */ +static inline int +base32_decode_octet(const unsigned char *in, unsigned char *out, + const unsigned char table[]) +{ + unsigned char v0 = table[in[0]]; + unsigned char v1 = table[in[1]]; + unsigned char v2 = table[in[2]]; + unsigned char v3 = table[in[3]]; + unsigned char v4 = table[in[4]]; + unsigned char v5 = table[in[5]]; + unsigned char v6 = table[in[6]]; + unsigned char v7 = table[in[7]]; + + if ((v0 | v1 | v2 | v3 | v4 | v5 | v6 | v7) & 0xe0) { + return 0; + } + + out[0] = (v0 << 3) | (v1 >> 2); + out[1] = (v1 << 6) | (v2 << 1) | (v3 >> 4); + out[2] = (v3 << 4) | (v4 >> 1); + out[3] = (v4 << 7) | (v5 << 2) | (v6 >> 3); + out[4] = (v6 << 5) | v7; + return 1; +} + +/* + * Decode multiple complete 8-character groups (no padding allowed). + * Returns the number of input characters processed. + * Stops at the first invalid character, padding, or incomplete group. + */ +static inline Py_ssize_t +base32_decode_fast(const unsigned char *in, Py_ssize_t in_len, + unsigned char *out, const unsigned char table[]) +{ + Py_ssize_t n_quintets = in_len / 8; + Py_ssize_t i; + + for (i = 0; i < n_quintets; i++) { + if (!base32_decode_octet(in + i * 8, out + i * 5, table)) { + break; + } + } + + return i * 8; +} + + static const unsigned short crctab_hqx[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, @@ -1367,6 +1513,311 @@ binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, int pad, return PyBytesWriter_FinishWithPointer(writer, ascii_data); } +static PyObject * +base32_decode_impl(PyObject *module, Py_buffer *data, + const unsigned char table_a2b[], const char *name) +{ + const unsigned char *ascii_data = data->buf; + Py_ssize_t ascii_len = data->len; + binascii_state *state = NULL; + + assert(ascii_len >= 0); + + /* Allocate output buffer. */ + size_t bin_len = ((size_t)ascii_len + 7) / 8 * 5; + PyBytesWriter *writer = PyBytesWriter_Create(bin_len); + if (writer == NULL) { + return NULL; + } + unsigned char *bin_data = PyBytesWriter_GetData(writer); + + /* + * Fast path: use optimized decoder for complete octets (groups of 8 bytes). + * The fast path stops at padding, invalid chars, or incomplete octets. + */ + if (ascii_len >= 8) { + Py_ssize_t fast_chars = base32_decode_fast(ascii_data, ascii_len, + bin_data, table_a2b); + if (fast_chars > 0) { + ascii_data += fast_chars; + ascii_len -= fast_chars; + bin_data += (fast_chars / 8) * 5; + } + } + + /* Slow path: handle remaining input (padding, invalid chars, incomplete octets). */ + unsigned char leftchar = 0; + int octet_pos = 0; + int pads = 0; + for (; ascii_len; ascii_len--, ascii_data++) { + unsigned char this_ch = *ascii_data; + + /* Check for pad sequences. They may only occur at certain positions. */ + if (this_ch == BASE32_PAD) { + pads++; + + if ((octet_pos == 2 || octet_pos == 4 + || octet_pos == 5 || octet_pos == 7) + && octet_pos + pads <= 8) + { + continue; + } + + state = get_binascii_state(module); + if (state) { + if (octet_pos == 1 || octet_pos == 3 || octet_pos == 6) { + const unsigned char *ascii_data_start = data->buf; + PyErr_Format(state->Error, + "Invalid %s-encoded string: " + "number of data characters (%zd) " + "cannot be 1, 3, or 6 more " + "than a multiple of 8", + name, (ascii_data - ascii_data_start)); + } + else { + PyErr_SetString(state->Error, + (octet_pos == 0 && ascii_data == data->buf) + ? "Leading padding not allowed" + : "Excess padding not allowed"); + } + } + goto error; + } + + unsigned char v = table_a2b[this_ch]; + if (v >= 32) { + state = get_binascii_state(module); + if (state) { + PyErr_Format(state->Error, "Only %s data is allowed", name); + } + goto error; + } + + /* Data in the middle of/after the padding is not allowed. */ + if (pads) { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, (octet_pos + pads == 8) + ? "Excess data after padding" + : "Discontinuous padding not allowed"); + } + goto error; + } + + switch (octet_pos) { + case 0: + octet_pos = 1; + leftchar = v; + break; + case 1: + octet_pos = 2; + *bin_data++ = (leftchar << 3) | (v >> 2); + leftchar = v & 0x03; + break; + case 2: + octet_pos = 3; + leftchar = (leftchar << 5) | v; + break; + case 3: + octet_pos = 4; + *bin_data++ = (leftchar << 1) | (v >> 4); + leftchar = v & 0x0f; + break; + case 4: + octet_pos = 5; + *bin_data++ = (leftchar << 4) | (v >> 1); + leftchar = v & 0x01; + break; + case 5: + octet_pos = 6; + leftchar = (leftchar << 5) | v; + break; + case 6: + octet_pos = 7; + *bin_data++ = (leftchar << 2) | (v >> 3); + leftchar = v & 0x07; + break; + case 7: + octet_pos = 0; + *bin_data++ = (leftchar << 5) | v; + leftchar = 0; + } + } + + if ((octet_pos != 0 && octet_pos + pads != 8) + || (octet_pos == 0 && pads != 0)) + { + state = get_binascii_state(module); + if (state) { + PyErr_SetString(state->Error, "Incorrect padding"); + } + goto error; + } + + return PyBytesWriter_FinishWithPointer(writer, bin_data); + +error: + PyBytesWriter_Discard(writer); + return NULL; +} + +static PyObject * +base32_encode_impl(PyObject *module, Py_buffer *data, + const unsigned char table_b2a[], const char *name) +{ + const unsigned char *bin_data = data->buf; + Py_ssize_t bin_len = data->len; + binascii_state *state = NULL; + + assert(bin_len >= 0); + + /* + * Each group of 5 bytes (rounded up) gets encoded as 8 characters. + * Use unsigned integer arithmetic to avoid signed integer overflow. + */ + size_t ascii_len = ((size_t)bin_len + 4u) / 5u * 8u; + if (ascii_len > PY_SSIZE_T_MAX) { + state = get_binascii_state(module); + if (state) { + PyErr_Format(state->Error, "Too much data for %s", name); + } + return NULL; + } + PyBytesWriter *writer = PyBytesWriter_Create(ascii_len); + if (writer == NULL) { + return NULL; + } + unsigned char *ascii_data = PyBytesWriter_GetData(writer); + + /* Use the optimized fast path for complete 5-byte groups. */ + Py_ssize_t fast_bytes = base32_encode_fast(bin_data, bin_len, ascii_data, + table_b2a); + bin_data += fast_bytes; + ascii_data += (fast_bytes / 5) * 8; + bin_len -= fast_bytes; + + /* Handle the remaining 0-4 bytes. */ + if (bin_len == 1) { + /* 1 byte remaining: produces 2 encoded + 6 padding chars. */ + uint32_t val = bin_data[0]; + *ascii_data++ = table_b2a[(val >> 3) & 0x1f]; + *ascii_data++ = table_b2a[(val << 2) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 2) { + /* 2 bytes remaining: produces 4 encoded + 4 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 8) | bin_data[1]; + *ascii_data++ = table_b2a[(val >> 11) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 6) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 1) & 0x1f]; + *ascii_data++ = table_b2a[(val << 4) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 3) { + /* 3 bytes remaining: produces 5 encoded + 3 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 16) + | ((uint32_t)bin_data[1] << 8) + | bin_data[2]; + *ascii_data++ = table_b2a[(val >> 19) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 14) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 9) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 4) & 0x1f]; + *ascii_data++ = table_b2a[(val << 1) & 0x1f]; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + *ascii_data++ = BASE32_PAD; + } + else if (bin_len == 4) { + /* 4 bytes remaining: produces 7 encoded + 1 padding chars. */ + uint32_t val = ((uint32_t)bin_data[0] << 24) + | ((uint32_t)bin_data[1] << 16) + | ((uint32_t)bin_data[2] << 8) + | bin_data[3]; + *ascii_data++ = table_b2a[(val >> 27) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 22) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 17) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 12) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 7) & 0x1f]; + *ascii_data++ = table_b2a[(val >> 2) & 0x1f]; + *ascii_data++ = table_b2a[(val << 3) & 0x1f]; + *ascii_data++ = BASE32_PAD; + } + + return PyBytesWriter_FinishWithPointer(writer, ascii_data); +} + +/*[clinic input] +binascii.a2b_base32 + + data: ascii_buffer + / + +Decode a line of base32 data. +[clinic start generated code]*/ + +static PyObject * +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=978d91ce9fadedf9 input=9137b28791447ce7]*/ +{ + return base32_decode_impl(module, data, table_a2b_base32, "base32"); +} + +/*[clinic input] +binascii.b2a_base32 + + data: Py_buffer + / + +base32-code line of data. +[clinic start generated code]*/ + +static PyObject * +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=c44b684c550a24cc input=0c6cbb86d32086f5]*/ +{ + return base32_encode_impl(module, data, table_b2a_base32, "base32"); +} + +/*[clinic input] +binascii.a2b_base32hex + + data: ascii_buffer + / + +Decode a line of base32hex data. +[clinic start generated code]*/ + +static PyObject * +binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=29133f84416e93cf input=178fe8e8fb212206]*/ +{ + return base32_decode_impl(module, data, table_a2b_base32hex, "base32hex"); +} + +/*[clinic input] +binascii.b2a_base32hex + + data: Py_buffer + / + +base32hex-code line of data. +[clinic start generated code]*/ + +static PyObject * +binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data) +/*[clinic end generated code: output=8ab2f6742ed918cb input=01108fc686630e91]*/ +{ + return base32_encode_impl(module, data, table_b2a_base32hex, "base32hex"); +} + /*[clinic input] binascii.crc_hqx @@ -2028,6 +2479,10 @@ static struct PyMethodDef binascii_module_methods[] = { BINASCII_A2B_ASCII85_METHODDEF BINASCII_A2B_BASE85_METHODDEF BINASCII_B2A_BASE85_METHODDEF + BINASCII_A2B_BASE32_METHODDEF + BINASCII_B2A_BASE32_METHODDEF + BINASCII_A2B_BASE32HEX_METHODDEF + BINASCII_B2A_BASE32HEX_METHODDEF BINASCII_A2B_HEX_METHODDEF BINASCII_B2A_HEX_METHODDEF BINASCII_HEXLIFY_METHODDEF diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index 2fdecc2efbf9d4..ae6bd7813f89e7 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -711,6 +711,132 @@ binascii_b2a_base85(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P return return_value; } +PyDoc_STRVAR(binascii_a2b_base32__doc__, +"a2b_base32($module, data, /)\n" +"--\n" +"\n" +"Decode a line of base32 data."); + +#define BINASCII_A2B_BASE32_METHODDEF \ + {"a2b_base32", (PyCFunction)binascii_a2b_base32, METH_O, binascii_a2b_base32__doc__}, + +static PyObject * +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_a2b_base32(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (!ascii_buffer_converter(arg, &data)) { + goto exit; + } + return_value = binascii_a2b_base32_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) + PyBuffer_Release(&data); + + return return_value; +} + +PyDoc_STRVAR(binascii_b2a_base32__doc__, +"b2a_base32($module, data, /)\n" +"--\n" +"\n" +"base32-code line of data."); + +#define BINASCII_B2A_BASE32_METHODDEF \ + {"b2a_base32", (PyCFunction)binascii_b2a_base32, METH_O, binascii_b2a_base32__doc__}, + +static PyObject * +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_b2a_base32(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + return_value = binascii_b2a_base32_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + +PyDoc_STRVAR(binascii_a2b_base32hex__doc__, +"a2b_base32hex($module, data, /)\n" +"--\n" +"\n" +"Decode a line of base32hex data."); + +#define BINASCII_A2B_BASE32HEX_METHODDEF \ + {"a2b_base32hex", (PyCFunction)binascii_a2b_base32hex, METH_O, binascii_a2b_base32hex__doc__}, + +static PyObject * +binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_a2b_base32hex(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (!ascii_buffer_converter(arg, &data)) { + goto exit; + } + return_value = binascii_a2b_base32hex_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) + PyBuffer_Release(&data); + + return return_value; +} + +PyDoc_STRVAR(binascii_b2a_base32hex__doc__, +"b2a_base32hex($module, data, /)\n" +"--\n" +"\n" +"base32hex-code line of data."); + +#define BINASCII_B2A_BASE32HEX_METHODDEF \ + {"b2a_base32hex", (PyCFunction)binascii_b2a_base32hex, METH_O, binascii_b2a_base32hex__doc__}, + +static PyObject * +binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data); + +static PyObject * +binascii_b2a_base32hex(PyObject *module, PyObject *arg) +{ + PyObject *return_value = NULL; + Py_buffer data = {NULL, NULL}; + + if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { + goto exit; + } + return_value = binascii_b2a_base32hex_impl(module, &data); + +exit: + /* Cleanup for data */ + if (data.obj) { + PyBuffer_Release(&data); + } + + return return_value; +} + PyDoc_STRVAR(binascii_crc_hqx__doc__, "crc_hqx($module, data, crc, /)\n" "--\n" @@ -1256,4 +1382,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=84c97096b0fb3819 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=bafd226511187580 input=a9049054013a1b77]*/ From bf1308f1f1139fe99411908dc575c17f18d983fa Mon Sep 17 00:00:00 2001 From: James Seo Date: Fri, 20 Mar 2026 08:54:02 -0700 Subject: [PATCH 2/4] Update PR for #145981 - Use the new `alphabet` parameter in `binascii` - Remove `binascii.a2b_base32hex()` and `binascii.b2a_base32hex()` - Change value for `.. versionadded::` ReST directive in docs for new `binascii` functions to "next" instead of "3.15" --- Doc/library/binascii.rst | 46 +++++----- Lib/base64.py | 4 +- Lib/test/test_binascii.py | 174 ++++++------------------------------ Modules/binascii.c | 162 ++++++++++++++------------------- Modules/clinic/binascii.c.h | 171 ++++++++++++++++++++--------------- 5 files changed, 213 insertions(+), 344 deletions(-) diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 9137b7203698df..3facb139e17d43 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -182,7 +182,7 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: 3.15 -.. function:: a2b_base32(string, /) +.. function:: a2b_base32(string, /, *, alphabet=BASE32_ALPHABET) Convert base32 data back to binary and return the binary data. @@ -193,38 +193,22 @@ The :mod:`!binascii` module defines the following functions: * Contains no excess data after padding (including excess padding, newlines, etc.). * Does not start with padding. + Optional *alphabet* must be a :class:`bytes` object of length 32 which + specifies an alternative alphabet. + Invalid base32 data will raise :exc:`binascii.Error`. - .. versionadded:: 3.15 + .. versionadded:: next -.. function:: b2a_base32(data, /) +.. function:: b2a_base32(data, /, *, alphabet=BASE32_ALPHABET) Convert binary data to a line(s) of ASCII characters in base32 coding, as specified in :rfc:`4648`. The return value is the converted line. - .. versionadded:: 3.15 - -.. function:: a2b_base32hex(string, /) - - Convert base32hex data back to binary and return the binary data. - - Valid base32hex: - - * Conforms to :rfc:`4648`. - * Contains only characters from the base32hex alphabet. - * Contains no excess data after padding (including excess padding, newlines, etc.). - * Does not start with padding. - - Invalid base32hex data will raise :exc:`binascii.Error`. - - .. versionadded:: 3.15 - -.. function:: b2a_base32hex(data, /) - - Convert binary data to a line(s) of ASCII characters in base32hex coding, - as specified in :rfc:`4648`. The return value is the converted line. + Optional *alphabet* must be a :term:`bytes-like object` of length 32 which + specifies an alternative alphabet. - .. versionadded:: 3.15 + .. versionadded:: next .. function:: a2b_qp(data, header=False) @@ -370,6 +354,18 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: next +.. data:: BASE32_ALPHABET + + The base32 alphabet according to :rfc:`4648`. + + .. versionadded:: next + +.. data:: BASE32HEX_ALPHABET + + The "Extended Hex" base32hex alphabet according to :rfc:`4648`. + + .. versionadded:: next + .. seealso:: diff --git a/Lib/base64.py b/Lib/base64.py index 576d429522ba31..8c88add3c2595b 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -234,13 +234,13 @@ def b32decode(s, casefold=False, map01=None): extra_args=_B32_DECODE_MAP01_DOCSTRING) def b32hexencode(s): - return binascii.b2a_base32hex(s) + return binascii.b2a_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex') def b32hexdecode(s, casefold=False): # base32hex does not have the 01 mapping s = _b32decode_prepare(s, casefold) - return binascii.a2b_base32hex(s) + return binascii.a2b_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex', extra_args='') diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 3ac468d636d203..7fedd8e1a8b3ca 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -10,10 +10,10 @@ # Note: "*_hex" functions are aliases for "(un)hexlify" -b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base32hex', 'b2a_base64', 'b2a_base85', +b2a_functions = ['b2a_ascii85', 'b2a_base32', 'b2a_base64', 'b2a_base85', 'b2a_hex', 'b2a_qp', 'b2a_uu', 'hexlify'] -a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base32hex', 'a2b_base64', 'a2b_base85', +a2b_functions = ['a2b_ascii85', 'a2b_base32', 'a2b_base64', 'a2b_base85', 'a2b_hex', 'a2b_qp', 'a2b_uu', 'unhexlify'] all_functions = a2b_functions + b2a_functions + ['crc32', 'crc_hqx'] @@ -820,155 +820,31 @@ def assertInvalidLength(*args): assertInvalidLength(b"BEEFCA=K", b"\t\x08Q\x01") assertInvalidLength(b"BEEFCA=====K", b"\t\x08Q\x01") - def test_base32hex_valid(self): - # Test base32hex with valid data - lines = [] - step = 0 - i = 0 - while i < len(self.rawdata): - b = self.type2test(self.rawdata[i:i + step]) - a = binascii.b2a_base32hex(b) - lines.append(a) - i += step - step += 1 - res = bytes() - for line in lines: - a = self.type2test(line) - b = binascii.a2b_base32hex(a) - res += b - self.assertEqual(res, self.rawdata) - - def test_base32hex_errors(self): - def _fixPadding(data): - fixed = data.replace(b"=", b"") - len_8 = len(fixed) % 8 - p = 8 - len_8 if len_8 else 0 - return fixed + b"=" * p - - def _assertRegexTemplate(assert_regex, data, good_padding_result=None): - with self.assertRaisesRegex(binascii.Error, assert_regex): - binascii.a2b_base32hex(self.type2test(data)) - if good_padding_result: - fixed = self.type2test(_fixPadding(data)) - self.assertEqual(binascii.a2b_base32hex(fixed), good_padding_result) - - def assertNonBase32HexData(*args): - _assertRegexTemplate(r"(?i)Only base32hex data", *args) - - def assertExcessData(*args): - _assertRegexTemplate(r"(?i)Excess data", *args) - - def assertExcessPadding(*args): - _assertRegexTemplate(r"(?i)Excess padding", *args) - - def assertLeadingPadding(*args): - _assertRegexTemplate(r"(?i)Leading padding", *args) - - def assertIncorrectPadding(*args): - _assertRegexTemplate(r"(?i)Incorrect padding", *args) - - def assertDiscontinuousPadding(*args): - _assertRegexTemplate(r"(?i)Discontinuous padding", *args) - - def assertInvalidLength(*args): - _assertRegexTemplate(r"(?i)Invalid.+number of data characters", *args) - - assertNonBase32HexData(b"a") - assertNonBase32HexData(b"AA-") - assertNonBase32HexData(b"ABCDE==!") - assertNonBase32HexData(b"ab:(){:|:&};:==") - - assertExcessData(b"AB======C") - assertExcessData(b"AB======CD") - assertExcessData(b"ABCD====E") - assertExcessData(b"ABCDE===FGH") - assertExcessData(b"ABCDEFG=H") - assertExcessData(b"4321====55555555") - - assertExcessData(b"BE======EF", b"[\x9c") - assertExcessData(b"BEEF====C", b"[\x9c\xf6") - assertExcessData(b"BEEFC===AK", b"[\x9c\xf6*") - assertExcessData(b"BEEFCAK=E", b"[\x9c\xf6*\x8e") - - assertExcessPadding(b"BE=======", b"[") - assertExcessPadding(b"BE========", b"[") - assertExcessPadding(b"BEEF=====", b"[\x9c") - assertExcessPadding(b"BEEF======", b"[\x9c") - assertExcessPadding(b"BEEFC====", b"[\x9c\xf6") - assertExcessPadding(b"BEEFC=====", b"[\x9c\xf6") - assertExcessPadding(b"BEEFCAK==", b"[\x9c\xf6*") - assertExcessPadding(b"BEEFCAK===", b"[\x9c\xf6*") - assertExcessPadding(b"BEEFCAKE=", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE==", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE===", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE====", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE=====", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE======", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE=======", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE========", b"[\x9c\xf6*\x8e") - assertExcessPadding(b"BEEFCAKE=========", b"[\x9c\xf6*\x8e") - - assertLeadingPadding(b"=", b"") - assertLeadingPadding(b"==", b"") - assertLeadingPadding(b"===", b"") - assertLeadingPadding(b"====", b"") - assertLeadingPadding(b"=====", b"") - assertLeadingPadding(b"======", b"") - assertLeadingPadding(b"=======", b"") - assertLeadingPadding(b"========", b"") - assertLeadingPadding(b"=========", b"") - assertLeadingPadding(b"=BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"==BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"===BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"====BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"=====BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"======BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"=======BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"========BEEFCAKE", b"[\x9c\xf6*\x8e") - assertLeadingPadding(b"=========BEEFCAKE", b"[\x9c\xf6*\x8e") - - assertIncorrectPadding(b"A") - assertIncorrectPadding(b"AB") - assertIncorrectPadding(b"ABC") - assertIncorrectPadding(b"ABCD") - assertIncorrectPadding(b"ABCDE") - assertIncorrectPadding(b"ABCDEF") - assertIncorrectPadding(b"ABCDEFG") - - assertIncorrectPadding(b"BE=", b"[") - assertIncorrectPadding(b"BE==", b"[") - assertIncorrectPadding(b"BE===", b"[") - assertIncorrectPadding(b"BE====", b"[") - assertIncorrectPadding(b"BE=====", b"[") - assertIncorrectPadding(b"BEEF=", b"[\x9c") - assertIncorrectPadding(b"BEEF==", b"[\x9c") - assertIncorrectPadding(b"BEEF===", b"[\x9c") - assertIncorrectPadding(b"BEEFC=", b"[\x9c\xf6") - assertIncorrectPadding(b"BEEFC==", b"[\x9c\xf6") - - assertDiscontinuousPadding(b"BE=EF===", b"[\x9c") - assertDiscontinuousPadding(b"BE==EF==", b"[\x9c") - assertDiscontinuousPadding(b"BEEF=C==", b"[\x9c\xf6") - assertDiscontinuousPadding(b"BEEFC=AK", b"[\x9c\xf6*") + def test_base32_alphabet(self): + alphabet = b'0Aa1Bb2Cc3Dd4Ee5Ff6Gg7Hh8Ii9JjKk' + data = self.type2test(self.rawdata) + encoded = binascii.b2a_base32(data, alphabet=alphabet) + trans = bytes.maketrans(binascii.BASE32_ALPHABET, alphabet) + expected = binascii.b2a_base32(data).translate(trans) + self.assertEqual(encoded, expected) + self.assertEqual(binascii.a2b_base32(encoded, alphabet=alphabet), self.rawdata) + self.assertEqual(binascii.b2a_base32(data, alphabet=self.type2test(alphabet)), expected) - assertInvalidLength(b"A=") - assertInvalidLength(b"A==") - assertInvalidLength(b"A===") - assertInvalidLength(b"A====") - assertInvalidLength(b"A=====") - assertInvalidLength(b"A======") - assertInvalidLength(b"ABC=") - assertInvalidLength(b"ABC==") - assertInvalidLength(b"ABC===") - assertInvalidLength(b"ABC====") - assertInvalidLength(b"ABCDEF=") + data = self.type2test(b'') + self.assertEqual(binascii.b2a_base32(data, alphabet=alphabet), b'') + self.assertEqual(binascii.a2b_base32(data, alphabet=alphabet), b'') - assertInvalidLength(b"B=E=====", b"[") - assertInvalidLength(b"B==E====", b"[") - assertInvalidLength(b"BEE=F===", b"[\x9c") - assertInvalidLength(b"BEE==F==", b"[\x9c") - assertInvalidLength(b"BEEFCA=K", b"[\x9c\xf6*") - assertInvalidLength(b"BEEFCA=====K", b"[\x9c\xf6*") + for func in binascii.b2a_base32, binascii.a2b_base32: + with self.assertRaises(TypeError): + func(data, alphabet=None) + with self.assertRaises(TypeError): + func(data, alphabet=alphabet.decode()) + with self.assertRaises(ValueError): + func(data, alphabet=alphabet[:-1]) + with self.assertRaises(ValueError): + func(data, alphabet=alphabet+b'?') + with self.assertRaises(TypeError): + binascii.a2b_base32(data, alphabet=bytearray(alphabet)) def test_uu(self): MAX_UU = 45 diff --git a/Modules/binascii.c b/Modules/binascii.c index 241aeac400063e..44d7986b6c0415 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -265,32 +265,9 @@ static const unsigned char table_a2b_base32[] Py_ALIGNED(64) = { -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, }; -static const unsigned char table_a2b_base32hex[] Py_ALIGNED(64) = { - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,-1,-1, -1,-1,-1,-1, - -1,10,11,12, 13,14,15,16, 17,18,19,20, 21,22,23,24, - 25,26,27,28, 29,30,31,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, - -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -}; - static const unsigned char table_b2a_base32[] Py_ALIGNED(64) = "ABCDEFGHIJKLMNOPQRSTUVWXYZ234567"; -static const unsigned char table_b2a_base32hex[] Py_ALIGNED(64) = - "0123456789ABCDEFGHIJKLMNOPQRSTUV"; - #define BASE32_PAD '=' /* @@ -1513,20 +1490,44 @@ binascii_b2a_base85_impl(PyObject *module, Py_buffer *data, int pad, return PyBytesWriter_FinishWithPointer(writer, ascii_data); } +/*[clinic input] +binascii.a2b_base32 + + data: ascii_buffer + / + * + alphabet: PyBytesObject(c_default="NULL") = BASE32_ALPHABET + +Decode a line of base32 data. +[clinic start generated code]*/ + static PyObject * -base32_decode_impl(PyObject *module, Py_buffer *data, - const unsigned char table_a2b[], const char *name) +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, + PyBytesObject *alphabet) +/*[clinic end generated code: output=12cb58bf547237e2 input=426055ea49ac147e]*/ { const unsigned char *ascii_data = data->buf; Py_ssize_t ascii_len = data->len; binascii_state *state = NULL; + PyObject *table_obj = NULL; + const unsigned char *table_a2b = table_a2b_base32; assert(ascii_len >= 0); + if (alphabet != NULL) { + state = get_binascii_state(module); + table_obj = get_reverse_table(state, (PyObject *)alphabet, 32, BASE32_PAD); + if (table_obj == NULL) { + return NULL; + } + table_a2b = (const unsigned char *)PyBytes_AS_STRING(table_obj); + } + /* Allocate output buffer. */ size_t bin_len = ((size_t)ascii_len + 7) / 8 * 5; PyBytesWriter *writer = PyBytesWriter_Create(bin_len); if (writer == NULL) { + Py_XDECREF(table_obj); return NULL; } unsigned char *bin_data = PyBytesWriter_GetData(writer); @@ -1568,11 +1569,11 @@ base32_decode_impl(PyObject *module, Py_buffer *data, if (octet_pos == 1 || octet_pos == 3 || octet_pos == 6) { const unsigned char *ascii_data_start = data->buf; PyErr_Format(state->Error, - "Invalid %s-encoded string: " + "Invalid base32-encoded string: " "number of data characters (%zd) " "cannot be 1, 3, or 6 more " "than a multiple of 8", - name, (ascii_data - ascii_data_start)); + ascii_data - ascii_data_start); } else { PyErr_SetString(state->Error, @@ -1588,7 +1589,7 @@ base32_decode_impl(PyObject *module, Py_buffer *data, if (v >= 32) { state = get_binascii_state(module); if (state) { - PyErr_Format(state->Error, "Only %s data is allowed", name); + PyErr_SetString(state->Error, "Only base32 data is allowed"); } goto error; } @@ -1654,23 +1655,46 @@ base32_decode_impl(PyObject *module, Py_buffer *data, goto error; } + Py_XDECREF(table_obj); return PyBytesWriter_FinishWithPointer(writer, bin_data); error: PyBytesWriter_Discard(writer); + Py_XDECREF(table_obj); return NULL; } +/*[clinic input] +binascii.b2a_base32 + + data: Py_buffer + / + * + alphabet: Py_buffer(c_default="{NULL, NULL}") = BASE32_ALPHABET + +base32-code line of data. +[clinic start generated code]*/ + static PyObject * -base32_encode_impl(PyObject *module, Py_buffer *data, - const unsigned char table_b2a[], const char *name) +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, + Py_buffer *alphabet) +/*[clinic end generated code: output=058d0d1aeb014d3b input=ffd4fa162a6e1cb5]*/ { + const unsigned char *table_b2a = table_b2a_base32; const unsigned char *bin_data = data->buf; Py_ssize_t bin_len = data->len; binascii_state *state = NULL; assert(bin_len >= 0); + if (alphabet->buf != NULL) { + if (alphabet->len != 32) { + PyErr_SetString(PyExc_ValueError, "alphabet must have length 32"); + return NULL; + } + table_b2a = alphabet->buf; + } + /* * Each group of 5 bytes (rounded up) gets encoded as 8 characters. * Use unsigned integer arithmetic to avoid signed integer overflow. @@ -1679,7 +1703,7 @@ base32_encode_impl(PyObject *module, Py_buffer *data, if (ascii_len > PY_SSIZE_T_MAX) { state = get_binascii_state(module); if (state) { - PyErr_Format(state->Error, "Too much data for %s", name); + PyErr_SetString(state->Error, "Too much data for base32"); } return NULL; } @@ -1754,70 +1778,6 @@ base32_encode_impl(PyObject *module, Py_buffer *data, return PyBytesWriter_FinishWithPointer(writer, ascii_data); } -/*[clinic input] -binascii.a2b_base32 - - data: ascii_buffer - / - -Decode a line of base32 data. -[clinic start generated code]*/ - -static PyObject * -binascii_a2b_base32_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=978d91ce9fadedf9 input=9137b28791447ce7]*/ -{ - return base32_decode_impl(module, data, table_a2b_base32, "base32"); -} - -/*[clinic input] -binascii.b2a_base32 - - data: Py_buffer - / - -base32-code line of data. -[clinic start generated code]*/ - -static PyObject * -binascii_b2a_base32_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=c44b684c550a24cc input=0c6cbb86d32086f5]*/ -{ - return base32_encode_impl(module, data, table_b2a_base32, "base32"); -} - -/*[clinic input] -binascii.a2b_base32hex - - data: ascii_buffer - / - -Decode a line of base32hex data. -[clinic start generated code]*/ - -static PyObject * -binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=29133f84416e93cf input=178fe8e8fb212206]*/ -{ - return base32_decode_impl(module, data, table_a2b_base32hex, "base32hex"); -} - -/*[clinic input] -binascii.b2a_base32hex - - data: Py_buffer - / - -base32hex-code line of data. -[clinic start generated code]*/ - -static PyObject * -binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data) -/*[clinic end generated code: output=8ab2f6742ed918cb input=01108fc686630e91]*/ -{ - return base32_encode_impl(module, data, table_b2a_base32hex, "base32hex"); -} - /*[clinic input] binascii.crc_hqx @@ -2481,8 +2441,6 @@ static struct PyMethodDef binascii_module_methods[] = { BINASCII_B2A_BASE85_METHODDEF BINASCII_A2B_BASE32_METHODDEF BINASCII_B2A_BASE32_METHODDEF - BINASCII_A2B_BASE32HEX_METHODDEF - BINASCII_B2A_BASE32HEX_METHODDEF BINASCII_A2B_HEX_METHODDEF BINASCII_B2A_HEX_METHODDEF BINASCII_HEXLIFY_METHODDEF @@ -2569,6 +2527,16 @@ binascii_exec(PyObject *module) { return -1; } + if (PyModule_Add(module, "BASE32_ALPHABET", + PyBytes_FromStringAndSize((const char *)table_b2a_base32, 32)) < 0) + { + return -1; + } + if (PyModule_Add(module, "BASE32HEX_ALPHABET", + PyBytes_FromString("0123456789ABCDEFGHIJKLMNOPQRSTUV")) < 0) + { + return -1; + } state->reverse_table_cache = PyDict_New(); if (state->reverse_table_cache == NULL) { diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index ae6bd7813f89e7..8057d94a1fb934 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -712,27 +712,72 @@ binascii_b2a_base85(PyObject *module, PyObject *const *args, Py_ssize_t nargs, P } PyDoc_STRVAR(binascii_a2b_base32__doc__, -"a2b_base32($module, data, /)\n" +"a2b_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" "--\n" "\n" "Decode a line of base32 data."); #define BINASCII_A2B_BASE32_METHODDEF \ - {"a2b_base32", (PyCFunction)binascii_a2b_base32, METH_O, binascii_a2b_base32__doc__}, + {"a2b_base32", _PyCFunction_CAST(binascii_a2b_base32), METH_FASTCALL|METH_KEYWORDS, binascii_a2b_base32__doc__}, static PyObject * -binascii_a2b_base32_impl(PyObject *module, Py_buffer *data); +binascii_a2b_base32_impl(PyObject *module, Py_buffer *data, + PyBytesObject *alphabet); static PyObject * -binascii_a2b_base32(PyObject *module, PyObject *arg) +binascii_a2b_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) + + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(alphabet), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) + + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE + + static const char * const _keywords[] = {"", "alphabet", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "a2b_base32", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; Py_buffer data = {NULL, NULL}; + PyBytesObject *alphabet = NULL; - if (!ascii_buffer_converter(arg, &data)) { + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { + goto exit; + } + if (!ascii_buffer_converter(args[0], &data)) { + goto exit; + } + if (!noptargs) { + goto skip_optional_kwonly; + } + if (!PyBytes_Check(args[1])) { + _PyArg_BadArgument("a2b_base32", "argument 'alphabet'", "bytes", args[1]); goto exit; } - return_value = binascii_a2b_base32_impl(module, &data); + alphabet = (PyBytesObject *)args[1]; +skip_optional_kwonly: + return_value = binascii_a2b_base32_impl(module, &data, alphabet); exit: /* Cleanup for data */ @@ -743,96 +788,80 @@ binascii_a2b_base32(PyObject *module, PyObject *arg) } PyDoc_STRVAR(binascii_b2a_base32__doc__, -"b2a_base32($module, data, /)\n" +"b2a_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" "--\n" "\n" "base32-code line of data."); #define BINASCII_B2A_BASE32_METHODDEF \ - {"b2a_base32", (PyCFunction)binascii_b2a_base32, METH_O, binascii_b2a_base32__doc__}, + {"b2a_base32", _PyCFunction_CAST(binascii_b2a_base32), METH_FASTCALL|METH_KEYWORDS, binascii_b2a_base32__doc__}, static PyObject * -binascii_b2a_base32_impl(PyObject *module, Py_buffer *data); +binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, + Py_buffer *alphabet); static PyObject * -binascii_b2a_base32(PyObject *module, PyObject *arg) +binascii_b2a_base32(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames) { PyObject *return_value = NULL; - Py_buffer data = {NULL, NULL}; - - if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { - goto exit; - } - return_value = binascii_b2a_base32_impl(module, &data); - -exit: - /* Cleanup for data */ - if (data.obj) { - PyBuffer_Release(&data); - } - - return return_value; -} - -PyDoc_STRVAR(binascii_a2b_base32hex__doc__, -"a2b_base32hex($module, data, /)\n" -"--\n" -"\n" -"Decode a line of base32hex data."); + #if defined(Py_BUILD_CORE) && !defined(Py_BUILD_CORE_MODULE) -#define BINASCII_A2B_BASE32HEX_METHODDEF \ - {"a2b_base32hex", (PyCFunction)binascii_a2b_base32hex, METH_O, binascii_a2b_base32hex__doc__}, + #define NUM_KEYWORDS 1 + static struct { + PyGC_Head _this_is_not_used; + PyObject_VAR_HEAD + Py_hash_t ob_hash; + PyObject *ob_item[NUM_KEYWORDS]; + } _kwtuple = { + .ob_base = PyVarObject_HEAD_INIT(&PyTuple_Type, NUM_KEYWORDS) + .ob_hash = -1, + .ob_item = { &_Py_ID(alphabet), }, + }; + #undef NUM_KEYWORDS + #define KWTUPLE (&_kwtuple.ob_base.ob_base) -static PyObject * -binascii_a2b_base32hex_impl(PyObject *module, Py_buffer *data); + #else // !Py_BUILD_CORE + # define KWTUPLE NULL + #endif // !Py_BUILD_CORE -static PyObject * -binascii_a2b_base32hex(PyObject *module, PyObject *arg) -{ - PyObject *return_value = NULL; + static const char * const _keywords[] = {"", "alphabet", NULL}; + static _PyArg_Parser _parser = { + .keywords = _keywords, + .fname = "b2a_base32", + .kwtuple = KWTUPLE, + }; + #undef KWTUPLE + PyObject *argsbuf[2]; + Py_ssize_t noptargs = nargs + (kwnames ? PyTuple_GET_SIZE(kwnames) : 0) - 1; Py_buffer data = {NULL, NULL}; + Py_buffer alphabet = {NULL, NULL}; - if (!ascii_buffer_converter(arg, &data)) { + args = _PyArg_UnpackKeywords(args, nargs, NULL, kwnames, &_parser, + /*minpos*/ 1, /*maxpos*/ 1, /*minkw*/ 0, /*varpos*/ 0, argsbuf); + if (!args) { goto exit; } - return_value = binascii_a2b_base32hex_impl(module, &data); - -exit: - /* Cleanup for data */ - if (data.obj) - PyBuffer_Release(&data); - - return return_value; -} - -PyDoc_STRVAR(binascii_b2a_base32hex__doc__, -"b2a_base32hex($module, data, /)\n" -"--\n" -"\n" -"base32hex-code line of data."); - -#define BINASCII_B2A_BASE32HEX_METHODDEF \ - {"b2a_base32hex", (PyCFunction)binascii_b2a_base32hex, METH_O, binascii_b2a_base32hex__doc__}, - -static PyObject * -binascii_b2a_base32hex_impl(PyObject *module, Py_buffer *data); - -static PyObject * -binascii_b2a_base32hex(PyObject *module, PyObject *arg) -{ - PyObject *return_value = NULL; - Py_buffer data = {NULL, NULL}; - - if (PyObject_GetBuffer(arg, &data, PyBUF_SIMPLE) != 0) { + if (PyObject_GetBuffer(args[0], &data, PyBUF_SIMPLE) != 0) { goto exit; } - return_value = binascii_b2a_base32hex_impl(module, &data); + if (!noptargs) { + goto skip_optional_kwonly; + } + if (PyObject_GetBuffer(args[1], &alphabet, PyBUF_SIMPLE) != 0) { + goto exit; + } +skip_optional_kwonly: + return_value = binascii_b2a_base32_impl(module, &data, &alphabet); exit: /* Cleanup for data */ if (data.obj) { PyBuffer_Release(&data); } + /* Cleanup for alphabet */ + if (alphabet.obj) { + PyBuffer_Release(&alphabet); + } return return_value; } @@ -1382,4 +1411,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=bafd226511187580 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=5be0c5d9b116ee17 input=a9049054013a1b77]*/ From a9a7d26463b57fd2a25b79b2466539c438169aa1 Mon Sep 17 00:00:00 2001 From: James Seo Date: Sat, 21 Mar 2026 07:52:29 -0700 Subject: [PATCH 3/4] Address reviewer feedback - Update docs to refer to "Base 32" and "Base32" - Update docs to better explain `binascii.a2b_base32()` - Inline helper function in `base64` - Add forgotten tests for presence of alphabet module globals --- Doc/library/binascii.rst | 21 +++++++++++++-------- Lib/base64.py | 21 ++++++++------------- Lib/test/test_binascii.py | 4 ++++ Modules/binascii.c | 2 +- 4 files changed, 26 insertions(+), 22 deletions(-) diff --git a/Doc/library/binascii.rst b/Doc/library/binascii.rst index 3facb139e17d43..47c021a85c13ad 100644 --- a/Doc/library/binascii.rst +++ b/Doc/library/binascii.rst @@ -182,16 +182,21 @@ The :mod:`!binascii` module defines the following functions: .. versionadded:: 3.15 + .. function:: a2b_base32(string, /, *, alphabet=BASE32_ALPHABET) Convert base32 data back to binary and return the binary data. - Valid base32 data: + Valid base32 data contains characters from the base32 alphabet specified + in :rfc:`4648` in groups of eight (if necessary, the final group is padded + to eight characters with ``=``). Each group encodes 40 bits of binary data + in the range from ``0`` to ``2 ** 40 - 1``, inclusive. - * Conforms to :rfc:`4648`. - * Contains only characters from the base32 alphabet. - * Contains no excess data after padding (including excess padding, newlines, etc.). - * Does not start with padding. + .. note:: + By default, this function does not map lowercase characters (which are + invalid in standard base32) to their uppercase counterparts, nor does + it contextually map ``0`` to ``O`` and ``1`` to ``I``/``L`` as + :rfc:`4648` allows. Optional *alphabet* must be a :class:`bytes` object of length 32 which specifies an alternative alphabet. @@ -202,7 +207,7 @@ The :mod:`!binascii` module defines the following functions: .. function:: b2a_base32(data, /, *, alphabet=BASE32_ALPHABET) - Convert binary data to a line(s) of ASCII characters in base32 coding, + Convert binary data to a line of ASCII characters in base32 coding, as specified in :rfc:`4648`. The return value is the converted line. Optional *alphabet* must be a :term:`bytes-like object` of length 32 which @@ -356,13 +361,13 @@ The :mod:`!binascii` module defines the following functions: .. data:: BASE32_ALPHABET - The base32 alphabet according to :rfc:`4648`. + The Base 32 alphabet according to :rfc:`4648`. .. versionadded:: next .. data:: BASE32HEX_ALPHABET - The "Extended Hex" base32hex alphabet according to :rfc:`4648`. + The "Extended Hex" Base 32 alphabet according to :rfc:`4648`. .. versionadded:: next diff --git a/Lib/base64.py b/Lib/base64.py index 8c88add3c2595b..9b57cdfefce1e6 100644 --- a/Lib/base64.py +++ b/Lib/base64.py @@ -207,10 +207,12 @@ def urlsafe_b64decode(s): 0 and 1 are not allowed in the input. ''' -def _b32decode_prepare(s, casefold=False, map01=None): +def b32encode(s): + return binascii.b2a_base32(s) +b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32') + +def b32decode(s, casefold=False, map01=None): s = _bytes_from_decode_data(s) - if len(s) % 8: - raise binascii.Error('Incorrect padding') # Handle section 2.4 zero and one mapping. The flag map01 will be either # False, or the character to map the digit 1 (one) to. It should be # either L (el) or I (eye). @@ -220,15 +222,6 @@ def _b32decode_prepare(s, casefold=False, map01=None): s = s.translate(bytes.maketrans(b'01', b'O' + map01)) if casefold: s = s.upper() - return s - - -def b32encode(s): - return binascii.b2a_base32(s) -b32encode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32') - -def b32decode(s, casefold=False, map01=None): - s = _b32decode_prepare(s, casefold, map01) return binascii.a2b_base32(s) b32decode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32', extra_args=_B32_DECODE_MAP01_DOCSTRING) @@ -238,8 +231,10 @@ def b32hexencode(s): b32hexencode.__doc__ = _B32_ENCODE_DOCSTRING.format(encoding='base32hex') def b32hexdecode(s, casefold=False): + s = _bytes_from_decode_data(s) # base32hex does not have the 01 mapping - s = _b32decode_prepare(s, casefold) + if casefold: + s = s.upper() return binascii.a2b_base32(s, alphabet=binascii.BASE32HEX_ALPHABET) b32hexdecode.__doc__ = _B32_DECODE_DOCSTRING.format(encoding='base32hex', extra_args='') diff --git a/Lib/test/test_binascii.py b/Lib/test/test_binascii.py index 7fedd8e1a8b3ca..638a4cce0509d0 100644 --- a/Lib/test/test_binascii.py +++ b/Lib/test/test_binascii.py @@ -74,6 +74,10 @@ def test_constants(self): b'abcdefghijklmnopqrstuvwxyz' b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' b'.-:+=^!/*?&<>()[]{}@%$#') + self.assertEqual(binascii.BASE32_ALPHABET, + b'ABCDEFGHIJKLMNOPQRSTUVWXYZ234567') + self.assertEqual(binascii.BASE32HEX_ALPHABET, + b'0123456789ABCDEFGHIJKLMNOPQRSTUV') def test_functions(self): # Check presence of all functions diff --git a/Modules/binascii.c b/Modules/binascii.c index 44d7986b6c0415..786a95f8bfb42c 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -1672,7 +1672,7 @@ binascii.b2a_base32 * alphabet: Py_buffer(c_default="{NULL, NULL}") = BASE32_ALPHABET -base32-code line of data. +Base32-code line of data. [clinic start generated code]*/ static PyObject * From 6f80c549a6e34762cde54502d24c9cee98e7079f Mon Sep 17 00:00:00 2001 From: James Seo Date: Sat, 21 Mar 2026 08:01:31 -0700 Subject: [PATCH 4/4] Update generated files --- Modules/binascii.c | 2 +- Modules/clinic/binascii.c.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Modules/binascii.c b/Modules/binascii.c index 786a95f8bfb42c..e98cce10f8c58f 100644 --- a/Modules/binascii.c +++ b/Modules/binascii.c @@ -1678,7 +1678,7 @@ Base32-code line of data. static PyObject * binascii_b2a_base32_impl(PyObject *module, Py_buffer *data, Py_buffer *alphabet) -/*[clinic end generated code: output=058d0d1aeb014d3b input=ffd4fa162a6e1cb5]*/ +/*[clinic end generated code: output=058d0d1aeb014d3b input=99cbe7194799d368]*/ { const unsigned char *table_b2a = table_b2a_base32; const unsigned char *bin_data = data->buf; diff --git a/Modules/clinic/binascii.c.h b/Modules/clinic/binascii.c.h index 8057d94a1fb934..7a411bfc829943 100644 --- a/Modules/clinic/binascii.c.h +++ b/Modules/clinic/binascii.c.h @@ -791,7 +791,7 @@ PyDoc_STRVAR(binascii_b2a_base32__doc__, "b2a_base32($module, data, /, *, alphabet=BASE32_ALPHABET)\n" "--\n" "\n" -"base32-code line of data."); +"Base32-code line of data."); #define BINASCII_B2A_BASE32_METHODDEF \ {"b2a_base32", _PyCFunction_CAST(binascii_b2a_base32), METH_FASTCALL|METH_KEYWORDS, binascii_b2a_base32__doc__}, @@ -1411,4 +1411,4 @@ binascii_b2a_qp(PyObject *module, PyObject *const *args, Py_ssize_t nargs, PyObj return return_value; } -/*[clinic end generated code: output=5be0c5d9b116ee17 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=242c0c56b918bd33 input=a9049054013a1b77]*/