The Wayback Machine - https://web.archive.org/web/20211012231910/https://github.com/python/cpython/pull/28752/files
Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-29410: Change the default hash algorithm to SipHash13. #28752

Merged
merged 18 commits into from Oct 10, 2021
@@ -416,15 +416,19 @@ Libraries options
Security Options
----------------

.. cmdoption:: --with-hash-algorithm=[fnv|siphash24]
.. cmdoption:: --with-hash-algorithm=[fnv|siphash13|siphash24]

Select hash algorithm for use in ``Python/pyhash.c``:

* ``siphash24`` (default).
* ``fnv``;
* ``siphash13`` (default);
* ``siphash24``;
* ``fnv``.

.. versionadded:: 3.4

.. versionadded:: 3.11
``siphash13`` is added and it is the new default.

.. cmdoption:: --with-builtin-hashlib-hashes=md5,sha1,sha256,sha512,sha3,blake2

Built-in hash modules:
@@ -175,6 +175,11 @@ Other CPython Implementation Changes
support :class:`typing.SupportsComplex` and :class:`typing.SupportsBytes` protocols.
(Contributed by Mark Dickinson and Dong-hee Na in :issue:`24234`.)

* ``siphash13`` is added as a new internal hashing algorithms. It's has similar security
properties as ``siphash24`` but it is slightly faster for long inputs. ``str``, ``bytes``,
and some other types now use it as default algorithm for ``hash()``. :pep:`552`
hash-based pyc files now use ``siphash13``, too.
(Contributed by Inada Naoki in :issue:`29410`.)

New Modules
===========
@@ -114,11 +114,10 @@ PyAPI_FUNC(PyHash_FuncDef*) PyHash_GetFuncDef(void);

/* hash algorithm selection
*
* The values for Py_HASH_SIPHASH24 and Py_HASH_FNV are hard-coded in the
* The values for Py_HASH_* are hard-coded in the
* configure script.
*
* - FNV is available on all platforms and architectures.
* - SIPHASH24 only works on platforms that don't require aligned memory for integers.
* - FNV and SIPHASH* are available on all platforms and architectures.
* - With EXTERNAL embedders can provide an alternative implementation with::
*
* PyHash_FuncDef PyHash_Func = {...};
@@ -128,10 +127,11 @@ PyAPI_FUNC(PyHash_FuncDef*) PyHash_GetFuncDef(void);
#define Py_HASH_EXTERNAL 0
#define Py_HASH_SIPHASH24 1
#define Py_HASH_FNV 2
#define Py_HASH_SIPHASH13 3

#ifndef Py_HASH_ALGORITHM
# ifndef HAVE_ALIGNED_REQUIRED
# define Py_HASH_ALGORITHM Py_HASH_SIPHASH24
# define Py_HASH_ALGORITHM Py_HASH_SIPHASH13
# else
# define Py_HASH_ALGORITHM Py_HASH_FNV
# endif /* uint64_t && uint32_t && aligned */
@@ -42,8 +42,8 @@ def pysiphash(uint64):

def skip_unless_internalhash(test):
"""Skip decorator for tests that depend on SipHash24 or FNV"""
ok = sys.hash_info.algorithm in {"fnv", "siphash24"}
msg = "Requires SipHash24 or FNV"
ok = sys.hash_info.algorithm in {"fnv", "siphash13", "siphash24"}
msg = "Requires SipHash13, SipHash24 or FNV"
return test if ok else unittest.skip(msg)(test)


@@ -206,6 +206,19 @@ class StringlikeHashRandomizationTests(HashRandomizationTests):
# seed 42, 'abc'
[-678966196, 573763426263223372, -820489388, -4282905804826039665],
],
'siphash13': [

This comment has been minimized.

# NOTE: PyUCS2 layout depends on endianness
# seed 0, 'abc'
[69611762, -4594863902769663758, 69611762, -4594863902769663758],
# seed 42, 'abc'
[-975800855, 3869580338025362921, -975800855, 3869580338025362921],
# seed 42, 'abcdefghijk'
[-595844228, 7764564197781545852, -595844228, 7764564197781545852],
# seed 0, 'äú∑ℇ'
[-1093288643, -2810468059467891395, -1041341092, 4925090034378237276],
# seed 42, 'äú∑ℇ'
[-585999602, -2845126246016066802, -817336969, -2219421378907968137],
],
'siphash24': [
# NOTE: PyUCS2 layout depends on endianness
# seed 0, 'abc'
@@ -351,8 +351,8 @@ def test_issue_35321(self):
self.assertEqual(_frozen_importlib.__spec__.origin, "frozen")

def test_source_hash(self):
self.assertEqual(_imp.source_hash(42, b'hi'), b'\xc6\xe7Z\r\x03:}\xab')
self.assertEqual(_imp.source_hash(43, b'hi'), b'\x85\x9765\xf8\x9a\x8b9')
self.assertEqual(_imp.source_hash(42, b'hi'), b'\xfb\xd9G\x05\xaf$\x9b~')
self.assertEqual(_imp.source_hash(43, b'hi'), b'\xd0/\x87C\xccC\xff\xe2')

def test_pyc_invalidation_mode_from_cmdline(self):
cases = [
@@ -508,16 +508,18 @@ def test_attributes(self):
self.assertIsInstance(sys.hash_info.nan, int)
self.assertIsInstance(sys.hash_info.imag, int)
algo = sysconfig.get_config_var("Py_HASH_ALGORITHM")
if sys.hash_info.algorithm in {"fnv", "siphash24"}:
if sys.hash_info.algorithm in {"fnv", "siphash13", "siphash24"}:
self.assertIn(sys.hash_info.hash_bits, {32, 64})
self.assertIn(sys.hash_info.seed_bits, {32, 64, 128})

if algo == 1:
self.assertEqual(sys.hash_info.algorithm, "siphash24")
elif algo == 2:
self.assertEqual(sys.hash_info.algorithm, "fnv")
elif algo == 3:
self.assertEqual(sys.hash_info.algorithm, "siphash13")
else:
self.assertIn(sys.hash_info.algorithm, {"fnv", "siphash24"})
self.assertIn(sys.hash_info.algorithm, {"fnv", "siphash13", "siphash24"})
else:
# PY_HASH_EXTERNAL
self.assertEqual(algo, 0)
@@ -0,0 +1 @@
Add SipHash13 for string hash algorithm and use it by default.
@@ -358,19 +358,72 @@ static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
# define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
#endif

#define HALF_ROUND(a,b,c,d,s,t) \
a += b; c += d; \
#define HALF_ROUND(a,b,c,d,s,t) \
a += b; c += d; \
b = ROTATE(b, s) ^ a; \
d = ROTATE(d, t) ^ c; \
a = ROTATE(a, 32);

#define DOUBLE_ROUND(v0,v1,v2,v3) \
HALF_ROUND(v0,v1,v2,v3,13,16); \
HALF_ROUND(v2,v1,v0,v3,17,21); \
HALF_ROUND(v0,v1,v2,v3,13,16); \
#define SINGLE_ROUND(v0,v1,v2,v3) \
HALF_ROUND(v0,v1,v2,v3,13,16); \
HALF_ROUND(v2,v1,v0,v3,17,21);

#define DOUBLE_ROUND(v0,v1,v2,v3) \
SINGLE_ROUND(v0,v1,v2,v3); \
SINGLE_ROUND(v0,v1,v2,v3);


static uint64_t
siphash13(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
uint64_t b = (uint64_t)src_sz << 56;
const uint8_t *in = (const uint8_t*)src;

uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
uint64_t v3 = k1 ^ 0x7465646279746573ULL;

uint64_t t;
uint8_t *pt;

while (src_sz >= 8) {
uint64_t mi;
memcpy(&mi, in, sizeof(mi));
mi = _le64toh(mi);
in += sizeof(mi);
src_sz -= sizeof(mi);
v3 ^= mi;
SINGLE_ROUND(v0,v1,v2,v3);
v0 ^= mi;
}

t = 0;
pt = (uint8_t *)&t;
switch (src_sz) {
case 7: pt[6] = in[6]; /* fall through */
case 6: pt[5] = in[5]; /* fall through */
case 5: pt[4] = in[4]; /* fall through */
case 4: memcpy(pt, in, sizeof(uint32_t)); break;
case 3: pt[2] = in[2]; /* fall through */
case 2: pt[1] = in[1]; /* fall through */
case 1: pt[0] = in[0]; /* fall through */
}
b |= _le64toh(t);

v3 ^= b;
SINGLE_ROUND(v0,v1,v2,v3);
v0 ^= b;
v2 ^= 0xff;
SINGLE_ROUND(v0,v1,v2,v3);
SINGLE_ROUND(v0,v1,v2,v3);
SINGLE_ROUND(v0,v1,v2,v3);

/* modified */
t = (v0 ^ v1) ^ (v2 ^ v3);
return t;
}

#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
static uint64_t
siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
uint64_t b = (uint64_t)src_sz << 56;
@@ -419,14 +472,26 @@ siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
t = (v0 ^ v1) ^ (v2 ^ v3);
return t;
}
#endif

uint64_t
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
{
return siphash24(key, 0, src, src_sz);
return siphash13(key, 0, src, src_sz);
This conversation was marked as resolved by tiran

This comment has been minimized.

@tiran

tiran Oct 8, 2021
Member

I'm not sure that it's a good idea to change the keyed hash format. PEP 552 kinda implies that PYC format uses SipHash-2-4. This change breaks backwards compatibility of public API importlib.util.source_hash. The output of source_hash() on 3.11 would no longer be compatible with Python 3.10 and older releases.

@benjaminp @brettcannon What do you think?

This comment has been minimized.

@methane

methane Oct 8, 2021
Author Member

This change breaks backwards compatibility of public API importlib.util.source_hash.

If "breaks backward compatibility" means changing hash value, incrementing MAGIC number is also backward incompatible.

I changed this hash function because speeding up hash validation may increase startup time when hash validation is used.

This comment has been minimized.

@tiran

tiran Oct 8, 2021
Member

Good point!

May I suggest that we either wait for feedback from Brett and Benjamin or move the change into a separate BPO? This change may need an update of PEP 552, too.

This comment has been minimized.

@benjaminp

benjaminp Oct 8, 2021
Contributor

No application I'm aware of would care about this changing. importlib.util.source_hash just exists to have the same result as what the importlib system would compute.

This comment has been minimized.

@tiran

tiran Oct 9, 2021
Member

Thanks for the feedback, Benjamin!

}


#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH13
static Py_hash_t
pysiphash(const void *src, Py_ssize_t src_sz) {
return (Py_hash_t)siphash13(
_le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
src, src_sz);
}

static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash13", 64, 128};
#endif

#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
static Py_hash_t
pysiphash(const void *src, Py_ssize_t src_sz) {
@@ -1556,9 +1556,9 @@ Optional Packages:
--with-undefined-behavior-sanitizer
enable UndefinedBehaviorSanitizer undefined
behaviour detector, 'ubsan' (default is no)
--with-hash-algorithm=[fnv|siphash24]
--with-hash-algorithm=[fnv|siphash13|siphash24]
select hash algorithm for use in Python/pyhash.c
(default is SipHash24)
(default is SipHash13)
--with-tzpath=<list of absolute paths separated by pathsep>
Select the default time zone search path for zoneinfo.TZPATH
@@ -10420,6 +10420,10 @@ if test "${with_hash_algorithm+set}" = set; then :
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $withval" >&5
$as_echo "$withval" >&6; }
case "$withval" in
siphash13)
$as_echo "#define Py_HASH_ALGORITHM 3" >>confdefs.h

;;
siphash24)
$as_echo "#define Py_HASH_ALGORITHM 1" >>confdefs.h

@@ -3036,16 +3036,19 @@ fi
# str, bytes and memoryview hash algorithm
AH_TEMPLATE(Py_HASH_ALGORITHM,
[Define hash algorithm for str, bytes and memoryview.
SipHash24: 1, FNV: 2, externally defined: 0])
SipHash24: 1, FNV: 2, SipHash13: 3, externally defined: 0])

AC_MSG_CHECKING(for --with-hash-algorithm)
dnl quadrigraphs "@<:@" and "@:>@" produce "[" and "]" in the output
AC_ARG_WITH(hash_algorithm,
AS_HELP_STRING([--with-hash-algorithm=@<:@fnv|siphash24@:>@],
[select hash algorithm for use in Python/pyhash.c (default is SipHash24)]),
AS_HELP_STRING([--with-hash-algorithm=@<:@fnv|siphash13|siphash24@:>@],
[select hash algorithm for use in Python/pyhash.c (default is SipHash13)]),
[
AC_MSG_RESULT($withval)
case "$withval" in
siphash13)
AC_DEFINE(Py_HASH_ALGORITHM, 3)
;;
siphash24)
AC_DEFINE(Py_HASH_ALGORITHM, 1)
;;
@@ -136,15 +136,15 @@
/* Define to 1 if you have the `clock' function. */
#undef HAVE_CLOCK

/* Define to 1 if you have the `clock_nanosleep' function. */
#undef HAVE_CLOCK_NANOSLEEP

/* Define to 1 if you have the `clock_getres' function. */
#undef HAVE_CLOCK_GETRES

/* Define to 1 if you have the `clock_gettime' function. */
#undef HAVE_CLOCK_GETTIME

/* Define to 1 if you have the `clock_nanosleep' function. */
#undef HAVE_CLOCK_NANOSLEEP

/* Define to 1 if you have the `clock_settime' function. */
#undef HAVE_CLOCK_SETTIME

@@ -1436,7 +1436,7 @@
#undef Py_ENABLE_SHARED

/* Define hash algorithm for str, bytes and memoryview. SipHash24: 1, FNV: 2,
externally defined: 0 */
SipHash13: 3, externally defined: 0 */
#undef Py_HASH_ALGORITHM

/* Define if you want to enable tracing references for debugging purpose */