Python dict lookdict 函数源码分析与实现原理
代码里面有中文说明
static PyDictEntry * lookdict(PyDictObject *mp, PyObject *key, register long hash) { register size_t i; register size_t perturb; register PyDictEntry *freeslot; register size_t mask = (size_t)mp->ma_mask; PyDictEntry *ep0 = mp->ma_table; register PyDictEntry *ep; register int cmp; PyObject *startkey;
i = (size_t)hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
return ep;
if (ep->me_key == dummy)
freeslot = ep;
else {
if (ep->me_hash == hash) {
startkey = ep->me_key;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp < 0)
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
if (cmp > 0)
return ep;
}
else {
/* The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
*/
return lookdict(mp, key, hash); //此处逻辑甚是艰深,望大神指点
}
}
freeslot = NULL;
}
/* In the loop, me_key == dummy is by far (factor of 100s) the
least likely outcome, so test for that last. /
for (perturb = hash; ; perturb >>= PERTURB_SHIFT) {
i = (i << 2) + i + perturb + 1;
ep = &ep0[i & mask];
if (ep->me_key == NULL)
return freeslot == NULL ? ep : freeslot;
if (ep->me_key == key)
return ep;
if (ep->me_hash == hash && ep->me_key != dummy) {
startkey = ep->me_key;
Py_INCREF(startkey);
cmp = PyObject_RichCompareBool(startkey, key, Py_EQ);
Py_DECREF(startkey);
if (cmp < 0)
return NULL;
if (ep0 == mp->ma_table && ep->me_key == startkey) {
if (cmp > 0)
return ep;
}
else {
/ The compare did major nasty stuff to the
* dict: start over.
* XXX A clever adversary could prevent this
* XXX from terminating.
/
return lookdict(mp, key, hash);
}
}
else if (ep->me_key == dummy && freeslot == NULL)
freeslot = ep;
}
assert(0); / NOT REACHED */
return 0;
}
Python dict lookdict 函数源码分析与实现原理
你可以看看“.\Lib\test\crashers<a target="_blank" href=“http://nasty_eq_vs_dict.py” rel=“nofollow noopener”>nasty_eq_vs_dict.py ”文件,以及这个 http://bugs.python.org/issue14205 。
大概就是一个 dict 在遍历的时候被修改了。
这个lookdict函数是CPython字典实现的核心查找函数,它实现了字典的键查找逻辑。让我分析一下它的关键实现原理:
/* 这是CPython中lookdict函数的简化版实现逻辑 */
static Py_ssize_t
lookdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr)
{
size_t i; /* 当前探测位置 */
size_t perturb; /* 扰动值 */
PyDictKeyEntry *ep; /* 当前条目 */
/* 计算初始索引位置 */
i = (size_t)hash & mask;
ep = &mp->ma_keys->dk_entries[i];
/* 如果槽位为空,返回未找到 */
if (ep->me_key == NULL)
return DKIX_EMPTY;
/* 如果槽位被占用但不是我们要找的键 */
if (ep->me_key != key) {
/* 检查哈希值是否匹配 */
if (ep->me_hash != hash) {
perturb = hash;
while (1) {
/* 二次探测:i = (i*5 + 1 + perturb) & mask */
i = (i << 2) + i + perturb + 1;
i &= mask;
ep = &mp->ma_keys->dk_entries[i];
if (ep->me_key == NULL)
return DKIX_EMPTY;
if (ep->me_key == key ||
(ep->me_hash == hash &&
_PyDictKeys_Compare(mp, ep->me_key, key))) {
break;
}
perturb >>= PERTURB_SHIFT;
}
}
/* 哈希匹配但键对象不同,需要深度比较 */
else if (!_PyDictKeys_Compare(mp, ep->me_key, key)) {
/* 处理哈希冲突,继续探测 */
perturb = hash;
while (1) {
i = (i << 2) + i + perturb + 1;
i &= mask;
ep = &mp->ma_keys->dk_entries[i];
if (ep->me_key == NULL)
return DKIX_EMPTY;
if (ep->me_key == key ||
(ep->me_hash == hash &&
_PyDictKeys_Compare(mp, ep->me_key, key))) {
break;
}
perturb >>= PERTURB_SHIFT;
}
}
}
/* 找到匹配的键,返回对应的值 */
*value_addr = ep->me_value;
return i;
}
核心实现原理:
-
开放定址法 + 二次探测:Python字典使用开放定址法解决哈希冲突,探测序列公式为
i = (i*5 + 1 + perturb) & mask,其中perturb初始为哈希值,每次循环右移5位 -
两级比较策略:
- 第一级:比较键对象地址(
ep->me_key == key),这是最快的比较 - 第二级:比较哈希值(
ep->me_hash == hash) - 第三级:深度比较键值(
_PyDictKeys_Compare)
- 第一级:比较键对象地址(
-
内存布局优化:Python 3.6+使用紧凑型字典,将键和值分开存储,提高了缓存局部性
-
哈希扰动:使用
perturb变量确保探测序列能覆盖整个哈希表,避免聚集现象
查找时间复杂度:
- 平均情况:O(1)
- 最坏情况:O(n)(当所有键哈希冲突时)
这个设计在内存效率和查找速度之间取得了很好的平衡,特别是通过对象地址比较和哈希值比较的快速路径,大多数查找都能在O(1)时间内完成。
总结:字典查找的核心是哈希+开放定址法,通过多级比较优化性能。
哥们,可以找到 Python 之父这么设计的原因相关的报道吗,这么看起来太费劲了。
这是 python 1.5.2 版本的源码,冲突直接走后面的代码
static dictentry *
lookdict(mp, key, hash)
dictobject *mp;
PyObject *key;
register long hash;
{
register int i;
register unsigned incr;
register dictentry freeslot;
register unsigned int mask = mp->ma_size-1;
dictentry ep0 = mp->ma_table;
register dictentry ep;
/ We must come up with (i, incr) such that 0 <= i < ma_size
and 0 < incr < ma_size and both are a function of hash /
i = (~hash) & mask;
/ We use ~hash instead of hash, as degenerate hash functions, such
as for ints <sigh>, can have lots of leading zeros. It’s not
really a performance risk, but better safe than sorry. /
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
return ep;
if (ep->me_key == dummy)
freeslot = ep;
else {
if (ep->me_hash == hash &&
PyObject_Compare(ep->me_key, key) == 0) //不冲突
{
return ep;
}
freeslot = NULL; //冲突
}
/ XXX What if PyObject_Compare returned an exception? /
/ Derive incr from hash, just to make it more arbitrary. Note that
incr must not be 0, or we will get into an infinite loop./
incr = (hash ^ ((unsigned long)hash >> 3)) & mask;
if (!incr)
incr = mask;
for (;;) {
ep = &ep0[(i+incr)&mask];
if (ep->me_key == NULL) {
if (freeslot != NULL)
return freeslot;
else
return ep;
}
if (ep->me_key == dummy) {
if (freeslot == NULL)
freeslot = ep;
}
else if (ep->me_key == key ||
(ep->me_hash == hash &&
PyObject_Compare(ep->me_key, key) == 0)) {
return ep;
}
/ XXX What if PyObject_Compare returned an exception? /
/ Cycle through GF(2^n)-{0} /
incr = incr << 1;
if (incr > mask)
incr ^= mp->ma_poly; / This will implicitely clear
the highest bit */
}
}

