Python dict lookdict 函数源码分析与实现原理

代码里面有中文说明

static PyDictEntry * lookdict(PyDictObject *mp, PyObject *key, register long hash) { register size_t i; register size_t perturb; register PyDictEntry *freeslot; register size_t mask = (size_t)mp->ma_mask; PyDictEntry *ep0 = mp->ma_table; register PyDictEntry *ep; register int cmp; PyObject *startkey;

i = (size_t)hash & mask;
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
    return ep;

if (ep->me_key == dummy) freeslot = ep; else { if (ep->me_hash == hash) { startkey = ep->me_key; Py_INCREF(startkey); cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); Py_DECREF(startkey); if (cmp < 0) return NULL; if (ep0 == mp->ma_table && ep->me_key == startkey) { if (cmp > 0) return ep; } else { /* The compare did major nasty stuff to the * dict: start over. * XXX A clever adversary could prevent this * XXX from terminating. */ return lookdict(mp, key, hash); //此处逻辑甚是艰深,望大神指点 } } freeslot = NULL; }

/* In the loop, me_key == dummy is by far (factor of 100s) the least likely outcome, so test for that last. / for (perturb = hash; ; perturb >>= PERTURB_SHIFT) { i = (i << 2) + i + perturb + 1; ep = &ep0[i & mask]; if (ep->me_key == NULL) return freeslot == NULL ? ep : freeslot; if (ep->me_key == key) return ep; if (ep->me_hash == hash && ep->me_key != dummy) { startkey = ep->me_key; Py_INCREF(startkey); cmp = PyObject_RichCompareBool(startkey, key, Py_EQ); Py_DECREF(startkey); if (cmp < 0) return NULL; if (ep0 == mp->ma_table && ep->me_key == startkey) { if (cmp > 0) return ep; } else { / The compare did major nasty stuff to the * dict: start over. * XXX A clever adversary could prevent this * XXX from terminating. / return lookdict(mp, key, hash); } } else if (ep->me_key == dummy && freeslot == NULL) freeslot = ep; } assert(0); / NOT REACHED */ return 0;

}


Python dict lookdict 函数源码分析与实现原理

4 回复

你可以看看“.\Lib\test\crashers<a target="_blank" href=“http://nasty_eq_vs_dict.py” rel=“nofollow noopener”>nasty_eq_vs_dict.py ”文件,以及这个 http://bugs.python.org/issue14205
大概就是一个 dict 在遍历的时候被修改了。


这个lookdict函数是CPython字典实现的核心查找函数,它实现了字典的键查找逻辑。让我分析一下它的关键实现原理:

/* 这是CPython中lookdict函数的简化版实现逻辑 */
static Py_ssize_t
lookdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr)
{
    size_t i;           /* 当前探测位置 */
    size_t perturb;     /* 扰动值 */
    PyDictKeyEntry *ep; /* 当前条目 */
    
    /* 计算初始索引位置 */
    i = (size_t)hash & mask;
    ep = &mp->ma_keys->dk_entries[i];
    
    /* 如果槽位为空,返回未找到 */
    if (ep->me_key == NULL)
        return DKIX_EMPTY;
    
    /* 如果槽位被占用但不是我们要找的键 */
    if (ep->me_key != key) {
        /* 检查哈希值是否匹配 */
        if (ep->me_hash != hash) {
            perturb = hash;
            while (1) {
                /* 二次探测:i = (i*5 + 1 + perturb) & mask */
                i = (i << 2) + i + perturb + 1;
                i &= mask;
                ep = &mp->ma_keys->dk_entries[i];
                
                if (ep->me_key == NULL)
                    return DKIX_EMPTY;
                    
                if (ep->me_key == key || 
                    (ep->me_hash == hash && 
                     _PyDictKeys_Compare(mp, ep->me_key, key))) {
                    break;
                }
                perturb >>= PERTURB_SHIFT;
            }
        }
        /* 哈希匹配但键对象不同,需要深度比较 */
        else if (!_PyDictKeys_Compare(mp, ep->me_key, key)) {
            /* 处理哈希冲突,继续探测 */
            perturb = hash;
            while (1) {
                i = (i << 2) + i + perturb + 1;
                i &= mask;
                ep = &mp->ma_keys->dk_entries[i];
                
                if (ep->me_key == NULL)
                    return DKIX_EMPTY;
                    
                if (ep->me_key == key || 
                    (ep->me_hash == hash && 
                     _PyDictKeys_Compare(mp, ep->me_key, key))) {
                    break;
                }
                perturb >>= PERTURB_SHIFT;
            }
        }
    }
    
    /* 找到匹配的键,返回对应的值 */
    *value_addr = ep->me_value;
    return i;
}

核心实现原理:

  1. 开放定址法 + 二次探测:Python字典使用开放定址法解决哈希冲突,探测序列公式为 i = (i*5 + 1 + perturb) & mask,其中perturb初始为哈希值,每次循环右移5位

  2. 两级比较策略

    • 第一级:比较键对象地址(ep->me_key == key),这是最快的比较
    • 第二级:比较哈希值(ep->me_hash == hash
    • 第三级:深度比较键值(_PyDictKeys_Compare
  3. 内存布局优化:Python 3.6+使用紧凑型字典,将键和值分开存储,提高了缓存局部性

  4. 哈希扰动:使用perturb变量确保探测序列能覆盖整个哈希表,避免聚集现象

查找时间复杂度

  • 平均情况:O(1)
  • 最坏情况:O(n)(当所有键哈希冲突时)

这个设计在内存效率和查找速度之间取得了很好的平衡,特别是通过对象地址比较和哈希值比较的快速路径,大多数查找都能在O(1)时间内完成。

总结:字典查找的核心是哈希+开放定址法,通过多级比较优化性能。

哥们,可以找到 Python 之父这么设计的原因相关的报道吗,这么看起来太费劲了。

这是 python 1.5.2 版本的源码,冲突直接走后面的代码

static dictentry *
lookdict(mp, key, hash)
dictobject *mp;
PyObject *key;
register long hash;
{
register int i;
register unsigned incr;
register dictentry freeslot;
register unsigned int mask = mp->ma_size-1;
dictentry ep0 = mp->ma_table;
register dictentry ep;
/
We must come up with (i, incr) such that 0 <= i < ma_size
and 0 < incr < ma_size and both are a function of hash /
i = (~hash) & mask;
/
We use ~hash instead of hash, as degenerate hash functions, such
as for ints <sigh>, can have lots of leading zeros. It’s not
really a performance risk, but better safe than sorry. /
ep = &ep0[i];
if (ep->me_key == NULL || ep->me_key == key)
return ep;
if (ep->me_key == dummy)
freeslot = ep;
else {
if (ep->me_hash == hash &&
PyObject_Compare(ep->me_key, key) == 0) //不冲突
{
return ep;
}
freeslot = NULL; //冲突
}
/
XXX What if PyObject_Compare returned an exception? /
/
Derive incr from hash, just to make it more arbitrary. Note that
incr must not be 0, or we will get into an infinite loop.
/
incr = (hash ^ ((unsigned long)hash >> 3)) & mask;
if (!incr)
incr = mask;
for (;;) {
ep = &ep0[(i+incr)&mask];
if (ep->me_key == NULL) {
if (freeslot != NULL)
return freeslot;
else
return ep;
}
if (ep->me_key == dummy) {
if (freeslot == NULL)
freeslot = ep;
}
else if (ep->me_key == key ||
(ep->me_hash == hash &&
PyObject_Compare(ep->me_key, key) == 0)) {
return ep;
}
/
XXX What if PyObject_Compare returned an exception? /
/
Cycle through GF(2^n)-{0} /
incr = incr << 1;
if (incr > mask)
incr ^= mp->ma_poly; /
This will implicitely clear
the highest bit */
}
}

回到顶部