Python内存结构模型源码详细解析

2025-6-28 评论(0) 分类:人工智能 Tags:

1. 核心对象模型 (Include/object.h)

1.1 基础对象结构 PyObject

// Include/object.h
typedef struct _object {
    _PyObject_HEAD_EXTRA    // 调试模式下的额外字段
    Py_ssize_t ob_refcnt;   // 引用计数
    PyTypeObject *ob_type;  // 类型对象指针
} PyObject;

// 变长对象 PyVarObject是Python中用于表示可变长度对象(如列表、元组、字符串等)的基础结构。它扩展了基本的PyObject,增加了一个表示对象中元素数量的字段(ob_size)
typedef struct {
    PyObject ob_base;       // 基础对象
    Py_ssize_t ob_size;     // 对象大小
} PyVarObject;

源码位置: Include/object.h:106-120

内存布局:

PyObject:
┌─────────────┬──────────────┬─────────────┐
│ ob_refcnt   │   ob_type    │   data...   │
│   8 bytes   │   8 bytes    │  variable   │
└─────────────┴──────────────┴─────────────┘

1.2 类型对象 PyTypeObject

// Include/cpython/object.h
typedef struct _typeobject {
    PyVarObject ob_base;
    const char *tp_name;             // 类型名称
    Py_ssize_t tp_basicsize;         // 基本大小
    Py_ssize_t tp_itemsize;          // 元素大小

    // 析构和打印
    destructor tp_dealloc;
    Py_ssize_t tp_vectorcall_offset;

    // 标准方法
    getattrfunc tp_getattr;
    setattrfunc tp_setattr;
    PyAsyncMethods *tp_as_async;
    reprfunc tp_repr;

    // 数值方法、序列方法、映射方法
    PyNumberMethods *tp_as_number;
    PySequenceMethods *tp_as_sequence;
    PyMappingMethods *tp_as_mapping;

    // 更多字段...
} PyTypeObject;

2. 整数对象内存模型 (Objects/longobject.c)

2.1 长整数结构

// Include/cpython/longintrepr.h
struct _longobject {
    PyVarObject ob_base;
    digit ob_digit[1];      // 数字位数组
};

typedef struct _longobject PyLongObject;

// digit 是 30 位或 15 位的无符号整数
#if PYLONG_BITS_IN_DIGIT == 30
typedef uint32_t digit;
typedef int32_t sdigit;
typedef uint64_t twodigits;
#elif PYLONG_BITS_IN_DIGIT == 15
typedef unsigned short digit;
typedef short sdigit;
typedef unsigned long twodigits;
#endif

小整数缓存机制:

// Objects/longobject.c
#define IS_SMALL_INT(ival) (-_PY_NSMALLNEGINTS <= (ival) && (ival) < _PY_NSMALLPOSINTS)
#define IS_SMALL_UINT(ival) ((ival) < _PY_NSMALLPOSINTS)
// Objects/longobject.c
#define NSMALLPOSINTS           257
#define NSMALLNEGINTS           5

// 小整数对象池 [-5, 256]
static PyLongObject small_ints[NSMALLNEGINTS + NSMALLPOSINTS];

PyObject *
PyLong_FromLong(long ival)
{
    // 使用小整数缓存
    if (IS_SMALL_INT(ival)) {
        return get_small_int((sdigit)ival);
    }
    // 创建新的长整数对象
    return PyLong_FromLongLong(ival);
}

2.2 内存布局示例

小整数 (42):
┌───────────┬──────────┬────────┬──────────┐
│ ob_refcnt │ ob_type  │ob_size │ ob_digit │
│     ?     │  &Long   │   1    │    42    │
└───────────┴──────────┴────────┴──────────┘

大整数 (2^100):
┌───────────┬──────────┬────────┬─────────────────────┐
│ ob_refcnt │ ob_type  │ob_size │    ob_digit[]       │
│     1     │  &Long   │   4    │ [低位...高位]       │
└───────────┴──────────┴────────┴─────────────────────┘

3. 字符串对象内存模型 (Objects/unicodeobject.c)

3.1 Unicode 对象结构

// Include/cpython/unicodeobject.h
typedef struct {
    PyObject_HEAD
    Py_ssize_t length;          // 字符串长度
    Py_hash_t hash;            // 哈希值缓存
    struct {
        unsigned int interned:2;    // 字符串驻留状态
        unsigned int kind:3;        // 字符串类型
        unsigned int compact:1;     // 是否紧凑存储
        unsigned int ascii:1;       // 是否纯ASCII
        unsigned int ready:1;       // 是否就绪
        unsigned int :24;
    } state;
    wchar_t *wstr;             // 宽字符表示(可选)
} PyASCIIObject;

// 紧凑 Unicode 对象
typedef struct {
    PyASCIIObject _base;
    Py_ssize_t utf8_length;    // UTF-8 长度
    char *utf8;                // UTF-8 缓存
    Py_ssize_t wstr_length;    // 宽字符长度
} PyCompactUnicodeObject;

3.2 字符串驻留机制

// Objects/unicodeobject.c
static PyObject *interned = NULL;  // 驻留字符串字典

void
PyUnicode_InternInPlace(PyObject **p)
{
    PyObject *s = *p;
    if (PyUnicode_READY(s) == -1) {
        return;
    }

    // 检查是否已驻留
    if (PyUnicode_CHECK_INTERNED(s)) {
        return;
    }

    // 添加到驻留字典
    PyObject *t = PyDict_SetDefault(interned, s, s);
    if (t != s) {
        Py_SETREF(*p, Py_NewRef(t));
    }
    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
}

4. 列表对象内存模型 (Objects/listobject.c)

4.1 列表结构

// Include/cpython/listobject.h
typedef struct {
    PyVarObject ob_base;
    PyObject **ob_item;        // 指向元素数组的指针
    Py_ssize_t allocated;      // 已分配的空间大小
} PyListObject;

4.2 动态扩容机制

// Objects/listobject.c
static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
    PyObject **items;
    size_t new_allocated, num_allocated_bytes;
    Py_ssize_t allocated = self->allocated;

    // 扩容策略: newsize + (newsize >> 3) + (newsize < 9 ? 3 : 6)
    new_allocated = ((size_t)newsize + (newsize >> 3) + 6) & ~(size_t)3;

    if (newsize == 0) {
        PyMem_FREE(self->ob_item);
        self->ob_item = NULL;
        self->allocated = 0;
        return 0;
    }

    // 重新分配内存
    items = (PyObject **)PyMem_Realloc(self->ob_item,
                                       new_allocated * sizeof(PyObject *));
    if (items == NULL) {
        PyErr_NoMemory();
        return -1;
    }

    self->ob_item = items;
    self->allocated = new_allocated;
    return 0;
}

4.3 内存布局

PyObject:
┌─────────────┬──────────────┬─────────────┐
│ ob_refcnt   │   ob_type    │   data...   │
│   8 bytes   │   8 bytes    │  variable   │
└─────────────┴──────────────┴─────────────┘

1.2 类型对象 PyTypeObject

// Include/cpython/object.h
typedef struct _typeobject {
    PyVarObject ob_base;
    const char *tp_name;             // 类型名称
    Py_ssize_t tp_basicsize;         // 基本大小
    Py_ssize_t tp_itemsize;          // 元素大小

    // 析构和打印
    destructor tp_dealloc;
    Py_ssize_t tp_vectorcall_offset;

    // 标准方法
    getattrfunc tp_getattr;
    setattrfunc tp_setattr;
    PyAsyncMethods *tp_as_async;
    reprfunc tp_repr;

    // 数值方法、序列方法、映射方法
    PyNumberMethods *tp_as_number;
    PySequenceMethods *tp_as_sequence;
    PyMappingMethods *tp_as_mapping;

    // 更多字段...
} PyTypeObject;

2. 整数对象内存模型 (Objects/longobject.c)

2.1 长整数结构

// Include/cpython/longintrepr.h
struct _longobject {
    PyVarObject ob_base;
    digit ob_digit[1];      // 数字位数组
};

typedef struct _longobject PyLongObject;

// digit 是 30 位或 15 位的无符号整数
#if PYLONG_BITS_IN_DIGIT == 30
typedef uint32_t digit;
typedef int32_t sdigit;
typedef uint64_t twodigits;
#elif PYLONG_BITS_IN_DIGIT == 15
typedef unsigned short digit;
typedef short sdigit;
typedef unsigned long twodigits;
#endif

小整数缓存机制:

// Objects/longobject.c
#define IS_SMALL_INT(ival) (-_PY_NSMALLNEGINTS <= (ival) && (ival) < _PY_NSMALLPOSINTS)
#define IS_SMALL_UINT(ival) ((ival) < _PY_NSMALLPOSINTS)
// Objects/longobject.c
#define NSMALLPOSINTS           257
#define NSMALLNEGINTS           5

// 小整数对象池 [-5, 256]
static PyLongObject small_ints[NSMALLNEGINTS + NSMALLPOSINTS];

PyObject *
PyLong_FromLong(long ival)
{
    // 使用小整数缓存
    if (IS_SMALL_INT(ival)) {
        return get_small_int((sdigit)ival);
    }
    // 创建新的长整数对象
    return PyLong_FromLongLong(ival);
}

2.2 内存布局示例

小整数 (42):
┌───────────┬──────────┬────────┬──────────┐
│ ob_refcnt │ ob_type  │ob_size │ ob_digit │
│     ?     │  &Long   │   1    │    42    │
└───────────┴──────────┴────────┴──────────┘

大整数 (2^100):
┌───────────┬──────────┬────────┬─────────────────────┐
│ ob_refcnt │ ob_type  │ob_size │    ob_digit[]       │
│     1     │  &Long   │   4    │ [低位...高位]       │
└───────────┴──────────┴────────┴─────────────────────┘

3. 字符串对象内存模型 (Objects/unicodeobject.c)

3.1 Unicode 对象结构

// Include/cpython/unicodeobject.h
typedef struct {
    PyObject_HEAD
    Py_ssize_t length;          // 字符串长度
    Py_hash_t hash;            // 哈希值缓存
    struct {
        unsigned int interned:2;    // 字符串驻留状态
        unsigned int kind:3;        // 字符串类型
        unsigned int compact:1;     // 是否紧凑存储
        unsigned int ascii:1;       // 是否纯ASCII
        unsigned int ready:1;       // 是否就绪
        unsigned int :24;
    } state;
    wchar_t *wstr;             // 宽字符表示(可选)
} PyASCIIObject;

// 紧凑 Unicode 对象
typedef struct {
    PyASCIIObject _base;
    Py_ssize_t utf8_length;    // UTF-8 长度
    char *utf8;                // UTF-8 缓存
    Py_ssize_t wstr_length;    // 宽字符长度
} PyCompactUnicodeObject;

3.2 字符串驻留机制

// Objects/unicodeobject.c
static PyObject *interned = NULL;  // 驻留字符串字典

void
PyUnicode_InternInPlace(PyObject **p)
{
    PyObject *s = *p;
    if (PyUnicode_READY(s) == -1) {
        return;
    }

    // 检查是否已驻留
    if (PyUnicode_CHECK_INTERNED(s)) {
        return;
    }

    // 添加到驻留字典
    PyObject *t = PyDict_SetDefault(interned, s, s);
    if (t != s) {
        Py_SETREF(*p, Py_NewRef(t));
    }
    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
}

4. 列表对象内存模型 (Objects/listobject.c)

4.1 列表结构

// Include/cpython/listobject.h
typedef struct {
    PyVarObject ob_base;
    PyObject **ob_item;        // 指向元素数组的指针
    Py_ssize_t allocated;      // 已分配的空间大小
} PyListObject;

4.2 动态扩容机制

// Objects/listobject.c
static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
    PyObject **items;
    size_t new_allocated, num_allocated_bytes;
    Py_ssize_t allocated = self->allocated;

    // 扩容策略: newsize + (newsize >> 3) + (newsize < 9 ? 3 : 6)
    new_allocated = ((size_t)newsize + (newsize >> 3) + 6) & ~(size_t)3;

    if (newsize == 0) {
        PyMem_FREE(self->ob_item);
        self->ob_item = NULL;
        self->allocated = 0;
        return 0;
    }

    // 重新分配内存
    items = (PyObject **)PyMem_Realloc(self->ob_item,
                                       new_allocated * sizeof(PyObject *));
    if (items == NULL) {
        PyErr_NoMemory();
        return -1;
    }

    self->ob_item = items;
    self->allocated = new_allocated;
    return 0;
}

4.3 内存布局

空列表:
┌───────────┬──────────┬────────┬──────────┬───────────┐
│ ob_refcnt │ ob_type  │ob_size │ ob_item  │ allocated │
│     1     │  &List   │   0    │   NULL   │     0     │
└───────────┴──────────┴────────┴──────────┴───────────┘

包含元素的列表 [1, 2, 3]:
┌───────────┬──────────┬────────┬──────────┬───────────┐
│ ob_refcnt │ ob_type  │ob_size │ ob_item  │ allocated │
│     1     │  &List   │   3    │   ptr    │     4     │
└───────────┴──────────┴────────┴──────────┴───────────┘
                                      │
                                      ▼
                               ┌──────────────────┐
                               │ PyObject *[4]    │
                               │ [0]: &int(1)     │
                               │ [1]: &int(2)     │
                               │ [2]: &int(3)     │
                               │ [3]: NULL        │
                               └──────────────────┘

5. 字典对象内存模型 (Objects/dictobject.c)

5.1 字典结构 (紧凑表示)

// Objects/dictobject.c
struct _dictkeysobject {
    Py_ssize_t dk_refcnt;      // 键对象引用计数
    Py_ssize_t dk_size;        // 哈希表大小
    dict_lookup_func dk_lookup; // 查找函数
    Py_ssize_t dk_usable;      // 可用槽位数
    Py_ssize_t dk_nentries;    // 已使用条目数
    char dk_indices[];         // 索引数组 + 条目数组
};

typedef struct {
    PyObject_HEAD
    Py_ssize_t ma_used;        // 已使用条目数
    uint64_t ma_version_tag;   // 版本标签
    PyDictKeysObject *ma_keys; // 键对象
    PyObject **ma_values;      // 值数组(分离表示)
} PyDictObject;

5.2 哈希冲突解决

// Objects/dictobject.c
static Py_ssize_t
lookdict(PyDictObject *mp, PyObject *key, Py_hash_t hash, PyObject **value_addr)
{
    PyDictKeysObject *dk = mp->ma_keys;
    size_t mask = DK_MASK(dk);
    size_t perturb = hash;
    size_t i = (size_t)hash & mask;

    // 开放寻址法解决冲突
    for (;;) {
        Py_ssize_t ix = dk_get_index(dk, i);
        if (ix == DKIX_EMPTY) {
            *value_addr = NULL;
            return ix;
        }

        PyDictKeyEntry *ep = &DK_ENTRIES(dk)[ix];
        assert(ep->me_key != NULL);

        if (ep->me_key == key) {
            *value_addr = ep->me_value;
            return ix;
        }

        // 扰动探测
        perturb >>= PERTURB_SHIFT;
        i = (i*5 + 1 + perturb) & mask;
    }
}

6. 内存管理机制

6.1 对象分配器 (Objects/obmalloc.c)

// Objects/obmalloc.c

// 内存池结构
struct pool_header {
    union { block *_padding;
            uint count; } ref;          // 引用计数
    block *freeblock;                  // 空闲块链表
    struct pool_header *nextpool;      // 下一个池
    struct pool_header *prevpool;      // 前一个池
    uint arenaindex;                   // 竞技场索引
    uint szidx;                        // 大小索引
    uint nextoffset;                   // 下一个偏移
    uint maxnextoffset;                // 最大偏移
};

// PyObject_Malloc 实现
void *
PyObject_Malloc(size_t size)
{
    if (size > SMALL_REQUEST_THRESHOLD) {
        return PyMem_RawMalloc(size);
    }

    // 使用对象分配器
    uint pool_idx = size_to_pool_idx(size);
    pool_header *pool = usedpools[pool_idx];

    if (pool != NULL) {
        // 从现有池分配
        block *bp = pool->freeblock;
        if (bp != NULL) {
            pool->freeblock = *(block **)bp;
            return (void *)bp;
        }
    }

    // 创建新池或使用系统分配器
    return allocate_from_new_pool(size);
}

6.2 垃圾回收机制 (Modules/gcmodule.c)

// Modules/gcmodule.c

// GC 头结构
typedef union _gc_head {
    struct {
        union _gc_head *gc_next;
        union _gc_head *gc_prev;
        Py_ssize_t gc_refs;
    } gc;
    double dummy;  // 对齐
} PyGC_Head;

// 垃圾回收主函数
static Py_ssize_t
gc_collect_main(PyThreadState *tstate, int generation, Py_ssize_t *n_collected, Py_ssize_t *n_uncollectable, int nofail)
{
    Py_ssize_t m = 0; // 收集的对象数
    Py_ssize_t n = 0; // 无法收集的对象数
    PyGC_Head *young; // 年轻代
    PyGC_Head *old; // 老年代
    PyGC_Head unreachable; // 不可达对象
    PyGC_Head finalizers; // 终结器对象

    // 标记-清扫算法
    // 1. 将引用计数复制到gc_refs
    update_refs(young);

    // 2. 减少内部引用
    subtract_refs(young);

    // 3. 标记可达对象
    move_unreachable(young, &unreachable);

    // 4. 处理终结器
    move_legacy_finalizers(&unreachable, &finalizers);

    // 5. 删除不可达对象
    delete_garbage(tstate, &unreachable, generation);

    return n+m;
}
// Modules/gcmodule.c 详细展开
static Py_ssize_t
gc_collect_main(PyThreadState *tstate, int generation,
                Py_ssize_t *n_collected, Py_ssize_t *n_uncollectable,
                int nofail)
{
    int i;
    Py_ssize_t m = 0; /* # objects collected */
    Py_ssize_t n = 0; /* # unreachable objects that couldn't be collected */
    PyGC_Head *young; /* the generation we are examining */
    PyGC_Head *old; /* next older generation */
    PyGC_Head unreachable; /* non-problematic unreachable trash */
    PyGC_Head finalizers;  /* objects with, & reachable from, __del__ */
    PyGC_Head *gc;
    _PyTime_t t1 = 0;   /* initialize to prevent a compiler warning */
    GCState *gcstate = &tstate->interp->gc;

    // gc_collect_main() must not be called before _PyGC_Init
    // or after _PyGC_Fini()
    assert(gcstate->garbage != NULL);
    assert(!_PyErr_Occurred(tstate));

    if (gcstate->debug & DEBUG_STATS) {
        PySys_WriteStderr("gc: collecting generation %d...\n", generation);
        show_stats_each_generations(gcstate);
        t1 = _PyTime_GetPerfCounter();
    }

    if (PyDTrace_GC_START_ENABLED())
        PyDTrace_GC_START(generation);

        **//重置当前代及更年轻代的计数器,增加下一代的计数器,之所以重制当前代的计数器,是为了完成垃圾回收后,当前的计数器还处于较高的峰值,再次触发回收,给下一代增加1 ,这个很好理解,为了出发老年代的对象**
        
    /* update collection and allocation counters */
    if (generation+1 < NUM_GENERATIONS)
        gcstate->generations[generation+1].count += 1;
    for (i = 0; i <= generation; i++)
        gcstate->generations[i].count = 0;

        **//这是分代收集的关键:收集第N代时,会同时收集所有更年轻的代(0到N-1代)。**
    /* merge younger generations with one we are currently collecting */
    for (i = 0; i < generation; i++) {
        gc_list_merge(GEN_HEAD(gcstate, i), GEN_HEAD(gcstate, generation));
    }

        **//young: 指向当前正在收集的代
        //old: 指向下一个更老的代(或在特殊情况下指向当前代)**
    /* handy references */
    young = GEN_HEAD(gcstate, generation);
    if (generation < NUM_GENERATIONS-1)
        old = GEN_HEAD(gcstate, generation+1);
    else
        old = young;
    validate_list(old, collecting_clear_unreachable_clear);

        **//将不可达的对象封装到unreachable链表里,可达对象保留在young链表中,分离不可达对象(move_unreachable): 将`gc_refs`为0的对象移到不可达链表,大于0的对象留在可达链表。**
    deduce_unreachable(young, &unreachable);

    untrack_tuples(young);
    /* Move reachable objects to next generation. */
    if (young != old) {
        if (generation == NUM_GENERATIONS - 2) {
            gcstate->long_lived_pending += gc_list_size(young);
        }
        gc_list_merge(young, old);
    }
    else {
        /* We only un-track dicts in full collections, to avoid quadratic
           dict build-up. See issue #14775. */
        untrack_dicts(young);
        gcstate->long_lived_pending = 0;
        gcstate->long_lived_total = gc_list_size(young);
    }

        **//创建一个空的链表用于存放有终结器的对象
        //这个列表将包含所有不能立即删除的"问题"对象**
    /* All objects in unreachable are trash, but objects reachable from
     * legacy finalizers (e.g. tp_del) can't safely be deleted.
     */
    gc_list_init(&finalizers);
    // NEXT_MASK_UNREACHABLE is cleared here.
    // After move_legacy_finalizers(), unreachable is normal list.
    
        //   从unreachable列表中找出所有有遗留终结器的对象,移动到finalizers列表
        //具体过程:
        //	遍历unreachable列表中的每个对象
        //	检查对象是否有__del__方法或tp_del函数
        //	如果有,将该对象从unreachable移动到finalizers
        //	同时清除对象的NEXT_MASK_UNREACHABLE标记

    move_legacy_finalizers(&unreachable, &finalizers);
    
    **//找出从终结器对象可达的其他对象,也移动到finalizers列表**
    /* finalizers contains the unreachable objects with a legacy finalizer;
     * unreachable objects reachable *from* those are also uncollectable,
     * and we move those into the finalizers list too.
     */
    move_legacy_finalizer_reachable(&finalizers);

    validate_list(&finalizers, collecting_clear_unreachable_clear);
    validate_list(&unreachable, collecting_set_unreachable_clear);

    /* Print debugging information. */
    if (gcstate->debug & DEBUG_COLLECTABLE) {
        for (gc = GC_NEXT(&unreachable); gc != &unreachable; gc = GC_NEXT(gc)) {
            debug_cycle("collectable", FROM_GC(gc));
        }
    }

        **//清理弱引用并调用相关回调函数。**
    /* Clear weakrefs and invoke callbacks as necessary. */
    m += handle_weakrefs(&unreachable, old);

    validate_list(old, collecting_clear_unreachable_clear);
    validate_list(&unreachable, collecting_set_unreachable_clear);

        **//调用对象的tp_finalize方法**
    /* Call tp_finalize on objects which have one. */
    finalize_garbage(tstate, &unreachable);

    /* Handle any objects that may have resurrected after the call
     * to 'finalize_garbage' and continue the collection with the
     * objects that are still unreachable */
    PyGC_Head final_unreachable;
    handle_resurrected_objects(&unreachable, &final_unreachable, old);

    /* Call tp_clear on objects in the final_unreachable set.  This will cause
    * the reference cycles to be broken.  It may also cause some objects
    * in finalizers to be freed.
    */
    m += gc_list_size(&final_unreachable);
    delete_garbage(tstate, gcstate, &final_unreachable, old);

    /* Collect statistics on uncollectable objects found and print
     * debugging information. */
    for (gc = GC_NEXT(&finalizers); gc != &finalizers; gc = GC_NEXT(gc)) {
        n++;
        if (gcstate->debug & DEBUG_UNCOLLECTABLE)
            debug_cycle("uncollectable", FROM_GC(gc));
    }
    if (gcstate->debug & DEBUG_STATS) {
        double d = _PyTime_AsSecondsDouble(_PyTime_GetPerfCounter() - t1);
        PySys_WriteStderr(
            "gc: done, %zd unreachable, %zd uncollectable, %.4fs elapsed\n",
            n+m, n, d);
    }

    /* Append instances in the uncollectable set to a Python
     * reachable list of garbage.  The programmer has to deal with
     * this if they insist on creating this type of structure.
     */
    handle_legacy_finalizers(tstate, gcstate, &finalizers, old);
    validate_list(old, collecting_clear_unreachable_clear);

    /* Clear free list only during the collection of the highest
     * generation */
    if (generation == NUM_GENERATIONS-1) {
        clear_freelists(tstate->interp);
    }

    if (_PyErr_Occurred(tstate)) {
        if (nofail) {
            _PyErr_Clear(tstate);
        }
        else {
            _PyErr_WriteUnraisableMsg("in garbage collection", NULL);
        }
    }

    /* Update stats */
    if (n_collected) {
        *n_collected = m;
    }
    if (n_uncollectable) {
        *n_uncollectable = n;
    }

    struct gc_generation_stats *stats = &gcstate->generation_stats[generation];
    stats->collections++;
    stats->collected += m;
    stats->uncollectable += n;

    if (PyDTrace_GC_DONE_ENABLED()) {
        PyDTrace_GC_DONE(n + m);
    }

    assert(!_PyErr_Occurred(tstate));
    return n + m;
}
move_legacy_finalizers(&unreachable, &finalizers)作用

功能:找出从终结器对象可达的其他对象,也移动到finalizers列表
为什么需要这步?
Container:
    def __del__(self):
        # 可能访问self.data
        print(f"删除容器: {self.data}")

class Data:
    pass

# 创建循环引用
container = Container()
data = Data()
container.data = data
data.container = container
在这个例子中:

Container有__del__方法,会被放入finalizers
Data对象虽然没有终结器,但从Container可达
如果先删除Data,Container.__del__执行时会出错
因此Data也必须被标记为不可收集

执行过程:

从finalizers中的每个对象开始
通过BFS/DFS遍历所有可达对象
将这些可达对象也移动到finalizers列表
确保终结器执行时不会访问到已删除的对象

6.2.2. finalize_garbage方法

finalize_garbage(PyThreadState *tstate, PyGC_Head *collectable)
{
    destructor finalize;
    PyGC_Head seen;

    /* While we're going through the loop, `finalize(op)` may cause op, or
     * other objects, to be reclaimed via refcounts falling to zero.  So
     * there's little we can rely on about the structure of the input
     * `collectable` list across iterations.  For safety, we always take the
     * first object in that list and move it to a temporary `seen` list.
     * If objects vanish from the `collectable` and `seen` lists we don't
     * care.
     */
    gc_list_init(&seen);

    while (!gc_list_is_empty(collectable)) {
        PyGC_Head *gc = GC_NEXT(collectable);
        PyObject *op = FROM_GC(gc);
        gc_list_move(gc, &seen);
        if (!_PyGCHead_FINALIZED(gc) &&
                (finalize = Py_TYPE(op)->tp_finalize) != NULL) {
            _PyGCHead_SET_FINALIZED(gc);
            Py_INCREF(op);
            finalize(op);
            assert(!_PyErr_Occurred(tstate));
            Py_DECREF(op);
        }
    }
    gc_list_merge(&seen, collectable);
}
(Objects/typeobject.c)

FLSLOT(__init__, tp_init, slot_tp_init, (wrapperfunc)(void(*)(void))wrap_init,
           "__init__($self, /, *args, **kwargs)\n--\n\n"
           "Initialize self.  See help(type(self)) for accurate signature.",
           PyWrapperFlag_KEYWORDS),
    TPSLOT(__new__, tp_new, slot_tp_new, NULL,
           "__new__(type, /, *args, **kwargs)\n--\n\n"
           "Create and return new object.  See help(type) for accurate signature."),
    **TPSLOT(__del__, tp_finalize, slot_tp_finalize, (wrapperfunc)wrap_del, ""),**

    BUFSLOT(__buffer__, bf_getbuffer, slot_bf_getbuffer, wrap_buffer,
            "__buffer__($self, flags, /)\n--\n\n"
            "Return a buffer object that exposes the underlying memory of the object."),
    BUFSLOT(__release_buffer__, bf_releasebuffer, slot_bf_releasebuffer, wrap_releasebuffer,
            "__release_buffer__($self, buffer, /)\n--\n\n"
            "Release the buffer object that exposes the underlying memory of the object."),
                     
......

static void
slot_tp_finalize(PyObject *self)
{
    int unbound;
    PyObject *del, *res;

    /* Save the current exception, if any. */
    PyObject *exc = PyErr_GetRaisedException();

    /* Execute __del__ method, if any. */
    del = lookup_maybe_method(self, &_Py_ID(__del__), &unbound);
    if (del != NULL) {
        res = call_unbound_noarg(unbound, del, self);
        if (res == NULL)
            PyErr_WriteUnraisable(del);
        else
            Py_DECREF(res);
        Py_DECREF(del);
    }

    /* Restore the saved exception. */
    PyErr_SetRaisedException(exc);
}

// 调用del方法

lookup_maybe_method(PyObject *self, PyObject *attr, int *unbound)
{
    PyObject *res = _PyType_Lookup(Py_TYPE(self), attr);
    if (res == NULL) {
        return NULL;
    }

    if (_PyType_HasFeature(Py_TYPE(res), Py_TPFLAGS_METHOD_DESCRIPTOR)) {
        /* Avoid temporary PyMethodObject */
        *unbound = 1;
        Py_INCREF(res);
    }
    else {
        *unbound = 0;
        descrgetfunc f = Py_TYPE(res)->tp_descr_get;
        if (f == NULL) {
            Py_INCREF(res);
        }
        else {
            res = f(res, self, (PyObject *)(Py_TYPE(self)));
        }
    }
    return res;
}
(Objects/typeobject.c)

FLSLOT(__init__, tp_init, slot_tp_init, (wrapperfunc)(void(*)(void))wrap_init,
           "__init__($self, /, *args, **kwargs)\n--\n\n"
           "Initialize self.  See help(type(self)) for accurate signature.",
           PyWrapperFlag_KEYWORDS),
    TPSLOT(__new__, tp_new, slot_tp_new, NULL,
           "__new__(type, /, *args, **kwargs)\n--\n\n"
           "Create and return new object.  See help(type) for accurate signature."),
    **TPSLOT(__del__, tp_finalize, slot_tp_finalize, (wrapperfunc)wrap_del, ""),**

    BUFSLOT(__buffer__, bf_getbuffer, slot_bf_getbuffer, wrap_buffer,
            "__buffer__($self, flags, /)\n--\n\n"
            "Return a buffer object that exposes the underlying memory of the object."),
    BUFSLOT(__release_buffer__, bf_releasebuffer, slot_bf_releasebuffer, wrap_releasebuffer,
            "__release_buffer__($self, buffer, /)\n--\n\n"
            "Release the buffer object that exposes the underlying memory of the object."),
                     
......

static void
slot_tp_finalize(PyObject *self)
{
    int unbound;
    PyObject *del, *res;

    /* Save the current exception, if any. */
    PyObject *exc = PyErr_GetRaisedException();

    /* Execute __del__ method, if any. */
    del = lookup_maybe_method(self, &_Py_ID(__del__), &unbound);
    if (del != NULL) {
        res = call_unbound_noarg(unbound, del, self);
        if (res == NULL)
            PyErr_WriteUnraisable(del);
        else
            Py_DECREF(res);
        Py_DECREF(del);
    }

    /* Restore the saved exception. */
    PyErr_SetRaisedException(exc);
}

// 调用del方法

lookup_maybe_method(PyObject *self, PyObject *attr, int *unbound)
{
    PyObject *res = _PyType_Lookup(Py_TYPE(self), attr);
    if (res == NULL) {
        return NULL;
    }

    if (_PyType_HasFeature(Py_TYPE(res), Py_TPFLAGS_METHOD_DESCRIPTOR)) {
        /* Avoid temporary PyMethodObject */
        *unbound = 1;
        Py_INCREF(res);
    }
    else {
        *unbound = 0;
        descrgetfunc f = Py_TYPE(res)->tp_descr_get;
        if (f == NULL) {
            Py_INCREF(res);
        }
        else {
            res = f(res, self, (PyObject *)(Py_TYPE(self)));
        }
    }
    return res;
}

在 CPython 中,slotdefs 数组用于将 Python 的特殊方法(如 __getitem____add__ 等)映射到类型对象(PyTypeObject)的相应槽位(slots)。注册过程发生在类型创建时,具体步骤如下:

/* 伪代码:简化版类型创建流程 */
PyObject* type_new(...) {
    // 1. 创建类型对象
    PyTypeObject *type = ...;
    
    // 2. 遍历 slotdefs 数组
    for (PySlotDef *slot = slotdefs; slot->name; slot++) {
        // 3. 在类字典中查找特殊方法
        PyObject *method = _PyDict_GetItemId(class_dict, slot->name);
        if (method) {
            // 4. 将方法绑定到槽位
            int res = _PyType_SetSlotFromSpec(type, slot, method);
            // 处理错误...
        }
    }
}
/* Cpython里实际代码 */
/* Update the slots after assignment to a class (type) attribute. */
static int
update_slot(PyTypeObject *type, PyObject *name)
{
    pytype_slotdef *ptrs[MAX_EQUIV];
    pytype_slotdef *p;
    pytype_slotdef **pp;
    int offset;

    assert(PyUnicode_CheckExact(name));
    assert(PyUnicode_CHECK_INTERNED(name));

    pp = ptrs;
    for (p = slotdefs; p->name; p++) {
        assert(PyUnicode_CheckExact(p->name_strobj));
        assert(PyUnicode_CHECK_INTERNED(p->name_strobj));
        assert(PyUnicode_CheckExact(name));
        /* bpo-40521: Using interned strings. */
        if (p->name_strobj == name) {
            *pp++ = p;
        }
    }
    *pp = NULL;
    for (pp = ptrs; *pp; pp++) {
        p = *pp;
        offset = p->offset;
        while (p > slotdefs && (p-1)->offset == offset)
            --p;
        *pp = p;
    }
    if (ptrs[0] == NULL)
        return 0; /* Not an attribute that affects any slots */
    return update_subclasses(type, name,
                             update_slots_callback, (void *)ptrs);
}
  • 核心函数,负责:

    1. 验证方法签名

    2. 使用包装函数 将 Python 方法转为 C 可调用形式

    3. 将函数指针赋给类型槽位(如 type->tp_as_sequence->sq_item

  • 包装函数的作用(如 wrap_binaryfunc

    将 Python 层面的函数调用适配到底层 C 接口:

static PyObject* wrap_binaryfunc(PyObject *self, PyObject *args) {
    // 解包参数,调用用户定义的 __getitem__ 等
}

CPython 通过 slotdefs 数组在类型创建阶段自动完成特殊方法到槽位的注册:

  1. 遍历 slotdefs 匹配类字典中的方法

  2. 使用包装函数桥接 Python/C 接口

  3. 填充类型对象的槽位函数指针

  4. 为未定义的方法设置默认实现

这一机制使得 Python 层面的特殊方法能直接影响对象在解释器中的底层行为,是 CPython 实现面向对象特性的核心机制。

6.2.3 可达对象如何决策的

update_refs(PyGC_Head *containers)
{
    PyGC_Head *next;
    PyGC_Head *gc = GC_NEXT(containers);

    while (gc != containers) {
        next = GC_NEXT(gc);
        /* Move any object that might have become immortal to the
         * permanent generation as the reference count is not accurately
         * reflecting the actual number of live references to this object
         */
        if (_Py_IsImmortal(FROM_GC(gc))) {
           gc_list_move(gc, &get_gc_state()->permanent_generation.head);
           gc = next;
           continue;
        }
        gc_reset_refs(gc, Py_REFCNT(FROM_GC(gc)));
        /* Python's cyclic gc should never see an incoming refcount
         * of 0:  if something decref'ed to 0, it should have been
         * deallocated immediately at that time.
         * Possible cause (if the assert triggers):  a tp_dealloc
         * routine left a gc-aware object tracked during its teardown
         * phase, and did something-- or allowed something to happen --
         * that called back into Python.  gc can trigger then, and may
         * see the still-tracked dying object.  Before this assert
         * was added, such mistakes went on to allow gc to try to
         * delete the object again.  In a debug build, that caused
         * a mysterious segfault, when _Py_ForgetReference tried
         * to remove the object from the doubly-linked list of all
         * objects a second time.  In a release build, an actual
         * double deallocation occurred, which leads to corruption
         * of the allocator's internal bookkeeping pointers.  That's
         * so serious that maybe this should be a release-build
         * check instead of an assert?
         */
        _PyObject_ASSERT(FROM_GC(gc), gc_get_refs(gc) != 0);
        gc = next;
    }
}

将每个对象的 gc_refs 初始化为其 ob_refcnt(原始引用计数)。ob_refcnt 包含了来自 所有来源 的引用,包括:

栈上的变量(函数局部变量)
全局/静态变量
其他代(非当前回收代)的对象
非容器对象(如整数、字符串等)
这些来源共同构成了 GC 的根。
subtract_refs(base):
遍历当前代(base 链表)的对象,对每个对象引用的 其他同代对象,减少其 gc_refs。这相当于减去了 同代对象间的内部引用。
结果:
若对象最终 gc_refs > 0:表示存在 来自根(外部)的引用。

若 gc_refs = 0:对象仅被同代对象引用(形成孤岛),判定为不可达。
  • GC 的根roots 包括:

    • 栈上的变量(局部变量)

    • 全局/静态变量

    • 其他代的对象

    • 非容器对象

  • 这些根通过 ob_refcnt 的初始值 在 update_refs 中间接捕获,并在 subtract_refs 后通过 gc_refs > 0 体现。显式的根扫描(如遍历栈、全局区)发生在 GC 的其他阶段(如分代回收触发前)。

7. 引用计数机制

7.1 引用计数宏定义

// Include/object.h

// 增加引用计数
static inline void _Py_INCREF(PyObject *op)
{
#ifdef Py_REF_DEBUG
    _Py_RefTotal++;
#endif
    op->ob_refcnt++;
}

// 减少引用计数
static inline void _Py_DECREF(PyObject *op)
{
#ifdef Py_REF_DEBUG
    _Py_RefTotal--;
#endif
    if (--op->ob_refcnt != 0) {
        return;
    }
    _Py_Dealloc(op);  // 引用计数为0时销毁对象
}

#define Py_INCREF(op) _Py_INCREF(_PyObject_CAST(op))
#define Py_DECREF(op) _Py_DECREF(_PyObject_CAST(op))

7.2 对象销毁机制

// Objects/object.c
void
_Py_Dealloc(PyObject *op)
{
    destructor dealloc = Py_TYPE(op)->tp_dealloc;

#ifdef Py_TRACE_REFS
    _Py_ForgetReference(op);
#endif

    // 调用类型特定的析构函数
    (*dealloc)(op);
}

8. 内存布局总结

8.1 对象内存开销

对象类型 固定开销 可变部分 总大小示例
int (小) 28 bytes 0 28 bytes
int (大) 28 bytes 4*n bytes 28+4n bytes
str (ASCII) 49 bytes 1*n bytes 49+n bytes
str (Unicode) 80 bytes 4*n bytes 80+4n bytes
list 56 bytes 8*cap bytes 56+8*cap bytes
dict 240 bytes 24*n bytes 240+24n bytes

 

8.2 内存对齐和填充

// 内存对齐宏
#define _PyObject_SIZE(typeobj) ( (typeobj)->tp_basicsize )
#define _PyObject_VAR_SIZE(typeobj, nitems) \
    (size_t) \
    ( ( (typeobj)->tp_basicsize + \
        (nitems)*(typeobj)->tp_itemsize + \
        (SIZEOF_VOID_P-1) \
      ) & ~(SIZEOF_VOID_P-1) \
    )

9 执行流程

graph TD
    A[开始回收] --> B[合并年轻代]
    B --> C[标记可达对象]
    C --> D[分离不可达对象]
    D --> E[处理终结器对象]
    E --> F[处理弱引用]
    F --> G[执行__del__方法]
    G --> H[检查复活对象]
    H --> I[回收最终不可达对象]
    I --> J[处理不可回收对象]
    J --> K[更新统计信息]
    K --> L[结束]