/* * The pg_data_t structure is used in machines with CONFIG_DISCONTIGMEM * (mostly NUMA machines?) to denote a higher-level memory zone than the * zone denotes. * * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ structbootmem_data; typedefstructpglist_data { /* 包含了结点中各内存域的数据结构 , 可能的区域类型用zone_type表示*/ structzonenode_zones[MAX_NR_ZONES]; /* 指点了备用结点及其内存域的列表,以便在当前结点没有可用空间时,在备用结点分配内存 */ structzonelistnode_zonelists[MAX_ZONELISTS]; int nr_zones; /* 保存结点中不同内存域的数目 */ #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ structpage *node_mem_map;/* 指向page实例数组的指针,用于描述结点的所有物理内存页,它包含了结点中所有内存域的页。 */ #ifdef CONFIG_PAGE_EXTENSION structpage_ext *node_page_ext; #endif #endif #ifndef CONFIG_NO_BOOTMEM /* 在系统启动boot期间,内存管理子系统初始化之前, 内核页需要使用内存(另外,还需要保留部分内存用于初始化内存管理子系统) 为解决这个问题,内核使用了自举内存分配器 此结构用于这个阶段的内存管理 */ structbootmem_data *bdata; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* * Must be held any time you expect node_start_pfn, node_present_pages * or node_spanned_pages stay constant. Holding this will also * guarantee that any pfn_valid() stays that way. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG. * * Nests above zone->lock and zone->span_seqlock * 当系统支持内存热插拨时,用于保护本结构中的与节点大小相关的字段。 * 哪调用node_start_pfn,node_present_pages,node_spanned_pages相关的代码时,需要使用该锁。 */ spinlock_t node_size_lock; #endif /* /*起始页面帧号,指出该节点在全局mem_map中的偏移 系统中所有的页帧是依次编号的,每个页帧的号码都是全局唯一的(不只是结点内唯一) */ unsignedlong node_start_pfn; unsignedlong node_present_pages; /* total number of physical pages 结点中页帧的数目 */ unsignedlong node_spanned_pages; /* total size of physical page range, including holes 该结点以页帧为单位计算的长度,包含内存空洞 */ int node_id; /* 全局结点ID,系统中的NUMA结点都从0开始编号 */ wait_queue_head_t kswapd_wait; /* 交换守护进程的等待队列, 在将页帧换出结点时会用到。后面的文章会详细讨论。 */ wait_queue_head_t pfmemalloc_wait; structtask_struct *kswapd;/* Protected by mem_hotplug_begin/end() 指向负责该结点的交换守护进程的task_struct。 */ int kswapd_max_order; /* 定义需要释放的区域的长度 */ enumzone_typeclasszone_idx;
#ifdef CONFIG_COMPACTION int kcompactd_max_order; enumzone_typekcompactd_classzone_idx; wait_queue_head_t kcompactd_wait; structtask_struct *kcompactd; #endif
/* Rate limiting time interval */ unsignedlong numabalancing_migrate_next_window;
/* Number of pages migrated during the rate limiting time interval */ unsignedlong numabalancing_migrate_nr_pages; #endif
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsignedlong first_deferred_pfn; #endif/* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/* zone watermarks, access with *_wmark_pages(zone) macros */ unsignedlong watermark[NR_WMARK];
unsignedlong nr_reserved_highatomic;
/* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. * 分别为各种内存域指定了若干页 * 用于一些无论如何都不能失败的关键性内存分配。 */ long lowmem_reserve[MAX_NR_ZONES];
#ifdef CONFIG_NUMA int node; #endif
/* * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on * this zone's LRU. Maintained by the pageout code. * 不活动页的比例, * 接着是一些很少使用或者大部分情况下是只读的字段: * wait_table wait_table_hash_nr_entries wait_table_bits * 形成等待列队,可以等待某一页可供进程使用 */ unsignedint inactive_ratio;
/* * This is a per-zone reserve of pages that are not available * to userspace allocations. * 每个区域保留的不能被用户空间分配的页面数目 */ unsignedlong totalreserve_pages;
#ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsignedlong *pageblock_flags; #endif/* CONFIG_SPARSEMEM */
#ifdef CONFIG_NUMA /* * zone reclaim becomes active if more unmapped pages exist. */ unsignedlong min_unmapped_pages; unsignedlong min_slab_pages; #endif/* CONFIG_NUMA */
/* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/end(). Any reader who can't tolerant drift of * present_pages should get_online_mems() to get a stable value. * * Read access to managed_pages should be safe because it's unsigned * long. Write access to zone->managed_pages and totalram_pages are * protected by managed_page_count_lock at runtime. Idealy only * adjust_managed_page_count() should be used instead of directly * touching zone->managed_pages and totalram_pages. */ unsignedlong managed_pages; unsignedlong spanned_pages; /* 总页数,包含空洞 */ unsignedlong present_pages; /* 可用页数,不包哈空洞 */
#ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsignedlong nr_isolate_pageblock; #endif
#ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif
/* * wait_table -- the array holding the hash table * wait_table_hash_nr_entries -- the size of the hash table array * wait_table_bits -- wait_table_size == (1 << wait_table_bits) * * The purpose of all these is to keep track of the people * waiting for a page to become available and make them * runnable again when possible. The trouble is that this * consumes a lot of space, especially when so few things * wait on pages at a given time. So instead of using * per-page waitqueues, we use a waitqueue hash table. * * The bucket discipline is to sleep on the same queue when * colliding and wake all in that wait queue when removing. * When something wakes, it must check to be sure its page is * truly available, a la thundering herd. The cost of a * collision is great, but given the expected load of the * table, they should be so rare as to be outweighed by the * benefits from the saved space. * * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the * primary users of these fields, and in mm/page_alloc.c * free_area_init_core() performs the initialization of them. */ /* 进程等待队列的散列表, 这些进程正在等待管理区中的某页 */ wait_queue_head_t *wait_table; /* 等待队列散列表中的调度实体数目 */ unsignedlong wait_table_hash_nr_entries; /* 等待队列散列表数组大小, 值为2^order */ unsignedlong wait_table_bits;
ZONE_PADDING(_pad1_)
/* free areas of different sizes 页面使用状态的信息,以每个bit标识对应的page是否可以分配 是用于伙伴系统的,每个数组元素指向对应阶也表的数组开头 以下是供页帧回收扫描器(page reclaim scanner)访问的字段 scanner会跟据页帧的活动情况对内存域中使用的页进行编目 如果页帧被频繁访问,则是活动的,相反则是不活动的, 在需要换出页帧时,这样的信息是很重要的: */ structfree_areafree_area[MAX_ORDER];
/* zone flags, see below 描述当前内存的状态, 参见下面的enum zone_flags结构 */ unsignedlong flags;
/* Write-intensive fields used from the page allocator, 保存该描述符的自旋锁 */ spinlock_t lock;
ZONE_PADDING(_pad2_)
/* Write-intensive fields used by page reclaim */
/* Fields commonly accessed by the page reclaim scanner */ spinlock_t lru_lock; /* LRU(最近最少使用算法)活动以及非活动链表使用的自旋锁 */ structlruveclruvec;
/* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached * 在空闲页的数目少于这个点percpu_drift_mark的时候 * 当读取和空闲页数一样的内存页时,系统会采取额外的工作, * 防止单CPU页数漂移,从而导致水印被破坏。 */ unsignedlong percpu_drift_mark;
#if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsignedlong compact_cached_free_pfn; /* pfn where async and sync compaction migration scanner should start */ unsignedlong compact_cached_migrate_pfn[2]; #endif
#ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. */ unsignedint compact_considered; unsignedint compact_defer_shift; int compact_order_failed; #endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif
/* * Each physical page in the system has a struct page associated with * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * * The objects in struct page are organized in double word blocks in * order to allows us to use atomic double word operations on portions * of struct page. That is currently only used by slub but the arrangement * allows the use of atomic double word operations on the flags/mapping * and lru list pointers also. */ structpage { /* First double word block */ unsignedlong flags; /* Atomic flags, some possibly updated asynchronously 描述page的状态和其他信息 */ union { structaddress_space *mapping;/* If low bit clear, points to * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and * it points to anon_vma object: * see PAGE_MAPPING_ANON below. */ void *s_mem; /* slab first object 现移动至struct slab结构体*/ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ };
/* Second double word */ struct { union { pgoff_t index; /* Our offset within mapping. 在映射的虚拟空间(vma_area)内的偏移; 一个文件可能只映射一部分,假设映射了1M的空间, index指的是在1M空间内的偏移,而不是在整个文件内的偏移。 */ void *freelist; /* sl[aou]b first free object */ /* page_deferred_list().prev -- second tail page */ };
union { #if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) /* Used for cmpxchg_double in slub */ unsignedlong counters; #else /* * Keep _refcount separate from slub cmpxchg_double * data. As the rest of the double word is protected by * slab_lock but _refcount is not. */ unsigned counters; #endif
struct {
union { /* * Count of ptes mapped in mms, to show * when page is mapped & limit reverse * map searches. * 页映射计数器 */ atomic_t _mapcount;
/* * Third double word block * * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ union { structlist_headlru;/* Pageout list, eg. active_list * protected by zone->lru_lock ! * Can be used as a generic list * by the page owner. */ structdev_pagemap *pgmap;/* ZONE_DEVICE pages are never on an * lru or handled by a slab * allocator, this points to the * hosting device page map. */ struct {/* slub per cpu partial pages */ structpage *next;/* Next partial slab */ #ifdef CONFIG_64BIT int pages; /* Nr of partial slabs left */ int pobjects; /* Approximate # of objects */ #else shortint pages; shortint pobjects; #endif };
structrcu_headrcu_head;/* Used by SLAB * when destroying via RCU */ /* Tail pages of compound page */ struct { unsignedlong compound_head; /* If bit zero is set */
/* First tail page only */ #ifdef CONFIG_64BIT /* * On 64 bit system we have enough space in struct page * to encode compound_dtor and compound_order with * unsigned int. It can help compiler generate better or * smaller code on some archtectures. */ unsignedint compound_dtor; unsignedint compound_order; #else unsignedshortint compound_dtor; unsignedshortint compound_order; #endif };
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS struct { unsignedlong __pad; /* do not overlay pmd_huge_pte * with compound_head to avoid * possible bit 0 collision. */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ }; #endif };
/* Remainder is not double word aligned */ union { unsignedlong private; /* Mapping-private opaque data: * usually used for buffer_heads * if PagePrivate set; used for * swp_entry_t if PageSwapCache; * indicates order in the buddy * system if PG_buddy is set. * 私有数据指针,由应用场景确定其具体的含义 */ #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else spinlock_t ptl; #endif #endif structkmem_cache *slab_cache;/* SL[AU]B: Pointer to slab */ };
/* * On machines where all RAM is mapped into kernel address space, * we can simply calculate the virtual address. On machines with * highmem some memory is mapped into kernel virtual memory * dynamically, so we need a place to store that address. * Note that this field could be 16 bits on x86 ... ;) * * Architectures with slow multiplication can define * WANT_PAGE_VIRTUAL in asm/page.h */ #if defined(WANT_PAGE_VIRTUAL) void *virtual; /* Kernel virtual address (NULL if not kmapped, ie. highmem) */ #endif/* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_KMEMCHECK /* * kmemcheck wants to track the status of each byte in a page; this * is a pointer to such a status block. NULL if not tracked. */ void *shadow; #endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif } /* * The struct page can be forced to be double word aligned so that atomic ops * on double words work. The SLUB allocator can make use of such a feature. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE __aligned(2 * sizeof(unsignedlong)) #endif ;
enummigratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES };