您的位置 首页 > 数码极客

KSM,ksms是谁?

转自简书

导读

本文适合有基本Linux内存管理概念的新手阅读,且本文旨在从工作流程和设计思想上介绍KSM,在涉及到源代码的地方,进行了部分删减,如果想详细了解KSM,推荐阅读源代码及源代码中的注释。

作者也是初次接触Linux内核源码,所以文章中难免出现纰漏,欢迎在评论中纠正。

一、KSM概述

KSM的全称是 Kernel Samepage Merging,主要应用在虚拟化环境中,它允许内核通过合并内存页面来节省内存,从来可以增加虚拟机的并发数据。

KSM核心设计思想是基于写时复制机制COW,也就是将内容相同的页面合并成一个只读页面,从而释放出空闲物理页面。

二、核心数据结构

KSM的核心数据结构有三个:

  • struct rmap_item
  • struct mm_slot
  • struct ksm_scan

rmap_item

从虚拟地址到一个 mm 的反向映射

同时,对于 stable tree,它负责组织其结构;对于 unstable tree,保存其校验和

struct rmap_item { struct rmap_item *rmap_list; // 所有rmap_item连成一个链表,k union { struct anon_vma *anon_vma; // rmap_item在stable树中时,指向VMA #ifdef CONFIG_NUMA int nid; // when node of unstable tree #endif }; struct mm_struct *mm; // 进程的struct mm_struct数据结构 unsigned long address; // rmap_item所跟踪的用户地址空间 unsigned int oldchecksum; // 虚拟地址对应的物理页面的旧校验值 union { struct rb_node node; // rmap_item加入unstable红黑树的节点 struct { /* 在 stable tree 中时才有效 */ struct stable_node *head; // 加入stable红黑树的节点 struct hlist_node hlist; // stable链表 }; }; };

struct mm_slot

描述添加到KSM系统中将来要被扫描的进程mm_struct数据结构

struct mm_slot { struct hlist_node link; // 用于添加到mm_slot哈希表中 struct list_head mm_list; // 用于添加到mm_slot链表中,链表头在ksm_mm_head struct rmap_item *rmap_list; // rmap_item链表头 struct mm_struct *mm; // 进程的mm_sturct数据结构 };

struct ksm_scan

当前的扫描状态

struct ksm_scan { struct mm_slot *mm_slot; // 当前正在扫描的mm_slot unsigned long address; // 将要扫描的地址(page的地址) struct rmap_item **rmap_list; // 将要扫描的rmap_item的指针 unsigned long seqnr; // 全部扫描完成后会计数一次,用于删除unstable节点 };

三、工作流程

上层的应用通过 madvise() 给某内存区域增加 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,造成对系统调用的访问,该系统调用由 SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) 定义。

SYSCALL_DEFINE3

在这里会进行一个预处理,如找到该内存区域的所有VMA,并调用 madvise_vma 进行进一步处理。

SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct *vma, *prev; int unmapped_error = 0; int error = -EINVAL; int write; size_t len; struct blk_plug plug; ... vma = find_vma_prev(current->mm, start, &prev); if (vma && start > vma->vm_start) prev = vma; blk_start_plug(&plug); for (;;) { // 在这个循环中遍历所有的VMA error = -ENOMEM; if (!vma) goto out; /* Here start < (end|vma->vm_end). */ if (start < vma->vm_start) { unmapped_error = -ENOMEM; start = vma->vm_start; if (start >= end) goto out; } // 获得VMA的终止地址 tmp = vma->vm_end; if (end < tmp) tmp = end; // 将当前VMA的起始地址(start和end)传递给 madvise_vma,当然继承的标记behavior不能忘记 error = madvise_vma(vma, &prev, start, tmp, behavior); if (error) goto out; start = tmp; if (prev && start < prev->vm_end) start = prev->vm_end; error = unmapped_error; if (start >= end) goto out; // 这里是正常结束的出口 if (prev) vma = prev->vm_next; else vma = find_vma(current->mm, start); } out: blk_finish_plug(&plug); if (write) up_write(¤t->mm->mmap_sem); else up_read(¤t->mm->mmap_sem); return error; }

madvise_behavior

如果在某个VMA上发现 MADV_MERGEABLE 或者 MADV_UNMERGEABLE 标记,则触发 ksm_madvise

static long madvise_behavior(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { struct mm_struct *mm = vma->vm_mm; int error = 0; pgoff_t pgoff; unsigned long new_flags = vma->vm_flags; switch (behavior) { ... case MADV_MERGEABLE: case MADV_UNMERGEABLE: error = ksm_madvise(vma, start, end, behavior, &new_flags); if (error) { /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; goto out; } break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: error = hugepage_madvise(vma, &new_flags, behavior); if (error) { /* * madvise() returns EAGAIN if kernel resources, such as * slab, are temporarily unavailable. */ if (error == -ENOMEM) error = -EAGAIN; goto out; } break; } ... success: vma->vm_flags = new_flags; out: return error; }

ksm_madvise

在这一步会找到 vma 所属进程(mm),并判断标记决定是否对页面进行共享

如果需要共享,调用 __ksm_enter()并传递当前 vma 所属的 mm 地址。

int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; int err; switch (advice) { case MADV_MERGEABLE: ... if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { err = __ksm_enter(mm); // 这是真正的入口 if (err) return err; } *vm_flags |= VM_MERGEABLE; break; case MADV_UNMERGEABLE: if (!(*vm_flags & VM_MERGEABLE)) return 0; /* just ignore the advice */ if (vma->anon_vma) { err = unmerge_ksm_pages(vma, start, end); if (err) return err; } *vm_flags &= ~VM_MERGEABLE; break; } return 0; }

__ksm_enter

将进程的 mm 加入到 ksm_mm_head 链表中

int __ksm_enter(struct mm_struct *mm) { struct mm_slot *mm_slot; int needs_wakeup; mm_slot = alloc_mm_slot(); // 分配一个mm_slot,表示当前进程mm_struct数据结构 if (!mm_slot) return -ENOMEM; /* Check ksm_run too? Would need tighter locking */ needs_wakeup = list_empty(&k); // 为空表示当前没有正在被扫描的mm_slot spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); // 将当前进程mm赋给mm_slot->mm /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle * down a little; when fork is followed by immediate exec, we don't * want ksmd to waste time setting up and tearing down an rmap_list. * * But when KSM_RUN_UNMERGE, it's important to insert ahead of its * scanning cursor, otherwise KSM pages in newly forked mms will be * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) list_add_tail(&mm_slot->mm_list, &k); else list_add_tail(&mm_slot->mm_list, &k;mm_list); // mm_slot添加到k;mm_list链表中 spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); // 表示这个进程已经添加到KSM系统中 mmgrab(mm); if (needs_wakeup) wake_up_interruptible(&ksm_thread_wait); // 则唤醒ksmd内核线程 return 0; }

ksm_init

创建内核线程ksmd

static int __init ksm_init(void) { struct task_struct *ksm_thread; int err; /* The correct value depends on page size and endianness */ zero_checksum = calc_checksum(ZERO_PAGE(0)); /* Default to false for backwards compatibility */ ksm_use_zero_pages = false; err = ksm_slab_init(); // 创建ksm_rmap_item、ksm_stable_node、ksm_mm_slot三个高速缓存 if (err) goto out; ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); // 创建ksmd内核线程,线程函数为ksm_scan_thread if (IS_ERR(ksm_thread)) { pr_err("ksm: creating kthread failed\n"); err = PTR_ERR(ksm_thread); goto out_free; } #ifdef CONFIG_SYSFS err = sysfs_create_group(mm_kobj, &ksm_attr_group); // 在sys/kernel/mm下创建ksm相关节点 if (err) { pr_err("ksm: register sysfs failed\n"); kthread_stop(ksm_thread); goto out_free; } #else ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ #endif /* CONFIG_SYSFS */ #ifdef CONFIG_MEMORY_HOTREMOVE /* There is no significance to this priority 100 */ hotplug_memory_notifier(ksm_memory_callback, 100); #endif return 0; out_free: ksm_slab_free(); out: return err; }

ksm_scan_thread

只需要知道这里进入ksm_do_scan()进行实际的操作就可以了

static int ksm_scan_thread(void *nothing) { set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { mutex_lock(&ksm_thread_mutex); wait_while_offlining(); if (ksmd_should_run()) ksm_do_scan(ksm_thread_pages_to_scan); // 实际的执行者,参数是一次扫描的页面数量 mutex_unlock(&ksm_thread_mutex); try_to_freeze(); if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } return 0; }

ksm_do_scam

static void ksm_do_scan(unsigned int scan_npages) { struct rmap_item *rmap_item; struct page *uninitialized_var(page); while (scan_npages-- && likely(!freezing(current))) { // 一次扫描 scan_npages 个页面 cond_resched(); rmap_item = scan_get_next_rmap_item(&page); // 找到一个合适的匿名page,并为其建立由物理地址到虚拟地址的反向映射 if (!rmap_item) return; cmp_and_merge_page(page, rmap_item); // 查找两棵树看看是不是能合并这个page put_page(page); } }

cmp_and_merge_page

首先查看这个 page 是不是可以和 stable tree 中的 page 合并,不可以的话,检查下它的检验值和之前是否有变化,如果变化则认为是写频繁的页面,跳过它;否则,决定是将它插入到 unstable tree 中,或与其中节点合并加入到 stable tree 中。

static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) { struct rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct stable_node *stable_node; struct page *kpage; unsigned int checksum; int err; stable_node = page_stable_node(page); if (stable_node) { if (stable_node->head != &migrate_nodes && get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) { rb_erase(&stable_node->node, root_stable_tree + NUMA(stable_node->nid)); stable_node->head = &migrate_nodes; list_add(&stable_node->list, stable_node->head); } if (stable_node->head != &migrate_nodes && rmap_item->head == stable_node) return; } kpage = stable_tree_search(page); // 查 stable tree if (kpage == page && rmap_item->head == stable_node) { // 查找结果就是自身的情况 put_page(kpage); return; } remove_rmap_item_from_tree(rmap_item); // 为什么??????????? if (kpage) { // kpage 和 page 内容相同且不是同一页面 err = try_to_merge_with_ksm_page(rmap_item, page, kpage); // 尝试合并页面 if (!err) { lock_page(kpage); stable_tree_append(rmap_item, page_stable_node(kpage)); // 合并成功,把rmap_item添加到stable_node->hlist哈希链表上 unlock_page(kpage); } put_page(kpage); return; } /********************************下面是 unstable tree 的处理 ***********************/ // 计算检验值,如果与旧值不等,说明页面写频繁,不适合KSM checksum = calc_checksum(page); if (rmap_item->oldchecksum != checksum) { rmap_item->oldchecksum = checksum; return; } /* * Same checksum as an empty page. We attempt to merge it with the * appropriate zero page if the user enabled this via sysfs. */ if (ksm_use_zero_pages && (checksum == zero_checksum)) { struct vm_area_struct *vma; vma = find_mergeable_vma(rmap_item->mm, rmap_item->address); err = try_to_merge_one_page(vma, page, ZERO_PAGE(rmap_item->address)); /* * In case of failure, the page was not really empty, so we * need to continue. Otherwise we're done. */ if (!err) return; } // 搜索 unstable tree 中查看是否有相同页面 tree_rmap_item = unstable_tree_search_insert(rmap_item, page, &tree_page); if (tree_rmap_item) { // 尝试将 page 和 tree_page 合并成一个 kpage kpage = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); put_page(tree_page); if (kpage) { /* * 合并成功,将 kpage 插入到 stable tree 中和 rmap_items 中 */ lock_page(kpage); stable_node = stable_tree_insert(kpage); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node); stable_tree_append(rmap_item, stable_node); } unlock_page(kpage); /* * 如果stable_node插入到stable树失败, * 那么调用break_cow()主动触发一个缺页中断 * 来分离这个KSM页面 */ if (!stable_node) { break_cow(tree_rmap_item); break_cow(rmap_item); } } } }

try_to_merge_one_page

将两个 page 进行合并

/* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage * @kpage: the PageKsm page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. */ static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; if (!PageAnon(page)) goto out; /* * We need the page lock to read a stable PageSwapCache in * write_protect_page(). */ if (!trylock_page(page)) // 我们不希望在此等待,所以不直接 lock_page goto out; // 跳过这个 page 转而去处理其他 page if (PageTransCompound(page)) { // 如果 page 是透明大页,返回 1 if (split_huge_page(page)) // 将大页分裂成 normal pages,成功则返回 0 goto out_unlock; } /* * If this anonymous page is mapped only here, its pte may need * to be write-protected. If it's mapped elsewhere, all of its * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ if (write_protect_page(vma, page, &orig_pte) == 0) { if (!kpage) { /* * While we hold page lock, upgrade page from * PageAnon+anon_vma to PageKsm+NULL stable_node: * stable_tree_insert() will update stable_node. */ set_page_stable_node(page, NULL); mark_page_accessed(page); /* * Page reclaim just frees a clean page with no dirty * ptes: make sure that the ksm page would be swapped. */ if (!PageDirty(page)) SetPageDirty(page); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); // } if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { munlock_vma_page(page); if (!PageMlocked(kpage)) { unlock_page(page); lock_page(kpage); mlock_vma_page(kpage); page = kpage; /* for final unlock */ } } out_unlock: unlock_page(page); out: return err; }

附:

scan_get_next_rmap_item()

遍历ksm_mm_head链表(即进程的mm),然后再遍历进程地址空间的每个VMA,然后通过get_next_rmpa_item()返回rmap_item

static struct rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; int nid; if (list_empty(&k)) // 没有mm_slot,直接退出 return NULL; slot = k; if (slot == &ksm_mm_head) { // 第一次运行ksmd,进行初始化 ... } mm = slot->mm; down_read(&mm->mmap_sem); if (ksm_test_exit(mm)) vma = NULL; else vma = find_vma(mm, k); // 根据 address 在 mm 中找到 vma for (; vma; vma = vma->vm_next) { // 遍历当前mm的所有VMA if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (k < vma->vm_start) k = vma->vm_start; if (!vma->anon_vma) k = vma->vm_end; while (k < vma->vm_end) { // 遍历VMA中的所有页面 if (ksm_test_exit(mm)) break; *page = follow_page(vma, k, FOLL_GET); // 根据虚拟地址找到物理页 if (IS_ERR_OR_NULL(*page)) { k += PAGE_SIZE; cond_resched(); continue; } if (PageAnon(*page)) { // 只考虑匿名页面 flush_anon_page(vma, *page, k); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, k, k); // 去找mm_slot->rmap_list链表上是否有该虚拟地址对应的rmap_item,没有就新建一个 if (rmap_item) { k = &rmap_item->rmap_list; k += PAGE_SIZE; } else put_page(*page); up_read(&mm->mmap_sem); return rmap_item; } put_page(*page); k += PAGE_SIZE; cond_resched(); } } if (ksm_test_exit(mm)) { // for循环里扫描该进程所有的VMA都没找到合适的匿名页面 k = 0; k = &slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: * because there were no VM_MERGEABLE vmas with such addresses. */ remove_trailing_rmap_items(slot, k); // 在该进程中没找到合适的匿名页面时,那么对应的rmap_item已经没用必要占用空间,直接删除 spin_lock(&ksm_mmlist_lock); k = list_entry(slot->mm_li, struct mm_slot, mm_list); // 取下一个mm_slot if (k == 0) { // 处理该进程被销毁的情况,把mm_slot从ksm_mm_head链表中删除,释放mm_slot数据结构,清MMF_VM_MERGEABLE标志位。 /* * We've completed a full scan of all vmas, holding mmap_sem * throughout, and found no VM_MERGEABLE: so do the same as * __ksm_exit does to remove this mm from all our lists now. * This applies either when cleaning up after __ksm_exit * (but beware: we can reach here even before __ksm_exit), * or when all VM_MERGEABLE areas have been unmapped (and * mmap_sem then protects against race with MADV_MERGEABLE). */ hash_del(&slot->link); list_del(&slot->mm_list); spin_unlock(&ksm_mmlist_lock); free_mm_slot(slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); up_read(&mm->mmap_sem); mmdrop(mm); } else { up_read(&mm->mmap_sem); /* * up_read(&mm->mmap_sem) first because after * spin_unlock(&ksm_mmlist_lock) run, the "mm" may * already have been freed under us by __ksm_exit() * because the "mm_slot" is still hashed and * k doesn't point to it anymore. */ spin_unlock(&ksm_mmlist_lock); } /* Repeat until we've completed scanning the whole list */ slot = k; if (slot != &ksm_mm_head) goto next_mm; // 继续扫描下一个mm_slot k; // 扫描完一轮mm_slot,增加计数 return NULL; }

责任编辑: 鲁达

1.内容基于多重复合算法人工智能语言模型创作,旨在以深度学习研究为目的传播信息知识,内容观点与本网站无关,反馈举报请
2.仅供读者参考,本网站未对该内容进行证实,对其原创性、真实性、完整性、及时性不作任何保证;
3.本站属于非营利性站点无毒无广告,请读者放心使用!

“KSM,ksms是谁,ksm是什么意思,ksm是哪个动漫人物,ksm化疗药物是什么”边界阅读