[packages/kernel/LINUX_3_18] - up to 3.18.37 - import uksm patch to git (source page is not accessible)
baggins
baggins at pld-linux.org
Sat Jul 16 11:08:52 CEST 2016
commit 04bb9bf19db0e810b33b9e19d511652bf4fe0b9d
Author: Jan Rękorajski <baggins at pld-linux.org>
Date: Sat Jul 16 11:08:38 2016 +0200
- up to 3.18.37
- import uksm patch to git (source page is not accessible)
kernel.spec | 10 +-
uksm-0.1.2.3-for-v3.18.patch | 6897 ++++++++++++++++++++++++++++++++++++++++++
2 files changed, 6901 insertions(+), 6 deletions(-)
---
diff --git a/kernel.spec b/kernel.spec
index 1d3f3f2..cdf0db4 100644
--- a/kernel.spec
+++ b/kernel.spec
@@ -72,7 +72,7 @@
%define rel 1
%define basever 3.18
-%define postver .36
+%define postver .37
# define this to '-%{basever}' for longterm branch
%define versuffix -%{basever}
@@ -121,7 +121,7 @@ Source0: http://www.kernel.org/pub/linux/kernel/v3.x/linux-%{basever}.tar.xz
# Source0-md5: 9e854df51ca3fef8bfe566dbd7b89241
%if "%{postver}" != ".0"
Patch0: http://www.kernel.org/pub/linux/kernel/v3.x/patch-%{version}.xz
-# Patch0-md5: e173543ed461d4e0c5b08cfd546c085f
+# Patch0-md5: 912cbca7383b99d74958ec9d72f2853d
%endif
Source1: kernel.sysconfig
@@ -212,10 +212,8 @@ Patch101: kernel-vserver-fixes.patch
Patch145: kernel-aufs3.patch
Patch146: kernel-aufs3+vserver.patch
-%define uksm_major_version 0.1.2.3
-%define uksm_version %{uksm_major_version}-for-v3.18
-Patch150: http://kerneldedup.org/download/uksm/%{uksm_major_version}/uksm-%{uksm_version}.patch
-# Patch150-md5: b6a2b2aae9c2844d0c74690632d7019e
+# http://kerneldedup.org/download/uksm/0.1.2.3/uksm-0.1.2.3-for-v3.18.patch
+Patch150: uksm-0.1.2.3-for-v3.18.patch
# Show normal colors in menuconfig with ncurses ABI 6
Patch250: kernel-fix_256colors_menuconfig.patch
diff --git a/uksm-0.1.2.3-for-v3.18.patch b/uksm-0.1.2.3-for-v3.18.patch
new file mode 100644
index 0000000..15e1d40
--- /dev/null
+++ b/uksm-0.1.2.3-for-v3.18.patch
@@ -0,0 +1,6897 @@
+diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
+index 081c497..75bde3d 100644
+--- a/Documentation/vm/00-INDEX
++++ b/Documentation/vm/00-INDEX
+@@ -16,6 +16,8 @@ hwpoison.txt
+ - explains what hwpoison is
+ ksm.txt
+ - how to use the Kernel Samepage Merging feature.
++uksm.txt
++ - Introduction to Ultra KSM
+ numa
+ - information about NUMA specific code in the Linux vm.
+ numa_memory_policy.txt
+diff --git a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
+new file mode 100644
+index 0000000..08bd645
+--- /dev/null
++++ b/Documentation/vm/uksm.txt
+@@ -0,0 +1,58 @@
++The Ultra Kernel Samepage Merging feature
++----------------------------------------------
++/*
++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
++ *
++ * This is an improvement upon KSM. Some basic data structures and routines
++ * are borrowed from ksm.c .
++ *
++ * Its new features:
++ * 1. Full system scan:
++ * It automatically scans all user processes' anonymous VMAs. Kernel-user
++ * interaction to submit a memory area to KSM is no longer needed.
++ *
++ * 2. Rich area detection:
++ * It automatically detects rich areas containing abundant duplicated
++ * pages based. Rich areas are given a full scan speed. Poor areas are
++ * sampled at a reasonable speed with very low CPU consumption.
++ *
++ * 3. Ultra Per-page scan speed improvement:
++ * A new hash algorithm is proposed. As a result, on a machine with
++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
++ * can scan memory areas that does not contain duplicated pages at speed of
++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
++ * 477MB/sec ~ 923MB/sec.
++ *
++ * 4. Thrashing area avoidance:
++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
++ * filtered out. My benchmark shows it's more efficient than KSM's per-page
++ * hash value based volatile page detection.
++ *
++ *
++ * 5. Misc changes upon KSM:
++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
++ * comparison. It's much faster than default C version on x86.
++ * * rmap_item now has an struct *page member to loosely cache a
++ * address-->page mapping, which reduces too much time-costly
++ * follow_page().
++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
++ * ksm is needed for this case.
++ *
++ * 6. Full Zero Page consideration(contributed by Figo Zhang)
++ * Now uksmd consider full zero pages as special pages and merge them to an
++ * special unswappable uksm zero page.
++ */
++
++ChangeLog:
++
++2012-05-05 The creation of this Doc
++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
++2012-05-28 UKSM 0.1.1.2 bug fix release
++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
++2012-07-2 UKSM 0.1.2-beta2
++2012-07-10 UKSM 0.1.2-beta3
++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
++2012-10-13 UKSM 0.1.2.1 Bug fixes.
++2012-12-31 UKSM 0.1.2.2 Minor bug fixes
++2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug"
+diff --git a/fs/exec.c b/fs/exec.c
+index 7302b75..84b0df5 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -19,7 +19,7 @@
+ * current->executable is only used by the procfs. This allows a dispatch
+ * table to check for several different types of binary formats. We keep
+ * trying until we recognize the file or we run out of supported binary
+- * formats.
++ * formats.
+ */
+
+ #include <linux/slab.h>
+@@ -56,6 +56,7 @@
+ #include <linux/pipe_fs_i.h>
+ #include <linux/oom.h>
+ #include <linux/compat.h>
++#include <linux/ksm.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/mmu_context.h>
+@@ -1128,6 +1129,7 @@ void setup_new_exec(struct linux_binprm * bprm)
+ /* An exec changes our domain. We are no longer part of the thread
+ group */
+ current->self_exec_id++;
++
+ flush_signal_handlers(current, 0);
+ do_close_on_exec(current->files);
+ }
+diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
+index aa1eee0..5a9149c 100644
+--- a/fs/proc/meminfo.c
++++ b/fs/proc/meminfo.c
+@@ -121,6 +121,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
+ "SUnreclaim: %8lu kB\n"
+ "KernelStack: %8lu kB\n"
+ "PageTables: %8lu kB\n"
++#ifdef CONFIG_UKSM
++ "KsmZeroPages: %8lu kB\n"
++#endif
+ #ifdef CONFIG_QUICKLIST
+ "Quicklists: %8lu kB\n"
+ #endif
+@@ -175,6 +178,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
+ K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
+ global_page_state(NR_KERNEL_STACK) * THREAD_SIZE / 1024,
+ K(global_page_state(NR_PAGETABLE)),
++#ifdef CONFIG_UKSM
++ K(global_page_state(NR_UKSM_ZERO_PAGES)),
++#endif
+ #ifdef CONFIG_QUICKLIST
+ K(quicklist_total_size()),
+ #endif
+diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
+index 752e30d..1e7c826 100644
+--- a/include/asm-generic/pgtable.h
++++ b/include/asm-generic/pgtable.h
+@@ -537,12 +537,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
+ unsigned long size);
+ #endif
+
++#ifdef CONFIG_UKSM
++static inline int is_uksm_zero_pfn(unsigned long pfn)
++{
++ extern unsigned long uksm_zero_pfn;
++ return pfn == uksm_zero_pfn;
++}
++#else
++static inline int is_uksm_zero_pfn(unsigned long pfn)
++{
++ return 0;
++}
++#endif
++
+ #ifdef __HAVE_COLOR_ZERO_PAGE
+ static inline int is_zero_pfn(unsigned long pfn)
+ {
+ extern unsigned long zero_pfn;
+ unsigned long offset_from_zero_pfn = pfn - zero_pfn;
+- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
++ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
+ }
+
+ #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr))
+@@ -551,7 +564,7 @@ static inline int is_zero_pfn(unsigned long pfn)
+ static inline int is_zero_pfn(unsigned long pfn)
+ {
+ extern unsigned long zero_pfn;
+- return pfn == zero_pfn;
++ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
+ }
+
+ static inline unsigned long my_zero_pfn(unsigned long addr)
+diff --git a/include/linux/ksm.h b/include/linux/ksm.h
+index 3be6bb1..51557d1 100644
+--- a/include/linux/ksm.h
++++ b/include/linux/ksm.h
+@@ -19,21 +19,6 @@ struct mem_cgroup;
+ #ifdef CONFIG_KSM
+ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, int advice, unsigned long *vm_flags);
+-int __ksm_enter(struct mm_struct *mm);
+-void __ksm_exit(struct mm_struct *mm);
+-
+-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+-{
+- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+- return __ksm_enter(mm);
+- return 0;
+-}
+-
+-static inline void ksm_exit(struct mm_struct *mm)
+-{
+- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+- __ksm_exit(mm);
+-}
+
+ /*
+ * A KSM page is one of those write-protected "shared pages" or "merged pages"
+@@ -76,6 +61,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
+ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
+ void ksm_migrate_page(struct page *newpage, struct page *oldpage);
+
++#ifdef CONFIG_KSM_LEGACY
++int __ksm_enter(struct mm_struct *mm);
++void __ksm_exit(struct mm_struct *mm);
++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
++{
++ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
++ return __ksm_enter(mm);
++ return 0;
++}
++
++static inline void ksm_exit(struct mm_struct *mm)
++{
++ if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
++ __ksm_exit(mm);
++}
++
++#elif defined(CONFIG_UKSM)
++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
++{
++ return 0;
++}
++
++static inline void ksm_exit(struct mm_struct *mm)
++{
++}
++#endif /* !CONFIG_UKSM */
++
+ #else /* !CONFIG_KSM */
+
+ static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+@@ -123,4 +135,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+ #endif /* CONFIG_MMU */
+ #endif /* !CONFIG_KSM */
+
++#include <linux/uksm.h>
++
+ #endif /* __LINUX_KSM_H */
+diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
+index 6e0b286..ebd6243 100644
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -308,6 +308,9 @@ struct vm_area_struct {
+ #ifdef CONFIG_NUMA
+ struct mempolicy *vm_policy; /* NUMA policy for the VMA */
+ #endif
++#ifdef CONFIG_UKSM
++ struct vma_slot *uksm_vma_slot;
++#endif
+ };
+
+ struct core_thread {
+diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
+index ffe66e3..355f0e9 100644
+--- a/include/linux/mmzone.h
++++ b/include/linux/mmzone.h
+@@ -157,6 +157,9 @@ enum zone_stat_item {
+ WORKINGSET_NODERECLAIM,
+ NR_ANON_TRANSPARENT_HUGEPAGES,
+ NR_FREE_CMA_PAGES,
++#ifdef CONFIG_UKSM
++ NR_UKSM_ZERO_PAGES,
++#endif
+ NR_VM_ZONE_STAT_ITEMS };
+
+ /*
+@@ -865,7 +868,7 @@ static inline int is_highmem_idx(enum zone_type idx)
+ }
+
+ /**
+- * is_highmem - helper function to quickly check if a struct zone is a
++ * is_highmem - helper function to quickly check if a struct zone is a
+ * highmem zone or not. This is an attempt to keep references
+ * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
+ * @zone - pointer to struct zone variable
+diff --git a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
+new file mode 100644
+index 0000000..6780fdb
+--- /dev/null
++++ b/include/linux/sradix-tree.h
+@@ -0,0 +1,77 @@
++#ifndef _LINUX_SRADIX_TREE_H
++#define _LINUX_SRADIX_TREE_H
++
++
++#define INIT_SRADIX_TREE(root, mask) \
++do { \
++ (root)->height = 0; \
++ (root)->gfp_mask = (mask); \
++ (root)->rnode = NULL; \
++} while (0)
++
++#define ULONG_BITS (sizeof(unsigned long) * 8)
++#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long))
++//#define SRADIX_TREE_MAP_SHIFT 6
++//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT)
++//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1)
++
++struct sradix_tree_node {
++ unsigned int height; /* Height from the bottom */
++ unsigned int count;
++ unsigned int fulls; /* Number of full sublevel trees */
++ struct sradix_tree_node *parent;
++ void *stores[0];
++};
++
++/* A simple radix tree implementation */
++struct sradix_tree_root {
++ unsigned int height;
++ struct sradix_tree_node *rnode;
++
++ /* Where found to have available empty stores in its sublevels */
++ struct sradix_tree_node *enter_node;
++ unsigned int shift;
++ unsigned int stores_size;
++ unsigned int mask;
++ unsigned long min; /* The first hole index */
++ unsigned long num;
++ //unsigned long *height_to_maxindex;
++
++ /* How the node is allocated and freed. */
++ struct sradix_tree_node *(*alloc)(void);
++ void (*free)(struct sradix_tree_node *node);
++
++ /* When a new node is added and removed */
++ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
++ void (*assign)(struct sradix_tree_node *node, unsigned index, void *item);
++ void (*rm)(struct sradix_tree_node *node, unsigned offset);
++};
++
++struct sradix_tree_path {
++ struct sradix_tree_node *node;
++ int offset;
++};
++
++static inline
++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
++{
++ root->height = 0;
++ root->rnode = NULL;
++ root->shift = shift;
++ root->stores_size = 1UL << shift;
++ root->mask = root->stores_size - 1;
++}
++
++
++extern void *sradix_tree_next(struct sradix_tree_root *root,
++ struct sradix_tree_node *node, unsigned long index,
++ int (*iter)(void *, unsigned long));
++
++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
++
++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
++ struct sradix_tree_node *node, unsigned long index);
++
++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
++
++#endif /* _LINUX_SRADIX_TREE_H */
+diff --git a/include/linux/uksm.h b/include/linux/uksm.h
+new file mode 100644
+index 0000000..a644bca
+--- /dev/null
++++ b/include/linux/uksm.h
+@@ -0,0 +1,146 @@
++#ifndef __LINUX_UKSM_H
++#define __LINUX_UKSM_H
++/*
++ * Memory merging support.
++ *
++ * This code enables dynamic sharing of identical pages found in different
++ * memory areas, even if they are not shared by fork().
++ */
++
++/* if !CONFIG_UKSM this file should not be compiled at all. */
++#ifdef CONFIG_UKSM
++
++#include <linux/bitops.h>
++#include <linux/mm.h>
++#include <linux/pagemap.h>
++#include <linux/rmap.h>
++#include <linux/sched.h>
++
++extern unsigned long zero_pfn __read_mostly;
++extern unsigned long uksm_zero_pfn __read_mostly;
++extern struct page *empty_uksm_zero_page;
++
++/* must be done before linked to mm */
++extern void uksm_vma_add_new(struct vm_area_struct *vma);
++extern void uksm_remove_vma(struct vm_area_struct *vma);
++
++#define UKSM_SLOT_NEED_SORT (1 << 0)
++#define UKSM_SLOT_NEED_RERAND (1 << 1)
++#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */
++#define UKSM_SLOT_FUL_SCANNED (1 << 3)
++#define UKSM_SLOT_IN_UKSM (1 << 4)
++
++struct vma_slot {
++ struct sradix_tree_node *snode;
++ unsigned long sindex;
++
++ struct list_head slot_list;
++ unsigned long fully_scanned_round;
++ unsigned long dedup_num;
++ unsigned long pages_scanned;
++ unsigned long last_scanned;
++ unsigned long pages_to_scan;
++ struct scan_rung *rung;
++ struct page **rmap_list_pool;
++ unsigned int *pool_counts;
++ unsigned long pool_size;
++ struct vm_area_struct *vma;
++ struct mm_struct *mm;
++ unsigned long ctime_j;
++ unsigned long pages;
++ unsigned long flags;
++ unsigned long pages_cowed; /* pages cowed this round */
++ unsigned long pages_merged; /* pages merged this round */
++ unsigned long pages_bemerged;
++
++ /* when it has page merged in this eval round */
++ struct list_head dedup_list;
++};
++
++static inline void uksm_unmap_zero_page(pte_t pte)
++{
++ if (pte_pfn(pte) == uksm_zero_pfn)
++ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
++}
++
++static inline void uksm_map_zero_page(pte_t pte)
++{
++ if (pte_pfn(pte) == uksm_zero_pfn)
++ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
++}
++
++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
++{
++ if (vma->uksm_vma_slot && PageKsm(page))
++ vma->uksm_vma_slot->pages_cowed++;
++}
++
++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
++{
++ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
++ vma->uksm_vma_slot->pages_cowed++;
++}
++
++static inline int uksm_flags_can_scan(unsigned long vm_flags)
++{
++#ifndef VM_SAO
++#define VM_SAO 0
++#endif
++ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND |
++ VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP |
++ VM_SHARED | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN | VM_SAO));
++}
++
++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
++{
++ if (uksm_flags_can_scan(*vm_flags_p))
++ *vm_flags_p |= VM_MERGEABLE;
++}
++
++/*
++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
++ * be removed when uksm zero page patch is stable enough.
++ */
++static inline void uksm_bugon_zeropage(pte_t pte)
++{
++ BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
++}
++#else
++static inline void uksm_vma_add_new(struct vm_area_struct *vma)
++{
++}
++
++static inline void uksm_remove_vma(struct vm_area_struct *vma)
++{
++}
++
++static inline void uksm_unmap_zero_page(pte_t pte)
++{
++}
++
++static inline void uksm_map_zero_page(pte_t pte)
++{
++}
++
++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
++{
++}
++
++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
++{
++}
++
++static inline int uksm_flags_can_scan(unsigned long vm_flags)
++{
++ return 0;
++}
++
++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
++{
++}
++
++static inline void uksm_bugon_zeropage(pte_t pte)
++{
++}
++#endif /* !CONFIG_UKSM */
++#endif /* __LINUX_UKSM_H */
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 9b7d746..73ad90d 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -412,7 +412,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ goto fail_nomem;
+ charge = len;
+ }
+- tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
++ tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+ if (!tmp)
+ goto fail_nomem;
+ *tmp = *mpnt;
+@@ -467,7 +467,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
+ __vma_link_rb(mm, tmp, rb_link, rb_parent);
+ rb_link = &tmp->vm_rb.rb_right;
+ rb_parent = &tmp->vm_rb;
+-
++ uksm_vma_add_new(tmp);
+ mm->map_count++;
+ retval = copy_page_range(mm, oldmm, mpnt);
+
+diff --git a/lib/Makefile b/lib/Makefile
+index 0211d2b..426536f 100644
+--- a/lib/Makefile
++++ b/lib/Makefile
+@@ -8,7 +8,7 @@ KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
+ endif
+
+ lib-y := ctype.o string.o vsprintf.o cmdline.o \
+- rbtree.o radix-tree.o dump_stack.o timerqueue.o\
++ rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\
+ idr.o int_sqrt.o extable.o \
+ sha1.o md5.o irq_regs.o argv_split.o \
+ proportions.o flex_proportions.o ratelimit.o show_mem.o \
+diff --git a/lib/sradix-tree.c b/lib/sradix-tree.c
+new file mode 100644
+index 0000000..8d06329
+--- /dev/null
++++ b/lib/sradix-tree.c
+@@ -0,0 +1,476 @@
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/mman.h>
++#include <linux/spinlock.h>
++#include <linux/slab.h>
++#include <linux/gcd.h>
++#include <linux/sradix-tree.h>
++
++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
++{
++ return node->fulls == root->stores_size ||
++ (node->height == 1 && node->count == root->stores_size);
++}
++
++/*
++ * Extend a sradix tree so it can store key @index.
++ */
++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
++{
++ struct sradix_tree_node *node;
++ unsigned int height;
++
++ if (unlikely(root->rnode == NULL)) {
++ if (!(node = root->alloc()))
++ return -ENOMEM;
++
++ node->height = 1;
++ root->rnode = node;
++ root->height = 1;
++ }
++
++ /* Figure out what the height should be. */
++ height = root->height;
++ index >>= root->shift * height;
++
++ while (index) {
++ index >>= root->shift;
++ height++;
++ }
++
++ while (height > root->height) {
++ unsigned int newheight;
++ if (!(node = root->alloc()))
++ return -ENOMEM;
++
++ /* Increase the height. */
++ node->stores[0] = root->rnode;
++ root->rnode->parent = node;
++ if (root->extend)
++ root->extend(node, root->rnode);
++
++ newheight = root->height + 1;
++ node->height = newheight;
++ node->count = 1;
++ if (sradix_node_full(root, root->rnode))
++ node->fulls = 1;
++
++ root->rnode = node;
++ root->height = newheight;
++ }
++
++ return 0;
++}
++
++/*
++ * Search the next item from the current node, that is not NULL
++ * and can satify root->iter().
++ */
++void *sradix_tree_next(struct sradix_tree_root *root,
++ struct sradix_tree_node *node, unsigned long index,
++ int (*iter)(void *item, unsigned long height))
++{
++ unsigned long offset;
++ void *item;
++
++ if (unlikely(node == NULL)) {
++ node = root->rnode;
++ for (offset = 0; offset < root->stores_size; offset++) {
++ item = node->stores[offset];
++ if (item && (!iter || iter(item, node->height)))
++ break;
++ }
++
++ if (unlikely(offset >= root->stores_size))
++ return NULL;
++
++ if (node->height == 1)
++ return item;
++ else
++ goto go_down;
++ }
++
++ while (node) {
++ offset = (index & root->mask) + 1;
++ for (;offset < root->stores_size; offset++) {
++ item = node->stores[offset];
++ if (item && (!iter || iter(item, node->height)))
++ break;
++ }
++
++ if (offset < root->stores_size)
++ break;
++
++ node = node->parent;
++ index >>= root->shift;
++ }
++
++ if (!node)
++ return NULL;
++
++ while (node->height > 1) {
++go_down:
++ node = item;
++ for (offset = 0; offset < root->stores_size; offset++) {
++ item = node->stores[offset];
++ if (item && (!iter || iter(item, node->height)))
++ break;
++ }
++
++ if (unlikely(offset >= root->stores_size))
++ return NULL;
++ }
++
++ BUG_ON(offset > root->stores_size);
++
++ return item;
++}
++
++/*
++ * Blindly insert the item to the tree. Typically, we reuse the
++ * first empty store item.
++ */
++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
++{
++ unsigned long index;
++ unsigned int height;
++ struct sradix_tree_node *node, *tmp = NULL;
++ int offset, offset_saved;
++ void **store = NULL;
++ int error, i, j, shift;
++
++go_on:
++ index = root->min;
++
++ if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
++ node = root->enter_node;
++ BUG_ON((index >> (root->shift * root->height)));
++ } else {
++ node = root->rnode;
++ if (node == NULL || (index >> (root->shift * root->height))
++ || sradix_node_full(root, node)) {
++ error = sradix_tree_extend(root, index);
++ if (error)
++ return error;
++
++ node = root->rnode;
++ }
++ }
++
++
++ height = node->height;
++ shift = (height - 1) * root->shift;
++ offset = (index >> shift) & root->mask;
++ while (shift > 0) {
++ offset_saved = offset;
++ for (; offset < root->stores_size; offset++) {
++ store = &node->stores[offset];
++ tmp = *store;
++
++ if (!tmp || !sradix_node_full(root, tmp))
++ break;
++ }
++ BUG_ON(offset >= root->stores_size);
++
++ if (offset != offset_saved) {
++ index += (offset - offset_saved) << shift;
++ index &= ~((1UL << shift) - 1);
++ }
++
++ if (!tmp) {
++ if (!(tmp = root->alloc()))
++ return -ENOMEM;
++
++ tmp->height = shift / root->shift;
++ *store = tmp;
++ tmp->parent = node;
++ node->count++;
++// if (root->extend)
++// root->extend(node, tmp);
++ }
++
++ node = tmp;
++ shift -= root->shift;
++ offset = (index >> shift) & root->mask;
++ }
++
++ BUG_ON(node->height != 1);
++
++
++ store = &node->stores[offset];
++ for (i = 0, j = 0;
++ j < root->stores_size - node->count &&
++ i < root->stores_size - offset && j < num; i++) {
++ if (!store[i]) {
++ store[i] = item[j];
++ if (root->assign)
++ root->assign(node, index + i, item[j]);
++ j++;
++ }
++ }
++
++ node->count += j;
++ root->num += j;
++ num -= j;
++
++ while (sradix_node_full(root, node)) {
++ node = node->parent;
++ if (!node)
++ break;
++
++ node->fulls++;
++ }
++
++ if (unlikely(!node)) {
++ /* All nodes are full */
++ root->min = 1 << (root->height * root->shift);
++ root->enter_node = NULL;
++ } else {
++ root->min = index + i - 1;
++ root->min |= (1UL << (node->height - 1)) - 1;
++ root->min++;
++ root->enter_node = node;
++ }
++
++ if (num) {
++ item += j;
++ goto go_on;
++ }
++
++ return 0;
++}
++
++
++/**
++ * sradix_tree_shrink - shrink height of a sradix tree to minimal
++ * @root sradix tree root
++ *
++ */
++static inline void sradix_tree_shrink(struct sradix_tree_root *root)
++{
++ /* try to shrink tree height */
++ while (root->height > 1) {
++ struct sradix_tree_node *to_free = root->rnode;
++
++ /*
++ * The candidate node has more than one child, or its child
++ * is not at the leftmost store, we cannot shrink.
++ */
++ if (to_free->count != 1 || !to_free->stores[0])
++ break;
++
++ root->rnode = to_free->stores[0];
++ root->rnode->parent = NULL;
++ root->height--;
++ if (unlikely(root->enter_node == to_free)) {
++ root->enter_node = NULL;
++ }
++ root->free(to_free);
++ }
++}
++
++/*
++ * Del the item on the known leaf node and index
++ */
++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root,
++ struct sradix_tree_node *node, unsigned long index)
++{
++ unsigned int offset;
++ struct sradix_tree_node *start, *end;
++
++ BUG_ON(node->height != 1);
++
++ start = node;
++ while (node && !(--node->count))
++ node = node->parent;
++
++ end = node;
++ if (!node) {
++ root->rnode = NULL;
++ root->height = 0;
++ root->min = 0;
++ root->num = 0;
++ root->enter_node = NULL;
++ } else {
++ offset = (index >> (root->shift * (node->height - 1))) & root->mask;
++ if (root->rm)
++ root->rm(node, offset);
++ node->stores[offset] = NULL;
++ root->num--;
++ if (root->min > index) {
++ root->min = index;
++ root->enter_node = node;
++ }
++ }
++
++ if (start != end) {
++ do {
++ node = start;
++ start = start->parent;
++ if (unlikely(root->enter_node == node))
++ root->enter_node = end;
++ root->free(node);
++ } while (start != end);
++
++ /*
++ * Note that shrink may free "end", so enter_node still need to
++ * be checked inside.
++ */
++ sradix_tree_shrink(root);
++ } else if (node->count == root->stores_size - 1) {
++ /* It WAS a full leaf node. Update the ancestors */
++ node = node->parent;
++ while (node) {
++ node->fulls--;
++ if (node->fulls != root->stores_size - 1)
++ break;
++
++ node = node->parent;
++ }
++ }
++}
++
++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
++{
++ unsigned int height, offset;
++ struct sradix_tree_node *node;
++ int shift;
++
++ node = root->rnode;
++ if (node == NULL || (index >> (root->shift * root->height)))
++ return NULL;
++
++ height = root->height;
++ shift = (height - 1) * root->shift;
++
++ do {
++ offset = (index >> shift) & root->mask;
++ node = node->stores[offset];
++ if (!node)
++ return NULL;
++
++ shift -= root->shift;
++ } while (shift >= 0);
++
++ return node;
++}
++
++/*
++ * Return the item if it exists, otherwise create it in place
++ * and return the created item.
++ */
++void *sradix_tree_lookup_create(struct sradix_tree_root *root,
++ unsigned long index, void *(*item_alloc)(void))
++{
++ unsigned int height, offset;
++ struct sradix_tree_node *node, *tmp;
++ void *item;
++ int shift, error;
++
++ if (root->rnode == NULL || (index >> (root->shift * root->height))) {
++ if (item_alloc) {
++ error = sradix_tree_extend(root, index);
++ if (error)
++ return NULL;
++ } else {
++ return NULL;
++ }
++ }
++
++ node = root->rnode;
++ height = root->height;
++ shift = (height - 1) * root->shift;
++
++ do {
++ offset = (index >> shift) & root->mask;
++ if (!node->stores[offset]) {
++ if (!(tmp = root->alloc()))
++ return NULL;
++
++ tmp->height = shift / root->shift;
++ node->stores[offset] = tmp;
++ tmp->parent = node;
++ node->count++;
++ node = tmp;
++ } else {
++ node = node->stores[offset];
++ }
++
++ shift -= root->shift;
++ } while (shift > 0);
++
++ BUG_ON(node->height != 1);
++ offset = index & root->mask;
++ if (node->stores[offset]) {
++ return node->stores[offset];
++ } else if (item_alloc) {
++ if (!(item = item_alloc()))
++ return NULL;
++
++ node->stores[offset] = item;
++
++ /*
++ * NOTE: we do NOT call root->assign here, since this item is
++ * newly created by us having no meaning. Caller can call this
++ * if it's necessary to do so.
++ */
++
++ node->count++;
++ root->num++;
++
++ while (sradix_node_full(root, node)) {
++ node = node->parent;
++ if (!node)
++ break;
++
++ node->fulls++;
++ }
++
++ if (unlikely(!node)) {
++ /* All nodes are full */
++ root->min = 1 << (root->height * root->shift);
++ } else {
++ if (root->min == index) {
++ root->min |= (1UL << (node->height - 1)) - 1;
++ root->min++;
++ root->enter_node = node;
++ }
++ }
++
++ return item;
++ } else {
++ return NULL;
++ }
++
++}
++
++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
++{
++ unsigned int height, offset;
++ struct sradix_tree_node *node;
++ int shift;
++
++ node = root->rnode;
++ if (node == NULL || (index >> (root->shift * root->height)))
++ return -ENOENT;
++
++ height = root->height;
++ shift = (height - 1) * root->shift;
++
++ do {
++ offset = (index >> shift) & root->mask;
++ node = node->stores[offset];
++ if (!node)
++ return -ENOENT;
++
++ shift -= root->shift;
++ } while (shift > 0);
++
++ offset = index & root->mask;
++ if (!node->stores[offset])
++ return -ENOENT;
++
++ sradix_tree_delete_from_leaf(root, node, index);
++
++ return 0;
++}
+diff --git a/mm/Kconfig b/mm/Kconfig
+index 1d1ae6b..511dde6 100644
+--- a/mm/Kconfig
++++ b/mm/Kconfig
+@@ -339,6 +339,32 @@ config KSM
+ See Documentation/vm/ksm.txt for more information: KSM is inactive
+ until a program has madvised that an area is MADV_MERGEABLE, and
+ root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
++choice
++ prompt "Choose UKSM/KSM strategy"
++ default UKSM
++ depends on KSM
++ help
++ This option allows to select a UKSM/KSM stragety.
++
++config UKSM
++ bool "Ultra-KSM for page merging"
++ depends on KSM
++ help
++ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
++ page Merging), but with a fundamentally rewritten core algorithm. With
++ an advanced algorithm, UKSM now can transparently scans all anonymously
++ mapped user space applications with an significantly improved scan speed
++ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
++ UKSM. Now UKSM has its first stable release and first real world enterprise user.
++ For more information, please goto its project page.
++ (www.kerneldedup.org)
++
++config KSM_LEGACY
++ bool "Legacy KSM implementation"
++ depends on KSM
++ help
++ The legacy KSM implementation from Redhat.
++endchoice
+
+ config DEFAULT_MMAP_MIN_ADDR
+ int "Low address space to protect from user allocation"
+diff --git a/mm/Makefile b/mm/Makefile
+index 8405eb0..7689f0c 100644
+--- a/mm/Makefile
++++ b/mm/Makefile
+@@ -44,7 +44,8 @@ obj-$(CONFIG_SPARSEMEM) += sparse.o
+ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
+ obj-$(CONFIG_SLOB) += slob.o
+ obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
+-obj-$(CONFIG_KSM) += ksm.o
++obj-$(CONFIG_KSM_LEGACY) += ksm.o
++obj-$(CONFIG_UKSM) += uksm.o
+ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
+ obj-$(CONFIG_SLAB) += slab.o
+ obj-$(CONFIG_SLUB) += slub.o
+diff --git a/mm/memory.c b/mm/memory.c
+index d5f2ae9..86b5d09 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -120,6 +120,28 @@ unsigned long highest_memmap_pfn __read_mostly;
+
+ EXPORT_SYMBOL(zero_pfn);
+
++#ifdef CONFIG_UKSM
++unsigned long uksm_zero_pfn __read_mostly;
++EXPORT_SYMBOL_GPL(uksm_zero_pfn);
++struct page *empty_uksm_zero_page;
++
++static int __init setup_uksm_zero_page(void)
++{
++ unsigned long addr;
++ addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, 0);
++ if (!addr)
++ panic("Oh boy, that early out of memory?");
++
++ empty_uksm_zero_page = virt_to_page((void *) addr);
++ SetPageReserved(empty_uksm_zero_page);
++
++ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
++
++ return 0;
++}
++core_initcall(setup_uksm_zero_page);
++#endif
++
+ /*
+ * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
+ */
+@@ -131,6 +153,7 @@ static int __init init_zero_pfn(void)
+ core_initcall(init_zero_pfn);
+
+
++
+ #if defined(SPLIT_RSS_COUNTING)
+
+ void sync_mm_rss(struct mm_struct *mm)
+@@ -878,6 +901,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ rss[MM_ANONPAGES]++;
+ else
+ rss[MM_FILEPAGES]++;
++
++ /* Should return NULL in vm_normal_page() */
++ uksm_bugon_zeropage(pte);
++ } else {
++ uksm_map_zero_page(pte);
+ }
+
+ out_set_pte:
+@@ -1120,8 +1148,10 @@ again:
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+- if (unlikely(!page))
++ if (unlikely(!page)) {
++ uksm_unmap_zero_page(ptent);
+ continue;
++ }
+ if (unlikely(details) && details->nonlinear_vma
+ && linear_page_index(details->nonlinear_vma,
+ addr) != page->index) {
+@@ -1983,8 +2013,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
+ clear_page(kaddr);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(dst);
+- } else
++ } else {
+ copy_user_highpage(dst, src, va, vma);
++ uksm_cow_page(vma, src);
++ }
+ }
+
+ /*
+@@ -2198,6 +2230,7 @@ gotten:
+ new_page = alloc_zeroed_user_highpage_movable(vma, address);
+ if (!new_page)
+ goto oom;
++ uksm_cow_pte(vma, orig_pte);
+ } else {
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (!new_page)
+@@ -2223,8 +2256,11 @@ gotten:
+ dec_mm_counter_fast(mm, MM_FILEPAGES);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
+ }
+- } else
++ uksm_bugon_zeropage(orig_pte);
++ } else {
++ uksm_unmap_zero_page(orig_pte);
+ inc_mm_counter_fast(mm, MM_ANONPAGES);
++ }
+ flush_cache_page(vma, address, pte_pfn(orig_pte));
+ entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+diff --git a/mm/mmap.c b/mm/mmap.c
+index ae91989..844f366 100644
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -41,6 +41,7 @@
+ #include <linux/notifier.h>
+ #include <linux/memory.h>
+ #include <linux/printk.h>
++#include <linux/ksm.h>
+
+ #include <asm/uaccess.h>
+ #include <asm/cacheflush.h>
+@@ -279,6 +280,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ mpol_put(vma_policy(vma));
++ uksm_remove_vma(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+ return next;
+ }
+@@ -739,9 +741,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ long adjust_next = 0;
+ int remove_next = 0;
+
++/*
++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
++ * acquired
++ */
++ uksm_remove_vma(vma);
++
+ if (next && !insert) {
+ struct vm_area_struct *exporter = NULL;
+
++ uksm_remove_vma(next);
+ if (end >= next->vm_end) {
+ /*
+ * vma expands, overlapping all the next, and
+@@ -838,6 +847,7 @@ again: remove_next = 1 + (end > next->vm_end);
+ end_changed = true;
+ }
+ vma->vm_pgoff = pgoff;
++
+ if (adjust_next) {
+ next->vm_start += adjust_next << PAGE_SHIFT;
+ next->vm_pgoff += adjust_next;
+@@ -908,16 +918,22 @@ again: remove_next = 1 + (end > next->vm_end);
+ * up the code too much to do both in one go.
+ */
+ next = vma->vm_next;
+- if (remove_next == 2)
++ if (remove_next == 2) {
++ uksm_remove_vma(next);
+ goto again;
+- else if (next)
++ } else if (next) {
+ vma_gap_update(next);
+- else
++ } else {
+ mm->highest_vm_end = end;
++ }
++ } else {
++ if (next && !insert)
++ uksm_vma_add_new(next);
+ }
+ if (insert && file)
+ uprobe_mmap(insert);
+
++ uksm_vma_add_new(vma);
+ validate_mm(mm);
+
+ return 0;
+@@ -1310,6 +1326,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
+ vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+ mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+
++ /* If uksm is enabled, we add VM_MERGABLE to new VMAs. */
++ uksm_vm_flags_mod(&vm_flags);
++
+ if (flags & MAP_LOCKED)
+ if (!can_do_mlock())
+ return -EPERM;
+@@ -1651,6 +1670,7 @@ munmap_back:
+ allow_write_access(file);
+ }
+ file = vma->vm_file;
++ uksm_vma_add_new(vma);
+ out:
+ perf_event_mmap(vma);
+
+@@ -1692,6 +1712,7 @@ allow_write_and_free_vma:
+ if (vm_flags & VM_DENYWRITE)
+ allow_write_access(file);
+ free_vma:
++ uksm_remove_vma(vma);
+ kmem_cache_free(vm_area_cachep, vma);
+ unacct_error:
+ if (charged)
+@@ -2488,6 +2509,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
+ else
+ err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
+
++ uksm_vma_add_new(new);
++
+ /* Success. */
+ if (!err)
+ return 0;
+@@ -2654,6 +2677,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
+ return addr;
+
+ flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
++ uksm_vm_flags_mod(&flags);
+
+ error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (error & ~PAGE_MASK)
+@@ -2712,6 +2736,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
+ vma->vm_flags = flags;
+ vma->vm_page_prot = vm_get_page_prot(flags);
+ vma_link(mm, vma, prev, rb_link, rb_parent);
++ uksm_vma_add_new(vma);
+ out:
+ perf_event_mmap(vma);
+ mm->total_vm += len >> PAGE_SHIFT;
+@@ -2747,6 +2772,12 @@ void exit_mmap(struct mm_struct *mm)
+ /* mm's last user has gone, and its about to be pulled down */
+ mmu_notifier_release(mm);
+
++ /*
++ * Taking write lock on mmap_sem does not harm others,
++ * but it's crucial for uksm to avoid races.
++ */
++ down_write(&mm->mmap_sem);
++
+ if (mm->locked_vm) {
+ vma = mm->mmap;
+ while (vma) {
+@@ -2783,6 +2814,11 @@ void exit_mmap(struct mm_struct *mm)
+ }
+ vm_unacct_memory(nr_accounted);
+
++ mm->mmap = NULL;
++ mm->mm_rb = RB_ROOT;
++ vmacache_invalidate(mm);
++ up_write(&mm->mmap_sem);
++
+ WARN_ON(atomic_long_read(&mm->nr_ptes) >
+ (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ }
+@@ -2891,6 +2927,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ new_vma->vm_ops->open(new_vma);
+ vma_link(mm, new_vma, prev, rb_link, rb_parent);
+ *need_rmap_locks = false;
++ uksm_vma_add_new(new_vma);
+ }
+ }
+ return new_vma;
+@@ -3004,10 +3041,10 @@ static struct vm_area_struct *__install_special_mapping(
+ ret = insert_vm_struct(mm, vma);
+ if (ret)
+ goto out;
+-
+ mm->total_vm += len >> PAGE_SHIFT;
+
+ perf_event_mmap(vma);
++ uksm_vma_add_new(vma);
+
+ return vma;
+
+diff --git a/mm/rmap.c b/mm/rmap.c
+index 3e4c721..d39d8a3 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -908,9 +908,9 @@ void page_move_anon_rmap(struct page *page,
+
+ /**
+ * __page_set_anon_rmap - set up new anonymous rmap
+- * @page: Page to add to rmap
++ * @page: Page to add to rmap
+ * @vma: VM area to add page to.
+- * @address: User virtual address of the mapping
++ * @address: User virtual address of the mapping
+ * @exclusive: the page is exclusively owned by the current process
+ */
+ static void __page_set_anon_rmap(struct page *page,
+diff --git a/mm/uksm.c b/mm/uksm.c
+new file mode 100644
+index 0000000..c76fcfc
+--- /dev/null
++++ b/mm/uksm.c
+@@ -0,0 +1,5519 @@
++/*
++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
++ *
++ * This is an improvement upon KSM. Some basic data structures and routines
++ * are borrowed from ksm.c .
++ *
++ * Its new features:
++ * 1. Full system scan:
++ * It automatically scans all user processes' anonymous VMAs. Kernel-user
++ * interaction to submit a memory area to KSM is no longer needed.
++ *
++ * 2. Rich area detection:
++ * It automatically detects rich areas containing abundant duplicated
++ * pages based. Rich areas are given a full scan speed. Poor areas are
++ * sampled at a reasonable speed with very low CPU consumption.
++ *
++ * 3. Ultra Per-page scan speed improvement:
++ * A new hash algorithm is proposed. As a result, on a machine with
++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
++ * can scan memory areas that does not contain duplicated pages at speed of
++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
++ * 477MB/sec ~ 923MB/sec.
++ *
++ * 4. Thrashing area avoidance:
++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be
++ * filtered out. My benchmark shows it's more efficient than KSM's per-page
++ * hash value based volatile page detection.
++ *
++ *
++ * 5. Misc changes upon KSM:
++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
++ * comparison. It's much faster than default C version on x86.
++ * * rmap_item now has an struct *page member to loosely cache a
++ * address-->page mapping, which reduces too much time-costly
++ * follow_page().
++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_
++ * ksm is needed for this case.
++ *
++ * 6. Full Zero Page consideration(contributed by Figo Zhang)
++ * Now uksmd consider full zero pages as special pages and merge them to an
++ * special unswappable uksm zero page.
++ */
++
++#include <linux/errno.h>
++#include <linux/mm.h>
++#include <linux/fs.h>
++#include <linux/mman.h>
++#include <linux/sched.h>
++#include <linux/rwsem.h>
++#include <linux/pagemap.h>
++#include <linux/rmap.h>
++#include <linux/spinlock.h>
++#include <linux/jhash.h>
++#include <linux/delay.h>
++#include <linux/kthread.h>
++#include <linux/wait.h>
++#include <linux/slab.h>
++#include <linux/rbtree.h>
++#include <linux/memory.h>
++#include <linux/mmu_notifier.h>
++#include <linux/swap.h>
++#include <linux/ksm.h>
++#include <linux/crypto.h>
++#include <linux/scatterlist.h>
++#include <crypto/hash.h>
++#include <linux/random.h>
++#include <linux/math64.h>
++#include <linux/gcd.h>
++#include <linux/freezer.h>
++#include <linux/sradix-tree.h>
++
++#include <asm/tlbflush.h>
++#include "internal.h"
++
++#ifdef CONFIG_X86
++#undef memcmp
++
++#ifdef CONFIG_X86_32
++#define memcmp memcmpx86_32
++/*
++ * Compare 4-byte-aligned address s1 and s2, with length n
++ */
++int memcmpx86_32(void *s1, void *s2, size_t n)
++{
++ size_t num = n / 4;
++ register int res;
++
++ __asm__ __volatile__
++ (
++ "testl %3,%3\n\t"
++ "repe; cmpsd\n\t"
++ "je 1f\n\t"
++ "sbbl %0,%0\n\t"
++ "orl $1,%0\n"
++ "1:"
++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
++ : "0" (0)
++ : "cc");
++
++ return res;
++}
++
++/*
++ * Check the page is all zero ?
++ */
++static int is_full_zero(const void *s1, size_t len)
++{
++ unsigned char same;
++
++ len /= 4;
++
++ __asm__ __volatile__
++ ("repe; scasl;"
++ "sete %0"
++ : "=qm" (same), "+D" (s1), "+c" (len)
++ : "a" (0)
++ : "cc");
++
++ return same;
++}
++
++
++#elif defined(CONFIG_X86_64)
++#define memcmp memcmpx86_64
++/*
++ * Compare 8-byte-aligned address s1 and s2, with length n
++ */
++int memcmpx86_64(void *s1, void *s2, size_t n)
++{
++ size_t num = n / 8;
++ register int res;
++
++ __asm__ __volatile__
++ (
++ "testq %q3,%q3\n\t"
++ "repe; cmpsq\n\t"
++ "je 1f\n\t"
++ "sbbq %q0,%q0\n\t"
++ "orq $1,%q0\n"
++ "1:"
++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
++ : "0" (0)
++ : "cc");
++
++ return res;
++}
++
++static int is_full_zero(const void *s1, size_t len)
++{
++ unsigned char same;
++
++ len /= 8;
++
++ __asm__ __volatile__
++ ("repe; scasq;"
++ "sete %0"
++ : "=qm" (same), "+D" (s1), "+c" (len)
++ : "a" (0)
++ : "cc");
++
++ return same;
++}
++
++#endif
++#else
++static int is_full_zero(const void *s1, size_t len)
++{
++ unsigned long *src = s1;
++ int i;
++
++ len /= sizeof(*src);
++
++ for (i = 0; i < len; i++) {
++ if (src[i])
++ return 0;
++ }
++
++ return 1;
++}
++#endif
++
++#define UKSM_RUNG_ROUND_FINISHED (1 << 0)
++#define TIME_RATIO_SCALE 10000
++
++#define SLOT_TREE_NODE_SHIFT 8
++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT)
++struct slot_tree_node {
++ unsigned long size;
++ struct sradix_tree_node snode;
++ void *stores[SLOT_TREE_NODE_STORE_SIZE];
++};
++
++static struct kmem_cache *slot_tree_node_cachep;
++
++static struct sradix_tree_node *slot_tree_node_alloc(void)
++{
++ struct slot_tree_node *p;
++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL);
++ if (!p)
++ return NULL;
++
++ return &p->snode;
++}
++
++static void slot_tree_node_free(struct sradix_tree_node *node)
++{
++ struct slot_tree_node *p;
++
++ p = container_of(node, struct slot_tree_node, snode);
++ kmem_cache_free(slot_tree_node_cachep, p);
++}
++
++static void slot_tree_node_extend(struct sradix_tree_node *parent,
++ struct sradix_tree_node *child)
++{
++ struct slot_tree_node *p, *c;
++
++ p = container_of(parent, struct slot_tree_node, snode);
++ c = container_of(child, struct slot_tree_node, snode);
++
++ p->size += c->size;
++}
++
++void slot_tree_node_assign(struct sradix_tree_node *node,
++ unsigned index, void *item)
++{
++ struct vma_slot *slot = item;
++ struct slot_tree_node *cur;
++
++ slot->snode = node;
++ slot->sindex = index;
++
++ while (node) {
++ cur = container_of(node, struct slot_tree_node, snode);
++ cur->size += slot->pages;
++ node = node->parent;
++ }
++}
++
++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
++{
++ struct vma_slot *slot;
++ struct slot_tree_node *cur;
++ unsigned long pages;
++
++ if (node->height == 1) {
++ slot = node->stores[offset];
++ pages = slot->pages;
++ } else {
++ cur = container_of(node->stores[offset],
++ struct slot_tree_node, snode);
++ pages = cur->size;
++ }
++
++ while (node) {
++ cur = container_of(node, struct slot_tree_node, snode);
++ cur->size -= pages;
++ node = node->parent;
++ }
++}
++
++unsigned long slot_iter_index;
++int slot_iter(void *item, unsigned long height)
++{
++ struct slot_tree_node *node;
++ struct vma_slot *slot;
++
++ if (height == 1) {
++ slot = item;
++ if (slot_iter_index < slot->pages) {
++ /*in this one*/
++ return 1;
++ } else {
++ slot_iter_index -= slot->pages;
++ return 0;
++ }
++
++ } else {
++ node = container_of(item, struct slot_tree_node, snode);
++ if (slot_iter_index < node->size) {
++ /*in this one*/
++ return 1;
++ } else {
++ slot_iter_index -= node->size;
++ return 0;
++ }
++ }
++}
++
++
++static inline void slot_tree_init_root(struct sradix_tree_root *root)
++{
++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
++ root->alloc = slot_tree_node_alloc;
++ root->free = slot_tree_node_free;
++ root->extend = slot_tree_node_extend;
++ root->assign = slot_tree_node_assign;
++ root->rm = slot_tree_node_rm;
++}
++
++void slot_tree_init(void)
++{
++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
++ sizeof(struct slot_tree_node), 0,
++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
++ NULL);
++}
++
++
++/* Each rung of this ladder is a list of VMAs having a same scan ratio */
++struct scan_rung {
++ //struct list_head scanned_list;
++ struct sradix_tree_root vma_root;
++ struct sradix_tree_root vma_root2;
++
++ struct vma_slot *current_scan;
++ unsigned long current_offset;
++
++ /*
++ * The initial value for current_offset, it should loop over
++ * [0~ step - 1] to let all slot have its chance to be scanned.
++ */
++ unsigned long offset_init;
++ unsigned long step; /* dynamic step for current_offset */
++ unsigned int flags;
++ unsigned long pages_to_scan;
++ //unsigned long fully_scanned_slots;
++ /*
++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the
++ * the cpu time ratio it can spend in rung_i for every scan
++ * period. if < 0, then it is the cpu time ratio relative to the
++ * max cpu percentage user specified. Both in unit of
++ * 1/TIME_RATIO_SCALE
++ */
++ int cpu_ratio;
++
++ /*
++ * How long it will take for all slots in this rung to be fully
++ * scanned? If it's zero, we don't care about the cover time:
++ * it's fully scanned.
++ */
++ unsigned int cover_msecs;
++ //unsigned long vma_num;
++ //unsigned long pages; /* Sum of all slot's pages in rung */
++};
++
++/**
++ * node of either the stable or unstale rbtree
++ *
++ */
++struct tree_node {
++ struct rb_node node; /* link in the main (un)stable rbtree */
++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
++ u32 hash;
++ unsigned long count; /* TODO: merged with sub_root */
++ struct list_head all_list; /* all tree nodes in stable/unstable tree */
++};
++
++/**
++ * struct stable_node - node of the stable rbtree
++ * @node: rb node of this ksm page in the stable tree
++ * @hlist: hlist head of rmap_items using this ksm page
++ * @kpfn: page frame number of this ksm page
++ */
++struct stable_node {
++ struct rb_node node; /* link in sub-rbtree */
++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
++ struct hlist_head hlist;
++ unsigned long kpfn;
++ u32 hash_max; /* if ==0 then it's not been calculated yet */
++ struct list_head all_list; /* in a list for all stable nodes */
++};
++
++/**
++ * struct node_vma - group rmap_items linked in a same stable
++ * node together.
++ */
++struct node_vma {
++ union {
++ struct vma_slot *slot;
++ unsigned long key; /* slot is used as key sorted on hlist */
++ };
++ struct hlist_node hlist;
++ struct hlist_head rmap_hlist;
++ struct stable_node *head;
++};
++
++/**
++ * struct rmap_item - reverse mapping item for virtual addresses
++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
++ * @mm: the memory structure this rmap_item is pointing into
++ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
++ * @node: rb node of this rmap_item in the unstable tree
++ * @head: pointer to stable_node heading this list in the stable tree
++ * @hlist: link into hlist of rmap_items hanging off that stable_node
++ */
++struct rmap_item {
++ struct vma_slot *slot;
++ struct page *page;
++ unsigned long address; /* + low bits used for flags below */
++ unsigned long hash_round;
++ unsigned long entry_index;
++ union {
++ struct {/* when in unstable tree */
++ struct rb_node node;
++ struct tree_node *tree_node;
++ u32 hash_max;
++ };
++ struct { /* when in stable tree */
++ struct node_vma *head;
++ struct hlist_node hlist;
++ struct anon_vma *anon_vma;
++ };
++ };
++} __attribute__((aligned(4)));
++
++struct rmap_list_entry {
++ union {
++ struct rmap_item *item;
++ unsigned long addr;
++ };
++ /* lowest bit is used for is_addr tag */
++} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
++
++
++/* Basic data structure definition ends */
++
++
++/*
++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
++ * The flags use the low bits of rmap_item.address
++ */
++#define UNSTABLE_FLAG 0x1
++#define STABLE_FLAG 0x2
++#define get_rmap_addr(x) ((x)->address & PAGE_MASK)
++
++/*
++ * rmap_list_entry helpers
++ */
++#define IS_ADDR_FLAG 1
++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG)
++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG)
++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
++
++
++/*
++ * High speed caches for frequently allocated and freed structs
++ */
++static struct kmem_cache *rmap_item_cache;
++static struct kmem_cache *stable_node_cache;
++static struct kmem_cache *node_vma_cache;
++static struct kmem_cache *vma_slot_cache;
++static struct kmem_cache *tree_node_cache;
++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
++ sizeof(struct __struct), __alignof__(struct __struct),\
++ (__flags), NULL)
++
++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
++#define SCAN_LADDER_SIZE 4
++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
++
++/* The evaluation rounds uksmd has finished */
++static unsigned long long uksm_eval_round = 1;
++
++/*
++ * we add 1 to this var when we consider we should rebuild the whole
++ * unstable tree.
++ */
++static unsigned long uksm_hash_round = 1;
++
++/*
++ * How many times the whole memory is scanned.
++ */
++static unsigned long long fully_scanned_round = 1;
++
++/* The total number of virtual pages of all vma slots */
++static u64 uksm_pages_total;
++
++/* The number of pages has been scanned since the start up */
++static u64 uksm_pages_scanned;
++
++static u64 scanned_virtual_pages;
++
++/* The number of pages has been scanned since last encode_benefit call */
++static u64 uksm_pages_scanned_last;
++
++/* If the scanned number is tooo large, we encode it here */
++static u64 pages_scanned_stored;
++
++static unsigned long pages_scanned_base;
++
++/* The number of nodes in the stable tree */
++static unsigned long uksm_pages_shared;
++
++/* The number of page slots additionally sharing those nodes */
++static unsigned long uksm_pages_sharing;
++
++/* The number of nodes in the unstable tree */
++static unsigned long uksm_pages_unshared;
++
++/*
++ * Milliseconds ksmd should sleep between scans,
++ * >= 100ms to be consistent with
++ * scan_time_to_sleep_msec()
++ */
++static unsigned int uksm_sleep_jiffies;
++
++/* The real value for the uksmd next sleep */
++static unsigned int uksm_sleep_real;
++
++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
++static unsigned int uksm_sleep_saved;
++
++/* Max percentage of cpu utilization ksmd can take to scan in one batch */
++static unsigned int uksm_max_cpu_percentage;
++
++static int uksm_cpu_governor;
++
++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
++
++struct uksm_cpu_preset_s {
++ int cpu_ratio[SCAN_LADDER_SIZE];
++ unsigned int cover_msecs[SCAN_LADDER_SIZE];
++ unsigned int max_cpu; /* percentage */
++};
++
++struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
++};
++
++/* The default value for uksm_ema_page_time if it's not initialized */
++#define UKSM_PAGE_TIME_DEFAULT 500
++
++/*cost to scan one page by expotional moving average in nsecs */
++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
++
++/* The expotional moving average alpha weight, in percentage. */
++#define EMA_ALPHA 20
++
++/*
++ * The threshold used to filter out thrashing areas,
++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
++ * will be considered as having a zero duplication ratio.
++ */
++static unsigned int uksm_thrash_threshold = 50;
++
++/* How much dedup ratio is considered to be abundant*/
++static unsigned int uksm_abundant_threshold = 10;
++
++/* All slots having merged pages in this eval round. */
++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
++
++/* How many times the ksmd has slept since startup */
++static unsigned long long uksm_sleep_times;
++
++#define UKSM_RUN_STOP 0
++#define UKSM_RUN_MERGE 1
++static unsigned int uksm_run = 1;
++
++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
++static DEFINE_MUTEX(uksm_thread_mutex);
++
++/*
++ * List vma_slot_new is for newly created vma_slot waiting to be added by
++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
++ * VMA has been removed/freed.
++ */
++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
++static DEFINE_SPINLOCK(vma_slot_list_lock);
++
++/* The unstable tree heads */
++static struct rb_root root_unstable_tree = RB_ROOT;
++
++/*
++ * All tree_nodes are in a list to be freed at once when unstable tree is
++ * freed after each scan round.
++ */
++static struct list_head unstable_tree_node_list =
++ LIST_HEAD_INIT(unstable_tree_node_list);
++
++/* List contains all stable nodes */
++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
++
++/*
++ * When the hash strength is changed, the stable tree must be delta_hashed and
++ * re-structured. We use two set of below structs to speed up the
++ * re-structuring of stable tree.
++ */
++static struct list_head
++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
++ LIST_HEAD_INIT(stable_tree_node_list[1])};
++
++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
++static struct rb_root *root_stable_treep = &root_stable_tree[0];
++static unsigned long stable_tree_index;
++
++/* The hash strength needed to hash a full page */
++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32))
++
++/* The hash strength needed for loop-back hashing */
++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10)
++
++/* The random offsets in a page */
++static u32 *random_nums;
++
++/* The hash strength */
++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
++
++/* The delta value each time the hash strength increases or decreases */
++static unsigned long hash_strength_delta;
++#define HASH_STRENGTH_DELTA_MAX 5
++
++/* The time we have saved due to random_sample_hash */
++static u64 rshash_pos;
++
++/* The time we have wasted due to hash collision */
++static u64 rshash_neg;
++
++struct uksm_benefit {
++ u64 pos;
++ u64 neg;
++ u64 scanned;
++ unsigned long base;
++} benefit;
++
++/*
++ * The relative cost of memcmp, compared to 1 time unit of random sample
++ * hash, this value is tested when ksm module is initialized
++ */
++static unsigned long memcmp_cost;
++
++static unsigned long rshash_neg_cont_zero;
++static unsigned long rshash_cont_obscure;
++
++/* The possible states of hash strength adjustment heuristic */
++enum rshash_states {
++ RSHASH_STILL,
++ RSHASH_TRYUP,
++ RSHASH_TRYDOWN,
++ RSHASH_NEW,
++ RSHASH_PRE_STILL,
++};
++
++/* The possible direction we are about to adjust hash strength */
++enum rshash_direct {
++ GO_UP,
++ GO_DOWN,
++ OBSCURE,
++ STILL,
++};
++
++/* random sampling hash state machine */
++static struct {
++ enum rshash_states state;
++ enum rshash_direct pre_direct;
++ u8 below_count;
++ /* Keep a lookup window of size 5, iff above_count/below_count > 3
++ * in this window we stop trying.
++ */
++ u8 lookup_window_index;
++ u64 stable_benefit;
++ unsigned long turn_point_down;
++ unsigned long turn_benefit_down;
++ unsigned long turn_point_up;
++ unsigned long turn_benefit_up;
++ unsigned long stable_point;
++} rshash_state;
++
++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
++static u32 *zero_hash_table;
++
++static inline struct node_vma *alloc_node_vma(void)
++{
++ struct node_vma *node_vma;
++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL);
++ if (node_vma) {
++ INIT_HLIST_HEAD(&node_vma->rmap_hlist);
++ INIT_HLIST_NODE(&node_vma->hlist);
++ }
++ return node_vma;
++}
++
++static inline void free_node_vma(struct node_vma *node_vma)
++{
++ kmem_cache_free(node_vma_cache, node_vma);
++}
++
++
++static inline struct vma_slot *alloc_vma_slot(void)
++{
++ struct vma_slot *slot;
++
++ /*
++ * In case ksm is not initialized by now.
++ * Oops, we need to consider the call site of uksm_init() in the future.
++ */
++ if (!vma_slot_cache)
++ return NULL;
++
++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL);
++ if (slot) {
++ INIT_LIST_HEAD(&slot->slot_list);
++ INIT_LIST_HEAD(&slot->dedup_list);
++ slot->flags |= UKSM_SLOT_NEED_RERAND;
++ }
++ return slot;
++}
++
++static inline void free_vma_slot(struct vma_slot *vma_slot)
++{
++ kmem_cache_free(vma_slot_cache, vma_slot);
++}
++
++
++
++static inline struct rmap_item *alloc_rmap_item(void)
++{
++ struct rmap_item *rmap_item;
++
++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL);
++ if (rmap_item) {
++ /* bug on lowest bit is not clear for flag use */
++ BUG_ON(is_addr(rmap_item));
++ }
++ return rmap_item;
++}
++
++static inline void free_rmap_item(struct rmap_item *rmap_item)
++{
++ rmap_item->slot = NULL; /* debug safety */
++ kmem_cache_free(rmap_item_cache, rmap_item);
++}
++
++static inline struct stable_node *alloc_stable_node(void)
++{
++ struct stable_node *node;
++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
++ if (!node)
++ return NULL;
++
++ INIT_HLIST_HEAD(&node->hlist);
++ list_add(&node->all_list, &stable_node_list);
++ return node;
++}
++
++static inline void free_stable_node(struct stable_node *stable_node)
++{
++ list_del(&stable_node->all_list);
++ kmem_cache_free(stable_node_cache, stable_node);
++}
++
++static inline struct tree_node *alloc_tree_node(struct list_head *list)
++{
++ struct tree_node *node;
++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
++ if (!node)
++ return NULL;
++
++ list_add(&node->all_list, list);
++ return node;
++}
++
++static inline void free_tree_node(struct tree_node *node)
++{
++ list_del(&node->all_list);
++ kmem_cache_free(tree_node_cache, node);
++}
++
++static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
++{
++ struct anon_vma *anon_vma = rmap_item->anon_vma;
++
++ put_anon_vma(anon_vma);
++}
++
++
++/**
++ * Remove a stable node from stable_tree, may unlink from its tree_node and
++ * may remove its parent tree_node if no other stable node is pending.
++ *
++ * @stable_node The node need to be removed
++ * @unlink_rb Will this node be unlinked from the rbtree?
++ * @remove_tree_ node Will its tree_node be removed if empty?
++ */
++static void remove_node_from_stable_tree(struct stable_node *stable_node,
++ int unlink_rb, int remove_tree_node)
++{
++ struct node_vma *node_vma;
++ struct rmap_item *rmap_item;
++ struct hlist_node *n;
++
++ if (!hlist_empty(&stable_node->hlist)) {
++ hlist_for_each_entry_safe(node_vma, n,
++ &stable_node->hlist, hlist) {
++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
++ uksm_pages_sharing--;
++
++ uksm_drop_anon_vma(rmap_item);
++ rmap_item->address &= PAGE_MASK;
++ }
++ free_node_vma(node_vma);
++ cond_resched();
++ }
++
++ /* the last one is counted as shared */
++ uksm_pages_shared--;
++ uksm_pages_sharing++;
++ }
++
++ if (stable_node->tree_node && unlink_rb) {
++ rb_erase(&stable_node->node,
++ &stable_node->tree_node->sub_root);
++
++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
++ remove_tree_node) {
++ rb_erase(&stable_node->tree_node->node,
++ root_stable_treep);
++ free_tree_node(stable_node->tree_node);
++ } else {
++ stable_node->tree_node->count--;
++ }
++ }
++
++ free_stable_node(stable_node);
++}
++
++
++/*
++ * get_uksm_page: checks if the page indicated by the stable node
++ * is still its ksm page, despite having held no reference to it.
++ * In which case we can trust the content of the page, and it
++ * returns the gotten page; but if the page has now been zapped,
++ * remove the stale node from the stable tree and return NULL.
++ *
++ * You would expect the stable_node to hold a reference to the ksm page.
++ * But if it increments the page's count, swapping out has to wait for
++ * ksmd to come around again before it can free the page, which may take
++ * seconds or even minutes: much too unresponsive. So instead we use a
++ * "keyhole reference": access to the ksm page from the stable node peeps
++ * out through its keyhole to see if that page still holds the right key,
++ * pointing back to this stable node. This relies on freeing a PageAnon
++ * page to reset its page->mapping to NULL, and relies on no other use of
++ * a page to put something that might look like our key in page->mapping.
++ *
++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
++ * but this is different - made simpler by uksm_thread_mutex being held, but
++ * interesting for assuming that no other use of the struct page could ever
++ * put our expected_mapping into page->mapping (or a field of the union which
++ * coincides with page->mapping). The RCU calls are not for KSM at all, but
++ * to keep the page_count protocol described with page_cache_get_speculative.
++ *
++ * Note: it is possible that get_uksm_page() will return NULL one moment,
++ * then page the next, if the page is in between page_freeze_refs() and
++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
++ * is on its way to being freed; but it is an anomaly to bear in mind.
++ *
++ * @unlink_rb: if the removal of this node will firstly unlink from
++ * its rbtree. stable_node_reinsert will prevent this when restructuring the
++ * node from its old tree.
++ *
++ * @remove_tree_node: if this is the last one of its tree_node, will the
++ * tree_node be freed ? If we are inserting stable node, this tree_node may
++ * be reused, so don't free it.
++ */
++static struct page *get_uksm_page(struct stable_node *stable_node,
++ int unlink_rb, int remove_tree_node)
++{
++ struct page *page;
++ void *expected_mapping;
++
++ page = pfn_to_page(stable_node->kpfn);
++ expected_mapping = (void *)stable_node +
++ (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
++ rcu_read_lock();
++ if (page->mapping != expected_mapping)
++ goto stale;
++ if (!get_page_unless_zero(page))
++ goto stale;
++ if (page->mapping != expected_mapping) {
++ put_page(page);
++ goto stale;
++ }
++ rcu_read_unlock();
++ return page;
++stale:
++ rcu_read_unlock();
++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
++
++ return NULL;
++}
++
++/*
++ * Removing rmap_item from stable or unstable tree.
++ * This function will clean the information from the stable/unstable tree.
++ */
++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
++{
++ if (rmap_item->address & STABLE_FLAG) {
++ struct stable_node *stable_node;
++ struct node_vma *node_vma;
++ struct page *page;
++
++ node_vma = rmap_item->head;
++ stable_node = node_vma->head;
++ page = get_uksm_page(stable_node, 1, 1);
++ if (!page)
++ goto out;
++
++ /*
++ * page lock is needed because it's racing with
++ * try_to_unmap_ksm(), etc.
++ */
++ lock_page(page);
++ hlist_del(&rmap_item->hlist);
++
++ if (hlist_empty(&node_vma->rmap_hlist)) {
++ hlist_del(&node_vma->hlist);
++ free_node_vma(node_vma);
++ }
++ unlock_page(page);
++
++ put_page(page);
++ if (hlist_empty(&stable_node->hlist)) {
++ /* do NOT call remove_node_from_stable_tree() here,
++ * it's possible for a forked rmap_item not in
++ * stable tree while the in-tree rmap_items were
++ * deleted.
++ */
++ uksm_pages_shared--;
++ } else
++ uksm_pages_sharing--;
++
++
++ uksm_drop_anon_vma(rmap_item);
++ } else if (rmap_item->address & UNSTABLE_FLAG) {
++ if (rmap_item->hash_round == uksm_hash_round) {
++
++ rb_erase(&rmap_item->node,
++ &rmap_item->tree_node->sub_root);
++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
++ rb_erase(&rmap_item->tree_node->node,
++ &root_unstable_tree);
++
++ free_tree_node(rmap_item->tree_node);
++ } else
++ rmap_item->tree_node->count--;
++ }
++ uksm_pages_unshared--;
++ }
++
++ rmap_item->address &= PAGE_MASK;
++ rmap_item->hash_max = 0;
++
++out:
++ cond_resched(); /* we're called from many long loops */
++}
++
++static inline int slot_in_uksm(struct vma_slot *slot)
++{
++ return list_empty(&slot->slot_list);
++}
++
++/*
++ * Test if the mm is exiting
++ */
++static inline bool uksm_test_exit(struct mm_struct *mm)
++{
++ return atomic_read(&mm->mm_users) == 0;
++}
++
++/**
++ * Need to do two things:
++ * 1. check if slot was moved to del list
++ * 2. make sure the mmap_sem is manipulated under valid vma.
++ *
++ * My concern here is that in some cases, this may make
++ * vma_slot_list_lock() waiters to serialized further by some
++ * sem->wait_lock, can this really be expensive?
++ *
++ *
++ * @return
++ * 0: if successfully locked mmap_sem
++ * -ENOENT: this slot was moved to del list
++ * -EBUSY: vma lock failed
++ */
++static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
++{
++ struct vm_area_struct *vma;
++ struct mm_struct *mm;
++ struct rw_semaphore *sem;
++
++ spin_lock(&vma_slot_list_lock);
++
++ /* the slot_list was removed and inited from new list, when it enters
++ * uksm_list. If now it's not empty, then it must be moved to del list
++ */
++ if (!slot_in_uksm(slot)) {
++ spin_unlock(&vma_slot_list_lock);
++ return -ENOENT;
++ }
++
++ BUG_ON(slot->pages != vma_pages(slot->vma));
++ /* Ok, vma still valid */
++ vma = slot->vma;
++ mm = vma->vm_mm;
++ sem = &mm->mmap_sem;
++
++ if (uksm_test_exit(mm)) {
++ spin_unlock(&vma_slot_list_lock);
++ return -ENOENT;
++ }
++
++ if (down_read_trylock(sem)) {
++ spin_unlock(&vma_slot_list_lock);
++ return 0;
++ }
++
++ spin_unlock(&vma_slot_list_lock);
++ return -EBUSY;
++}
++
++static inline unsigned long
++vma_page_address(struct page *page, struct vm_area_struct *vma)
++{
++ pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
++ unsigned long address;
++
++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
++ /* page should be within @vma mapping range */
++ return -EFAULT;
++ }
++ return address;
++}
++
++
++/* return 0 on success with the item's mmap_sem locked */
++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
++{
++ struct mm_struct *mm;
++ struct vma_slot *slot = item->slot;
++ int err = -EINVAL;
++
++ struct page *page;
++
++ /*
++ * try_down_read_slot_mmap_sem() returns non-zero if the slot
++ * has been removed by uksm_remove_vma().
++ */
++ if (try_down_read_slot_mmap_sem(slot))
++ return -EBUSY;
++
++ mm = slot->vma->vm_mm;
++
++ if (uksm_test_exit(mm))
++ goto failout_up;
++
++ page = item->page;
++ rcu_read_lock();
++ if (!get_page_unless_zero(page)) {
++ rcu_read_unlock();
++ goto failout_up;
++ }
++
++ /* No need to consider huge page here. */
++ if (item->slot->vma->anon_vma != page_anon_vma(page) ||
++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
++ /*
++ * TODO:
++ * should we release this item becase of its stale page
++ * mapping?
++ */
++ put_page(page);
++ rcu_read_unlock();
++ goto failout_up;
++ }
++ rcu_read_unlock();
++ return 0;
++
++failout_up:
++ up_read(&mm->mmap_sem);
++ return err;
++}
++
++/*
++ * What kind of VMA is considered ?
++ */
++static inline int vma_can_enter(struct vm_area_struct *vma)
++{
++ return uksm_flags_can_scan(vma->vm_flags);
++}
++
++/*
++ * Called whenever a fresh new vma is created A new vma_slot.
++ * is created and inserted into a global list Must be called.
++ * after vma is inserted to its mm .
++ */
++void uksm_vma_add_new(struct vm_area_struct *vma)
++{
++ struct vma_slot *slot;
++
++ if (!vma_can_enter(vma)) {
++ vma->uksm_vma_slot = NULL;
++ return;
++ }
++
++ slot = alloc_vma_slot();
++ if (!slot) {
++ vma->uksm_vma_slot = NULL;
++ return;
++ }
++
++ vma->uksm_vma_slot = slot;
++ vma->vm_flags |= VM_MERGEABLE;
++ slot->vma = vma;
++ slot->mm = vma->vm_mm;
++ slot->ctime_j = jiffies;
++ slot->pages = vma_pages(vma);
++ spin_lock(&vma_slot_list_lock);
++ list_add_tail(&slot->slot_list, &vma_slot_new);
++ spin_unlock(&vma_slot_list_lock);
++}
++
++/*
++ * Called after vma is unlinked from its mm
++ */
++void uksm_remove_vma(struct vm_area_struct *vma)
++{
++ struct vma_slot *slot;
++
++ if (!vma->uksm_vma_slot)
++ return;
++
++ slot = vma->uksm_vma_slot;
++ spin_lock(&vma_slot_list_lock);
++ if (slot_in_uksm(slot)) {
++ /**
++ * This slot has been added by ksmd, so move to the del list
++ * waiting ksmd to free it.
++ */
++ list_add_tail(&slot->slot_list, &vma_slot_del);
++ } else {
++ /**
++ * It's still on new list. It's ok to free slot directly.
++ */
++ list_del(&slot->slot_list);
++ free_vma_slot(slot);
++ }
++ spin_unlock(&vma_slot_list_lock);
++ vma->uksm_vma_slot = NULL;
++}
++
++/* 32/3 < they < 32/2 */
++#define shiftl 8
++#define shiftr 12
++
++#define HASH_FROM_TO(from, to) \
++for (index = from; index < to; index++) { \
++ pos = random_nums[index]; \
++ hash += key[pos]; \
++ hash += (hash << shiftl); \
++ hash ^= (hash >> shiftr); \
++}
++
++
++#define HASH_FROM_DOWN_TO(from, to) \
++for (index = from - 1; index >= to; index--) { \
++ hash ^= (hash >> shiftr); \
++ hash ^= (hash >> (shiftr*2)); \
++ hash -= (hash << shiftl); \
++ hash += (hash << (shiftl*2)); \
++ pos = random_nums[index]; \
++ hash -= key[pos]; \
++}
++
++/*
++ * The main random sample hash function.
++ */
++static u32 random_sample_hash(void *addr, u32 hash_strength)
++{
++ u32 hash = 0xdeadbeef;
++ int index, pos, loop = hash_strength;
++ u32 *key = (u32 *)addr;
++
++ if (loop > HASH_STRENGTH_FULL)
++ loop = HASH_STRENGTH_FULL;
++
++ HASH_FROM_TO(0, loop);
++
++ if (hash_strength > HASH_STRENGTH_FULL) {
++ loop = hash_strength - HASH_STRENGTH_FULL;
++ HASH_FROM_TO(0, loop);
++ }
++
++ return hash;
++}
++
++
++/**
++ * It's used when hash strength is adjusted
++ *
++ * @addr The page's virtual address
++ * @from The original hash strength
++ * @to The hash strength changed to
++ * @hash The hash value generated with "from" hash value
++ *
++ * return the hash value
++ */
++static u32 delta_hash(void *addr, int from, int to, u32 hash)
++{
++ u32 *key = (u32 *)addr;
++ int index, pos; /* make sure they are int type */
++
++ if (to > from) {
++ if (from >= HASH_STRENGTH_FULL) {
++ from -= HASH_STRENGTH_FULL;
++ to -= HASH_STRENGTH_FULL;
++ HASH_FROM_TO(from, to);
++ } else if (to <= HASH_STRENGTH_FULL) {
++ HASH_FROM_TO(from, to);
++ } else {
++ HASH_FROM_TO(from, HASH_STRENGTH_FULL);
++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
++ }
++ } else {
++ if (from <= HASH_STRENGTH_FULL) {
++ HASH_FROM_DOWN_TO(from, to);
++ } else if (to >= HASH_STRENGTH_FULL) {
++ from -= HASH_STRENGTH_FULL;
++ to -= HASH_STRENGTH_FULL;
++ HASH_FROM_DOWN_TO(from, to);
++ } else {
++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
++ }
++ }
++
++ return hash;
++}
++
++
++
++
++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
++
++/**
++ *
++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
++ * has finished.
++ *
++ * return 0 if no page has been scanned since last call, 1 otherwise.
++ */
++static inline int encode_benefit(void)
++{
++ u64 scanned_delta, pos_delta, neg_delta;
++ unsigned long base = benefit.base;
++
++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
++
++ if (!scanned_delta)
++ return 0;
++
++ scanned_delta >>= base;
++ pos_delta = rshash_pos >> base;
++ neg_delta = rshash_neg >> base;
++
++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
++ benefit.scanned >>= 1;
++ benefit.neg >>= 1;
++ benefit.pos >>= 1;
++ benefit.base++;
++ scanned_delta >>= 1;
++ pos_delta >>= 1;
++ neg_delta >>= 1;
++ }
++
++ benefit.pos += pos_delta;
++ benefit.neg += neg_delta;
++ benefit.scanned += scanned_delta;
++
++ BUG_ON(!benefit.scanned);
++
++ rshash_pos = rshash_neg = 0;
++ uksm_pages_scanned_last = uksm_pages_scanned;
++
++ return 1;
++}
++
++static inline void reset_benefit(void)
++{
++ benefit.pos = 0;
++ benefit.neg = 0;
++ benefit.base = 0;
++ benefit.scanned = 0;
++}
++
++static inline void inc_rshash_pos(unsigned long delta)
++{
++ if (CAN_OVERFLOW_U64(rshash_pos, delta))
++ encode_benefit();
++
++ rshash_pos += delta;
++}
++
++static inline void inc_rshash_neg(unsigned long delta)
++{
++ if (CAN_OVERFLOW_U64(rshash_neg, delta))
++ encode_benefit();
++
++ rshash_neg += delta;
++}
++
++
++static inline u32 page_hash(struct page *page, unsigned long hash_strength,
++ int cost_accounting)
++{
++ u32 val;
++ unsigned long delta;
++
++ void *addr = kmap_atomic(page);
++
++ val = random_sample_hash(addr, hash_strength);
++ kunmap_atomic(addr);
++
++ if (cost_accounting) {
++ if (HASH_STRENGTH_FULL > hash_strength)
++ delta = HASH_STRENGTH_FULL - hash_strength;
++ else
++ delta = 0;
++
++ inc_rshash_pos(delta);
++ }
++
++ return val;
++}
++
++static int memcmp_pages(struct page *page1, struct page *page2,
++ int cost_accounting)
++{
++ char *addr1, *addr2;
++ int ret;
++
++ addr1 = kmap_atomic(page1);
++ addr2 = kmap_atomic(page2);
++ ret = memcmp(addr1, addr2, PAGE_SIZE);
++ kunmap_atomic(addr2);
++ kunmap_atomic(addr1);
++
++ if (cost_accounting)
++ inc_rshash_neg(memcmp_cost);
++
++ return ret;
++}
++
++static inline int pages_identical(struct page *page1, struct page *page2)
++{
++ return !memcmp_pages(page1, page2, 0);
++}
++
++static inline int is_page_full_zero(struct page *page)
++{
++ char *addr;
++ int ret;
++
++ addr = kmap_atomic(page);
++ ret = is_full_zero(addr, PAGE_SIZE);
++ kunmap_atomic(addr);
++
++ return ret;
++}
++
++static int write_protect_page(struct vm_area_struct *vma, struct page *page,
++ pte_t *orig_pte, pte_t *old_pte)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ unsigned long addr;
++ pte_t *ptep;
++ spinlock_t *ptl;
++ int swapped;
++ int err = -EFAULT;
++ unsigned long mmun_start; /* For mmu_notifiers */
++ unsigned long mmun_end; /* For mmu_notifiers */
++
++ addr = page_address_in_vma(page, vma);
++ if (addr == -EFAULT)
++ goto out;
++
++ BUG_ON(PageTransCompound(page));
++
++ mmun_start = addr;
++ mmun_end = addr + PAGE_SIZE;
++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
++
++ ptep = page_check_address(page, mm, addr, &ptl, 0);
++ if (!ptep)
++ goto out_mn;
++
++ if (old_pte)
++ *old_pte = *ptep;
++
++ if (pte_write(*ptep) || pte_dirty(*ptep)) {
++ pte_t entry;
++
++ swapped = PageSwapCache(page);
++ flush_cache_page(vma, addr, page_to_pfn(page));
++ /*
++ * Ok this is tricky, when get_user_pages_fast() run it doesnt
++ * take any lock, therefore the check that we are going to make
++ * with the pagecount against the mapcount is racey and
++ * O_DIRECT can happen right after the check.
++ * So we clear the pte and flush the tlb before the check
++ * this assure us that no O_DIRECT can happen after the check
++ * or in the middle of the check.
++ */
++ entry = ptep_clear_flush(vma, addr, ptep);
++ /*
++ * Check that no O_DIRECT or similar I/O is in progress on the
++ * page
++ */
++ if (page_mapcount(page) + 1 + swapped != page_count(page)) {
++ set_pte_at(mm, addr, ptep, entry);
++ goto out_unlock;
++ }
++ if (pte_dirty(entry))
++ set_page_dirty(page);
++ entry = pte_mkclean(pte_wrprotect(entry));
++ set_pte_at_notify(mm, addr, ptep, entry);
++ }
++ *orig_pte = *ptep;
++ err = 0;
++
++out_unlock:
++ pte_unmap_unlock(ptep, ptl);
++out_mn:
++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
++out:
++ return err;
++}
++
++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */
++#define MERGE_ERR_COLLI 2 /* there is a collision */
++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */
++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */
++
++
++/**
++ * replace_page - replace page in vma by new ksm page
++ * @vma: vma that holds the pte pointing to page
++ * @page: the page we are replacing by kpage
++ * @kpage: the ksm page we replace page by
++ * @orig_pte: the original value of the pte
++ *
++ * Returns 0 on success, MERGE_ERR_PGERR on failure.
++ */
++static int replace_page(struct vm_area_struct *vma, struct page *page,
++ struct page *kpage, pte_t orig_pte)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *ptep;
++ spinlock_t *ptl;
++ pte_t entry;
++
++ unsigned long addr;
++ int err = MERGE_ERR_PGERR;
++ unsigned long mmun_start; /* For mmu_notifiers */
++ unsigned long mmun_end; /* For mmu_notifiers */
++
++ addr = page_address_in_vma(page, vma);
++ if (addr == -EFAULT)
++ goto out;
++
++ pgd = pgd_offset(mm, addr);
++ if (!pgd_present(*pgd))
++ goto out;
++
++ pud = pud_offset(pgd, addr);
++ if (!pud_present(*pud))
++ goto out;
++
++ pmd = pmd_offset(pud, addr);
++ BUG_ON(pmd_trans_huge(*pmd));
++ if (!pmd_present(*pmd))
++ goto out;
++
++ mmun_start = addr;
++ mmun_end = addr + PAGE_SIZE;
++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
++
++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++ if (!pte_same(*ptep, orig_pte)) {
++ pte_unmap_unlock(ptep, ptl);
++ goto out_mn;
++ }
++
++ flush_cache_page(vma, addr, pte_pfn(*ptep));
++ ptep_clear_flush(vma, addr, ptep);
++ entry = mk_pte(kpage, vma->vm_page_prot);
++
++ /* special treatment is needed for zero_page */
++ if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
++ (page_to_pfn(kpage) == zero_pfn))
++ entry = pte_mkspecial(entry);
++ else {
++ get_page(kpage);
++ page_add_anon_rmap(kpage, vma, addr);
++ }
++
++ set_pte_at_notify(mm, addr, ptep, entry);
++
++ page_remove_rmap(page);
++ if (!page_mapped(page))
++ try_to_free_swap(page);
++ put_page(page);
++
++ pte_unmap_unlock(ptep, ptl);
++ err = 0;
++out_mn:
++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
++out:
++ return err;
++}
++
++
++/**
++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its
++ * hash_max member has not been calculated.
++ *
++ * @page The page needs to be hashed
++ * @hash_old The hash value calculated with current hash strength
++ *
++ * return the new hash value calculated at HASH_STRENGTH_MAX
++ */
++static inline u32 page_hash_max(struct page *page, u32 hash_old)
++{
++ u32 hash_max = 0;
++ void *addr;
++
++ addr = kmap_atomic(page);
++ hash_max = delta_hash(addr, hash_strength,
++ HASH_STRENGTH_MAX, hash_old);
++
++ kunmap_atomic(addr);
++
++ if (!hash_max)
++ hash_max = 1;
++
++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
++ return hash_max;
++}
++
++/*
++ * We compare the hash again, to ensure that it is really a hash collision
++ * instead of being caused by page write.
++ */
++static inline int check_collision(struct rmap_item *rmap_item,
++ u32 hash)
++{
++ int err;
++ struct page *page = rmap_item->page;
++
++ /* if this rmap_item has already been hash_maxed, then the collision
++ * must appears in the second-level rbtree search. In this case we check
++ * if its hash_max value has been changed. Otherwise, the collision
++ * happens in the first-level rbtree search, so we check against it's
++ * current hash value.
++ */
++ if (rmap_item->hash_max) {
++ inc_rshash_neg(memcmp_cost);
++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
++
++ if (rmap_item->hash_max == page_hash_max(page, hash))
++ err = MERGE_ERR_COLLI;
++ else
++ err = MERGE_ERR_CHANGED;
++ } else {
++ inc_rshash_neg(memcmp_cost + hash_strength);
++
++ if (page_hash(page, hash_strength, 0) == hash)
++ err = MERGE_ERR_COLLI;
++ else
++ err = MERGE_ERR_CHANGED;
++ }
++
++ return err;
++}
++
++static struct page *page_trans_compound_anon(struct page *page)
++{
++ if (PageTransCompound(page)) {
++ struct page *head = compound_head(page);
++ /*
++ * head may actually be splitted and freed from under
++ * us but it's ok here.
++ */
++ if (PageAnon(head))
++ return head;
++ }
++ return NULL;
++}
++
++static int page_trans_compound_anon_split(struct page *page)
++{
++ int ret = 0;
++ struct page *transhuge_head = page_trans_compound_anon(page);
++ if (transhuge_head) {
++ /* Get the reference on the head to split it. */
++ if (get_page_unless_zero(transhuge_head)) {
++ /*
++ * Recheck we got the reference while the head
++ * was still anonymous.
++ */
++ if (PageAnon(transhuge_head))
++ ret = split_huge_page(transhuge_head);
++ else
++ /*
++ * Retry later if split_huge_page run
++ * from under us.
++ */
++ ret = 1;
++ put_page(transhuge_head);
++ } else
++ /* Retry later if split_huge_page run from under us. */
++ ret = 1;
++ }
++ return ret;
++}
++
++/**
++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
++ * already be a ksm page.
++ *
++ * @return 0 if the pages were merged, -EFAULT otherwise.
++ */
++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
++ struct page *kpage, u32 hash)
++{
++ struct vm_area_struct *vma = rmap_item->slot->vma;
++ struct mm_struct *mm = vma->vm_mm;
++ pte_t orig_pte = __pte(0);
++ int err = MERGE_ERR_PGERR;
++ struct page *page;
++
++ if (uksm_test_exit(mm))
++ goto out;
++
++ page = rmap_item->page;
++
++ if (page == kpage) { /* ksm page forked */
++ err = 0;
++ goto out;
++ }
++
++ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
++ goto out;
++ BUG_ON(PageTransCompound(page));
++
++ if (!PageAnon(page) || !PageKsm(kpage))
++ goto out;
++
++ /*
++ * We need the page lock to read a stable PageSwapCache in
++ * write_protect_page(). We use trylock_page() instead of
++ * lock_page() because we don't want to wait here - we
++ * prefer to continue scanning and merging different pages,
++ * then come back to this page when it is unlocked.
++ */
++ if (!trylock_page(page))
++ goto out;
++ /*
++ * If this anonymous page is mapped only here, its pte may need
++ * to be write-protected. If it's mapped elsewhere, all of its
++ * ptes are necessarily already write-protected. But in either
++ * case, we need to lock and check page_count is not raised.
++ */
++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
++ if (pages_identical(page, kpage))
++ err = replace_page(vma, page, kpage, orig_pte);
++ else
++ err = check_collision(rmap_item, hash);
++ }
++
++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
++ munlock_vma_page(page);
++ if (!PageMlocked(kpage)) {
++ unlock_page(page);
++ lock_page(kpage);
++ mlock_vma_page(kpage);
++ page = kpage; /* for final unlock */
++ }
++ }
++
++ unlock_page(page);
++out:
++ return err;
++}
++
++
++
++/**
++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
++ * to restore a page mapping that has been changed in try_to_merge_two_pages.
++ *
++ * @return 0 on success.
++ */
++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
++ pte_t orig_pte, pte_t wprt_pte)
++{
++ struct mm_struct *mm = vma->vm_mm;
++ pgd_t *pgd;
++ pud_t *pud;
++ pmd_t *pmd;
++ pte_t *ptep;
++ spinlock_t *ptl;
++
++ int err = -EFAULT;
++
++ pgd = pgd_offset(mm, addr);
++ if (!pgd_present(*pgd))
++ goto out;
++
++ pud = pud_offset(pgd, addr);
++ if (!pud_present(*pud))
++ goto out;
++
++ pmd = pmd_offset(pud, addr);
++ if (!pmd_present(*pmd))
++ goto out;
++
++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
++ if (!pte_same(*ptep, wprt_pte)) {
++ /* already copied, let it be */
++ pte_unmap_unlock(ptep, ptl);
++ goto out;
++ }
++
++ /*
++ * Good boy, still here. When we still get the ksm page, it does not
++ * return to the free page pool, there is no way that a pte was changed
++ * to other page and gets back to this page. And remind that ksm page
++ * do not reuse in do_wp_page(). So it's safe to restore the original
++ * pte.
++ */
++ flush_cache_page(vma, addr, pte_pfn(*ptep));
++ ptep_clear_flush(vma, addr, ptep);
++ set_pte_at_notify(mm, addr, ptep, orig_pte);
++
++ pte_unmap_unlock(ptep, ptl);
++ err = 0;
++out:
++ return err;
++}
++
++/**
++ * try_to_merge_two_pages() - take two identical pages and prepare
++ * them to be merged into one page(rmap_item->page)
++ *
++ * @return 0 if we successfully merged two identical pages into
++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision
++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
++ * changed since it's hashed. MERGE_ERR_PGERR otherwise.
++ *
++ */
++static int try_to_merge_two_pages(struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ u32 hash)
++{
++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
++ struct vm_area_struct *vma1 = rmap_item->slot->vma;
++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
++ struct page *page = rmap_item->page;
++ struct page *tree_page = tree_rmap_item->page;
++ int err = MERGE_ERR_PGERR;
++ struct address_space *saved_mapping;
++
++
++ if (rmap_item->page == tree_rmap_item->page)
++ goto out;
++
++ if (PageTransCompound(page) && page_trans_compound_anon_split(page))
++ goto out;
++ BUG_ON(PageTransCompound(page));
++
++ if (PageTransCompound(tree_page) && page_trans_compound_anon_split(tree_page))
++ goto out;
++ BUG_ON(PageTransCompound(tree_page));
++
++ if (!PageAnon(page) || !PageAnon(tree_page))
++ goto out;
++
++ if (!trylock_page(page))
++ goto out;
++
++
++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
++ unlock_page(page);
++ goto out;
++ }
++
++ /*
++ * While we hold page lock, upgrade page from
++ * PageAnon+anon_vma to PageKsm+NULL stable_node:
++ * stable_tree_insert() will update stable_node.
++ */
++ saved_mapping = page->mapping;
++ set_page_stable_node(page, NULL);
++ mark_page_accessed(page);
++ unlock_page(page);
++
++ if (!trylock_page(tree_page))
++ goto restore_out;
++
++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
++ unlock_page(tree_page);
++ goto restore_out;
++ }
++
++ if (pages_identical(page, tree_page)) {
++ err = replace_page(vma2, tree_page, page, wprt_pte2);
++ if (err) {
++ unlock_page(tree_page);
++ goto restore_out;
++ }
++
++ if ((vma2->vm_flags & VM_LOCKED)) {
++ munlock_vma_page(tree_page);
++ if (!PageMlocked(page)) {
++ unlock_page(tree_page);
++ lock_page(page);
++ mlock_vma_page(page);
++ tree_page = page; /* for final unlock */
++ }
++ }
++
++ unlock_page(tree_page);
++
++ goto out; /* success */
++
++ } else {
++ if (tree_rmap_item->hash_max &&
++ tree_rmap_item->hash_max == rmap_item->hash_max) {
++ err = MERGE_ERR_COLLI_MAX;
++ } else if (page_hash(page, hash_strength, 0) ==
++ page_hash(tree_page, hash_strength, 0)) {
++ inc_rshash_neg(memcmp_cost + hash_strength * 2);
++ err = MERGE_ERR_COLLI;
++ } else {
++ err = MERGE_ERR_CHANGED;
++ }
++
++ unlock_page(tree_page);
++ }
++
++restore_out:
++ lock_page(page);
++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
++ orig_pte1, wprt_pte1))
++ page->mapping = saved_mapping;
++
++ unlock_page(page);
++out:
++ return err;
++}
++
++static inline int hash_cmp(u32 new_val, u32 node_val)
++{
++ if (new_val > node_val)
++ return 1;
++ else if (new_val < node_val)
++ return -1;
++ else
++ return 0;
++}
++
++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
++{
++ u32 hash_max = item->hash_max;
++
++ if (!hash_max) {
++ hash_max = page_hash_max(item->page, hash);
++
++ item->hash_max = hash_max;
++ }
++
++ return hash_max;
++}
++
++
++
++/**
++ * stable_tree_search() - search the stable tree for a page
++ *
++ * @item: the rmap_item we are comparing with
++ * @hash: the hash value of this item->page already calculated
++ *
++ * @return the page we have found, NULL otherwise. The page returned has
++ * been gotten.
++ */
++static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
++{
++ struct rb_node *node = root_stable_treep->rb_node;
++ struct tree_node *tree_node;
++ unsigned long hash_max;
++ struct page *page = item->page;
++ struct stable_node *stable_node;
++
++ stable_node = page_stable_node(page);
++ if (stable_node) {
++ /* ksm page forked, that is
++ * if (PageKsm(page) && !in_stable_tree(rmap_item))
++ * it's actually gotten once outside.
++ */
++ get_page(page);
++ return page;
++ }
++
++ while (node) {
++ int cmp;
++
++ tree_node = rb_entry(node, struct tree_node, node);
++
++ cmp = hash_cmp(hash, tree_node->hash);
++
++ if (cmp < 0)
++ node = node->rb_left;
++ else if (cmp > 0)
++ node = node->rb_right;
++ else
++ break;
++ }
++
++ if (!node)
++ return NULL;
++
++ if (tree_node->count == 1) {
++ stable_node = rb_entry(tree_node->sub_root.rb_node,
++ struct stable_node, node);
++ BUG_ON(!stable_node);
++
++ goto get_page_out;
++ }
++
++ /*
++ * ok, we have to search the second
++ * level subtree, hash the page to a
++ * full strength.
++ */
++ node = tree_node->sub_root.rb_node;
++ BUG_ON(!node);
++ hash_max = rmap_item_hash_max(item, hash);
++
++ while (node) {
++ int cmp;
++
++ stable_node = rb_entry(node, struct stable_node, node);
++
++ cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++ if (cmp < 0)
++ node = node->rb_left;
++ else if (cmp > 0)
++ node = node->rb_right;
++ else
++ goto get_page_out;
++ }
++
++ return NULL;
++
++get_page_out:
++ page = get_uksm_page(stable_node, 1, 1);
++ return page;
++}
++
++static int try_merge_rmap_item(struct rmap_item *item,
++ struct page *kpage,
++ struct page *tree_page)
++{
++ spinlock_t *ptl;
++ pte_t *ptep;
++ unsigned long addr;
++ struct vm_area_struct *vma = item->slot->vma;
++
++ addr = get_rmap_addr(item);
++ ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
++ if (!ptep)
++ return 0;
++
++ if (pte_write(*ptep)) {
++ /* has changed, abort! */
++ pte_unmap_unlock(ptep, ptl);
++ return 0;
++ }
++
++ get_page(tree_page);
++ page_add_anon_rmap(tree_page, vma, addr);
++
++ flush_cache_page(vma, addr, pte_pfn(*ptep));
++ ptep_clear_flush(vma, addr, ptep);
++ set_pte_at_notify(vma->vm_mm, addr, ptep,
++ mk_pte(tree_page, vma->vm_page_prot));
++
++ page_remove_rmap(kpage);
++ put_page(kpage);
++
++ pte_unmap_unlock(ptep, ptl);
++
++ return 1;
++}
++
++/**
++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
++ * into stable tree, the page was found to be identical to a stable ksm page,
++ * this is the last chance we can merge them into one.
++ *
++ * @item1: the rmap_item holding the page which we wanted to insert
++ * into stable tree.
++ * @item2: the other rmap_item we found when unstable tree search
++ * @oldpage: the page currently mapped by the two rmap_items
++ * @tree_page: the page we found identical in stable tree node
++ * @success1: return if item1 is successfully merged
++ * @success2: return if item2 is successfully merged
++ */
++static void try_merge_with_stable(struct rmap_item *item1,
++ struct rmap_item *item2,
++ struct page **kpage,
++ struct page *tree_page,
++ int *success1, int *success2)
++{
++ struct vm_area_struct *vma1 = item1->slot->vma;
++ struct vm_area_struct *vma2 = item2->slot->vma;
++ *success1 = 0;
++ *success2 = 0;
++
++ if (unlikely(*kpage == tree_page)) {
++ /* I don't think this can really happen */
++ printk(KERN_WARNING "UKSM: unexpected condition detected in "
++ "try_merge_with_stable() -- *kpage == tree_page !\n");
++ *success1 = 1;
++ *success2 = 1;
++ return;
++ }
++
++ if (!PageAnon(*kpage) || !PageKsm(*kpage))
++ goto failed;
++
++ if (!trylock_page(tree_page))
++ goto failed;
++
++ /* If the oldpage is still ksm and still pointed
++ * to in the right place, and still write protected,
++ * we are confident it's not changed, no need to
++ * memcmp anymore.
++ * be ware, we cannot take nested pte locks,
++ * deadlock risk.
++ */
++ if (!try_merge_rmap_item(item1, *kpage, tree_page))
++ goto unlock_failed;
++
++ /* ok, then vma2, remind that pte1 already set */
++ if (!try_merge_rmap_item(item2, *kpage, tree_page))
++ goto success_1;
++
++ *success2 = 1;
++success_1:
++ *success1 = 1;
++
++
++ if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
++ (*success2 && vma2->vm_flags & VM_LOCKED)) {
++ munlock_vma_page(*kpage);
++ if (!PageMlocked(tree_page))
++ mlock_vma_page(tree_page);
++ }
++
++ /*
++ * We do not need oldpage any more in the caller, so can break the lock
++ * now.
++ */
++ unlock_page(*kpage);
++ *kpage = tree_page; /* Get unlocked outside. */
++ return;
++
++unlock_failed:
++ unlock_page(tree_page);
++failed:
++ return;
++}
++
++static inline void stable_node_hash_max(struct stable_node *node,
++ struct page *page, u32 hash)
++{
++ u32 hash_max = node->hash_max;
++
++ if (!hash_max) {
++ hash_max = page_hash_max(page, hash);
++ node->hash_max = hash_max;
++ }
++}
++
++static inline
++struct stable_node *new_stable_node(struct tree_node *tree_node,
++ struct page *kpage, u32 hash_max)
++{
++ struct stable_node *new_stable_node;
++
++ new_stable_node = alloc_stable_node();
++ if (!new_stable_node)
++ return NULL;
++
++ new_stable_node->kpfn = page_to_pfn(kpage);
++ new_stable_node->hash_max = hash_max;
++ new_stable_node->tree_node = tree_node;
++ set_page_stable_node(kpage, new_stable_node);
++
++ return new_stable_node;
++}
++
++static inline
++struct stable_node *first_level_insert(struct tree_node *tree_node,
++ struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ struct page **kpage, u32 hash,
++ int *success1, int *success2)
++{
++ int cmp;
++ struct page *tree_page;
++ u32 hash_max = 0;
++ struct stable_node *stable_node, *new_snode;
++ struct rb_node *parent = NULL, **new;
++
++ /* this tree node contains no sub-tree yet */
++ stable_node = rb_entry(tree_node->sub_root.rb_node,
++ struct stable_node, node);
++
++ tree_page = get_uksm_page(stable_node, 1, 0);
++ if (tree_page) {
++ cmp = memcmp_pages(*kpage, tree_page, 1);
++ if (!cmp) {
++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
++ tree_page, success1, success2);
++ put_page(tree_page);
++ if (!*success1 && !*success2)
++ goto failed;
++
++ return stable_node;
++
++ } else {
++ /*
++ * collision in first level try to create a subtree.
++ * A new node need to be created.
++ */
++ put_page(tree_page);
++
++ stable_node_hash_max(stable_node, tree_page,
++ tree_node->hash);
++ hash_max = rmap_item_hash_max(rmap_item, hash);
++ cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++ parent = &stable_node->node;
++ if (cmp < 0) {
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ new = &parent->rb_right;
++ } else {
++ goto failed;
++ }
++ }
++
++ } else {
++ /* the only stable_node deleted, we reuse its tree_node.
++ */
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ }
++
++ new_snode = new_stable_node(tree_node, *kpage, hash_max);
++ if (!new_snode)
++ goto failed;
++
++ rb_link_node(&new_snode->node, parent, new);
++ rb_insert_color(&new_snode->node, &tree_node->sub_root);
++ tree_node->count++;
++ *success1 = *success2 = 1;
++
++ return new_snode;
++
++failed:
++ return NULL;
++}
++
++static inline
++struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
++ struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ struct page **kpage, u32 hash,
++ int *success1, int *success2)
++{
++ struct page *tree_page;
++ u32 hash_max;
++ struct stable_node *stable_node, *new_snode;
++ struct rb_node *parent, **new;
++
++research:
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ BUG_ON(!*new);
++ hash_max = rmap_item_hash_max(rmap_item, hash);
++ while (*new) {
++ int cmp;
++
++ stable_node = rb_entry(*new, struct stable_node, node);
++
++ cmp = hash_cmp(hash_max, stable_node->hash_max);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else {
++ tree_page = get_uksm_page(stable_node, 1, 0);
++ if (tree_page) {
++ cmp = memcmp_pages(*kpage, tree_page, 1);
++ if (!cmp) {
++ try_merge_with_stable(rmap_item,
++ tree_rmap_item, kpage,
++ tree_page, success1, success2);
++
++ put_page(tree_page);
++ if (!*success1 && !*success2)
++ goto failed;
++ /*
++ * successfully merged with a stable
++ * node
++ */
++ return stable_node;
++ } else {
++ put_page(tree_page);
++ goto failed;
++ }
++ } else {
++ /*
++ * stable node may be deleted,
++ * and subtree maybe
++ * restructed, cannot
++ * continue, research it.
++ */
++ if (tree_node->count) {
++ goto research;
++ } else {
++ /* reuse the tree node*/
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ }
++ }
++ }
++ }
++
++ new_snode = new_stable_node(tree_node, *kpage, hash_max);
++ if (!new_snode)
++ goto failed;
++
++ rb_link_node(&new_snode->node, parent, new);
++ rb_insert_color(&new_snode->node, &tree_node->sub_root);
++ tree_node->count++;
++ *success1 = *success2 = 1;
++
++ return new_snode;
++
++failed:
++ return NULL;
++}
++
++
++/**
++ * stable_tree_insert() - try to insert a merged page in unstable tree to
++ * the stable tree
++ *
++ * @kpage: the page need to be inserted
++ * @hash: the current hash of this page
++ * @rmap_item: the rmap_item being scanned
++ * @tree_rmap_item: the rmap_item found on unstable tree
++ * @success1: return if rmap_item is merged
++ * @success2: return if tree_rmap_item is merged
++ *
++ * @return the stable_node on stable tree if at least one
++ * rmap_item is inserted into stable tree, NULL
++ * otherwise.
++ */
++static struct stable_node *
++stable_tree_insert(struct page **kpage, u32 hash,
++ struct rmap_item *rmap_item,
++ struct rmap_item *tree_rmap_item,
++ int *success1, int *success2)
++{
++ struct rb_node **new = &root_stable_treep->rb_node;
++ struct rb_node *parent = NULL;
++ struct stable_node *stable_node;
++ struct tree_node *tree_node;
++ u32 hash_max = 0;
++
++ *success1 = *success2 = 0;
++
++ while (*new) {
++ int cmp;
++
++ tree_node = rb_entry(*new, struct tree_node, node);
++
++ cmp = hash_cmp(hash, tree_node->hash);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else
++ break;
++ }
++
++ if (*new) {
++ if (tree_node->count == 1) {
++ stable_node = first_level_insert(tree_node, rmap_item,
++ tree_rmap_item, kpage,
++ hash, success1, success2);
++ } else {
++ stable_node = stable_subtree_insert(tree_node,
++ rmap_item, tree_rmap_item, kpage,
++ hash, success1, success2);
++ }
++ } else {
++
++ /* no tree node found */
++ tree_node = alloc_tree_node(stable_tree_node_listp);
++ if (!tree_node) {
++ stable_node = NULL;
++ goto out;
++ }
++
++ stable_node = new_stable_node(tree_node, *kpage, hash_max);
++ if (!stable_node) {
++ free_tree_node(tree_node);
++ goto out;
++ }
++
++ tree_node->hash = hash;
++ rb_link_node(&tree_node->node, parent, new);
++ rb_insert_color(&tree_node->node, root_stable_treep);
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++
++ rb_link_node(&stable_node->node, parent, new);
++ rb_insert_color(&stable_node->node, &tree_node->sub_root);
++ tree_node->count++;
++ *success1 = *success2 = 1;
++ }
++
++out:
++ return stable_node;
++}
++
++
++/**
++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
++ *
++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem,
++ * -EINVAL if the page mapping has been changed.
++ */
++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
++{
++ int err;
++
++ err = get_mergeable_page_lock_mmap(tree_rmap_item);
++
++ if (err == -EINVAL) {
++ /* its page map has been changed, remove it */
++ remove_rmap_item_from_tree(tree_rmap_item);
++ }
++
++ /* The page is gotten and mmap_sem is locked now. */
++ return err;
++}
++
++
++/**
++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
++ * same hash value. Get its page and trylock the mmap_sem
++ */
++static inline
++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
++ u32 hash)
++
++{
++ struct rb_node **new = &root_unstable_tree.rb_node;
++ struct rb_node *parent = NULL;
++ struct tree_node *tree_node;
++ u32 hash_max;
++ struct rmap_item *tree_rmap_item;
++
++ while (*new) {
++ int cmp;
++
++ tree_node = rb_entry(*new, struct tree_node, node);
++
++ cmp = hash_cmp(hash, tree_node->hash);
++
++ if (cmp < 0) {
++ parent = *new;
++ new = &parent->rb_left;
++ } else if (cmp > 0) {
++ parent = *new;
++ new = &parent->rb_right;
++ } else
++ break;
++ }
++
++ if (*new) {
++ /* got the tree_node */
++ if (tree_node->count == 1) {
++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
++ struct rmap_item, node);
++ BUG_ON(!tree_rmap_item);
++
++ goto get_page_out;
++ }
++
++ /* well, search the collision subtree */
++ new = &tree_node->sub_root.rb_node;
++ BUG_ON(!*new);
++ hash_max = rmap_item_hash_max(rmap_item, hash);
++
++ while (*new) {
++ int cmp;
++
++ tree_rmap_item = rb_entry(*new, struct rmap_item,
++ node);
++
++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
++ parent = *new;
++ if (cmp < 0)
++ new = &parent->rb_left;
++ else if (cmp > 0)
++ new = &parent->rb_right;
++ else
++ goto get_page_out;
++ }
++ } else {
++ /* alloc a new tree_node */
++ tree_node = alloc_tree_node(&unstable_tree_node_list);
++ if (!tree_node)
++ return NULL;
++
++ tree_node->hash = hash;
++ rb_link_node(&tree_node->node, parent, new);
++ rb_insert_color(&tree_node->node, &root_unstable_tree);
++ parent = NULL;
++ new = &tree_node->sub_root.rb_node;
++ }
++
++ /* did not found even in sub-tree */
++ rmap_item->tree_node = tree_node;
++ rmap_item->address |= UNSTABLE_FLAG;
++ rmap_item->hash_round = uksm_hash_round;
++ rb_link_node(&rmap_item->node, parent, new);
++ rb_insert_color(&rmap_item->node, &tree_node->sub_root);
++
++ uksm_pages_unshared++;
++ return NULL;
++
++get_page_out:
++ if (tree_rmap_item->page == rmap_item->page)
++ return NULL;
++
++ if (get_tree_rmap_item_page(tree_rmap_item))
++ return NULL;
++
++ return tree_rmap_item;
++}
++
++static void hold_anon_vma(struct rmap_item *rmap_item,
++ struct anon_vma *anon_vma)
++{
++ rmap_item->anon_vma = anon_vma;
++ get_anon_vma(anon_vma);
++}
++
++
++/**
++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
++ * ratio statistics is done in this function.
++ *
++ */
++static void stable_tree_append(struct rmap_item *rmap_item,
++ struct stable_node *stable_node, int logdedup)
++{
++ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
++ unsigned long key = (unsigned long)rmap_item->slot;
++ unsigned long factor = rmap_item->slot->rung->step;
++
++ BUG_ON(!stable_node);
++ rmap_item->address |= STABLE_FLAG;
++
++ if (hlist_empty(&stable_node->hlist)) {
++ uksm_pages_shared++;
++ goto node_vma_new;
++ } else {
++ uksm_pages_sharing++;
++ }
++
++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
++ if (node_vma->key >= key)
++ break;
++
++ if (logdedup) {
++ node_vma->slot->pages_bemerged += factor;
++ if (list_empty(&node_vma->slot->dedup_list))
++ list_add(&node_vma->slot->dedup_list,
++ &vma_slot_dedup);
++ }
++ }
++
++ if (node_vma) {
++ if (node_vma->key == key) {
++ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
++ goto node_vma_ok;
++ } else if (node_vma->key > key) {
++ node_vma_cont = node_vma;
++ }
++ }
++
++node_vma_new:
++ /* no same vma already in node, alloc a new node_vma */
++ new_node_vma = alloc_node_vma();
++ BUG_ON(!new_node_vma);
++ new_node_vma->head = stable_node;
++ new_node_vma->slot = rmap_item->slot;
++
++ if (!node_vma) {
++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
++ } else if (node_vma->key != key) {
++ if (node_vma->key < key)
++ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
++ else {
++ hlist_add_before(&new_node_vma->hlist,
++ &node_vma->hlist);
++ }
++
++ }
++ node_vma = new_node_vma;
++
++node_vma_ok: /* ok, ready to add to the list */
<Skipped 2964 lines>
================================================================
---- gitweb:
http://git.pld-linux.org/gitweb.cgi/packages/kernel.git/commitdiff/04bb9bf19db0e810b33b9e19d511652bf4fe0b9d
More information about the pld-cvs-commit
mailing list