cregit-Linux how code gets into the kernel

Release 4.8 mm/memory.c

Directory: mm
 *  linux/mm/memory.c
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds

 * demand-loading started 01.12.91 - seems it is high on the list of
 * things wanted, and it should be easy to implement. - Linus

 * Ok, demand-loading was easy, shared pages a little bit tricker. Shared
 * pages started 02.12.91, seems to work. - Linus.
 * Tested sharing by executing about 30 /bin/sh: under the old kernel it
 * would have taken more than the 6M I have free, but it worked well as
 * far as I could see.
 * Also corrected some "invalidate()"s - I wasn't doing enough of them.

 * Real VM (paging to/from disk) started 18.12.91. Much more work and
 * thought has to go into this. Oh, well..
 * 19.12.91  -  works, somewhat. Sometimes I get faults, don't know why.
 *              Found it. Everything seems to work now.
 * 20.12.91  -  Ok, making the swap-device changeable like the root.

 * 05.04.94  -  Multi-page memory management added for v1.1.
 *              Idea by Alex Bligh (
 * 16.07.99  -  Support of BIGMEM added by Gerhard Wichert, Siemens AG
 *              (
 * Aug/Sep 2004 Changed to four level page tables (Andi Kleen)

#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/hugetlb.h>
#include <linux/mman.h>
#include <linux/swap.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
#include <linux/delayacct.h>
#include <linux/init.h>
#include <linux/pfn_t.h>
#include <linux/writeback.h>
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/kallsyms.h>
#include <linux/swapops.h>
#include <linux/elf.h>
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
#include <linux/dma-debug.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>

#include <asm/io.h>
#include <asm/mmu_context.h>
#include <asm/pgalloc.h>
#include <asm/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
#include <asm/pgtable.h>

#include "internal.h"

#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.

/* use the per-pgdat data instead for discontigmem - mbligh */

unsigned long max_mapnr;

struct page *mem_map;



 * A number of key systems in x86 including ioremap() rely on the assumption
 * that high_memory defines the upper bound on direct map memory, then end
 * of ZONE_NORMAL.  Under CONFIG_DISCONTIG this means that max_low_pfn and
 * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL

void * high_memory;


 * Randomize the address space (stacks, mmaps, brk, etc.).
 * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
 *   as ancient (libc5 based) binaries can segfault. )

int randomize_va_space __read_mostly =

static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; }


andi kleenandi kleen1794.44%150.00%
hirofumi ogawahirofumi ogawa15.56%150.00%

__setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; unsigned long highest_memmap_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */
static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; }


hugh dickinshugh dickins22100.00%1100.00%

core_initcall(init_zero_pfn); #if defined(SPLIT_RSS_COUNTING)
void sync_mm_rss(struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { if (current->rss_stat.count[i]) { add_mm_counter(mm, i, current->rss_stat.count[i]); current->rss_stat.count[i] = 0; } } current-> = 0; }


kamezawa hiroyukikamezawa hiroyuki7193.42%133.33%
david rientjesdavid rientjes56.58%266.67%

static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) { struct task_struct *task = current; if (likely(task->mm == mm)) task->rss_stat.count[member] += val; else add_mm_counter(mm, member, val); }


kamezawa hiroyukikamezawa hiroyuki56100.00%1100.00%

#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) /* sync counter once per 64 page faults */ #define TASK_RSS_EVENTS_THRESH (64)
static void check_sync_rss_stat(struct task_struct *task) { if (unlikely(task != current)) return; if (unlikely(task-> > TASK_RSS_EVENTS_THRESH)) sync_mm_rss(task->mm); }


kamezawa hiroyukikamezawa hiroyuki42100.00%1100.00%

#else /* SPLIT_RSS_COUNTING */ #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member)
static void check_sync_rss_stat(struct task_struct *task) { }


kamezawa hiroyukikamezawa hiroyuki10100.00%1100.00%

static bool tlb_next_batch(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; batch = tlb->active; if (batch->next) { tlb->active = batch->next; return true; } if (tlb->batch_count == MAX_GATHER_BATCH_COUNT) return false; batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0); if (!batch) return false; tlb->batch_count++; batch->next = NULL; batch->nr = 0; batch->max = MAX_GATHER_BATCH; tlb->active->next = batch; tlb->active = batch; return true; }


peter zijlstrapeter zijlstra9582.61%133.33%
michal hockomichal hocko1513.04%133.33%
nicholas krausenicholas krause54.35%133.33%

/* tlb_gather_mmu * Called to initialize an (on-stack) mmu_gather structure for page-table * tear-down from @mm. The @fullmm argument is used when @mm is without * users and we're going to destroy the full address space (exit/execve). */
void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; /* Is it from 0 to ~0? */ tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; tlb-> = NULL; tlb-> = 0; tlb->local.max = ARRAY_SIZE(tlb->__pages); tlb->active = &tlb->local; tlb->batch_count = 0; #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb->batch = NULL; #endif tlb->page_size = 0; __tlb_reset_range(tlb); }


peter zijlstrapeter zijlstra7664.96%116.67%
linus torvaldslinus torvalds1815.38%116.67%
michal hockomichal hocko65.13%116.67%
aneesh kumaraneesh kumar65.13%116.67%
dave hansendave hansen65.13%116.67%
will deaconwill deacon54.27%116.67%

static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb) { if (!tlb->end) return; tlb_flush(tlb); mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end); #ifdef CONFIG_HAVE_RCU_TABLE_FREE tlb_table_flush(tlb); #endif __tlb_reset_range(tlb); }


peter zijlstrapeter zijlstra2342.59%120.00%
joerg roedeljoerg roedel1527.78%120.00%
will deaconwill deacon1324.07%240.00%
linus torvaldslinus torvalds35.56%120.00%

static void tlb_flush_mmu_free(struct mmu_gather *tlb) { struct mmu_gather_batch *batch; for (batch = &tlb->local; batch && batch->nr; batch = batch->next) { free_pages_and_swap_cache(batch->pages, batch->nr); batch->nr = 0; } tlb->active = &tlb->local; }


peter zijlstrapeter zijlstra4670.77%133.33%
linus torvaldslinus torvalds1523.08%133.33%
will deaconwill deacon46.15%133.33%

void tlb_flush_mmu(struct mmu_gather *tlb) { tlb_flush_mmu_tlbonly(tlb); tlb_flush_mmu_free(tlb); }


linus torvaldslinus torvalds20100.00%1100.00%

/* tlb_finish_mmu * Called at the end of the shootdown operation to free up any resources * that were required. */
void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) { struct mmu_gather_batch *batch, *next; tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ check_pgt_cache(); for (batch = tlb->; batch; batch = next) { next = batch->next; free_pages((unsigned long)batch, 0); } tlb-> = NULL; }


peter zijlstrapeter zijlstra78100.00%1100.00%

/* __tlb_remove_page * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while * handling the additional races in SMP caused by other CPUs caching valid * mappings in their TLBs. Returns the number of free page slots left. * When out of page slots we must call tlb_flush_mmu(). *returns true if the caller should flush. */
bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); if (!tlb->page_size) tlb->page_size = page_size; else { if (page_size != tlb->page_size) return true; } batch = tlb->active; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; batch = tlb->active; } VM_BUG_ON_PAGE(batch->nr > batch->max, page); batch->pages[batch->nr++] = page; return false; }


peter zijlstrapeter zijlstra6553.72%114.29%
aneesh kumaraneesh kumar4234.71%228.57%
li shaohuali shaohua108.26%228.57%
sasha levinsasha levin32.48%114.29%
will deaconwill deacon10.83%114.29%

#endif /* HAVE_GENERIC_MMU_GATHER */ #ifdef CONFIG_HAVE_RCU_TABLE_FREE /* * See the comment near struct mmu_table_batch. */
static void tlb_remove_table_smp_sync(void *arg) { /* Simply deliver the interrupt */ }


peter zijlstrapeter zijlstra11100.00%1100.00%

static void tlb_remove_table_one(void *table) { /* * This isn't an RCU grace period and hence the page-tables cannot be * assumed to be actually RCU-freed. * * It is however sufficient for software page-table walkers that rely on * IRQ disabling. See the comment near struct mmu_table_batch. */ smp_call_function(tlb_remove_table_smp_sync, NULL, 1); __tlb_remove_table(table); }


peter zijlstrapeter zijlstra25100.00%1100.00%

static void tlb_remove_table_rcu(struct rcu_head *head) { struct mmu_table_batch *batch; int i; batch = container_of(head, struct mmu_table_batch, rcu); for (i = 0; i < batch->nr; i++) __tlb_remove_table(batch->tables[i]); free_page((unsigned long)batch); }


peter zijlstrapeter zijlstra65100.00%1100.00%

void tlb_table_flush(struct mmu_gather *tlb) { struct mmu_table_batch **batch = &tlb->batch; if (*batch) { call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); *batch = NULL; } }


peter zijlstrapeter zijlstra46100.00%1100.00%

void tlb_remove_table(struct mmu_gather *tlb, void *table) { struct mmu_table_batch **batch = &tlb->batch; /* * When there's less then two users of this mm there cannot be a * concurrent page-table walk. */ if (atomic_read(&tlb->mm->mm_users) < 2) { __tlb_remove_table(table); return; } if (*batch == NULL) { *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); if (*batch == NULL) { tlb_remove_table_one(table); return; } (*batch)->nr = 0; } (*batch)->tables[(*batch)->nr++] = table; if ((*batch)->nr == MAX_TABLE_BATCH) tlb_table_flush(tlb); }


peter zijlstrapeter zijlstra130100.00%1100.00%

#endif /* CONFIG_HAVE_RCU_TABLE_FREE */ /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */
static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); atomic_long_dec(&tlb->mm->nr_ptes); }


hugh dickinshugh dickins713.46%214.29%
benjamin herrenschmidtbenjamin herrenschmidt611.54%17.14%
william lee irwin iiiwilliam lee irwin iii611.54%17.14%
andrew mortonandrew morton47.69%214.29%
kirill a. shutemovkirill a. shutemov47.69%17.14%
martin schwidefskymartin schwidefsky47.69%17.14%
nick pigginnick piggin47.69%17.14%
linus torvaldslinus torvalds35.77%17.14%
ingo molnaringo molnar11.92%17.14%

static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); mm_dec_nr_pmds(tlb->mm); }


hugh dickinshugh dickins8852.38%350.00%
andi kleenandi kleen6941.07%116.67%
kirill a. shutemovkirill a. shutemov74.17%116.67%
benjamin herrenschmidtbenjamin herrenschmidt42.38%116.67%

static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(pgd, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); }


hugh dickinshugh dickins9255.09%323.08%
nick pigginnick piggin3621.56%17.69%
linus torvaldslinus torvalds74.19%17.69%
andi kleenandi kleen52.99%17.69%
benjamin herrenschmidtbenjamin herrenschmidt21.20%17.69%
andrew mortonandrew morton21.20%17.69%

/* * This function frees user-level page tables of a process. */
void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_pud_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); }


hugh dickinshugh dickins10064.10%325.00%
nick pigginnick piggin2516.03%18.33%
linus torvaldslinus torvalds138.33%216.67%
andrew mortonandrew morton21.28%18.33%
martin schwidefskymartin schwidefsky10.64%18.33%

void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } else { /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next? next->vm_start: ceiling); } vma = next; } }


hugh dickinshugh dickins15195.57%337.50%
david gibsondavid gibson31.90%225.00%
rik van rielrik van riel21.27%112.50%
nick pigginnick piggin10.63%112.50%
ingo molnaringo molnar10.63%112.50%

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); if (!new) return -ENOMEM; /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_read_barrier_depends() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; } spin_unlock(ptl); if (new) pte_free(mm, new); return 0; }


ingo molnaringo molnar4944.95%110.00%
martin schwidefskymartin schwidefsky1715.60%110.00%
hugh dickinshugh dickins1614.68%330.00%
kirill a. shutemovkirill a. shutemov1412.84%220.00%
nick pigginnick piggin54.59%110.00%
andrea arcangeliandrea arcangeli43.67%110.00%
william lee irwin iiiwilliam lee irwin iii43.67%110.00%

int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) { pte_t *new = pte_alloc_one_kernel(&init_mm, address); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; }


ingo molnaringo molnar4243.30%114.29%
hugh dickinshugh dickins2727.84%228.57%
martin schwidefskymartin schwidefsky1919.59%114.29%
nick pigginnick piggin44.12%114.29%
andrea arcangeliandrea arcangeli44.12%114.29%

static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); }


kamezawa hiroyukikamezawa hiroyuki1352.00%150.00%
hugh dickinshugh dickins1248.00%150.00%

static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; if (current->mm == mm) sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); }


kamezawa hiroyukikamezawa hiroyuki5382.81%266.67%
hugh dickinshugh dickins1117.19%133.33%

/* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */
static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); pud_t *pud = pud_offset(pgd, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); /* * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y */ pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, mapping ? mapping->a_ops->readpage : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); }


hugh dickinshugh dickins21174.82%330.00%
nick pigginnick piggin3713.12%110.00%
konstantin khlebnikovkonstantin khlebnikov258.87%110.00%
joe perchesjoe perches31.06%110.00%
dave hansendave hansen20.71%110.00%
rusty russellrusty russell20.71%110.00%
fengguang wufengguang wu10.35%110.00%
adrian bunkadrian bunk10.35%110.00%

/* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * * And for normal mappings this is false. * * This restricts such mappings to be a linear translation from virtual address * to pfn. To get around this restriction, we allow arbitrary mappings so long * as the vma is not a COW mapping; in that case, we know that all ptes are * special (because none can have been COWed). * * * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings. * */ #ifdef __HAVE_ARCH_PTE_SPECIAL # define HAVE_PTE_SPECIAL 1 #else # define HAVE_PTE_SPECIAL 0 #endif
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (!is_zero_pfn(pfn)) print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !HAVE_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn))