cregit-Linux how code gets into the kernel

Release 4.8 mm/vmscan.c

Directory: mm
/*
 *  linux/mm/vmscan.c
 *
 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
 *
 *  Swap reorganised 29.12.95, Stephen Tweedie.
 *  kswapd added: 7.1.96  sct
 *  Removed kswapd_ctl limits, and swap out as many pages as needed
 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
 *  Multiqueue VM started 5.8.00, Rik van Riel.
 */


#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/gfp.h>
#include <linux/kernel_stat.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/vmpressure.h>
#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>	/* for try_to_release_page(),
                                        buffer_heads_over_limit */
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/rmap.h>
#include <linux/topology.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/compaction.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/memcontrol.h>
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>

#include <asm/tlbflush.h>
#include <asm/div64.h>

#include <linux/swapops.h>
#include <linux/balloon_compaction.h>

#include "internal.h"


#define CREATE_TRACE_POINTS
#include <trace/events/vmscan.h>


struct scan_control {
	/* How many pages shrink_list() should reclaim */
	
unsigned long nr_to_reclaim;

	/* This context's GFP mask */
	
gfp_t gfp_mask;

	/* Allocation order */
	
int order;

	/*
         * Nodemask of nodes allowed by the caller. If NULL, all nodes
         * are scanned.
         */
	
nodemask_t	*nodemask;

	/*
         * The memory cgroup that hit its limit and as a result is the
         * primary target of this reclaim invocation.
         */
	
struct mem_cgroup *target_mem_cgroup;

	/* Scan (total_size >> priority) pages at once */
	
int priority;

	/* The highest zone to isolate pages for reclaim from */
	
enum zone_type reclaim_idx;

	
unsigned int may_writepage:1;

	/* Can mapped pages be reclaimed? */
	
unsigned int may_unmap:1;

	/* Can pages be swapped as part of reclaim? */
	
unsigned int may_swap:1;

	/* Can cgroups be reclaimed below their normal consumption range? */
	
unsigned int may_thrash:1;

	
unsigned int hibernation_mode:1;

	/* One of the zones is ready for compaction */
	
unsigned int compaction_ready:1;

	/* Incremented by the number of inactive pages that were scanned */
	
unsigned long nr_scanned;

	/* Number of pages freed so far during a call to shrink_zones() */
	
unsigned long nr_reclaimed;
};

#ifdef ARCH_HAS_PREFETCH

#define prefetch_prev_lru_page(_page, _base, _field)			\
	do {                                                            \
                if ((_page)->lru.prev != _base) {                       \
                        struct page *prev;                              \
                                                                        \
                        prev = lru_to_page(&(_page->lru));              \
                        prefetch(&prev->_field);                        \
                }                                                       \
        } while (0)
#else

#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

#ifdef ARCH_HAS_PREFETCHW

#define prefetchw_prev_lru_page(_page, _base, _field)			\
	do {                                                            \
                if ((_page)->lru.prev != _base) {                       \
                        struct page *prev;                              \
                                                                        \
                        prev = lru_to_page(&(_page->lru));              \
                        prefetchw(&prev->_field);                       \
                }                                                       \
        } while (0)
#else

#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
#endif

/*
 * From 0 .. 100.  Higher means more swappy.
 */

int vm_swappiness = 60;
/*
 * The total number of pages which are beyond the high watermark within all
 * zones.
 */

unsigned long vm_total_pages;

static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);

#ifdef CONFIG_MEMCG

static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; }

Contributors

PersonTokensPropCommitsCommitProp
johannes weinerjohannes weiner17100.00%2100.00%
Total17100.00%2100.00%

/** * sane_reclaim - is the usual dirty throttling mechanism operational? * @sc: scan_control in question * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in * shrink_page_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */
static bool sane_reclaim(struct scan_control *sc) { struct mem_cgroup *memcg = sc->target_mem_cgroup; if (!memcg) return true; #ifdef CONFIG_CGROUP_WRITEBACK if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return true; #endif return false; }

Contributors

PersonTokensPropCommitsCommitProp
tejun heotejun heo4291.30%133.33%
vladimir davydovvladimir davydov24.35%133.33%
linus torvaldslinus torvalds24.35%133.33%
Total46100.00%3100.00%

#else
static bool global_reclaim(struct scan_control *sc) { return true; }

Contributors

PersonTokensPropCommitsCommitProp
johannes weinerjohannes weiner14100.00%1100.00%
Total14100.00%1100.00%


static bool sane_reclaim(struct scan_control *sc) { return true; }

Contributors

PersonTokensPropCommitsCommitProp
tejun heotejun heo14100.00%1100.00%
Total14100.00%1100.00%

#endif /* * This misses isolated pages which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is * not expected that isolated pages will be a dominating factor. */
unsigned long zone_reclaimable_pages(struct zone *zone) { unsigned long nr; nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE); if (get_nr_swap_pages() > 0) nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) + zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON); return nr; }

Contributors

PersonTokensPropCommitsCommitProp
mel gormanmel gorman57100.00%1100.00%
Total57100.00%1100.00%


unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) { unsigned long nr; nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); if (get_nr_swap_pages() > 0) nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); return nr; }

Contributors

PersonTokensPropCommitsCommitProp
mel gormanmel gorman3549.30%133.33%
lisa dulisa du2636.62%133.33%
michal hockomichal hocko1014.08%133.33%
Total71100.00%3100.00%


bool pgdat_reclaimable(struct pglist_data *pgdat) { return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < pgdat_reclaimable_pages(pgdat) * 6; }

Contributors

PersonTokensPropCommitsCommitProp
lisa dulisa du1456.00%133.33%
mel gormanmel gorman1144.00%266.67%
Total25100.00%3100.00%


unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) return mem_cgroup_get_lru_size(lruvec, lru); return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); }

Contributors

PersonTokensPropCommitsCommitProp
kosaki motohirokosaki motohiro2969.05%225.00%
konstantin khlebnikovkonstantin khlebnikov614.29%112.50%
hugh dickinshugh dickins37.14%225.00%
mel gormanmel gorman24.76%112.50%
kamezawa hiroyukikamezawa hiroyuki12.38%112.50%
johannes weinerjohannes weiner12.38%112.50%
Total42100.00%8100.00%

/* * Add a shrinker callback to be called from the vm. */
int register_shrinker(struct shrinker *shrinker) { size_t size = sizeof(*shrinker->nr_deferred); if (shrinker->flags & SHRINKER_NUMA_AWARE) size *= nr_node_ids; shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); if (!shrinker->nr_deferred) return -ENOMEM; down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
glauber costaglauber costa4251.85%116.67%
andrew mortonandrew morton2834.57%116.67%
konstantin khlebnikovkonstantin khlebnikov44.94%116.67%
nick pigginnick piggin44.94%116.67%
rusty russellrusty russell22.47%116.67%
christoph hellwigchristoph hellwig11.23%116.67%
Total81100.00%6100.00%

EXPORT_SYMBOL(register_shrinker); /* * Remove one */
void unregister_shrinker(struct shrinker *shrinker) { down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); kfree(shrinker->nr_deferred); }

Contributors

PersonTokensPropCommitsCommitProp
andrew mortonandrew morton2567.57%125.00%
andrey vaginandrey vagin718.92%125.00%
nick pigginnick piggin410.81%125.00%
rusty russellrusty russell12.70%125.00%
Total37100.00%4100.00%

EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, struct shrinker *shrinker, unsigned long nr_scanned, unsigned long nr_eligible) { unsigned long freed = 0; unsigned long long delta; long total_scan; long freeable; long nr; long new_nr; int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) return 0; /* * copy the current shrinker scan count into a local variable * and zero it so that other concurrent shrinker invocations * don't also do this scanning work. */ nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; delta = (4 * nr_scanned) / shrinker->seeks; delta *= freeable; do_div(delta, nr_eligible + 1); total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; } /* * We need to avoid excessive windup on filesystem shrinkers * due to large numbers of GFP_NOFS allocations causing the * shrinkers to return -1 all the time. This results in a large * nr being built up so when a shrink that can do some work * comes along it empties the entire cache due to nr >>> * freeable. This is bad for sustaining a working set in * memory. * * Hence only allow the shrinker to scan the entire cache when * a large delta change is calculated directly. */ if (delta < freeable / 4) total_scan = min(total_scan, freeable / 2); /* * Avoid risking looping forever due to too large nr value: * never try to free more than twice the estimate number of * freeable entries. */ if (total_scan > freeable * 2) total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, nr_scanned, nr_eligible, freeable, delta, total_scan); /* * Normally, we should not scan less than batch_size objects in one * pass to avoid too frequent shrinker calls, but if the slab has less * than batch_size objects in total and we are really tight on memory, * we will try to reclaim all available objects, otherwise we can end * up failing allocations although there are plenty of reclaimable * objects spread over several slabs with usage less than the * batch_size. * * We detect the "tight on memory" situations by looking at the total * number of objects we want to scan (total_scan). If it is greater * than the total number of objects on slab (freeable), we must be * scanning at high prio and therefore should try to reclaim as much as * possible. */ while (total_scan >= batch_size || total_scan >= freeable) { unsigned long ret; unsigned long nr_to_scan = min(batch_size, total_scan); shrinkctl->nr_to_scan = nr_to_scan; ret = shrinker->scan_objects(shrinker, shrinkctl); if (ret == SHRINK_STOP) break; freed += ret; count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; cond_resched(); } /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ if (total_scan > 0) new_nr = atomic_long_add_return(total_scan, &shrinker->nr_deferred[nid]); else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); return freed; }

Contributors

PersonTokensPropCommitsCommitProp
dave chinnerdave chinner12135.80%720.59%
andrew mortonandrew morton7121.01%1029.41%
glauber costaglauber costa3410.06%12.94%
vladimir davydovvladimir davydov319.17%38.82%
konstantin khlebnikovkonstantin khlebnikov288.28%25.88%
andrea arcangeliandrea arcangeli185.33%12.94%
nick pigginnick piggin133.85%12.94%
ying hanying han61.78%25.88%
johannes weinerjohannes weiner61.78%12.94%
dave hansendave hansen41.18%25.88%
christoph lameterchristoph lameter20.59%12.94%
david rientjesdavid rientjes20.59%12.94%
bernardo innocentibernardo innocenti10.30%12.94%
pintu kumarpintu kumar10.30%12.94%
Total338100.00%34100.00%

/** * shrink_slab - shrink slab caches * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target * @nr_scanned: pressure numerator * @nr_eligible: pressure denominator * * Call the shrink functions to age shrinkable caches. * * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, * unaware shrinkers will receive a node id of 0 instead. * * @memcg specifies the memory cgroup to target. If it is not NULL, * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan * objects from the memory cgroup specified. Otherwise, only unaware * shrinkers are called. * * @nr_scanned and @nr_eligible form a ratio that indicate how much of * the available objects should be scanned. Page reclaim for example * passes the number of pages scanned and the number of pages on the * LRU lists that it considered on @nid, plus a bias in @nr_scanned * when it encountered mapped pages. The ratio is further biased by * the ->seeks setting of the shrink function, which indicates the * cost to recreate an object relative to that of an LRU page. * * Returns the number of reclaimed slab objects. */
static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, unsigned long nr_scanned, unsigned long nr_eligible) { struct shrinker *shrinker; unsigned long freed = 0; if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; if (nr_scanned == 0) nr_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { /* * If we would return 0, our callers would understand that we * have nothing else to shrink and give up trying. By returning * 1 we keep it going and assume we'll be able to shrink next * time. */ freed = 1; goto out; } list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, .memcg = memcg, }; /* * If kernel memory accounting is disabled, we ignore * SHRINKER_MEMCG_AWARE flag and call all shrinkers * passing NULL for memcg. */ if (memcg_kmem_enabled() && !!memcg != !!(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); } up_read(&shrinker_rwsem); out: cond_resched(); return freed; }

Contributors

PersonTokensPropCommitsCommitProp
vladimir davydovvladimir davydov7038.46%333.33%
glauber costaglauber costa6636.26%111.11%
johannes weinerjohannes weiner3117.03%111.11%
andrew mortonandrew morton73.85%111.11%
minchan kimminchan kim52.75%111.11%
nick pigginnick piggin21.10%111.11%
dave chinnerdave chinner10.55%111.11%
Total182100.00%9100.00%


void drop_slab_node(int nid) { unsigned long freed; do { struct mem_cgroup *memcg = NULL; freed = 0; do { freed += shrink_slab(GFP_KERNEL, nid, memcg, 1000, 1000); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); } while (freed > 10); }

Contributors

PersonTokensPropCommitsCommitProp
vladimir davydovvladimir davydov69100.00%1100.00%
Total69100.00%1100.00%


void drop_slab(void) { int nid; for_each_online_node(nid) drop_slab_node(nid); }

Contributors

PersonTokensPropCommitsCommitProp
vladimir davydovvladimir davydov19100.00%1100.00%
Total19100.00%1100.00%


static inline int is_page_cache_freeable(struct page *page) { /* * A freeable page cache page is referenced only by the caller * that isolated the page, the page cache radix tree and * optional buffer heads at page->private. */ return page_count(page) - page_has_private(page) == 2; }

Contributors

PersonTokensPropCommitsCommitProp
andrew mortonandrew morton2284.62%125.00%
linus torvaldslinus torvalds27.69%125.00%
david howellsdavid howells13.85%125.00%
johannes weinerjohannes weiner13.85%125.00%
Total26100.00%4100.00%


static int may_write_to_inode(struct inode *inode, struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; if (!inode_write_congested(inode)) return 1; if (inode_to_bdi(inode) == current->backing_dev_info) return 1; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
andrew mortonandrew morton3665.45%125.00%
tejun heotejun heo916.36%125.00%
christoph lameterchristoph lameter59.09%125.00%
kosaki motohirokosaki motohiro59.09%125.00%
Total55100.00%4100.00%

/* * We detected a synchronous write error writing a page out. Probably * -ENOSPC. We need to propagate that into the address_space for a subsequent * fsync(), msync() or close(). * * The tricky part is that after writepage we cannot touch the mapping: nothing * prevents it from being freed up. But we have a ref on the page and once * that page is locked, the mapping is pinned. * * We're allowed to run sleeping lock_page() here because we know the caller has * __GFP_FS. */
static void handle_write_error(struct address_space *mapping, struct page *page, int error) { lock_page(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); }

Contributors

PersonTokensPropCommitsCommitProp
andrew mortonandrew morton4293.33%360.00%
guillaume chazarainguillaume chazarain24.44%120.00%
jens axboejens axboe12.22%120.00%
Total45100.00%5100.00%

/* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ PAGE_KEEP, /* move page to the active list, page is locked */ PAGE_ACTIVATE, /* page has been sent to the disk successfully, page is unlocked */ PAGE_SUCCESS, /* page is clean and locked */ PAGE_CLEAN, } pageout_t; /* * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */
static pageout_t pageout(struct page *page, struct address_space *mapping, struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write * will be non-blocking. To prevent this allocation from being * stalled by pagecache activity. But note that there may be * stalls if we need to run get_block(). We could test * PagePrivate for that. * * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * * If the page is swapcache, write it back even if that would * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; if (!mapping) { /* * Some data journaling orphaned pages can have * page->mapping == NULL while being dirty with clean buffers. */ if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } return PAGE_KEEP; } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; if (!may_write_to_inode(mapping->host, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { int res; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, .for_reclaim = 1, }; SetPageReclaim(page); res = mapping->a_ops->writepage(page, &wbc); if (res < 0) handle_write_error(mapping, page, res); if (res == AOP_WRITEPAGE_ACTIVATE) { ClearPageReclaim(page); return PAGE_ACTIVATE; } if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } trace_mm_vmscan_writepage(page); inc_node_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } return PAGE_CLEAN; }

Contributors

PersonTokensPropCommitsCommitProp
andrew mortonandrew morton16771.98%212.50%
andrea arcangeliandrea arcangeli3314.22%16.25%
hirofumi ogawahirofumi ogawa104.31%16.25%
kosaki motohirokosaki motohiro62.59%16.25%
mel gormanmel gorman62.59%212.50%
christoph hellwigchristoph hellwig20.86%16.25%
zach brownzach brown10.43%16.25%
christoph lameterchristoph lameter10.43%16.25%
david howellsdavid howells10.43%16.25%
mitchel humpherysmitchel humpherys10.43%16.25%
tejun heotejun heo10.43%16.25%
andy whitcroftandy whitcroft10.43%16.25%
al viroal viro10.43%16.25%
harvey harrisonharvey harrison10.43%16.25%
Total232100.00%16100.00%

/* * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */
static int __remove_mapping(struct address_space *mapping, struct page *page, bool reclaimed) { unsigned long flags; BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. * * Must be careful with the order of the tests. When someone has * a ref to the page, it may be possible that they dirty it then * drop the reference. So if PageDirty is tested before page_count * here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); * !PageDirty(page) [good] * SetPageDirty(page); * put_page(page); * !page_count(page) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot * escape unnoticed. The smp_rmb is needed to ensure the page->flags * load is not satisfied before that of page->_refcount. * * Note that if SetPageDirty is always performed via set_page_dirty, * and thus under tree_lock, then this ordering is not required. */ if (!page_ref_freeze(page, 2)) goto cannot_free; /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ if (unlikely(PageDirty(page))) { page_ref_unfreeze(page, 2); goto cannot_free; } if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); spin_unlock_irqrestore(&mapping->tree_lock, flags); swapcache_free(swap); } else { void (*freepage)(struct page *); void *shadow = NULL; freepage = mapping->a_ops->freepage; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. * * But don't store shadows in an address space that is * already exiting. This is not just an optizimation, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. * * We also don't store shadows for DAX mappings because the * only page cache pages found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same page_tree. */ if (reclaimed && page_is_file_cache(page) && !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow); spin_unlock_irqrestore(&mapping->tree_lock, flags); if (freepage != NULL) freepage(page); } return 1; cannot_free: spin_unlock_irqrestore(&mapping->tree_lock, flags); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
christoph lameterchristoph lameter10442.80%17.69%
johannes weinerjohannes weiner4217.28%323.08%
nick pigginnick piggin3313.58%215.38%
linus torvaldslinus torvalds3012.35%17.69%
greg thelengreg thelen229.05%17.69%
ross zwislerross zwisler72.88%17.69%
joonsoo kimjoonsoo kim31.23%215.38%
minchan kimminchan kim10.41%17.69%
kamezawa hiroyukikamezawa hiroyuki10.41%17.69%
Total243100.00%13100.00%

/* * Attempt to detach a locked page from its ->mapping. If it is dirty or if * someone else has a ref on the page, abort and return 0. If it was * successfully detached, return 1. Assumes the caller has a single ref on * this page. */
int remove_mapping(struct address_space *mapping, struct page *page) { if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another * atomic operation. */ page_ref_unfreeze(page, 1); return 1; } return 0; }

Contributors

PersonTokensPropCommitsCommitProp
nick pigginnick piggin3992.86%133.33%
johannes weinerjohannes weiner24.76%133.33%
joonsoo kimjoonsoo kim12.38%133.33%
Total42100.00%3100.00%

/** * putback_lru_page - put previously isolated page onto appropriate LRU list * @page: page to be put back to appropriate lru list * * Add previously isolated @page to appropriate LRU list. * Page may still be unevictable for other reasons. * * lru_lock must not be held, interrupts must be enabled. */
void putback_lru_page(struct page *page) { bool is_unevictable; int was_unevictable = PageUnevictable(page); VM_BUG_ON_PAGE(PageLRU(page), page); redo: ClearPageUnevictable(page); if (page_evictable(page)) { /* * For evictable pages, we can use the cache. * In event of a race, worst case is we end up with an * unevictable page on [in]active list. * We know how to handle that. */ is_unevictable = false; lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ is_unevictable = true; add_page_to_unevictable_list(page); /* * When racing with an mlock or AS_UNEVICTABLE clearing * (page is unlocked) make sure that if the other thread * does not observe our setting of PG_lru and fails * isolation/check_move_unevictable_pages, * we see PG_mlocked/AS_UNEVICTABLE cleared below and move * the page back to the evictable list. * * The other side is TestClearPageMlocked() or shmem_lock(). */ smp_mb(); } /* * page's status can change while we move it among lru. If an evictable * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; } /* This means someone else dropped this page from LRU * So, it will be freed or putback to LRU again. There is * nothing to do here. */ } if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ }

Contributors

PersonTokensPropCommitsCommitProp
lee schermerhornlee schermerhorn11886.76%228.57%
vlastimil babkavlastimil babka107.35%114.29%
sasha levinsasha levin32.21%114.29%
johannes weinerjohannes weiner32.21%114.29%
hugh dickinshugh dickins10.74%114.29%
mel gormanmel gorman10.74%114.29%
Total136100.00%7100.00%

enum page_references { PAGEREF_RECLAIM, PAGEREF_RECLAIM_CLEAN, PAGEREF_KEEP, PAGEREF_ACTIVATE, };
static enum page_references page_check_references(struct page *page, struct scan_control *sc) { int referenced_ptes, referenced_page; unsigned long vm_flags; referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup, &vm_flags); referenced_page = TestClearPageReferenced(page); /* * Mlock lost the isolation race with us. Let try_to_unmap() * move the page to the unevictable list. */ if (vm_flags & VM_LOCKED) return PAGEREF_RECLAIM; if (referenced_ptes) { if (PageSwapBacked(page)) return PAGEREF_ACTIVATE; /* * All mapped pages start out with page table * references from the instantiating fault, so we need * to look twice if a mapped file page is used more * than once. * * Mark it and spare it for another trip around the * inactive list. Another page table reference will * lead to its activation. * * Note: the mark is set for activated pages as well * so that recently deactivated but used pages are * quickly recovered. */ SetPageReferenced(page); if (referenced_page || referenced_ptes > 1) return PAGEREF_ACTIVATE; /* * Activate file-backed executable pages after first usage. */ if (vm_flags & VM_EXEC) return PAGEREF_ACTIVATE; return PAGEREF_KEEP; } /* Reclaim if clean, defer dirty pages to writeback */ if (referenced_page && !PageSwapBacked(page)) return PAGEREF_RECLAIM_CLEAN; return PAGEREF_RECLAIM; }

Contributors

PersonTokensPropCommitsCommitProp
johannes weinerjohannes weiner10182.79%342.86%
konstantin khlebnikovkonstantin khlebnikov1411.48%228.57%
kosaki motohirokosaki motohiro64.92%114.29%
michal hockomichal hocko10.82%114.29%
Total122100.00%7100.00%

/* Check if a page is dirty or under writeback */
static void page_check_dirty_writeback(struct page *page, bool *dirty, bool *writeback) { struct address_space *mapping; /* * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ if (!page_is_file_cache(page)) { *dirty = false; *writeback = false; return; } /* By default assume that the page flags are accurate */ *dirty = PageDirty(page); *writeback = PageWriteback(page); /* Verify dirty/writeback state if the filesystem supports it */ if (!page_has_private(page)) return; mapping = page_mapping(page); if (mapping && mapping->a_ops->is_dirty_writeback) mapping->a_ops->is_dirty_writeback(page, dirty, writeback); }

Contributors

PersonTokensPropCommitsCommitProp
mel gormanmel gorman103100.00%2100.00%
Total103100.00%2100.00%

/* * shrink_page_list() returns the number of reclaimed pages */
static unsigned long shrink_page_list(struct list_head *page_list, struct pglist_data *pgdat, struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, unsigned long *ret_nr_unqueued_dirty, unsigned long *ret_nr_congested, unsigned long *ret_nr_writeback, unsigned long *ret_nr_immediate, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; unsigned long nr_writeback = 0; unsigned long nr_immediate = 0; cond_resched(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; bool lazyfree = false; int ret = SWAP_SUCCESS; cond_resched(); page = lru_to_page(page_list); list_del(&page->lru); if (!trylock_page(page)) goto keep; VM_BUG_ON_PAGE(PageActive(page), page); sc->nr_scanned++; if (unlikely(!page_evictable(page))) goto cull_mlocked; if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* * The number of dirty pages determines if a zone is marked * reclaim_congested which affects wait_iff_congested. kswapd * will stall and start writing pages if the tail of the LRU * is all dirty unqueued pages. */ page_check_dirty_writeback(page, &dirty, &writeback); if (dirty || writeback) nr_dirty++; if (dirty && !writeback) nr_unqueued_dirty++; /* * Treat this page as congested if the underlying BDI is or if * pages are cycling through the LRU so quickly that the * pages marked for immediate reclaim are making it to the * end of the LRU a second time. */ mapping = page_mapping(page); if (((dirty || writeback) && mapping && inode_write_congested(mapping->host)) || (writeback && PageReclaim(page))) nr_congested++; /* * If a page at the tail of the LRU is under writeback, there * are three cases to consider. * * 1) If reclaim is encountering an excessive number of pages * under writeback and this page is both under writeback and * PageReclaim then it indicates that pages are being queued * for IO but are being recycled through the LRU before the * IO can complete. Waiting on the page itself risks an * indefinite stall if it is impossible to writeback the * page due to IO error or disconnected storage so instead * note that the LRU is being scanned too quickly and the * caller can stall after page list has been processed. * * 2) Global or new memcg reclaim encounters a page that is * not marked for immediate reclaim, or the caller does not * have __GFP_FS (or __GFP_IO if it's simply going to swap, * not to fs). In this case mark the page for immediate * reclaim and continue scanning. * * Require may_enter_fs because we would wait on fs, which * may not have submitted IO yet. And the loop driver might * enter reclaim, and deadlock if it waits on a page for * which it is needed to do the write (loop masks off * __GFP_IO|__GFP_FS for this reason); but more thought * would probably show more reasons. * * 3) Legacy memcg encounters a page that is already marked * PageReclaim. memcg does not have any dirty pages * throttling so we could easily OOM just because too many * pages are in writeback and there is nothing else to * reclaim. Wait for the writeback to complete. */ if (PageWriteback(page)) { /* Case 1 above */ if (current_is_kswapd() && PageReclaim(page) && test_bit(PGDAT_WRITEBACK, &pgdat->flags)) { nr_immediate++; goto keep_locked; /* Case 2 above */ } else if (sane_reclaim(sc) || !