cregit-Linux how code gets into the kernel

Release 4.17 mm/memcontrol.c

Directory: mm
/* memcontrol.c - Memory Controller
 * Copyright IBM Corporation, 2007
 * Author Balbir Singh <>
 * Copyright 2007 OpenVZ SWsoft Inc
 * Author: Pavel Emelianov <>
 * Memory thresholds
 * Copyright (C) 2009 Nokia Corporation
 * Author: Kirill A. Shutemov
 * Kernel Memory Controller
 * Copyright (C) 2012 Parallels Inc. and Google Inc.
 * Authors: Glauber Costa and Suleiman Souhlal
 * Native page reclaim
 * Charge lifetime sanitation
 * Lockless page tracking & accounting
 * Unified hierarchy configuration model
 * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * GNU General Public License for more details.

#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
#include <linux/bit_spinlock.h>
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/export.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/spinlock.h>
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/sort.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/vmpressure.h>
#include <linux/mm_inline.h>
#include <linux/swap_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
#include "slab.h"

#include <linux/uaccess.h>

#include <trace/events/vmscan.h>

struct cgroup_subsys memory_cgrp_subsys __read_mostly;

struct mem_cgroup *root_mem_cgroup __read_mostly;


/* Socket memory accounting disabled? */

static bool cgroup_memory_nosocket;

/* Kernel memory accounting disabled? */

static bool cgroup_memory_nokmem;

/* Whether the swap controller is active */

int do_swap_account __read_mostly;

#define do_swap_account		0

/* Whether legacy memory+swap accounting is active */

static bool do_memsw_account(void) { return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; }


Johannes Weiner17100.00%1100.00%

static const char *const mem_cgroup_lru_names[] = { "inactive_anon", "active_anon", "inactive_file", "active_file", "unevictable", }; #define THRESHOLDS_EVENTS_TARGET 128 #define SOFTLIMIT_EVENTS_TARGET 1024 #define NUMAINFO_EVENTS_TARGET 1024 /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation */ struct mem_cgroup_tree_per_node { struct rb_root rb_root; struct rb_node *rb_rightmost; spinlock_t lock; }; struct mem_cgroup_tree { struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; }; static struct mem_cgroup_tree soft_limit_tree __read_mostly; /* for OOM */ struct mem_cgroup_eventfd_list { struct list_head list; struct eventfd_ctx *eventfd; }; /* * cgroup_event represents events which userspace want to receive. */ struct mem_cgroup_event { /* * memcg which the event belongs to. */ struct mem_cgroup *memcg; /* * eventfd to signal userspace about the event. */ struct eventfd_ctx *eventfd; /* * Each of these stored in a list by the cgroup. */ struct list_head list; /* * register_event() callback will be used to add new userspace * waiter for changes related to this event. Use eventfd_signal() * on eventfd to send notification to userspace. */ int (*register_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd, const char *args); /* * unregister_event() callback will be called when userspace closes * the eventfd or on cgroup removing. This callback must be set, * if you want provide notification functionality. */ void (*unregister_event)(struct mem_cgroup *memcg, struct eventfd_ctx *eventfd); /* * All fields below needed to unregister event when * userspace closes eventfd. */ poll_table pt; wait_queue_head_t *wqh; wait_queue_entry_t wait; struct work_struct remove; }; static void mem_cgroup_threshold(struct mem_cgroup *memcg); static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* Stuffs for move charges at task migration. */ /* * Types of charges to be moved. */ #define MOVE_ANON 0x1U #define MOVE_FILE 0x2U #define MOVE_MASK (MOVE_ANON | MOVE_FILE) /* "mc" and its members are protected by cgroup_mutex */ static struct move_charge_struct { spinlock_t lock; /* for from, to */ struct mm_struct *mm; struct mem_cgroup *from; struct mem_cgroup *to; unsigned long flags; unsigned long precharge; unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; /* * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft * limit reclaim to prevent infinite loops, if they ever occur. */ #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_ANON, MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ NR_CHARGE_TYPE, }; /* for encoding cft->private value on file */ enum res_type { _MEM, _MEMSWAP, _OOM_TYPE, _KMEM, _TCP, }; #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) /* Used for OOM nofiier */ #define OOM_CONTROL (0) /* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) { if (!memcg) memcg = root_mem_cgroup; return &memcg->vmpressure; }


Anton Vorontsov27100.00%1100.00%

struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) { return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; }


Anton Vorontsov26100.00%1100.00%

static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); }


Michal Hocko19100.00%1100.00%

#ifndef CONFIG_SLOB /* * This will be the memcg's index in each cache's ->memcg_params.memcg_caches. * The main reason for not using cgroup id for this: * this works better in sparse environments, where we have a lot of memcgs, * but only a few kmem-limited. Or also, if we have, for instance, 200 * memcgs, and none but the 200th is kmem-limited, we'd have to have a * 200 entry array for that. * * The current size of the caches array is stored in memcg_nr_cache_ids. It * will double each time we have to increase it. */ static DEFINE_IDA(memcg_cache_ida); int memcg_nr_cache_ids; /* Protects memcg_nr_cache_ids */ static DECLARE_RWSEM(memcg_cache_ids_sem);
void memcg_get_cache_ids(void) { down_read(&memcg_cache_ids_sem); }


Glauber de Oliveira Costa753.85%240.00%
Johannes Weiner430.77%120.00%
Li Zefan17.69%120.00%
Daisuke Nishimura17.69%120.00%

void memcg_put_cache_ids(void) { up_read(&memcg_cache_ids_sem); }


Johannes Weiner861.54%133.33%
Glauber de Oliveira Costa430.77%133.33%
Li Zefan17.69%133.33%

/* * MIN_SIZE is different than 1, because we would like to avoid going through * the alloc/free process all the time. In a small machine, 4 kmem-limited * cgroups is a reasonable guess. In the future, it could be a parameter or * tunable, but that is strictly not necessary. * * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get * this constant directly from cgroup, but it is understandable that this is * better kept as an internal representation in cgroup.c. In any case, the * cgrp_id space is not getting any smaller, and we don't have to necessarily * increase ours as well if it increases. */ #define MEMCG_CACHES_MIN_SIZE 4 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX /* * A lot of the calls to the cache allocation functions are expected to be * inlined by the compiler. Since the calls to memcg_kmem_get_cache are * conditional to this static branch, we'll have to allow modules that does * kmem_cache_alloc and the such to see this symbol as well */ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key); EXPORT_SYMBOL(memcg_kmem_enabled_key); struct workqueue_struct *memcg_kmem_cache_wq; #endif /* !CONFIG_SLOB */ /** * mem_cgroup_css_from_page - css of the memcg associated with a page * @page: page of interest * * If memcg is bound to the default hierarchy, css of the memcg associated * with @page is returned. The returned css remains associated with @page * until it is released. * * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup * is returned. */
struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; memcg = page->mem_cgroup; if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; return &memcg->css; }


Johannes Weiner44100.00%3100.00%

/** * page_cgroup_ino - return inode number of the memcg a page is charged to * @page: the page * * Look up the closest online ancestor of the memory cgroup @page is charged to * and return its inode number or 0 if @page is not charged to any cgroup. It * is safe to call this function without holding a reference to @page. * * Note, this function is inherently racy, because there is nothing to prevent * the cgroup inode from getting torn down and potentially reallocated a moment * after page_cgroup_ino() returns, so it only should be used by callers that * do not care (such as procfs interfaces). */
ino_t page_cgroup_ino(struct page *page) { struct mem_cgroup *memcg; unsigned long ino = 0; rcu_read_lock(); memcg = READ_ONCE(page->mem_cgroup); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); if (memcg) ino = cgroup_ino(memcg->css.cgroup); rcu_read_unlock(); return ino; }


Johannes Weiner76100.00%3100.00%

static struct mem_cgroup_per_node * mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); return memcg->nodeinfo[nid]; }


Johannes Weiner2882.35%133.33%
Glauber de Oliveira Costa411.76%133.33%
Mel Gorman25.88%133.33%

static struct mem_cgroup_tree_per_node * soft_limit_tree_node(int nid) { return soft_limit_tree.rb_tree_per_node[nid]; }


Johannes Weiner1368.42%133.33%
Vladimir Davydov421.05%133.33%
Mel Gorman210.53%133.33%

static struct mem_cgroup_tree_per_node * soft_limit_tree_from_page(struct page *page) { int nid = page_to_nid(page); return soft_limit_tree.rb_tree_per_node[nid]; }


Johannes Weiner1137.93%120.00%
Jianyu Zhan827.59%120.00%
Kamezawa Hiroyuki827.59%120.00%
Mel Gorman13.45%120.00%
Vladimir Davydov13.45%120.00%

static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz, unsigned long new_usage_in_excess) { struct rb_node **p = &mctz->rb_root.rb_node; struct rb_node *parent = NULL; struct mem_cgroup_per_node *mz_node; bool rightmost = true; if (mz->on_tree) return; mz->usage_in_excess = new_usage_in_excess; if (!mz->usage_in_excess) return; while (*p) { parent = *p; mz_node = rb_entry(parent, struct mem_cgroup_per_node, tree_node); if (mz->usage_in_excess < mz_node->usage_in_excess) { p = &(*p)->rb_left; rightmost = false; } /* * We can't avoid mem cgroups that are over their soft * limit by the same amount */ else if (mz->usage_in_excess >= mz_node->usage_in_excess) p = &(*p)->rb_right; } if (rightmost) mctz->rb_rightmost = &mz->tree_node; rb_link_node(&mz->tree_node, parent, p); rb_insert_color(&mz->tree_node, &mctz->rb_root); mz->on_tree = true; }


Andrew Morton7741.18%116.67%
Johannes Weiner5931.55%116.67%
Davidlohr Bueso A2412.83%116.67%
Vladimir Davydov136.95%116.67%
Tejun Heo105.35%116.67%
Mel Gorman42.14%116.67%

static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz) { if (!mz->on_tree) return; if (&mz->tree_node == mctz->rb_rightmost) mctz->rb_rightmost = rb_prev(&mz->tree_node); rb_erase(&mz->tree_node, &mctz->rb_root); mz->on_tree = false; }


Andrew Morton4162.12%133.33%
Davidlohr Bueso A2334.85%133.33%
Mel Gorman23.03%133.33%

static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, struct mem_cgroup_tree_per_node *mctz) { unsigned long flags; spin_lock_irqsave(&mctz->lock, flags); __mem_cgroup_remove_exceeded(mz, mctz); spin_unlock_irqrestore(&mctz->lock, flags); }


Andrew Morton3574.47%133.33%
Johannes Weiner1021.28%133.33%
Mel Gorman24.26%133.33%

static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { unsigned long nr_pages = page_counter_read(&memcg->memory); unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; if (nr_pages > soft_limit) excess = nr_pages - soft_limit; return excess; }


Johannes Weiner5598.21%150.00%
Jason Low11.79%150.00%

static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long excess; struct mem_cgroup_per_node *mz; struct mem_cgroup_tree_per_node *mctz; mctz = soft_limit_tree_from_page(page); if (!mctz) return; /* * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ for (; memcg; memcg = parent_mem_cgroup(memcg)) { mz = mem_cgroup_page_nodeinfo(memcg, page); excess = soft_limit_excess(memcg); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. */ if (excess || mz->on_tree) { unsigned long flags; spin_lock_irqsave(&mctz->lock, flags); /* if on-tree, remove it */ if (mz->on_tree) __mem_cgroup_remove_exceeded(mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ __mem_cgroup_insert_exceeded(mz, mctz, excess); spin_unlock_irqrestore(&mctz->lock, flags); } } }


Andrew Morton11284.21%116.67%
Johannes Weiner118.27%233.33%
Laurent Dufour64.51%116.67%
Mel Gorman32.26%116.67%
Jianyu Zhan10.75%116.67%

static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { struct mem_cgroup_tree_per_node *mctz; struct mem_cgroup_per_node *mz; int nid; for_each_node(nid) { mz = mem_cgroup_nodeinfo(memcg, nid); mctz = soft_limit_tree_node(nid); if (mctz) mem_cgroup_remove_exceeded(mz, mctz); } }


Andrew Morton3866.67%125.00%
Jianyu Zhan814.04%125.00%
Mel Gorman712.28%125.00%
Laurent Dufour47.02%125.00%

static struct mem_cgroup_per_node * __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { struct mem_cgroup_per_node *mz; retry: mz = NULL; if (!mctz->rb_rightmost) goto done; /* Nothing to reclaim from */ mz = rb_entry(mctz->rb_rightmost, struct mem_cgroup_per_node, tree_node); /* * Remove the node now but someone else can add it back, * we will to add it back at the end of reclaim to its correct * position in the tree. */ __mem_cgroup_remove_exceeded(mz, mctz); if (!soft_limit_excess(mz->memcg) || !css_tryget_online(&mz->memcg->css)) goto retry; done: return mz; }


Andrew Morton6272.09%112.50%
Balbir Singh1011.63%112.50%
Davidlohr Bueso A66.98%112.50%
Mel Gorman44.65%112.50%
Johannes Weiner11.16%112.50%
Fengguang Wu11.16%112.50%
Hugh Dickins11.16%112.50%
Tejun Heo11.16%112.50%

static struct mem_cgroup_per_node * mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) { struct mem_cgroup_per_node *mz; spin_lock_irq(&mctz->lock); mz = __mem_cgroup_largest_soft_limit_node(mctz); spin_unlock_irq(&mctz->lock); return mz; }


Andrew Morton2352.27%120.00%
Balbir Singh1431.82%120.00%
Mel Gorman36.82%120.00%
Johannes Weiner24.55%120.00%
Michal Hocko24.55%120.00%

static unsigned long memcg_sum_events(struct mem_cgroup *memcg, int event) { return atomic_long_read(&memcg->events[event]); }


Johannes Weiner2488.89%466.67%
Raghavendra K T27.41%116.67%
Matthias Kaehlcke13.70%116.67%

static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, struct page *page, bool compound, int nr_pages) { /* * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); else { __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); if (PageSwapBacked(page)) __mod_memcg_state(memcg, NR_SHMEM, nr_pages); } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __count_memcg_events(memcg, PGPGIN, 1); else { __count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); }


Johannes Weiner4331.16%738.89%
Balbir Singh2719.57%15.56%
Kamezawa Hiroyuki2518.12%527.78%
Kirill A. Shutemov1712.32%211.11%
David Rientjes1712.32%15.56%
Raghavendra K T64.35%15.56%
Motohiro Kosaki32.17%15.56%

unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); unsigned long nr = 0; enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; nr += mem_cgroup_get_lru_size(lruvec, lru); } return nr; }


Kamezawa Hiroyuki3744.05%222.22%
Michal Hocko2023.81%111.11%
Jianyu Zhan1517.86%111.11%
Ying Han33.57%111.11%
Hugh Dickins33.57%111.11%
Mel Gorman33.57%111.11%
Konstantin Khlebnikov22.38%111.11%
Raghavendra K T11.19%111.11%

static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { unsigned long nr = 0; int nid; for_each_node_state(nid, N_MEMORY) nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); return nr; }


Ying Han2657.78%120.00%
Kamezawa Hiroyuki817.78%120.00%
Jianyu Zhan817.78%120.00%
Raghavendra K T24.44%120.00%
Lai Jiangshan12.22%120.00%

static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, enum mem_cgroup_events_target target) { unsigned long val, next; val = __this_cpu_read(memcg->stat_cpu->nr_page_events); next = __this_cpu_read(memcg->stat_cpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { case MEM_CGROUP_TARGET_THRESH: next = val + THRESHOLDS_EVENTS_TARGET; break; case MEM_CGROUP_TARGET_SOFTLIMIT: next = val + SOFTLIMIT_EVENTS_TARGET; break; case MEM_CGROUP_TARGET_NUMAINFO: next = val + NUMAINFO_EVENTS_TARGET; break; default: break; } __this_cpu_write(memcg->stat_cpu->targets[target], next); return true; } return false; }


Johannes Weiner7159.17%545.45%
Kamezawa Hiroyuki2823.33%218.18%
Andrew Morton108.33%19.09%
Michal Hocko43.33%19.09%
Raghavendra K T43.33%19.09%
Steven Rostedt32.50%19.09%

/* * Check events in order. * */
static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { /* threshold event is triggered in finer grain than soft limit */ if (unlikely(mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_THRESH))) { bool do_softlimit; bool do_numainfo __maybe_unused; do_softlimit = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); #if MAX_NUMNODES > 1 do_numainfo = mem_cgroup_event_ratelimit(memcg, MEM_CGROUP_TARGET_NUMAINFO); #endif mem_cgroup_threshold(memcg); if (unlikely(do_softlimit)) mem_cgroup_update_tree(memcg, page); #if MAX_NUMNODES > 1 if (unlikely(do_numainfo)) atomic_inc(&memcg->numainfo_events); #endif } }


Kamezawa Hiroyuki4745.19%228.57%
Andrew Morton2725.96%228.57%
Johannes Weiner2625.00%228.57%
Raghavendra K T43.85%114.29%

struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* * mm_update_next_owner() may clear mm->owner to NULL * if it races with swapoff, page migration, etc. * So this can be called with p == NULL. */ if (unlikely(!p)) return NULL; return mem_cgroup_from_css(task_css(p, memory_cgrp_id)); }


Pavel Emelyanov2057.14%120.00%
Balbir Singh1234.29%120.00%
Tejun Heo25.71%240.00%
Wanpeng Li12.86%120.00%

static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; rcu_read_lock(); do { /* * Page cache insertions can happen withou an * actual mm context, e.g. during disk probing * on boot, loopback IO, acct() writes etc. */ if (unlikely(!mm)) memcg = root_mem_cgroup; else { memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!memcg)) memcg = root_mem_cgroup; } } while (!css_tryget_online(&memcg->css)); rcu_read_unlock(); return memcg; }


Kamezawa Hiroyuki5666.67%120.00%
Michal Hocko1619.05%120.00%
Johannes Weiner67.14%120.00%
Raghavendra K T55.95%120.00%
Tejun Heo11.19%120.00%

/** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root * @prev: previously returned memcg, NULL on first invocation * @reclaim: cookie for shared reclaim walks, NULL for full walks * * Returns references to children of the hierarchy below @root, or * @root itself, or %NULL after a full round-trip. * * Caller must pass the return value in @prev on subsequent * invocations for reference counting, or use mem_cgroup_iter_break() * to cancel a hierarchy walk before the round-trip is complete. * * Reclaimers can specify a node and a priority level in @reclaim to * divide up the memcgs in the hierarchy among all concurrent * reclaimers operating on the same node and priority. */
struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); struct cgroup_subsys_state *css = NULL; struct mem_cgroup *memcg = NULL; struct mem_cgroup *pos = NULL; if (mem_cgroup_disabled()) return NULL; if (!root) root = root_mem_cgroup; if (prev && !reclaim) pos = prev; if (!root->use_hierarchy && root != root_mem_cgroup) { if (prev) goto out; return root; } rcu_read_lock(); if (reclaim) { struct mem_cgroup_per_node *mz; mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id); iter = &mz->iter[reclaim->priority]; if (prev && reclaim->generation != iter->generation) goto out_unlock; while (1) { pos = READ_ONCE(iter->position); if (!pos || css_tryget(&pos->css)) break; /* * css reference reached zero, so iter->position will * be cleared by ->css_released. However, we should not * rely on this happening soon, because ->css_released * is called from a work queue, and by busy-waiting we * might block it. So we clear iter->position right * away. */ (void)cmpxchg(&iter->position, pos, NULL); } } if (pos) css = &pos->css; for (;;) { css = css_next_descendant_pre(css, &root->css); if (!css) { /* * Reclaimers share the hierarchy walk, and a * new one might jump in right at the end of * the hierarchy - make sure they see at least * one group and restart from the beginning. */ if (!prev) continue; break; } /* * Verify the css and acquire a reference. The root * is provided by the caller, so we know it's alive * and kicking, and don't take an extra reference. */ memcg = mem_cgroup_from_css(css); if (css == &root->css) break; if (css_tryget(css)) break; memcg = NULL; } if (reclaim) { /* * The position could have already been updated by a competing * thread, so check that the value hasn't changed since we read * it to avoid reclaiming from the same cgroup twice. */ (void)cmpxchg(&iter->position, pos, memcg); if (pos) css_put(&pos->css); if (!memcg) iter->generation++; else if (!prev) reclaim->generation = iter->generation; } out_unlock: rcu_read_unlock(); out: if (prev && prev != root) css_put(&prev->css); return memcg; }


Johannes Weiner23063.89%730.43%
Michal Hocko7621.11%730.43%
Vladimir Davydov298.06%14.35%
Kamezawa Hiroyuki143.89%28.70%
Mel Gorman51.39%14.35%
Jianyu Zhan20.56%14.35%
Raghavendra K T10.28%14.35%
Jason Low10.28%14.35%
Tejun Heo10.28%14.35%
Andrew Morton10.28%14.35%

/** * mem_cgroup_iter_break - abort a hierarchy walk prematurely * @root: hierarchy root * @prev: last visited hierarchy member as returned by mem_cgroup_iter() */
void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { if (!root) root = root_mem_cgroup; if (prev && prev != root) css_put(&prev->css); }


Kamezawa Hiroyuki2870.00%375.00%
Johannes Weiner1230.00%125.00%

static void invalidate_reclaim_iterators(struct