cregit-Linux how code gets into the kernel

Release 4.10 arch/x86/mm/tlb.c

Directory: arch/x86/mm
#include <linux/init.h>

#include <linux/mm.h>
#include <linux/spinlock.h>
#include <linux/smp.h>
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/cpu.h>

#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
#include <asm/cache.h>
#include <asm/apic.h>
#include <asm/uv/uv.h>
#include <linux/debugfs.h>

/*
 *      Smarter SMP flushing macros.
 *              c/o Linus Torvalds.
 *
 *      These mean you can really definitely utterly forget about
 *      writing to user space from interrupts. (Its not allowed anyway).
 *
 *      Optimizations Manfred Spraul <manfred@colorfullife.com>
 *
 *      More scalable flush, from Andi Kleen
 *
 *      Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
 */

#ifdef CONFIG_SMP


struct flush_tlb_info {
	
struct mm_struct *flush_mm;
	
unsigned long flush_start;
	
unsigned long flush_end;
};

/*
 * We cannot call mmdrop() because we are in interrupt context,
 * instead update mm->cpu_vm_mask.
 */

void leave_mm(int cpu) { struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); load_cr3(swapper_pg_dir); /* * This gets called in the idle path where RCU * functions differently. Tracing normally * uses RCU, so we have to call the tracepoint * specially here. */ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); } }

Contributors

PersonTokensPropCommitsCommitProp
glauber de oliveira costaglauber de oliveira costa3042.25%112.50%
suresh siddhasuresh siddha2535.21%112.50%
dave hansendave hansen811.27%225.00%
rusty russellrusty russell34.23%112.50%
brian gerstbrian gerst34.23%112.50%
linus torvaldslinus torvalds11.41%112.50%
alex shialex shi11.41%112.50%
Total71100.00%8100.00%

EXPORT_SYMBOL_GPL(leave_mm); #endif /* CONFIG_SMP */
void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { unsigned long flags; local_irq_save(flags); switch_mm_irqs_off(prev, next, tsk); local_irq_restore(flags); }

Contributors

PersonTokensPropCommitsCommitProp
andy lutomirskiandy lutomirski43100.00%2100.00%
Total43100.00%2100.00%


void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { unsigned cpu = smp_processor_id(); if (likely(prev != next)) { if (IS_ENABLED(CONFIG_VMAP_STACK)) { /* * If our current stack is in vmalloc space and isn't * mapped in the new pgd, we'll double-fault. Forcibly * map it. */ unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); pgd_t *pgd = next->pgd + stack_pgd_index; if (unlikely(pgd_none(*pgd))) set_pgd(pgd, init_mm.pgd[stack_pgd_index]); } #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); #endif cpumask_set_cpu(cpu, mm_cpumask(next)); /* * Re-load page tables. * * This logic has an ordering constraint: * * CPU 0: Write to a PTE for 'next' * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. * CPU 1: set bit 1 in next's mm_cpumask * CPU 1: load from the PTE that CPU 0 writes (implicit) * * We need to prevent an outcome in which CPU 1 observes * the new PTE value and CPU 0 observes bit 1 clear in * mm_cpumask. (If that occurs, then the IPI will never * be sent, and CPU 0's TLB will contain a stale entry.) * * The bad outcome can occur if either CPU's load is * reordered before that CPU's store, so both CPUs must * execute full barriers to prevent this from happening. * * Thus, switch_mm needs a full barrier between the * store to mm_cpumask and any operation that could load * from next->pgd. TLB fills are special and can happen * due to instruction fetches or for no reason at all, * and neither LOCK nor MFENCE orders them. * Fortunately, load_cr3() is serializing and gives the * ordering guarantee we need. * */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); /* Stop flush ipis for the previous mm */ cpumask_clear_cpu(cpu, mm_cpumask(prev)); /* Load per-mm CR4 state */ load_mm_cr4(next); #ifdef CONFIG_MODIFY_LDT_SYSCALL /* * Load the LDT, if the LDT is different. * * It's possible that prev->context.ldt doesn't match * the LDT register. This can happen if leave_mm(prev) * was called and then modify_ldt changed * prev->context.ldt but suppressed an IPI to this CPU. * In this case, prev->context.ldt != NULL, because we * never set context.ldt to NULL while the mm still * exists. That means that next->context.ldt != * prev->context.ldt, because mms never share an LDT. */ if (unlikely(prev->context.ldt != next->context.ldt)) load_mm_ldt(next); #endif } #ifdef CONFIG_SMP else { this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { /* * On established mms, the mm_cpumask is only changed * from irq context, from ptep_clear_flush() while in * lazy tlb mode, and here. Irqs are blocked during * schedule, protecting us from simultaneous changes. */ cpumask_set_cpu(cpu, mm_cpumask(next)); /* * We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. * * As above, load_cr3() is serializing and orders TLB * fills with respect to the mm_cpumask write. */ load_cr3(next->pgd); trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); load_mm_cr4(next); load_mm_ldt(next); } } #endif }

Contributors

PersonTokensPropCommitsCommitProp
andy lutomirskiandy lutomirski263100.00%3100.00%
Total263100.00%3100.00%

#ifdef CONFIG_SMP /* * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] * 1) switch_mm() either 1a) or 1b) * 1a) thread switch to a different mm * 1a1) set cpu_tlbstate to TLBSTATE_OK * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm * if cpu0 was in lazy tlb mode. * 1a2) update cpu active_mm * Now cpu0 accepts tlb flushes for the new mm. * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask); * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); * Stop ipi delivery for the old mm. This is not synchronized with * the other cpus, but flush_tlb_func ignore flush ipis for the wrong * mm, and in the worst case we perform a superfluous tlb flush. * 1b) thread switch without mm change * cpu active_mm is correct, cpu0 already handles flush ipis. * 1b1) set cpu_tlbstate to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. * Atomically set the bit [other cpus will start sending flush ipis], * and test the bit. * 1b3) if the bit was 0: leave_mm was called, flush the tlb. * 2) switch %%esp, ie current * * The interrupt must handle 2 special cases: * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. * - the cpu performs speculative tlb reads, i.e. even if the cpu only * runs in kernel space, the cpu could load tlb entries for user space * pages. * * The good news is that cpu_tlbstate is local to each cpu, no * write/read ordering problems. */ /* * TLB flush funcation: * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. * 2) Leave the mm if we are in the lazy tlb mode. */
static void flush_tlb_func(void *info) { struct flush_tlb_info *f = info; inc_irq_stat(irq_tlb_count); if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) { local_flush_tlb(); trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); } else { unsigned long addr; unsigned long nr_pages = (f->flush_end - f->flush_start) / PAGE_SIZE; addr = f->flush_start; while (addr < f->flush_end) { __flush_tlb_single(addr); addr += PAGE_SIZE; } trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); } } else leave_mm(smp_processor_id()); }

Contributors

PersonTokensPropCommitsCommitProp
glauber de oliveira costaglauber de oliveira costa4431.43%18.33%
alex shialex shi4431.43%325.00%
dave hansendave hansen3625.71%325.00%
brian gerstbrian gerst53.57%18.33%
tomoki sekiyamatomoki sekiyama53.57%18.33%
nadav amitnadav amit42.86%18.33%
tejun heotejun heo10.71%18.33%
mel gormanmel gorman10.71%18.33%
Total140100.00%12100.00%


void native_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long start, unsigned long end) { struct flush_tlb_info info; if (end == 0) end = start + PAGE_SIZE; info.flush_mm = mm; info.flush_start = start; info.flush_end = end; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); else trace_tlb_flush(TLB_REMOTE_SEND_IPI, (end - start) >> PAGE_SHIFT); if (is_uv_system()) { unsigned int cpu; cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); if (cpumask) smp_call_function_many(cpumask, flush_tlb_func, &info, 1); return; } smp_call_function_many(cpumask, flush_tlb_func, &info, 1); }

Contributors

PersonTokensPropCommitsCommitProp
alex shialex shi3422.67%215.38%
nadav amitnadav amit3020.00%17.69%
rusty russellrusty russell2617.33%17.69%
glauber de oliveira costaglauber de oliveira costa2315.33%17.69%
tejun heotejun heo1510.00%17.69%
mel gormanmel gorman106.67%215.38%
li shaohuali shaohua42.67%17.69%
dave hansendave hansen42.67%17.69%
linus torvaldslinus torvalds21.33%17.69%
xiao guangrongxiao guangrong10.67%17.69%
mike travismike travis10.67%17.69%
Total150100.00%13100.00%


void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; preempt_disable(); count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); /* This is an implicit full barrier that synchronizes with switch_mm. */ local_flush_tlb(); trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); preempt_enable(); }

Contributors

PersonTokensPropCommitsCommitProp
glauber de oliveira costaglauber de oliveira costa3856.72%112.50%
rusty russellrusty russell1420.90%225.00%
dave hansendave hansen1116.42%225.00%
alex shialex shi22.99%112.50%
andy lutomirskiandy lutomirski11.49%112.50%
mel gormanmel gorman11.49%112.50%
Total67100.00%8100.00%

/* * See Documentation/x86/tlb.txt for details. We choose 33 * because it is large enough to cover the vast majority (at * least 95%) of allocations, and is small enough that we are * confident it will not cause too much overhead. Each single * flush is about 100 ns, so this caps the maximum overhead at * _about_ 3,000 ns. * * This is in units of pages. */ static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { unsigned long addr; /* do a global flush by default */ unsigned long base_pages_to_flush = TLB_FLUSH_ALL; preempt_disable(); if (current->active_mm != mm) { /* Synchronize with switch_mm. */ smp_mb(); goto out; } if (!current->mm) { leave_mm(smp_processor_id()); /* Synchronize with switch_mm. */ smp_mb(); goto out; } if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) base_pages_to_flush = (end - start) >> PAGE_SHIFT; /* * Both branches below are implicit full barriers (MOV to CR or * INVLPG) that synchronize with switch_mm. */ if (base_pages_to_flush > tlb_single_page_flush_ceiling) { base_pages_to_flush = TLB_FLUSH_ALL; count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } } trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); out: if (base_pages_to_flush == TLB_FLUSH_ALL) { start = 0UL; end = TLB_FLUSH_ALL; } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, start, end); preempt_enable(); }

Contributors

PersonTokensPropCommitsCommitProp
alex shialex shi12560.10%323.08%
dave hansendave hansen5626.92%538.46%
andy lutomirskiandy lutomirski115.29%17.69%
mel gormanmel gorman94.33%215.38%
glauber de oliveira costaglauber de oliveira costa62.88%17.69%
joonsoo kimjoonsoo kim10.48%17.69%
Total208100.00%13100.00%


void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) { struct mm_struct *mm = vma->vm_mm; preempt_disable(); if (current->active_mm == mm) { if (current->mm) { /* * Implicit full barrier (INVLPG) that synchronizes * with switch_mm. */ __flush_tlb_one(start); } else { leave_mm(smp_processor_id()); /* Synchronize with switch_mm. */ smp_mb(); } } if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); preempt_enable(); }

Contributors

PersonTokensPropCommitsCommitProp
glauber de oliveira costaglauber de oliveira costa6770.53%120.00%
rusty russellrusty russell1414.74%240.00%
andy lutomirskiandy lutomirski99.47%120.00%
alex shialex shi55.26%120.00%
Total95100.00%5100.00%


static void do_flush_tlb_all(void *info) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); }

Contributors

PersonTokensPropCommitsCommitProp
glauber de oliveira costaglauber de oliveira costa2468.57%116.67%
dave hansendave hansen411.43%116.67%
brian gerstbrian gerst38.57%116.67%
borislav petkovborislav petkov25.71%116.67%
alex shialex shi12.86%116.67%
mel gormanmel gorman12.86%116.67%
Total35100.00%6100.00%


void flush_tlb_all(void) { count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); }

Contributors

PersonTokensPropCommitsCommitProp
glauber de oliveira costaglauber de oliveira costa1676.19%133.33%
dave hansendave hansen419.05%133.33%
mel gormanmel gorman14.76%133.33%
Total21100.00%3100.00%


static void do_kernel_range_flush(void *info) { struct flush_tlb_info *f = info; unsigned long addr; /* flush range by one by one 'invlpg' */ for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) __flush_tlb_single(addr); }

Contributors

PersonTokensPropCommitsCommitProp
alex shialex shi45100.00%1100.00%
Total45100.00%1100.00%


void flush_tlb_kernel_range(unsigned long start, unsigned long end) { /* Balance as user space task's flush, a bit conservative */ if (end == TLB_FLUSH_ALL || (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { on_each_cpu(do_flush_tlb_all, NULL, 1); } else { struct flush_tlb_info info; info.flush_start = start; info.flush_end = end; on_each_cpu(do_kernel_range_flush, &info, 1); } }

Contributors

PersonTokensPropCommitsCommitProp
alex shialex shi6187.14%150.00%
dave hansendave hansen912.86%150.00%
Total70100.00%2100.00%


static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; unsigned int len; len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); return simple_read_from_buffer(user_buf, count, ppos, buf, len); }

Contributors

PersonTokensPropCommitsCommitProp
dave hansendave hansen58100.00%1100.00%
Total58100.00%1100.00%


static ssize_t tlbflush_write_file(struct file *file, const char __user *user_buf, size_t count, loff_t *ppos) { char buf[32]; ssize_t len; int ceiling; len = min(count, sizeof(buf) - 1); if (copy_from_user(buf, user_buf, len)) return -EFAULT; buf[len] = '\0'; if (kstrtoint(buf, 0, &ceiling)) return -EINVAL; if (ceiling < 0) return -EINVAL; tlb_single_page_flush_ceiling = ceiling; return count; }

Contributors

PersonTokensPropCommitsCommitProp
dave hansendave hansen105100.00%1100.00%
Total105100.00%1100.00%

static const struct file_operations fops_tlbflush = { .read = tlbflush_read_file, .write = tlbflush_write_file, .llseek = default_llseek, };
static int __init create_tlb_single_page_flush_ceiling(void) { debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, arch_debugfs_dir, NULL, &fops_tlbflush); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
dave hansendave hansen28100.00%1100.00%
Total28100.00%1100.00%

late_initcall(create_tlb_single_page_flush_ceiling); #endif /* CONFIG_SMP */

Overall Contributors

PersonTokensPropCommitsCommitProp
dave hansendave hansen35823.65%920.00%
andy lutomirskiandy lutomirski33922.39%511.11%
alex shialex shi33021.80%715.56%
glauber de oliveira costaglauber de oliveira costa29319.35%24.44%
rusty russellrusty russell573.76%24.44%
nadav amitnadav amit342.25%24.44%
suresh siddhasuresh siddha251.65%12.22%
mel gormanmel gorman231.52%36.67%
tejun heotejun heo181.19%24.44%
brian gerstbrian gerst110.73%12.22%
li shaohuali shaohua70.46%12.22%
tomoki sekiyamatomoki sekiyama50.33%12.22%
linus torvaldslinus torvalds30.20%24.44%
jan beulichjan beulich30.20%12.22%
jeremiah mahlerjeremiah mahler20.13%12.22%
borislav petkovborislav petkov20.13%12.22%
joonsoo kimjoonsoo kim10.07%12.22%
xiao guangrongxiao guangrong10.07%12.22%
mike travismike travis10.07%12.22%
paul gortmakerpaul gortmaker10.07%12.22%
Total1514100.00%45100.00%
Directory: arch/x86/mm
Information contained on this website is for historical information purposes only and does not indicate or represent copyright ownership.