cregit-Linux how code gets into the kernel

Release 4.14 arch/powerpc/kvm/book3s_64_mmu_hv.c

Directory: arch/powerpc/kvm
/*
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License, version 2, as
 * published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
 *
 * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
 */

#include <linux/types.h>
#include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/highmem.h>
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/srcu.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/debugfs.h>

#include <asm/tlbflush.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/book3s/64/mmu-hash.h>
#include <asm/hvcall.h>
#include <asm/synch.h>
#include <asm/ppc-opcode.h>
#include <asm/cputable.h>
#include <asm/pte-walk.h>

#include "trace_hv.h"

//#define DEBUG_RESIZE_HPT      1

#ifdef DEBUG_RESIZE_HPT

#define resize_hpt_debug(resize, ...)				\
	do {                                                    \
                printk(KERN_DEBUG "RESIZE HPT %p: ", resize);   \
                printk(__VA_ARGS__);                            \
        } while (0)
#else

#define resize_hpt_debug(resize, ...)				\
	do { } while (0)
#endif

static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
				long pte_index, unsigned long pteh,
				unsigned long ptel, unsigned long *pte_idx_ret);


struct kvm_resize_hpt {
	/* These fields read-only after init */
	
struct kvm *kvm;
	
struct work_struct work;
	
u32 order;

	/* These fields protected by kvm->lock */
	
int error;
	
bool prepare_done;

	/* Private to the work thread, until prepare_done is true,
         * then protected by kvm->resize_hpt_sem */
	
struct kvm_hpt_info hpt;
};

static void kvmppc_rmap_reset(struct kvm *kvm);


int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order) { unsigned long hpt = 0; int cma = 0; struct page *page = NULL; struct revmap_entry *rev; unsigned long npte; if ((order < PPC_MIN_HPT_ORDER) || (order > PPC_MAX_HPT_ORDER)) return -EINVAL; page = kvm_alloc_hpt_cma(1ul << (order - PAGE_SHIFT)); if (page) { hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); memset((void *)hpt, 0, (1ul << order)); cma = 1; } if (!hpt) hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_RETRY_MAYFAIL |__GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) return -ENOMEM; /* HPTEs are 2**4 bytes long */ npte = 1ul << (order - 4); /* Allocate reverse map array */ rev = vmalloc(sizeof(struct revmap_entry) * npte); if (!rev) { pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n"); if (cma) kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT)); else free_pages(hpt, order - PAGE_SHIFT); return -ENOMEM; } info->order = order; info->virt = hpt; info->cma = cma; info->rev = rev; return 0; }

Contributors

PersonTokensPropCommitsCommitProp
David Gibson10042.02%323.08%
Paul Mackerras7531.51%323.08%
Aneesh Kumar K.V2711.34%215.38%
JoonSoo Kim166.72%17.69%
Alexander Graf166.72%17.69%
Laurent Dufour20.84%17.69%
Yongji Xie10.42%17.69%
Michal Hocko10.42%17.69%
Total238100.00%13100.00%


void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info) { atomic64_set(&kvm->arch.mmio_update, 0); kvm->arch.hpt = *info; kvm->arch.sdr1 = __pa(info->virt) | (info->order - 18); pr_debug("KVM guest htab at %lx (order %ld), LPID %x\n", info->virt, (long)info->order, kvm->arch.lpid); }

Contributors

PersonTokensPropCommitsCommitProp
David Gibson4253.16%233.33%
Paul Mackerras3645.57%350.00%
Thomas Huth11.27%116.67%
Total79100.00%6100.00%


long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order) { long err = -EBUSY; struct kvm_hpt_info info; if (kvm_is_radix(kvm)) return -EINVAL; mutex_lock(&kvm->lock); if (kvm->arch.hpte_setup_done) { kvm->arch.hpte_setup_done = 0; /* order hpte_setup_done vs. vcpus_running */ smp_mb(); if (atomic_read(&kvm->arch.vcpus_running)) { kvm->arch.hpte_setup_done = 1; goto out; } } if (kvm->arch.hpt.order == order) { /* We already have a suitable HPT */ /* Set the entire HPT to 0, i.e. invalid HPTEs */ memset((void *)kvm->arch.hpt.virt, 0, 1ul << order); /* * Reset all the reverse-mapping chains for all memslots */ kvmppc_rmap_reset(kvm); /* Ensure that each vcpu will flush its TLB on next entry. */ cpumask_setall(&kvm->arch.need_tlb_flush); err = 0; goto out; } if (kvm->arch.hpt.virt) { kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_rmap_reset(kvm); } err = kvmppc_allocate_hpt(&info, order); if (err < 0) goto out; kvmppc_set_hpt(kvm, &info); out: mutex_unlock(&kvm->lock); return err; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras15571.43%555.56%
David Gibson5826.73%333.33%
Aneesh Kumar K.V41.84%111.11%
Total217100.00%9100.00%


void kvmppc_free_hpt(struct kvm_hpt_info *info) { vfree(info->rev); if (info->cma) kvm_free_hpt_cma(virt_to_page(info->virt), 1 << (info->order - PAGE_SHIFT)); else if (info->virt) free_pages(info->virt, info->order - PAGE_SHIFT); info->virt = 0; info->order = 0; }

Contributors

PersonTokensPropCommitsCommitProp
David Gibson2938.67%333.33%
Paul Mackerras2736.00%444.44%
Aneesh Kumar K.V1013.33%111.11%
Alexander Graf912.00%111.11%
Total75100.00%9100.00%

/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */
static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) { return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras23100.00%1100.00%
Total23100.00%1100.00%

/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */
static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) { return (pgsize == 0x10000) ? 0x1000 : 0; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras23100.00%1100.00%
Total23100.00%1100.00%


void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long porder) { unsigned long i; unsigned long npages; unsigned long hp_v, hp_r; unsigned long addr, hash; unsigned long psize; unsigned long hp0, hp1; unsigned long idx_ret; long ret; struct kvm *kvm = vcpu->kvm; psize = 1ul << porder; npages = memslot->npages >> (porder - PAGE_SHIFT); /* VRMA can't be > 1TB */ if (npages > 1ul << (40 - porder)) npages = 1ul << (40 - porder); /* Can't use more than 1 HPTE per HPTEG */ if (npages > kvmppc_hpt_mask(&kvm->arch.hpt) + 1) npages = kvmppc_hpt_mask(&kvm->arch.hpt) + 1; hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); hp1 = hpte1_pgsize_encoding(psize) | HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; for (i = 0; i < npages; ++i) { addr = i << porder; /* can't use hpt_hash since va > 64 bits */ hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvmppc_hpt_mask(&kvm->arch.hpt); /* * We assume that the hash table is empty and no * vcpus are using it at this stage. Since we create * at most one HPTE per HPTEG, we just assume entry 7 * is available and use it. */ hash = (hash << 3) + 7; hp_v = hp0 | ((addr >> 16) & ~0x7fUL); hp_r = hp1 | addr; ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, &idx_ret); if (ret != H_SUCCESS) { pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", addr, ret); break; } } }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras27494.81%777.78%
David Gibson155.19%222.22%
Total289100.00%9100.00%


int kvmppc_mmu_hv_init(void) { unsigned long host_lpid, rsvd_lpid; if (!cpu_has_feature(CPU_FTR_HVMODE)) return -EINVAL; /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ host_lpid = mfspr(SPRN_LPID); rsvd_lpid = LPID_RSVD; kvmppc_init_lpid(rsvd_lpid + 1); kvmppc_claim_lpid(host_lpid); /* rsvd_lpid is reserved for use in partition switching */ kvmppc_claim_lpid(rsvd_lpid); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras4984.48%480.00%
Scott Wood915.52%120.00%
Total58100.00%5100.00%


static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) { unsigned long msr = vcpu->arch.intr_msr; /* If transactional, change to suspend mode on IRQ delivery */ if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) msr |= MSR_TS_S; else msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; kvmppc_set_msr(vcpu, msr); }

Contributors

PersonTokensPropCommitsCommitProp
Michael Neuling4067.80%133.33%
Paul Mackerras1525.42%133.33%
Anton Blanchard46.78%133.33%
Total59100.00%3100.00%


static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret) { long ret; /* Protect linux PTE lookup from page table destruction */ rcu_read_lock_sched(); /* this disables preemption too */ ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, current->mm->pgd, false, pte_idx_ret); rcu_read_unlock_sched(); if (ret == H_TOO_HARD) { /* this can't happen */ pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); ret = H_RESOURCE; /* or something */ } return ret; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras8898.88%787.50%
Daniel Axtens11.12%112.50%
Total89100.00%8100.00%


static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, gva_t eaddr) { u64 mask; int i; for (i = 0; i < vcpu->arch.slb_nr; i++) { if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) continue; if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) mask = ESID_MASK_1T; else mask = ESID_MASK; if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) return &vcpu->arch.slb[i]; } return NULL; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras121100.00%1100.00%
Total121100.00%1100.00%


static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, unsigned long ea) { unsigned long ra_mask; ra_mask = hpte_page_size(v, r) - 1; return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras50100.00%1100.00%
Total50100.00%1100.00%


static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_pte *gpte, bool data, bool iswrite) { struct kvm *kvm = vcpu->kvm; struct kvmppc_slb *slbe; unsigned long slb_v; unsigned long pp, key; unsigned long v, orig_v, gr; __be64 *hptep; int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); /* Get SLB entry */ if (virtmode) { slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); if (!slbe) return -EINVAL; slb_v = slbe->origv; } else { /* real mode access */ slb_v = vcpu->kvm->arch.vrma_slb_v; } preempt_disable(); /* Find the HPTE in the hash table */ index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, HPTE_V_VALID | HPTE_V_ABSENT); if (index < 0) { preempt_enable(); return -ENOENT; } hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; if (cpu_has_feature(CPU_FTR_ARCH_300)) v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); gr = kvm->arch.hpt.rev[index].guest_rpte; unlock_hpte(hptep, orig_v); preempt_enable(); gpte->eaddr = eaddr; gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); /* Get PP bits and key for permission check */ pp = gr & (HPTE_R_PP0 | HPTE_R_PP); key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; key &= slb_v; /* Calculate permissions */ gpte->may_read = hpte_read_permission(pp, key); gpte->may_write = hpte_write_permission(pp, key); gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); /* Storage key permission check for POWER7 */ if (data && virtmode) { int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); if (amrfield & 1) gpte->may_read = 0; if (amrfield & 2) gpte->may_write = 0; } /* Get the guest physical address */ gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); return 0; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras38893.49%450.00%
pingfan liu112.65%112.50%
Alexander Graf71.69%112.50%
David Gibson61.45%112.50%
Aneesh Kumar K.V30.72%112.50%
Total415100.00%8100.00%

/* * Quick test for whether an instruction is a load or a store. * If the instruction is a load or a store, then this will indicate * which it is, at least on server processors. (Embedded processors * have some external PID instructions that don't follow the rule * embodied here.) If the instruction isn't a load or store, then * this doesn't return anything useful. */
static int instruction_is_store(unsigned int instr) { unsigned int mask; mask = 0x10000000; if ((instr & 0xfc000000) == 0x7c000000) mask = 0x100; /* major opcode 31 */ return (instr & mask) != 0; }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras42100.00%1100.00%
Total42100.00%1100.00%


int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long gpa, gva_t ea, int is_store) { u32 last_inst; /* * If we fail, we just return to the guest and try executing it again. */ if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != EMULATE_DONE) return RESUME_GUEST; /* * WARNING: We do not know for sure whether the instruction we just * read from memory is the same that caused the fault in the first * place. If the instruction we read is neither an load or a store, * then it can't access memory, so we don't need to worry about * enforcing access permissions. So, assuming it is a load or * store, we just check that its direction (load or store) is * consistent with the original fault, since that's what we * checked the access permissions against. If there is a mismatch * we just return and retry the instruction. */ if (instruction_is_store(last_inst) != !!is_store) return RESUME_GUEST; /* * Emulated accesses are emulated by looking at the hash for * translation once, then performing the access later. The * translation could be invalidated in the meantime in which * point performing the subsequent memory access on the old * physical address could possibly be a security hole for the * guest (but not the host). * * This is less of an issue for MMIO stores since they aren't * globally visible. It could be an issue for MMIO loads to * a certain extent but we'll ignore it for now. */ vcpu->arch.paddr_accessed = gpa; vcpu->arch.vaddr_accessed = ea; return kvmppc_emulate_mmio(run, vcpu); }

Contributors

PersonTokensPropCommitsCommitProp
Paul Mackerras7182.56%133.33%
Alexander Graf1112.79%133.33%
Mihai Caraman44.65%133.33%
Total86100.00%3100.00%


int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long ea, unsigned long dsisr) { struct kvm *kvm = vcpu->kvm; unsigned long hpte[3], r; unsigned long hnow_v, hnow_r; __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; unsigned long gpa, gfn, hva, pfn; struct kvm_memory_slot *memslot; unsigned long *rmap; struct revmap_entry *rev; struct page *page, *pages[1]; long index, ret, npages; bool is_ci; unsigned int writing, write_ok; struct vm_area_struct *vma; unsigned long rcbits; long mmio_update; if (kvm_is_radix(kvm)) return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); /* * Real-mode code has already searched the HPT and found the * entry we're interested in. Lock the entry and check that * it hasn't changed. If it has, just return and re-execute the * instruction. */ if (ea != vcpu->arch.pgfault_addr) return RESUME_GUEST; if (vcpu->arch.pgfault_cache) { mmio_update = atomic64_read(&kvm->arch.mmio_update); if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { r = vcpu->arch.pgfault_cache->rpte; psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, dsisr & DSISR_ISSTORE); } } index = vcpu->arch.pgfault_index; hptep = (__be64 *)(kvm->arch.hpt.virt + (index << 4)); rev = &kvm->arch.hpt.rev[index]; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; hpte[1] = be64_to_cpu(hptep[1]); hpte[2] = r = rev->guest_rpte; unlock_hpte(hptep, hpte[0]); preempt_enable(); if (cpu_has_feature(CPU_FTR_ARCH_300)) { hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); hpte[1] = hpte_new_to_old_r(hpte[1]); } if (hpte[0] != vcpu->arch.pgfault_hpte[0] || hpte[1] != vcpu->arch.pgfault_hpte[1]) return RESUME_GUEST; /* Translate the logical address and get the page */ psize = hpte_page_size(hpte[0], r); gpa_base = r & HPTE_R_RPN & ~(psize - 1); gfn_base = gpa_base >> PAGE_SHIFT; gpa = gpa_base | (ea & (psize - 1)); gfn = gpa >> PAGE_SHIFT; memslot = gfn_to_memslot(kvm, gfn); trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); /* No memslot means it's an emulated MMIO region */ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, dsisr & DSISR_ISSTORE); /* * This should never happen, because of the slot_is_aligned() * check in kvmppc_do_h_enter(). */ if (gfn_base < memslot->base_gfn) return -EFAULT; /* used to check for invalidations in progress */ mmu_seq = kvm->mmu_notifier_seq; smp_rmb(); ret = -EFAULT; is_ci = false; pfn = 0; page = NULL; pte_size = PAGE_SIZE; writing = (dsisr & DSISR_ISSTORE) != 0; /* If writing != 0, then the HPTE must allow writing, if we get here */ write_ok = writing; hva = gfn_to_hva_memslot(memslot, gfn); npages = get_user_pages_fast(hva, 1, writing, pages); if (npages < 1) { /* Check if it's an I/O mapping */ down_read(&current->mm->mmap_sem); vma = find_vma(current->mm, hva); if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && (vma->vm_flags & VM_PFNMAP)) { pfn = vma->vm_pgoff + ((hva - vma->vm_start) >> PAGE_SHIFT); pte_size = psize; is_ci = pte_ci(__pte((pgprot_val(vma->vm_page_prot)))); write_ok = vma->vm_flags & VM_WRITE; } up_read(&current->mm->mmap_sem); if (!pfn) goto out_put; } else { page = pages[0]; pfn = page_to_pfn(page); if (PageHuge(page)) { page = compound_head(page); pte_size <<= compound_order(page); } /* if the guest wants write access, see if that is OK */ if (!writing && hpte_is_writable(r)) { pte_t *ptep, pte; unsigned long flags; /* * We need to protect against page table destruction * hugepage split and collapse. */ local_irq_save(flags); ptep = find_current_mm_pte(current->mm->pgd, hva,