lguest: use a special 1:1 linear pagetable mode until first switch.
The Host used to create some page tables for the Guest to use at the
top of Guest memory; it would then tell the Guest where this was. In
particular, it created linear mappings for 0 and 0xC0000000 addresses
because lguest used to switch to its real page tables quite late in
boot.
However, since d50d8fe19
Linux initialized boot page tables in
head_32.S even before the "are we lguest?" boot jump. So, now we can
simplify things: the Host pagetable code assumes 1:1 linear mapping
until it first calls the LHCALL_NEW_PGTABLE hypercall, which we now do
before we reach C code.
This also means that the Host doesn't need to know anything about the
Guest's PAGE_OFFSET. (Non-Linux guests might not even have such a
thing).
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
parent
e0377e2520
commit
5dea1c88ed
6 changed files with 98 additions and 205 deletions
|
@ -63,7 +63,6 @@ void foo(void)
|
|||
BLANK();
|
||||
OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
|
||||
OFFSET(LGUEST_DATA_irq_pending, lguest_data, irq_pending);
|
||||
OFFSET(LGUEST_DATA_pgdir, lguest_data, pgdir);
|
||||
|
||||
BLANK();
|
||||
OFFSET(LGUEST_PAGES_host_gdt_desc, lguest_pages, state.host_gdt_desc);
|
||||
|
|
|
@ -520,17 +520,16 @@ static unsigned long lguest_read_cr2(void)
|
|||
|
||||
/* See lguest_set_pte() below. */
|
||||
static bool cr3_changed = false;
|
||||
static unsigned long current_cr3;
|
||||
|
||||
/*
|
||||
* cr3 is the current toplevel pagetable page: the principle is the same as
|
||||
* cr0. Keep a local copy, and tell the Host when it changes. The only
|
||||
* difference is that our local copy is in lguest_data because the Host needs
|
||||
* to set it upon our initial hypercall.
|
||||
* cr0. Keep a local copy, and tell the Host when it changes.
|
||||
*/
|
||||
static void lguest_write_cr3(unsigned long cr3)
|
||||
{
|
||||
lguest_data.pgdir = cr3;
|
||||
lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
|
||||
current_cr3 = cr3;
|
||||
|
||||
/* These two page tables are simple, linear, and used during boot */
|
||||
if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
|
||||
|
@ -539,7 +538,7 @@ static void lguest_write_cr3(unsigned long cr3)
|
|||
|
||||
static unsigned long lguest_read_cr3(void)
|
||||
{
|
||||
return lguest_data.pgdir;
|
||||
return current_cr3;
|
||||
}
|
||||
|
||||
/* cr4 is used to enable and disable PGE, but we don't care. */
|
||||
|
@ -758,7 +757,7 @@ static void lguest_pmd_clear(pmd_t *pmdp)
|
|||
static void lguest_flush_tlb_single(unsigned long addr)
|
||||
{
|
||||
/* Simply set it to zero: if it was not, it will fault back in. */
|
||||
lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0);
|
||||
lazy_hcall3(LHCALL_SET_PTE, current_cr3, addr, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -27,13 +27,18 @@
|
|||
.section .init.text, "ax", @progbits
|
||||
ENTRY(lguest_entry)
|
||||
/*
|
||||
* We make the "initialization" hypercall now to tell the Host about
|
||||
* us, and also find out where it put our page tables.
|
||||
* We make the "initialization" hypercall now to tell the Host where
|
||||
* our lguest_data struct is.
|
||||
*/
|
||||
movl $LHCALL_LGUEST_INIT, %eax
|
||||
movl $lguest_data - __PAGE_OFFSET, %ebx
|
||||
int $LGUEST_TRAP_ENTRY
|
||||
|
||||
/* Now turn our pagetables on; setup by arch/x86/kernel/head_32.S. */
|
||||
movl $LHCALL_NEW_PGTABLE, %eax
|
||||
movl $(initial_page_table - __PAGE_OFFSET), %ebx
|
||||
int $LGUEST_TRAP_ENTRY
|
||||
|
||||
/* Set up the initial stack so we can run C code. */
|
||||
movl $(init_thread_union+THREAD_SIZE),%esp
|
||||
|
||||
|
|
|
@ -59,6 +59,8 @@ struct lg_cpu {
|
|||
|
||||
struct lguest_pages *last_pages;
|
||||
|
||||
/* Initialization mode: linear map everything. */
|
||||
bool linear_pages;
|
||||
int cpu_pgd; /* Which pgd this cpu is currently using */
|
||||
|
||||
/* If a hypercall was asked for, this points to the arguments. */
|
||||
|
|
|
@ -17,7 +17,6 @@
|
|||
#include <linux/percpu.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/uaccess.h>
|
||||
#include <asm/bootparam.h>
|
||||
#include "lg.h"
|
||||
|
||||
/*M:008
|
||||
|
@ -325,10 +324,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
|
|||
#endif
|
||||
|
||||
/* First step: get the top-level Guest page table entry. */
|
||||
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
|
||||
/* Toplevel not present? We can't map it in. */
|
||||
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
|
||||
return false;
|
||||
if (unlikely(cpu->linear_pages)) {
|
||||
/* Faking up a linear mapping. */
|
||||
gpgd = __pgd(CHECK_GPGD_MASK);
|
||||
} else {
|
||||
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
|
||||
/* Toplevel not present? We can't map it in. */
|
||||
if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Now look at the matching shadow entry. */
|
||||
spgd = spgd_addr(cpu, cpu->cpu_pgd, vaddr);
|
||||
|
@ -353,10 +357,15 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
|
|||
}
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
|
||||
/* Middle level not present? We can't map it in. */
|
||||
if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
|
||||
return false;
|
||||
if (unlikely(cpu->linear_pages)) {
|
||||
/* Faking up a linear mapping. */
|
||||
gpmd = __pmd(_PAGE_TABLE);
|
||||
} else {
|
||||
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
|
||||
/* Middle level not present? We can't map it in. */
|
||||
if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Now look at the matching shadow entry. */
|
||||
spmd = spmd_addr(cpu, *spgd, vaddr);
|
||||
|
@ -397,8 +406,13 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
|
|||
gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
|
||||
#endif
|
||||
|
||||
/* Read the actual PTE value. */
|
||||
gpte = lgread(cpu, gpte_ptr, pte_t);
|
||||
if (unlikely(cpu->linear_pages)) {
|
||||
/* Linear? Make up a PTE which points to same page. */
|
||||
gpte = __pte((vaddr & PAGE_MASK) | _PAGE_RW | _PAGE_PRESENT);
|
||||
} else {
|
||||
/* Read the actual PTE value. */
|
||||
gpte = lgread(cpu, gpte_ptr, pte_t);
|
||||
}
|
||||
|
||||
/* If this page isn't in the Guest page tables, we can't page it in. */
|
||||
if (!(pte_flags(gpte) & _PAGE_PRESENT))
|
||||
|
@ -454,7 +468,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
|
|||
* Finally, we write the Guest PTE entry back: we've set the
|
||||
* _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
|
||||
*/
|
||||
lgwrite(cpu, gpte_ptr, pte_t, gpte);
|
||||
if (likely(!cpu->linear_pages))
|
||||
lgwrite(cpu, gpte_ptr, pte_t, gpte);
|
||||
|
||||
/*
|
||||
* The fault is fixed, the page table is populated, the mapping
|
||||
|
@ -612,6 +627,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
|
|||
#ifdef CONFIG_X86_PAE
|
||||
pmd_t gpmd;
|
||||
#endif
|
||||
|
||||
/* Still not set up? Just map 1:1. */
|
||||
if (unlikely(cpu->linear_pages))
|
||||
return vaddr;
|
||||
|
||||
/* First step: get the top-level Guest page table entry. */
|
||||
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
|
||||
/* Toplevel not present? We can't map it in. */
|
||||
|
@ -708,32 +728,6 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
|
|||
return next;
|
||||
}
|
||||
|
||||
/*H:430
|
||||
* (iv) Switching page tables
|
||||
*
|
||||
* Now we've seen all the page table setting and manipulation, let's see
|
||||
* what happens when the Guest changes page tables (ie. changes the top-level
|
||||
* pgdir). This occurs on almost every context switch.
|
||||
*/
|
||||
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
|
||||
{
|
||||
int newpgdir, repin = 0;
|
||||
|
||||
/* Look to see if we have this one already. */
|
||||
newpgdir = find_pgdir(cpu->lg, pgtable);
|
||||
/*
|
||||
* If not, we allocate or mug an existing one: if it's a fresh one,
|
||||
* repin gets set to 1.
|
||||
*/
|
||||
if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
|
||||
newpgdir = new_pgdir(cpu, pgtable, &repin);
|
||||
/* Change the current pgd index to the new one. */
|
||||
cpu->cpu_pgd = newpgdir;
|
||||
/* If it was completely blank, we map in the Guest kernel stack */
|
||||
if (repin)
|
||||
pin_stack_pages(cpu);
|
||||
}
|
||||
|
||||
/*H:470
|
||||
* Finally, a routine which throws away everything: all PGD entries in all
|
||||
* the shadow page tables, including the Guest's kernel mappings. This is used
|
||||
|
@ -780,6 +774,44 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
|
|||
/* We need the Guest kernel stack mapped again. */
|
||||
pin_stack_pages(cpu);
|
||||
}
|
||||
|
||||
/*H:430
|
||||
* (iv) Switching page tables
|
||||
*
|
||||
* Now we've seen all the page table setting and manipulation, let's see
|
||||
* what happens when the Guest changes page tables (ie. changes the top-level
|
||||
* pgdir). This occurs on almost every context switch.
|
||||
*/
|
||||
void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
|
||||
{
|
||||
int newpgdir, repin = 0;
|
||||
|
||||
/*
|
||||
* The very first time they call this, we're actually running without
|
||||
* any page tables; we've been making it up. Throw them away now.
|
||||
*/
|
||||
if (unlikely(cpu->linear_pages)) {
|
||||
release_all_pagetables(cpu->lg);
|
||||
cpu->linear_pages = false;
|
||||
/* Force allocation of a new pgdir. */
|
||||
newpgdir = ARRAY_SIZE(cpu->lg->pgdirs);
|
||||
} else {
|
||||
/* Look to see if we have this one already. */
|
||||
newpgdir = find_pgdir(cpu->lg, pgtable);
|
||||
}
|
||||
|
||||
/*
|
||||
* If not, we allocate or mug an existing one: if it's a fresh one,
|
||||
* repin gets set to 1.
|
||||
*/
|
||||
if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
|
||||
newpgdir = new_pgdir(cpu, pgtable, &repin);
|
||||
/* Change the current pgd index to the new one. */
|
||||
cpu->cpu_pgd = newpgdir;
|
||||
/* If it was completely blank, we map in the Guest kernel stack */
|
||||
if (repin)
|
||||
pin_stack_pages(cpu);
|
||||
}
|
||||
/*:*/
|
||||
|
||||
/*M:009
|
||||
|
@ -919,168 +951,26 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
|
|||
}
|
||||
#endif
|
||||
|
||||
/*H:505
|
||||
* To get through boot, we construct simple identity page mappings (which
|
||||
* set virtual == physical) and linear mappings which will get the Guest far
|
||||
* enough into the boot to create its own. The linear mapping means we
|
||||
* simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
|
||||
* as you'll see.
|
||||
*
|
||||
* We lay them out of the way, just below the initrd (which is why we need to
|
||||
* know its size here).
|
||||
*/
|
||||
static unsigned long setup_pagetables(struct lguest *lg,
|
||||
unsigned long mem,
|
||||
unsigned long initrd_size)
|
||||
{
|
||||
pgd_t __user *pgdir;
|
||||
pte_t __user *linear;
|
||||
unsigned long mem_base = (unsigned long)lg->mem_base;
|
||||
unsigned int mapped_pages, i, linear_pages;
|
||||
#ifdef CONFIG_X86_PAE
|
||||
pmd_t __user *pmds;
|
||||
unsigned int j;
|
||||
pgd_t pgd;
|
||||
pmd_t pmd;
|
||||
#else
|
||||
unsigned int phys_linear;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We have mapped_pages frames to map, so we need linear_pages page
|
||||
* tables to map them.
|
||||
*/
|
||||
mapped_pages = mem / PAGE_SIZE;
|
||||
linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
|
||||
|
||||
/* We put the toplevel page directory page at the top of memory. */
|
||||
pgdir = (pgd_t *)(mem + mem_base - initrd_size - PAGE_SIZE);
|
||||
|
||||
/* Now we use the next linear_pages pages as pte pages */
|
||||
linear = (void *)pgdir - linear_pages * PAGE_SIZE;
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/*
|
||||
* And the single mid page goes below that. We only use one, but
|
||||
* that's enough to map 1G, which definitely gets us through boot.
|
||||
*/
|
||||
pmds = (void *)linear - PAGE_SIZE;
|
||||
#endif
|
||||
/*
|
||||
* Linear mapping is easy: put every page's address into the
|
||||
* mapping in order.
|
||||
*/
|
||||
for (i = 0; i < mapped_pages; i++) {
|
||||
pte_t pte;
|
||||
pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
|
||||
if (copy_to_user(&linear[i], &pte, sizeof(pte)) != 0)
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/*
|
||||
* Make the Guest PMD entries point to the corresponding place in the
|
||||
* linear mapping (up to one page worth of PMD).
|
||||
*/
|
||||
for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
|
||||
i += PTRS_PER_PTE, j++) {
|
||||
pmd = pfn_pmd(((unsigned long)&linear[i] - mem_base)/PAGE_SIZE,
|
||||
__pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
|
||||
|
||||
if (copy_to_user(&pmds[j], &pmd, sizeof(pmd)) != 0)
|
||||
return -EFAULT;
|
||||
}
|
||||
|
||||
/* One PGD entry, pointing to that PMD page. */
|
||||
pgd = __pgd(((unsigned long)pmds - mem_base) | _PAGE_PRESENT);
|
||||
/* Copy it in as the first PGD entry (ie. addresses 0-1G). */
|
||||
if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
|
||||
return -EFAULT;
|
||||
/*
|
||||
* And the other PGD entry to make the linear mapping at PAGE_OFFSET
|
||||
*/
|
||||
if (copy_to_user(&pgdir[KERNEL_PGD_BOUNDARY], &pgd, sizeof(pgd)))
|
||||
return -EFAULT;
|
||||
#else
|
||||
/*
|
||||
* The top level points to the linear page table pages above.
|
||||
* We setup the identity and linear mappings here.
|
||||
*/
|
||||
phys_linear = (unsigned long)linear - mem_base;
|
||||
for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
|
||||
pgd_t pgd;
|
||||
/*
|
||||
* Create a PGD entry which points to the right part of the
|
||||
* linear PTE pages.
|
||||
*/
|
||||
pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
|
||||
(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
|
||||
|
||||
/*
|
||||
* Copy it into the PGD page at 0 and PAGE_OFFSET.
|
||||
*/
|
||||
if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
|
||||
|| copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
|
||||
+ i / PTRS_PER_PTE],
|
||||
&pgd, sizeof(pgd)))
|
||||
return -EFAULT;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We return the top level (guest-physical) address: we remember where
|
||||
* this is to write it into lguest_data when the Guest initializes.
|
||||
*/
|
||||
return (unsigned long)pgdir - mem_base;
|
||||
}
|
||||
|
||||
/*H:500
|
||||
* (vii) Setting up the page tables initially.
|
||||
*
|
||||
* When a Guest is first created, the Launcher tells us where the toplevel of
|
||||
* its first page table is. We set some things up here:
|
||||
* When a Guest is first created, set initialize a shadow page table which
|
||||
* we will populate on future faults. The Guest doesn't have any actual
|
||||
* pagetables yet, so we set linear_pages to tell demand_page() to fake it
|
||||
* for the moment.
|
||||
*/
|
||||
int init_guest_pagetable(struct lguest *lg)
|
||||
{
|
||||
u64 mem;
|
||||
u32 initrd_size;
|
||||
struct boot_params __user *boot = (struct boot_params *)lg->mem_base;
|
||||
#ifdef CONFIG_X86_PAE
|
||||
pgd_t *pgd;
|
||||
pmd_t *pmd_table;
|
||||
#endif
|
||||
/*
|
||||
* Get the Guest memory size and the ramdisk size from the boot header
|
||||
* located at lg->mem_base (Guest address 0).
|
||||
*/
|
||||
if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
|
||||
|| get_user(initrd_size, &boot->hdr.ramdisk_size))
|
||||
return -EFAULT;
|
||||
struct lg_cpu *cpu = &lg->cpus[0];
|
||||
int allocated = 0;
|
||||
|
||||
/*
|
||||
* We start on the first shadow page table, and give it a blank PGD
|
||||
* page.
|
||||
*/
|
||||
lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
|
||||
if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
|
||||
return lg->pgdirs[0].gpgdir;
|
||||
lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
|
||||
if (!lg->pgdirs[0].pgdir)
|
||||
/* lg (and lg->cpus[]) starts zeroed: this allocates a new pgdir */
|
||||
cpu->cpu_pgd = new_pgdir(cpu, 0, &allocated);
|
||||
if (!allocated)
|
||||
return -ENOMEM;
|
||||
|
||||
#ifdef CONFIG_X86_PAE
|
||||
/* For PAE, we also create the initial mid-level. */
|
||||
pgd = lg->pgdirs[0].pgdir;
|
||||
pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
|
||||
if (!pmd_table)
|
||||
return -ENOMEM;
|
||||
|
||||
set_pgd(pgd + SWITCHER_PGD_INDEX,
|
||||
__pgd(__pa(pmd_table) | _PAGE_PRESENT));
|
||||
#endif
|
||||
|
||||
/* This is the current page table. */
|
||||
lg->cpus[0].cpu_pgd = 0;
|
||||
/* We start with a linear mapping until the initialize. */
|
||||
cpu->linear_pages = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -1095,10 +985,10 @@ void page_table_guest_data_init(struct lg_cpu *cpu)
|
|||
* of virtual addresses used by the Switcher.
|
||||
*/
|
||||
|| put_user(RESERVE_MEM * 1024 * 1024,
|
||||
&cpu->lg->lguest_data->reserve_mem)
|
||||
|| put_user(cpu->lg->pgdirs[0].gpgdir,
|
||||
&cpu->lg->lguest_data->pgdir))
|
||||
&cpu->lg->lguest_data->reserve_mem)) {
|
||||
kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* In flush_user_mappings() we loop from 0 to
|
||||
|
|
|
@ -59,8 +59,6 @@ struct lguest_data {
|
|||
unsigned long reserve_mem;
|
||||
/* KHz for the TSC clock. */
|
||||
u32 tsc_khz;
|
||||
/* Page where the top-level pagetable is */
|
||||
unsigned long pgdir;
|
||||
|
||||
/* Fields initialized by the Guest at boot: */
|
||||
/* Instruction range to suppress interrupts even if enabled */
|
||||
|
|
Loading…
Reference in a new issue