2006-12-06 18:14:07 -07:00
|
|
|
/* Paravirtualization interfaces
|
|
|
|
Copyright (C) 2006 Rusty Russell IBM Corporation
|
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify
|
|
|
|
it under the terms of the GNU General Public License as published by
|
|
|
|
the Free Software Foundation; either version 2 of the License, or
|
|
|
|
(at your option) any later version.
|
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
GNU General Public License for more details.
|
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
|
|
along with this program; if not, write to the Free Software
|
|
|
|
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
*/
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/efi.h>
|
|
|
|
#include <linux/bcd.h>
|
2007-05-02 11:27:15 -06:00
|
|
|
#include <linux/highmem.h>
|
2006-12-06 18:14:07 -07:00
|
|
|
|
|
|
|
#include <asm/bug.h>
|
|
|
|
#include <asm/paravirt.h>
|
|
|
|
#include <asm/desc.h>
|
|
|
|
#include <asm/setup.h>
|
|
|
|
#include <asm/arch_hooks.h>
|
|
|
|
#include <asm/time.h>
|
|
|
|
#include <asm/irq.h>
|
|
|
|
#include <asm/delay.h>
|
2006-12-06 18:14:08 -07:00
|
|
|
#include <asm/fixmap.h>
|
|
|
|
#include <asm/apic.h>
|
2006-12-06 18:14:08 -07:00
|
|
|
#include <asm/tlbflush.h>
|
2007-03-05 01:30:35 -07:00
|
|
|
#include <asm/timer.h>
|
2006-12-06 18:14:07 -07:00
|
|
|
|
|
|
|
/* nop stub */
|
2007-05-02 11:27:13 -06:00
|
|
|
void _paravirt_nop(void)
|
2006-12-06 18:14:07 -07:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void __init default_banner(void)
|
|
|
|
{
|
|
|
|
printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
|
|
|
|
paravirt_ops.name);
|
|
|
|
}
|
|
|
|
|
|
|
|
char *memory_setup(void)
|
|
|
|
{
|
|
|
|
return paravirt_ops.memory_setup();
|
|
|
|
}
|
|
|
|
|
2006-12-06 18:14:08 -07:00
|
|
|
/* Simple instruction patching code. */
|
|
|
|
#define DEF_NATIVE(name, code) \
|
|
|
|
extern const char start_##name[], end_##name[]; \
|
|
|
|
asm("start_" #name ": " code "; end_" #name ":")
|
2007-05-02 11:27:14 -06:00
|
|
|
|
|
|
|
DEF_NATIVE(irq_disable, "cli");
|
|
|
|
DEF_NATIVE(irq_enable, "sti");
|
|
|
|
DEF_NATIVE(restore_fl, "push %eax; popf");
|
|
|
|
DEF_NATIVE(save_fl, "pushf; pop %eax");
|
2006-12-06 18:14:08 -07:00
|
|
|
DEF_NATIVE(iret, "iret");
|
2007-05-02 11:27:14 -06:00
|
|
|
DEF_NATIVE(irq_enable_sysexit, "sti; sysexit");
|
|
|
|
DEF_NATIVE(read_cr2, "mov %cr2, %eax");
|
|
|
|
DEF_NATIVE(write_cr3, "mov %eax, %cr3");
|
|
|
|
DEF_NATIVE(read_cr3, "mov %cr3, %eax");
|
|
|
|
DEF_NATIVE(clts, "clts");
|
|
|
|
DEF_NATIVE(read_tsc, "rdtsc");
|
2006-12-06 18:14:08 -07:00
|
|
|
|
2007-05-02 11:27:14 -06:00
|
|
|
DEF_NATIVE(ud2a, "ud2a");
|
2006-12-06 18:14:08 -07:00
|
|
|
|
|
|
|
static unsigned native_patch(u8 type, u16 clobbers, void *insns, unsigned len)
|
|
|
|
{
|
2007-05-02 11:27:14 -06:00
|
|
|
const unsigned char *start, *end;
|
|
|
|
unsigned ret;
|
|
|
|
|
|
|
|
switch(type) {
|
|
|
|
#define SITE(x) case PARAVIRT_PATCH(x): start = start_##x; end = end_##x; goto patch_site
|
|
|
|
SITE(irq_disable);
|
|
|
|
SITE(irq_enable);
|
|
|
|
SITE(restore_fl);
|
|
|
|
SITE(save_fl);
|
|
|
|
SITE(iret);
|
|
|
|
SITE(irq_enable_sysexit);
|
|
|
|
SITE(read_cr2);
|
|
|
|
SITE(read_cr3);
|
|
|
|
SITE(write_cr3);
|
|
|
|
SITE(clts);
|
|
|
|
SITE(read_tsc);
|
|
|
|
#undef SITE
|
|
|
|
|
|
|
|
patch_site:
|
|
|
|
ret = paravirt_patch_insns(insns, len, start, end);
|
|
|
|
break;
|
|
|
|
|
|
|
|
case PARAVIRT_PATCH(make_pgd):
|
|
|
|
case PARAVIRT_PATCH(make_pte):
|
|
|
|
case PARAVIRT_PATCH(pgd_val):
|
|
|
|
case PARAVIRT_PATCH(pte_val):
|
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
case PARAVIRT_PATCH(make_pmd):
|
|
|
|
case PARAVIRT_PATCH(pmd_val):
|
|
|
|
#endif
|
|
|
|
/* These functions end up returning exactly what
|
|
|
|
they're passed, in the same registers. */
|
|
|
|
ret = paravirt_patch_nop();
|
|
|
|
break;
|
|
|
|
|
|
|
|
default:
|
|
|
|
ret = paravirt_patch_default(type, clobbers, insns, len);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned paravirt_patch_nop(void)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned paravirt_patch_ignore(unsigned len)
|
|
|
|
{
|
|
|
|
return len;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned paravirt_patch_call(void *target, u16 tgt_clobbers,
|
|
|
|
void *site, u16 site_clobbers,
|
|
|
|
unsigned len)
|
|
|
|
{
|
|
|
|
unsigned char *call = site;
|
|
|
|
unsigned long delta = (unsigned long)target - (unsigned long)(call+5);
|
|
|
|
|
|
|
|
if (tgt_clobbers & ~site_clobbers)
|
|
|
|
return len; /* target would clobber too much for this site */
|
|
|
|
if (len < 5)
|
|
|
|
return len; /* call too long for patch site */
|
2006-12-06 18:14:08 -07:00
|
|
|
|
2007-05-02 11:27:14 -06:00
|
|
|
*call++ = 0xe8; /* call */
|
|
|
|
*(unsigned long *)call = delta;
|
2006-12-06 18:14:08 -07:00
|
|
|
|
2007-05-02 11:27:14 -06:00
|
|
|
return 5;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned paravirt_patch_jmp(void *target, void *site, unsigned len)
|
|
|
|
{
|
|
|
|
unsigned char *jmp = site;
|
|
|
|
unsigned long delta = (unsigned long)target - (unsigned long)(jmp+5);
|
|
|
|
|
|
|
|
if (len < 5)
|
|
|
|
return len; /* call too long for patch site */
|
|
|
|
|
|
|
|
*jmp++ = 0xe9; /* jmp */
|
|
|
|
*(unsigned long *)jmp = delta;
|
|
|
|
|
|
|
|
return 5;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned paravirt_patch_default(u8 type, u16 clobbers, void *site, unsigned len)
|
|
|
|
{
|
|
|
|
void *opfunc = *((void **)¶virt_ops + type);
|
|
|
|
unsigned ret;
|
|
|
|
|
|
|
|
if (opfunc == NULL)
|
|
|
|
/* If there's no function, patch it with a ud2a (BUG) */
|
|
|
|
ret = paravirt_patch_insns(site, len, start_ud2a, end_ud2a);
|
|
|
|
else if (opfunc == paravirt_nop)
|
|
|
|
/* If the operation is a nop, then nop the callsite */
|
|
|
|
ret = paravirt_patch_nop();
|
|
|
|
else if (type == PARAVIRT_PATCH(iret) ||
|
|
|
|
type == PARAVIRT_PATCH(irq_enable_sysexit))
|
|
|
|
/* If operation requires a jmp, then jmp */
|
|
|
|
ret = paravirt_patch_jmp(opfunc, site, len);
|
|
|
|
else
|
|
|
|
/* Otherwise call the function; assume target could
|
|
|
|
clobber any caller-save reg */
|
|
|
|
ret = paravirt_patch_call(opfunc, CLBR_ANY,
|
|
|
|
site, clobbers, len);
|
|
|
|
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned paravirt_patch_insns(void *site, unsigned len,
|
|
|
|
const char *start, const char *end)
|
|
|
|
{
|
|
|
|
unsigned insn_len = end - start;
|
2006-12-06 18:14:08 -07:00
|
|
|
|
2007-05-02 11:27:14 -06:00
|
|
|
if (insn_len > len || start == NULL)
|
|
|
|
insn_len = len;
|
|
|
|
else
|
|
|
|
memcpy(site, start, insn_len);
|
2006-12-06 18:14:08 -07:00
|
|
|
|
|
|
|
return insn_len;
|
|
|
|
}
|
|
|
|
|
2006-12-06 18:14:07 -07:00
|
|
|
void init_IRQ(void)
|
|
|
|
{
|
|
|
|
paravirt_ops.init_IRQ();
|
|
|
|
}
|
|
|
|
|
2007-02-13 05:26:25 -07:00
|
|
|
static void native_flush_tlb(void)
|
2006-12-06 18:14:08 -07:00
|
|
|
{
|
|
|
|
__native_flush_tlb();
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Global pages have to be flushed a bit differently. Not a real
|
|
|
|
* performance problem because this does not happen often.
|
|
|
|
*/
|
2007-02-13 05:26:25 -07:00
|
|
|
static void native_flush_tlb_global(void)
|
2006-12-06 18:14:08 -07:00
|
|
|
{
|
|
|
|
__native_flush_tlb_global();
|
|
|
|
}
|
|
|
|
|
2007-05-02 11:27:14 -06:00
|
|
|
static void native_flush_tlb_single(unsigned long addr)
|
2006-12-06 18:14:08 -07:00
|
|
|
{
|
|
|
|
__native_flush_tlb_single(addr);
|
|
|
|
}
|
|
|
|
|
2006-12-06 18:14:07 -07:00
|
|
|
/* These are in entry.S */
|
2007-02-13 05:26:25 -07:00
|
|
|
extern void native_iret(void);
|
|
|
|
extern void native_irq_enable_sysexit(void);
|
2006-12-06 18:14:07 -07:00
|
|
|
|
|
|
|
static int __init print_banner(void)
|
|
|
|
{
|
|
|
|
paravirt_ops.banner();
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
core_initcall(print_banner);
|
|
|
|
|
|
|
|
struct paravirt_ops paravirt_ops = {
|
|
|
|
.name = "bare hardware",
|
|
|
|
.paravirt_enabled = 0,
|
|
|
|
.kernel_rpl = 0,
|
[PATCH] i386: PARAVIRT: Allow paravirt backend to choose kernel PMD sharing
Normally when running in PAE mode, the 4th PMD maps the kernel address space,
which can be shared among all processes (since they all need the same kernel
mappings).
Xen, however, does not allow guests to have the kernel pmd shared between page
tables, so parameterize pgtable.c to allow both modes of operation.
There are several side-effects of this. One is that vmalloc will update the
kernel address space mappings, and those updates need to be propagated into
all processes if the kernel mappings are not intrinsically shared. In the
non-PAE case, this is done by maintaining a pgd_list of all processes; this
list is used when all process pagetables must be updated. pgd_list is
threaded via otherwise unused entries in the page structure for the pgd, which
means that the pgd must be page-sized for this to work.
Normally the PAE pgd is only 4x64 byte entries large, but Xen requires the PAE
pgd to page aligned anyway, so this patch forces the pgd to be page
aligned+sized when the kernel pmd is unshared, to accomodate both these
requirements.
Also, since there may be several distinct kernel pmds (if the user/kernel
split is below 3G), there's no point in allocating them from a slab cache;
they're just allocated with get_free_page and initialized appropriately. (Of
course the could be cached if there is just a single kernel pmd - which is the
default with a 3G user/kernel split - but it doesn't seem worthwhile to add
yet another case into this code).
[ Many thanks to wli for review comments. ]
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Cc: Zachary Amsden <zach@vmware.com>
Cc: Christoph Lameter <clameter@sgi.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2007-05-02 11:27:13 -06:00
|
|
|
.shared_kernel_pmd = 1, /* Only used when CONFIG_X86_PAE is set */
|
2006-12-06 18:14:07 -07:00
|
|
|
|
2006-12-06 18:14:08 -07:00
|
|
|
.patch = native_patch,
|
2006-12-06 18:14:07 -07:00
|
|
|
.banner = default_banner,
|
2007-05-02 11:27:13 -06:00
|
|
|
.arch_setup = paravirt_nop,
|
2006-12-06 18:14:07 -07:00
|
|
|
.memory_setup = machine_specific_memory_setup,
|
|
|
|
.get_wallclock = native_get_wallclock,
|
|
|
|
.set_wallclock = native_set_wallclock,
|
2007-03-05 01:30:39 -07:00
|
|
|
.time_init = hpet_time_init,
|
2006-12-06 18:14:07 -07:00
|
|
|
.init_IRQ = native_init_IRQ,
|
|
|
|
|
|
|
|
.cpuid = native_cpuid,
|
|
|
|
.get_debugreg = native_get_debugreg,
|
|
|
|
.set_debugreg = native_set_debugreg,
|
|
|
|
.clts = native_clts,
|
|
|
|
.read_cr0 = native_read_cr0,
|
|
|
|
.write_cr0 = native_write_cr0,
|
|
|
|
.read_cr2 = native_read_cr2,
|
|
|
|
.write_cr2 = native_write_cr2,
|
|
|
|
.read_cr3 = native_read_cr3,
|
|
|
|
.write_cr3 = native_write_cr3,
|
|
|
|
.read_cr4 = native_read_cr4,
|
|
|
|
.read_cr4_safe = native_read_cr4_safe,
|
|
|
|
.write_cr4 = native_write_cr4,
|
|
|
|
.save_fl = native_save_fl,
|
|
|
|
.restore_fl = native_restore_fl,
|
|
|
|
.irq_disable = native_irq_disable,
|
|
|
|
.irq_enable = native_irq_enable,
|
|
|
|
.safe_halt = native_safe_halt,
|
|
|
|
.halt = native_halt,
|
|
|
|
.wbinvd = native_wbinvd,
|
2007-05-02 11:27:10 -06:00
|
|
|
.read_msr = native_read_msr_safe,
|
|
|
|
.write_msr = native_write_msr_safe,
|
2006-12-06 18:14:07 -07:00
|
|
|
.read_tsc = native_read_tsc,
|
|
|
|
.read_pmc = native_read_pmc,
|
2007-03-05 01:30:35 -07:00
|
|
|
.get_scheduled_cycles = native_read_tsc,
|
2007-03-05 01:30:36 -07:00
|
|
|
.get_cpu_khz = native_calculate_cpu_khz,
|
2006-12-06 18:14:07 -07:00
|
|
|
.load_tr_desc = native_load_tr_desc,
|
|
|
|
.set_ldt = native_set_ldt,
|
|
|
|
.load_gdt = native_load_gdt,
|
|
|
|
.load_idt = native_load_idt,
|
|
|
|
.store_gdt = native_store_gdt,
|
|
|
|
.store_idt = native_store_idt,
|
|
|
|
.store_tr = native_store_tr,
|
|
|
|
.load_tls = native_load_tls,
|
2007-05-02 11:27:10 -06:00
|
|
|
.write_ldt_entry = write_dt_entry,
|
|
|
|
.write_gdt_entry = write_dt_entry,
|
|
|
|
.write_idt_entry = write_dt_entry,
|
2006-12-06 18:14:07 -07:00
|
|
|
.load_esp0 = native_load_esp0,
|
|
|
|
|
|
|
|
.set_iopl_mask = native_set_iopl_mask,
|
|
|
|
.io_delay = native_io_delay,
|
|
|
|
|
2006-12-06 18:14:08 -07:00
|
|
|
#ifdef CONFIG_X86_LOCAL_APIC
|
|
|
|
.apic_write = native_apic_write,
|
|
|
|
.apic_write_atomic = native_apic_write_atomic,
|
|
|
|
.apic_read = native_apic_read,
|
2007-02-13 05:26:21 -07:00
|
|
|
.setup_boot_clock = setup_boot_APIC_clock,
|
|
|
|
.setup_secondary_clock = setup_secondary_APIC_clock,
|
2007-05-02 11:27:18 -06:00
|
|
|
.startup_ipi_hook = paravirt_nop,
|
2006-12-06 18:14:08 -07:00
|
|
|
#endif
|
2007-05-02 11:27:13 -06:00
|
|
|
.set_lazy_mode = paravirt_nop,
|
2006-12-06 18:14:08 -07:00
|
|
|
|
[PATCH] i386: PARAVIRT: Hooks to set up initial pagetable
This patch introduces paravirt_ops hooks to control how the kernel's
initial pagetable is set up.
In the case of a native boot, the very early bootstrap code creates a
simple non-PAE pagetable to map the kernel and physical memory. When
the VM subsystem is initialized, it creates a proper pagetable which
respects the PAE mode, large pages, etc.
When booting under a hypervisor, there are many possibilities for what
paging environment the hypervisor establishes for the guest kernel, so
the constructon of the kernel's pagetable depends on the hypervisor.
In the case of Xen, the hypervisor boots the kernel with a fully
constructed pagetable, which is already using PAE if necessary. Also,
Xen requires particular care when constructing pagetables to make sure
all pagetables are always mapped read-only.
In order to make this easier, kernel's initial pagetable construction
has been changed to only allocate and initialize a pagetable page if
there's no page already present in the pagetable. This allows the Xen
paravirt backend to make a copy of the hypervisor-provided pagetable,
allowing the kernel to establish any more mappings it needs while
keeping the existing ones.
A slightly subtle point which is worth highlighting here is that Xen
requires all kernel mappings to share the same pte_t pages between all
pagetables, so that updating a kernel page's mapping in one pagetable
is reflected in all other pagetables. This makes it possible to
allocate a page and attach it to a pagetable without having to
explicitly enumerate that page's mapping in all pagetables.
And:
+From: "Eric W. Biederman" <ebiederm@xmission.com>
If we don't set the leaf page table entries it is quite possible that
will inherit and incorrect page table entry from the initial boot
page table setup in head.S. So we need to redo the effort here,
so we pick up PSE, PGE and the like.
Hypervisors like Xen require that their page tables be read-only,
which is slightly incompatible with our low identity mappings, however
I discussed this with Jeremy he has modified the Xen early set_pte
function to avoid problems in this area.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy@xensource.com>
Signed-off-by: Andi Kleen <ak@suse.de>
Acked-by: William Irwin <bill.irwin@oracle.com>
Cc: Ingo Molnar <mingo@elte.hu>
2007-05-02 11:27:13 -06:00
|
|
|
.pagetable_setup_start = native_pagetable_setup_start,
|
|
|
|
.pagetable_setup_done = native_pagetable_setup_done,
|
|
|
|
|
2006-12-06 18:14:08 -07:00
|
|
|
.flush_tlb_user = native_flush_tlb,
|
|
|
|
.flush_tlb_kernel = native_flush_tlb_global,
|
|
|
|
.flush_tlb_single = native_flush_tlb_single,
|
2007-05-02 11:27:15 -06:00
|
|
|
.flush_tlb_others = native_flush_tlb_others,
|
2006-12-06 18:14:08 -07:00
|
|
|
|
2007-05-02 11:27:13 -06:00
|
|
|
.alloc_pt = paravirt_nop,
|
|
|
|
.alloc_pd = paravirt_nop,
|
|
|
|
.alloc_pd_clone = paravirt_nop,
|
|
|
|
.release_pt = paravirt_nop,
|
|
|
|
.release_pd = paravirt_nop,
|
2007-02-13 05:26:21 -07:00
|
|
|
|
2006-12-06 18:14:08 -07:00
|
|
|
.set_pte = native_set_pte,
|
|
|
|
.set_pte_at = native_set_pte_at,
|
|
|
|
.set_pmd = native_set_pmd,
|
2007-05-02 11:27:13 -06:00
|
|
|
.pte_update = paravirt_nop,
|
|
|
|
.pte_update_defer = paravirt_nop,
|
2007-05-02 11:27:13 -06:00
|
|
|
|
2007-05-02 11:27:15 -06:00
|
|
|
#ifdef CONFIG_HIGHPTE
|
|
|
|
.kmap_atomic_pte = kmap_atomic,
|
|
|
|
#endif
|
|
|
|
|
2006-12-06 18:14:08 -07:00
|
|
|
#ifdef CONFIG_X86_PAE
|
|
|
|
.set_pte_atomic = native_set_pte_atomic,
|
|
|
|
.set_pte_present = native_set_pte_present,
|
|
|
|
.set_pud = native_set_pud,
|
|
|
|
.pte_clear = native_pte_clear,
|
|
|
|
.pmd_clear = native_pmd_clear,
|
2007-05-02 11:27:13 -06:00
|
|
|
|
|
|
|
.pmd_val = native_pmd_val,
|
|
|
|
.make_pmd = native_make_pmd,
|
2006-12-06 18:14:08 -07:00
|
|
|
#endif
|
|
|
|
|
2007-05-02 11:27:13 -06:00
|
|
|
.pte_val = native_pte_val,
|
|
|
|
.pgd_val = native_pgd_val,
|
|
|
|
|
|
|
|
.make_pte = native_make_pte,
|
|
|
|
.make_pgd = native_make_pgd,
|
|
|
|
|
2006-12-06 18:14:07 -07:00
|
|
|
.irq_enable_sysexit = native_irq_enable_sysexit,
|
|
|
|
.iret = native_iret,
|
2007-02-13 05:26:21 -07:00
|
|
|
|
2007-05-02 11:27:14 -06:00
|
|
|
.dup_mmap = paravirt_nop,
|
|
|
|
.exit_mmap = paravirt_nop,
|
|
|
|
.activate_mm = paravirt_nop,
|
2006-12-06 18:14:07 -07:00
|
|
|
};
|
2007-01-22 21:40:36 -07:00
|
|
|
|
2007-05-02 11:27:17 -06:00
|
|
|
EXPORT_SYMBOL(paravirt_ops);
|