lguest: send trap 13 through to userspace.
We copy 7 bytes at eip for userspace's instruction decode; we have to carefully handle the case where eip is at the end of a page. We can't leave this to userspace since kernel has all the page table decode logic. The decode logic moves to userspace, basically unchanged. Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
This commit is contained in:
parent
c9e433e4b8
commit
c565650b10
2 changed files with 190 additions and 88 deletions
|
@ -314,95 +314,52 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
|
||||||
* usually attached to a PC.
|
* usually attached to a PC.
|
||||||
*
|
*
|
||||||
* When the Guest uses one of these instructions, we get a trap (General
|
* When the Guest uses one of these instructions, we get a trap (General
|
||||||
* Protection Fault) and come here. We see if it's one of those troublesome
|
* Protection Fault) and come here. We queue this to be sent out to the
|
||||||
* instructions and skip over it. We return true if we did.
|
* Launcher to handle.
|
||||||
*/
|
*/
|
||||||
static int emulate_insn(struct lg_cpu *cpu)
|
|
||||||
|
/*
|
||||||
|
* The eip contains the *virtual* address of the Guest's instruction:
|
||||||
|
* we copy the instruction here so the Launcher doesn't have to walk
|
||||||
|
* the page tables to decode it. We handle the case (eg. in a kernel
|
||||||
|
* module) where the instruction is over two pages, and the pages are
|
||||||
|
* virtually but not physically contiguous.
|
||||||
|
*
|
||||||
|
* The longest possible x86 instruction is 15 bytes, but we don't handle
|
||||||
|
* anything that strange.
|
||||||
|
*/
|
||||||
|
static void copy_from_guest(struct lg_cpu *cpu,
|
||||||
|
void *dst, unsigned long vaddr, size_t len)
|
||||||
{
|
{
|
||||||
u8 insn;
|
size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
|
||||||
unsigned int insnlen = 0, in = 0, small_operand = 0;
|
unsigned long paddr;
|
||||||
/*
|
|
||||||
* The eip contains the *virtual* address of the Guest's instruction:
|
|
||||||
* walk the Guest's page tables to find the "physical" address.
|
|
||||||
*/
|
|
||||||
unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
|
|
||||||
|
|
||||||
/*
|
BUG_ON(len > PAGE_SIZE);
|
||||||
* This must be the Guest kernel trying to do something, not userspace!
|
|
||||||
* The bottom two bits of the CS segment register are the privilege
|
|
||||||
* level.
|
|
||||||
*/
|
|
||||||
if ((cpu->regs->cs & 3) != GUEST_PL)
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
/* Decoding x86 instructions is icky. */
|
/* If it goes over a page, copy in two parts. */
|
||||||
insn = lgread(cpu, physaddr, u8);
|
if (len > to_page_end) {
|
||||||
|
/* But make sure the next page is mapped! */
|
||||||
/*
|
if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
|
||||||
* Around 2.6.33, the kernel started using an emulation for the
|
copy_from_guest(cpu, dst + to_page_end,
|
||||||
* cmpxchg8b instruction in early boot on many configurations. This
|
vaddr + to_page_end,
|
||||||
* code isn't paravirtualized, and it tries to disable interrupts.
|
len - to_page_end);
|
||||||
* Ignore it, which will Mostly Work.
|
else
|
||||||
*/
|
/* Otherwise fill with zeroes. */
|
||||||
if (insn == 0xfa) {
|
memset(dst + to_page_end, 0, len - to_page_end);
|
||||||
/* "cli", or Clear Interrupt Enable instruction. Skip it. */
|
len = to_page_end;
|
||||||
cpu->regs->eip++;
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/* This will kill the guest if it isn't mapped, but that
|
||||||
* 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
|
* shouldn't happen. */
|
||||||
*/
|
__lgread(cpu, dst, guest_pa(cpu, vaddr), len);
|
||||||
if (insn == 0x66) {
|
}
|
||||||
small_operand = 1;
|
|
||||||
/* The instruction is 1 byte so far, read the next byte. */
|
|
||||||
insnlen = 1;
|
|
||||||
insn = lgread(cpu, physaddr + insnlen, u8);
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* We can ignore the lower bit for the moment and decode the 4 opcodes
|
|
||||||
* we need to emulate.
|
|
||||||
*/
|
|
||||||
switch (insn & 0xFE) {
|
|
||||||
case 0xE4: /* in <next byte>,%al */
|
|
||||||
insnlen += 2;
|
|
||||||
in = 1;
|
|
||||||
break;
|
|
||||||
case 0xEC: /* in (%dx),%al */
|
|
||||||
insnlen += 1;
|
|
||||||
in = 1;
|
|
||||||
break;
|
|
||||||
case 0xE6: /* out %al,<next byte> */
|
|
||||||
insnlen += 2;
|
|
||||||
break;
|
|
||||||
case 0xEE: /* out %al,(%dx) */
|
|
||||||
insnlen += 1;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
/* OK, we don't know what this is, can't emulate. */
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
static void setup_emulate_insn(struct lg_cpu *cpu)
|
||||||
* If it was an "IN" instruction, they expect the result to be read
|
{
|
||||||
* into %eax, so we change %eax. We always return all-ones, which
|
cpu->pending.trap = 13;
|
||||||
* traditionally means "there's nothing there".
|
copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
|
||||||
*/
|
sizeof(cpu->pending.insn));
|
||||||
if (in) {
|
|
||||||
/* Lower bit tells means it's a 32/16 bit access */
|
|
||||||
if (insn & 0x1) {
|
|
||||||
if (small_operand)
|
|
||||||
cpu->regs->eax |= 0xFFFF;
|
|
||||||
else
|
|
||||||
cpu->regs->eax = 0xFFFFFFFF;
|
|
||||||
} else
|
|
||||||
cpu->regs->eax |= 0xFF;
|
|
||||||
}
|
|
||||||
/* Finally, we've "done" the instruction, so move past it. */
|
|
||||||
cpu->regs->eip += insnlen;
|
|
||||||
/* Success! */
|
|
||||||
return 1;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
|
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
|
||||||
|
@ -410,14 +367,10 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
|
||||||
{
|
{
|
||||||
switch (cpu->regs->trapnum) {
|
switch (cpu->regs->trapnum) {
|
||||||
case 13: /* We've intercepted a General Protection Fault. */
|
case 13: /* We've intercepted a General Protection Fault. */
|
||||||
/*
|
/* Hand to Launcher to emulate those pesky IN and OUT insns */
|
||||||
* Check if this was one of those annoying IN or OUT
|
|
||||||
* instructions which we need to emulate. If so, we just go
|
|
||||||
* back into the Guest after we've done it.
|
|
||||||
*/
|
|
||||||
if (cpu->regs->errcode == 0) {
|
if (cpu->regs->errcode == 0) {
|
||||||
if (emulate_insn(cpu))
|
setup_emulate_insn(cpu);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case 14: /* We've intercepted a Page Fault. */
|
case 14: /* We've intercepted a Page Fault. */
|
||||||
|
|
|
@ -41,6 +41,7 @@
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#include <pwd.h>
|
#include <pwd.h>
|
||||||
#include <grp.h>
|
#include <grp.h>
|
||||||
|
#include <sys/user.h>
|
||||||
|
|
||||||
#ifndef VIRTIO_F_ANY_LAYOUT
|
#ifndef VIRTIO_F_ANY_LAYOUT
|
||||||
#define VIRTIO_F_ANY_LAYOUT 27
|
#define VIRTIO_F_ANY_LAYOUT 27
|
||||||
|
@ -1143,6 +1144,150 @@ static void handle_output(unsigned long addr)
|
||||||
strnlen(from_guest_phys(addr), guest_limit - addr));
|
strnlen(from_guest_phys(addr), guest_limit - addr));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*L:216
|
||||||
|
* This is where we emulate a handful of Guest instructions. It's ugly
|
||||||
|
* and we used to do it in the kernel but it grew over time.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* We use the ptrace syscall's pt_regs struct to talk about registers
|
||||||
|
* to lguest: these macros convert the names to the offsets.
|
||||||
|
*/
|
||||||
|
#define getreg(name) getreg_off(offsetof(struct user_regs_struct, name))
|
||||||
|
#define setreg(name, val) \
|
||||||
|
setreg_off(offsetof(struct user_regs_struct, name), (val))
|
||||||
|
|
||||||
|
static u32 getreg_off(size_t offset)
|
||||||
|
{
|
||||||
|
u32 r;
|
||||||
|
unsigned long args[] = { LHREQ_GETREG, offset };
|
||||||
|
|
||||||
|
if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
|
||||||
|
err(1, "Getting register %u", offset);
|
||||||
|
if (pread(lguest_fd, &r, sizeof(r), cpu_id) != sizeof(r))
|
||||||
|
err(1, "Reading register %u", offset);
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void setreg_off(size_t offset, u32 val)
|
||||||
|
{
|
||||||
|
unsigned long args[] = { LHREQ_SETREG, offset, val };
|
||||||
|
|
||||||
|
if (pwrite(lguest_fd, args, sizeof(args), cpu_id) < 0)
|
||||||
|
err(1, "Setting register %u", offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void emulate_insn(const u8 insn[])
|
||||||
|
{
|
||||||
|
unsigned long args[] = { LHREQ_TRAP, 13 };
|
||||||
|
unsigned int insnlen = 0, in = 0, small_operand = 0, byte_access;
|
||||||
|
unsigned int eax, port, mask;
|
||||||
|
/*
|
||||||
|
* We always return all-ones on IO port reads, which traditionally
|
||||||
|
* means "there's nothing there".
|
||||||
|
*/
|
||||||
|
u32 val = 0xFFFFFFFF;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This must be the Guest kernel trying to do something, not userspace!
|
||||||
|
* The bottom two bits of the CS segment register are the privilege
|
||||||
|
* level.
|
||||||
|
*/
|
||||||
|
if ((getreg(xcs) & 3) != 0x1)
|
||||||
|
goto no_emulate;
|
||||||
|
|
||||||
|
/* Decoding x86 instructions is icky. */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Around 2.6.33, the kernel started using an emulation for the
|
||||||
|
* cmpxchg8b instruction in early boot on many configurations. This
|
||||||
|
* code isn't paravirtualized, and it tries to disable interrupts.
|
||||||
|
* Ignore it, which will Mostly Work.
|
||||||
|
*/
|
||||||
|
if (insn[insnlen] == 0xfa) {
|
||||||
|
/* "cli", or Clear Interrupt Enable instruction. Skip it. */
|
||||||
|
insnlen = 1;
|
||||||
|
goto skip_insn;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
|
||||||
|
*/
|
||||||
|
if (insn[insnlen] == 0x66) {
|
||||||
|
small_operand = 1;
|
||||||
|
/* The instruction is 1 byte so far, read the next byte. */
|
||||||
|
insnlen = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the lower bit isn't set, it's a single byte access */
|
||||||
|
byte_access = !(insn[insnlen] & 1);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Now we can ignore the lower bit and decode the 4 opcodes
|
||||||
|
* we need to emulate.
|
||||||
|
*/
|
||||||
|
switch (insn[insnlen] & 0xFE) {
|
||||||
|
case 0xE4: /* in <next byte>,%al */
|
||||||
|
port = insn[insnlen+1];
|
||||||
|
insnlen += 2;
|
||||||
|
in = 1;
|
||||||
|
break;
|
||||||
|
case 0xEC: /* in (%dx),%al */
|
||||||
|
port = getreg(edx) & 0xFFFF;
|
||||||
|
insnlen += 1;
|
||||||
|
in = 1;
|
||||||
|
break;
|
||||||
|
case 0xE6: /* out %al,<next byte> */
|
||||||
|
port = insn[insnlen+1];
|
||||||
|
insnlen += 2;
|
||||||
|
break;
|
||||||
|
case 0xEE: /* out %al,(%dx) */
|
||||||
|
port = getreg(edx) & 0xFFFF;
|
||||||
|
insnlen += 1;
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
/* OK, we don't know what this is, can't emulate. */
|
||||||
|
goto no_emulate;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Set a mask of the 1, 2 or 4 bytes, depending on size of IO */
|
||||||
|
if (byte_access)
|
||||||
|
mask = 0xFF;
|
||||||
|
else if (small_operand)
|
||||||
|
mask = 0xFFFF;
|
||||||
|
else
|
||||||
|
mask = 0xFFFFFFFF;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* If it was an "IN" instruction, they expect the result to be read
|
||||||
|
* into %eax, so we change %eax.
|
||||||
|
*/
|
||||||
|
eax = getreg(eax);
|
||||||
|
|
||||||
|
if (in) {
|
||||||
|
/* Clear the bits we're about to read */
|
||||||
|
eax &= ~mask;
|
||||||
|
/* Copy bits in from val. */
|
||||||
|
eax |= val & mask;
|
||||||
|
/* Now update the register. */
|
||||||
|
setreg(eax, eax);
|
||||||
|
}
|
||||||
|
|
||||||
|
verbose("IO %s of %x to %u: %#08x\n",
|
||||||
|
in ? "IN" : "OUT", mask, port, eax);
|
||||||
|
skip_insn:
|
||||||
|
/* Finally, we've "done" the instruction, so move past it. */
|
||||||
|
setreg(eip, getreg(eip) + insnlen);
|
||||||
|
return;
|
||||||
|
|
||||||
|
no_emulate:
|
||||||
|
/* Inject trap into Guest. */
|
||||||
|
if (write(lguest_fd, args, sizeof(args)) < 0)
|
||||||
|
err(1, "Reinjecting trap 13 for fault at %#x", getreg(eip));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/*L:190
|
/*L:190
|
||||||
* Device Setup
|
* Device Setup
|
||||||
*
|
*
|
||||||
|
@ -1832,6 +1977,10 @@ static void __attribute__((noreturn)) run_guest(void)
|
||||||
verbose("Notify on address %#08x\n",
|
verbose("Notify on address %#08x\n",
|
||||||
notify.addr);
|
notify.addr);
|
||||||
handle_output(notify.addr);
|
handle_output(notify.addr);
|
||||||
|
} else if (notify.trap == 13) {
|
||||||
|
verbose("Emulating instruction at %#x\n",
|
||||||
|
getreg(eip));
|
||||||
|
emulate_insn(notify.insn);
|
||||||
} else
|
} else
|
||||||
errx(1, "Unknown trap %i addr %#08x\n",
|
errx(1, "Unknown trap %i addr %#08x\n",
|
||||||
notify.trap, notify.addr);
|
notify.trap, notify.addr);
|
||||||
|
|
Loading…
Reference in a new issue