Merge branch 'stable/bug-fixes-for-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen

* 'stable/bug-fixes-for-rc5' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen:
  xen: mask_rw_pte mark RO all pagetable pages up to pgt_buf_top
  xen/mmu: Add workaround "x86-64, mm: Put early page table high"
This commit is contained in:
Linus Torvalds 2011-05-03 09:25:42 -07:00
commit 609cfda586

View file

@ -1463,6 +1463,119 @@ static int xen_pgd_alloc(struct mm_struct *mm)
return ret;
}
#ifdef CONFIG_X86_64
static __initdata u64 __last_pgt_set_rw = 0;
static __initdata u64 __pgt_buf_start = 0;
static __initdata u64 __pgt_buf_end = 0;
static __initdata u64 __pgt_buf_top = 0;
/*
* As a consequence of the commit:
*
* commit 4b239f458c229de044d6905c2b0f9fe16ed9e01e
* Author: Yinghai Lu <yinghai@kernel.org>
* Date: Fri Dec 17 16:58:28 2010 -0800
*
* x86-64, mm: Put early page table high
*
* at some point init_memory_mapping is going to reach the pagetable pages
* area and map those pages too (mapping them as normal memory that falls
* in the range of addresses passed to init_memory_mapping as argument).
* Some of those pages are already pagetable pages (they are in the range
* pgt_buf_start-pgt_buf_end) therefore they are going to be mapped RO and
* everything is fine.
* Some of these pages are not pagetable pages yet (they fall in the range
* pgt_buf_end-pgt_buf_top; for example the page at pgt_buf_end) so they
* are going to be mapped RW. When these pages become pagetable pages and
* are hooked into the pagetable, xen will find that the guest has already
* a RW mapping of them somewhere and fail the operation.
* The reason Xen requires pagetables to be RO is that the hypervisor needs
* to verify that the pagetables are valid before using them. The validation
* operations are called "pinning".
*
* In order to fix the issue we mark all the pages in the entire range
* pgt_buf_start-pgt_buf_top as RO, however when the pagetable allocation
* is completed only the range pgt_buf_start-pgt_buf_end is reserved by
* init_memory_mapping. Hence the kernel is going to crash as soon as one
* of the pages in the range pgt_buf_end-pgt_buf_top is reused (b/c those
* ranges are RO).
*
* For this reason, 'mark_rw_past_pgt' is introduced which is called _after_
* the init_memory_mapping has completed (in a perfect world we would
* call this function from init_memory_mapping, but lets ignore that).
*
* Because we are called _after_ init_memory_mapping the pgt_buf_[start,
* end,top] have all changed to new values (b/c init_memory_mapping
* is called and setting up another new page-table). Hence, the first time
* we enter this function, we save away the pgt_buf_start value and update
* the pgt_buf_[end,top].
*
* When we detect that the "old" pgt_buf_start through pgt_buf_end
* PFNs have been reserved (so memblock_x86_reserve_range has been called),
* we immediately set out to RW the "old" pgt_buf_end through pgt_buf_top.
*
* And then we update those "old" pgt_buf_[end|top] with the new ones
* so that we can redo this on the next pagetable.
*/
static __init void mark_rw_past_pgt(void) {
if (pgt_buf_end > pgt_buf_start) {
u64 addr, size;
/* Save it away. */
if (!__pgt_buf_start) {
__pgt_buf_start = pgt_buf_start;
__pgt_buf_end = pgt_buf_end;
__pgt_buf_top = pgt_buf_top;
return;
}
/* If we get the range that starts at __pgt_buf_end that means
* the range is reserved, and that in 'init_memory_mapping'
* the 'memblock_x86_reserve_range' has been called with the
* outdated __pgt_buf_start, __pgt_buf_end (the "new"
* pgt_buf_[start|end|top] refer now to a new pagetable.
* Note: we are called _after_ the pgt_buf_[..] have been
* updated.*/
addr = memblock_x86_find_in_range_size(PFN_PHYS(__pgt_buf_start),
&size, PAGE_SIZE);
/* Still not reserved, meaning 'memblock_x86_reserve_range'
* hasn't been called yet. Update the _end and _top.*/
if (addr == PFN_PHYS(__pgt_buf_start)) {
__pgt_buf_end = pgt_buf_end;
__pgt_buf_top = pgt_buf_top;
return;
}
/* OK, the area is reserved, meaning it is time for us to
* set RW for the old end->top PFNs. */
/* ..unless we had already done this. */
if (__pgt_buf_end == __last_pgt_set_rw)
return;
addr = PFN_PHYS(__pgt_buf_end);
/* set as RW the rest */
printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n",
PFN_PHYS(__pgt_buf_end), PFN_PHYS(__pgt_buf_top));
while (addr < PFN_PHYS(__pgt_buf_top)) {
make_lowmem_page_readwrite(__va(addr));
addr += PAGE_SIZE;
}
/* And update everything so that we are ready for the next
* pagetable (the one created for regions past 4GB) */
__last_pgt_set_rw = __pgt_buf_end;
__pgt_buf_start = pgt_buf_start;
__pgt_buf_end = pgt_buf_end;
__pgt_buf_top = pgt_buf_top;
}
return;
}
#else
static __init void mark_rw_past_pgt(void) { }
#endif
static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
#ifdef CONFIG_X86_64
@ -1488,6 +1601,14 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
/*
* A bit of optimization. We do not need to call the workaround
* when xen_set_pte_init is called with a PTE with 0 as PFN.
* That is b/c the pagetable at that point are just being populated
* with empty values and we can save some cycles by not calling
* the 'memblock' code.*/
if (pfn)
mark_rw_past_pgt();
/*
* If the new pfn is within the range of the newly allocated
* kernel pagetable, and it isn't being mapped into an
@ -1495,7 +1616,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
* it is RO.
*/
if (((!is_early_ioremap_ptep(ptep) &&
pfn >= pgt_buf_start && pfn < pgt_buf_end)) ||
pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
(is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
pte = pte_wrprotect(pte);
@ -1997,6 +2118,8 @@ __init void xen_ident_map_ISA(void)
static __init void xen_post_allocator_init(void)
{
mark_rw_past_pgt();
#ifdef CONFIG_XEN_DEBUG
pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
#endif