/******************************************************************************
* xen/mm/hypervisor.c
*
* Update page tables via the hypervisor.
*
* Copyright (c) 2002, K A Fraser
*/
#ifdef linux
#include <linux/config.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/vmalloc.h>
#include <asm/hypervisor.h>
#include <asm/hypervisor-ifs/dom_mem_ops.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/multicall.h>
#endif
#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../xen/xen.h"
#define LOG(a)
/* wmb is a write memory barrier. I will leave it in here for now just as a marker
* if we have troubles that might be cured by such a thing.
#define wmb()
*/
#define physpfn_to_mfn(a) (xen_mm_mfn((void *) (a)))
#define xen_pa_to_ma(a) (xen_va_to_ma((void *)(a)))
/*
* This suffices to protect us if we ever move to SMP domains.
* Further, it protects us against interrupts. At the very least, this is
* required for the network driver which flushes the update queue before
* pushing new receive buffers.
*/
static Lock update_lock;
//static spinlock_t update_lock = SPIN_LOCK_UNLOCKED;
#define QUEUE_SIZE 2048
static mmu_update_t update_queue[QUEUE_SIZE];
unsigned int mmu_update_queue_idx = 0;
#define idx mmu_update_queue_idx
#undef MMU_UPDATE_DEBUG
#undef MMU_UPDATE_DEBUG_RECORD
#ifdef MMU_UPDATE_DEBUG_RECORD
/* this ain't define anywhere
page_update_debug_t update_debug_queue[QUEUE_SIZE] = {{0}};
*/
#undef queue_l1_entry_update
#undef queue_l2_entry_update
/* plan 9 doesn't go overboard with this pte_t stuff. We're in the arch
* directory here. PTEs are unsigned longs and that's that.
*/
static void DEBUG_allow_pt_reads(void)
{
unsigned long *pte;
mmu_update_t update;
int i;
for ( i = idx-1; i >= 0; i-- )
{
pte = update_debug_queue[i].ptep;
if ( pte == NULL ) continue;
update_debug_queue[i].ptep = NULL;
update.ptr = PADDR(pte);
update.val = update_debug_queue[i].pteval;
HYPERVISOR_mmu_update(&update, 1, NULL);
}
}
static void DEBUG_disallow_pt_read(unsigned long va)
{
ulong *pte;
unsigned long pteval;
/*
* We may fault because of an already outstanding update.
* That's okay -- it'll get fixed up in the fault handler.
*/
mmu_update_t update;
pte = mmuwalk(mach0->pdb, va, 0);
update.ptr = virt_to_machine(pte);
pteval = *(unsigned long *)pte;
update.val = pteval & ~_PAGE_PRESENT;
HYPERVISOR_mmu_update(&update, 1, NULL);
update_debug_queue[idx].ptep = pte;
update_debug_queue[idx].pteval = pteval;
}
#endif
#ifdef MMU_UPDATE_DEBUG_RECORD
#undef queue_pt_switch
#undef queue_tlb_flush
#undef queue_invlpg
#undef queue_pgd_pin
#undef queue_pgd_unpin
#undef queue_pte_pin
#undef queue_pte_unpin
#endif
#ifdef NOTYET
/*
* MULTICALL_flush_page_update_queue:
* This is a version of the flush which queues as part of a multicall.
*/
void MULTICALL_flush_page_update_queue(void)
{
unsigned int _idx;
ilock(&update_lock);
if ( (_idx = idx) != 0 )
{
#ifdef MMU_UPDATE_DEBUG
dp("Flushing %d entries from pt update queue\n", idx);
#endif
#ifdef MMU_UPDATE_DEBUG_RECORD
DEBUG_allow_pt_reads();
#endif
idx = 0;
wmb(); /* Make sure index is cleared first to avoid double updates. */
queue_multicall3(__HYPERVISOR_mmu_update,
(unsigned long)update_queue,
(unsigned long)_idx,
(unsigned long)0);
}
iunlock(&update_lock);
}
#endif
static void __flush_page_update_queue(void)
{
unsigned int _idx = idx;
#ifdef MMU_UPDATE_DEBUG
dp("Flushing %d entries from pt update queue\n", idx);
#endif
#ifdef MMU_UPDATE_DEBUG_RECORD
DEBUG_allow_pt_reads();
#endif
idx = 0;
wmb(); /* Make sure index is cleared first to avoid double updates. */
if ((HYPERVISOR_mmu_update(update_queue, _idx, 0) < 0) )
panic("Failed to execute MMU updates");
}
void _flush_page_update_queue(void)
{
ilock(&update_lock);
if ( idx != 0 ) __flush_page_update_queue();
iunlock(&update_lock);
}
static void increment_index(void)
{
idx++;
if ((idx == QUEUE_SIZE) ) __flush_page_update_queue();
}
/* the 'val' here is a pfn with permission bits. We need to turn it into an MFN */
void queue_l1_entry_update(unsigned long *pteptr, unsigned long pval)
{
unsigned long mval;
ilock(&update_lock);
#ifdef MMU_UPDATE_DEBUG_RECORD
DEBUG_disallow_pt_read((unsigned long)ptr);
#endif
mval = xen_pa_to_ma(pval);
// dp("ql1ue: P 0x%ulx xmfn 0x%ulx mval 0x%ulx\n",
// PADDR(pteptr), xen_va_to_ma(pteptr), mval);
update_queue[idx].ptr = xen_va_to_ma(pteptr);
update_queue[idx].val = mval;
increment_index();
iunlock(&update_lock);
}
int set_va_mfn(void *va, unsigned long mfn, unsigned long perm)
{
unsigned long *pte;
Mach *mach0 = (Mach *) MACHADDR;
// dp("set_va_mfn: mach0 is %p\n", mach0);
// dp("Try to mmuwalk ... probably will fail\n");
pte = mmuwalk(mach0->pdb, (unsigned long) va, 2, 0);
// dp("pte for %p is %p\n", va, pte);
if (! pte)
return -1;
// dp("queue request for va %p to be 0x%ulx\n",
// (va), mfn<<PGSHIFT|perm);
HYPERVISOR_update_va_mapping(((unsigned long) va)>>PGSHIFT,
(mfn<<PGSHIFT)|perm, UVMF_INVLPG);
// queue_l1_entry_update(pte, pfn|perm);
// dp("Flush update queue\n");
// __flush_page_update_queue();
// dp("set_va_mfn: done\n");
return 0;
}
void queue_l2_entry_update(unsigned long *ptr, unsigned long val)
{
ilock(&update_lock);
update_queue[idx].ptr = xen_va_to_ma(ptr);
update_queue[idx].val = val;
increment_index();
iunlock(&update_lock);
}
void queue_pt_switch(unsigned long ptr)
{;
ilock(&update_lock);
update_queue[idx].ptr = xen_pa_to_ma(ptr);
update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
update_queue[idx].val = MMUEXT_NEW_BASEPTR;
increment_index();
iunlock(&update_lock);
}
void queue_tlb_flush(void)
{
ilock(&update_lock);
update_queue[idx].ptr = MMU_EXTENDED_COMMAND;
update_queue[idx].val = MMUEXT_TLB_FLUSH;
increment_index();
iunlock(&update_lock);
}
void queue_invlpg(unsigned long ptr)
{
ilock(&update_lock);
update_queue[idx].ptr = MMU_EXTENDED_COMMAND;
update_queue[idx].ptr |= PPN(ptr);
update_queue[idx].val = MMUEXT_INVLPG;
increment_index();
iunlock(&update_lock);
}
void queue_pgd_pin(unsigned long *ptr)
{
ilock(&update_lock);
update_queue[idx].ptr = xen_va_to_ma(ptr);
update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
update_queue[idx].val = MMUEXT_PIN_L2_TABLE;
increment_index();
iunlock(&update_lock);
}
void queue_pgd_unpin(unsigned long *ptr)
{
ilock(&update_lock);
update_queue[idx].ptr = xen_va_to_ma(ptr);
update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
update_queue[idx].val = MMUEXT_UNPIN_TABLE;
increment_index();
iunlock(&update_lock);
}
/* these two should probably take a VA, not a PA, but they're not even used! */
void queue_pte_pin(unsigned long ptr)
{
ilock(&update_lock);
update_queue[idx].ptr = xen_pa_to_ma(ptr);
update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
update_queue[idx].val = MMUEXT_PIN_L1_TABLE;
increment_index();
iunlock(&update_lock);
}
void queue_pte_unpin(unsigned long ptr)
{
ilock(&update_lock);
update_queue[idx].ptr = xen_pa_to_ma(ptr);
update_queue[idx].ptr |= MMU_EXTENDED_COMMAND;
update_queue[idx].val = MMUEXT_UNPIN_TABLE;
increment_index();
iunlock(&update_lock);
}
void queue_set_ldt(unsigned long ptr, unsigned long len)
{
ilock(&update_lock);
update_queue[idx].ptr = MMU_EXTENDED_COMMAND | ptr;
update_queue[idx].val = MMUEXT_SET_LDT | (len << MMUEXT_CMD_SHIFT);
increment_index();
iunlock(&update_lock);
}
void queue_machphys_update(unsigned long mfn, unsigned long pfn)
{
ilock(&update_lock);
update_queue[idx].ptr = (mfn << PGSHIFT) | MMU_MACHPHYS_UPDATE;
update_queue[idx].val = pfn;
increment_index();
iunlock(&update_lock);
}
#ifdef CONFIG_XEN_PHYSDEV_ACCESS
unsigned long allocate_empty_lowmem_region(unsigned long pages)
{
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
unsigned long *pfn_array;
unsigned long vstart;
unsigned long i;
int ret;
unsigned int order = get_order(pages*PAGE_SIZE);
dom_mem_op_t dom_mem_op;
vstart = __get_free_pages(GFP_KERNEL, order);
if ( vstart == 0 )
return 0UL;
pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
if ( pfn_array == NULL )
BUG();
for ( i = 0; i < (1<<order); i++ )
{
pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE)));
pmd = pmd_offset(pgdtart + (i*PAGE_SIZE)));
pte = pte_offset(pmdtart + (i*PAGE_SIZE)));
pfn_array[i] = pte->pte_low >> PAGE_SHIFT;
queue_l1_entry_update(pte, 0);
phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = 0xdeadbeef;
}
flush_page_update_queue();
dom_mem_op.op = MEMOP_RESERVATION_DECREASE;
dom_mem_op.u.decrease.size = 1<<order;
dom_mem_op.u.decrease.pages = pfn_array;
if ( (ret = HYPERVISOR_dom_mem_op(&dom_mem_op)) != (1<<order) )
{
printk(KERN_WARNING "Unable to reduce memory reservation (%d)\n", ret);
BUG();
}
vfree(pfn_array);
return vstart;
}
void deallocate_lowmem_region(unsigned long vstart)
{
pgd_t *pgd;
pmd_t *pmd;
pte_t *pte;
unsigned long *pfn_array;
unsigned long i;
int ret;
unsigned int order = get_order(pages*PAGE_SIZE);
dom_mem_op_t dom_mem_op;
pfn_array = vmalloc((1<<order) * sizeof(*pfn_array));
if ( pfn_array == NULL )
BUG();
dom_mem_op.op = MEMOP_RESERVATION_INCREASE;
dom_mem_op.u.increase.size = 1<<order;
dom_mem_op.u.increase.pages = pfn_array;
if ( (ret = HYPERVISOR_dom_mem_op(&dom_mem_op)) != (1<<order) )
{
dpk(KERN_WARNING "Unable to increase memory reservation (%d)\n",
ret);
BUG();
}
for ( i = 0; i < (1<<order); i++ )
{
pgd = pgd_offset_k( (vstart + (i*PAGE_SIZE)));
pmd = pmd_offset(pgdtart + (i*PAGE_SIZE)));
pte = pte_offset(pmdtart + (i*PAGE_SIZE)));
queue_l1_entry_update(pte, (pfn_array[i]<<PAGE_SHIFT)|__PAGE_KERNEL);
queue_machphys_update(pfn_array[i]tart)>>PAGE_SHIFT);
phys_to_machine_mapping[__pa(vstart)>>PAGE_SHIFT] = pfn_array[i];
}
flush_page_update_queue();
vfree(pfn_array);
free_pages(vstart, order);
}
#endif /* CONFIG_XEN_PHYSDEV_ACCESS */
/* startup stuff, it is here because we don't want to reference the mfn outside of this file */
extended_start_info_t xen_start_info;
volatile shared_info_t *HYPERVISOR_shared_info = 0;
unsigned long *mfn;
void
xen_mm_startup(void) {
/* start_info is at first page. */
xen_start_info = *((extended_start_info_t *) KZERO);
mfn = (unsigned long *) xen_start_info.mfn_list;
}
void xen_mm_shared_info(void) {
#ifdef NOT
int i, j;
volatile unsigned char *cp;
#endif
HYPERVISOR_shared_info = (shared_info_t *)0x80002000;
/* set by xc_plan9_build */
#ifdef NOT
/**/
set_va_mfn(HYPERVISOR_shared_info, xen_start_info.shared_info,
PTEWRITE|PTEVALID);
/**/
#endif
#ifdef NOT
cp = (unsigned char *)HYPERVISOR_shared_info;
for(i = 0; i < 4096; i += 16) {
dp("0x%x: ", i);
for(j = 0; j < 16; j++) {
volatile unsigned char dpv;
// if (((i+j)<8) && ((i+j)>0))
// cp[i+j] = 0;
if (cp[i+j])
dp("%02x ", cp[i+j]);
dpv = cp[i+j];
cp[i+j] = dpv;
}
dp("\n");
}
#endif
}
static unsigned long ma_to_pa_map[1<<20];
extern void xen_meminit(unsigned long, unsigned long, unsigned long, unsigned long);
void xen_mm_meminit(void) {
int i;
xen_meminit(
xen_start_info.pt_base, xen_start_info.nr_pt_frames,
xen_start_info.mfn_list, xen_start_info.nr_pages);
for(i = 0; i < xen_start_info.nr_pages; i++)
ma_to_pa_map[mfn[i]] = i;
}
void
xen_mm_info(void){
extended_start_info_t *x = &xen_start_info;
dp("xen_start_info\n");
dp(" nr_pages %uld\n", x->nr_pages);
dp(" shared_info 0x%ulx\n", x->shared_info);
dp(" flags 0x%ux\n", x->flags);
dp(" pt_base 0x%ulx\n", x->pt_base);
dp(" nr_pt_frames %uld\n", x->nr_pt_frames);
dp(" mfn_list 0x%ulx\n", x->mfn_list);
dp(" shared info %p\n", HYPERVISOR_shared_info);
dp(" mfn %p\n", mfn);
dp(" mfn[0] 0x%ulx\n", mfn[0]);
}
/* note that because of the Plan 9 KADDR/PADDR scheme, this function actually
* works fine for BOTH kernel virtual address and physical addresses
*/
/* this one should get the frame, but you need a VA to MA function, idiot! */
extern unsigned long *mfn;
unsigned long xen_mm_mfn(void *va) {
unsigned long pmfn;
pmfn = mfn[PADDR(va)>>PGSHIFT];
// LOG(dp("PMFN: 0x%ulx\n", pmfn));
pmfn <<= PGSHIFT;
// LOG(dp("PMFN: return 0x%lx\n", pmfn));
return pmfn;
}
/* well, this sucks, but you can't really build a table as things can change
* out from under you
*/
unsigned long
xen_ma_to_pa(unsigned long ma) {
unsigned long offset = ((unsigned long)ma) & (BY2PG-1);
unsigned long pfn, pa;
ma >>= PGSHIFT;
pfn = ma_to_pa_map[ma];
pa = pfn << PGSHIFT;
pa |= offset;
// dp("xen_ma_to_pa: ma 0x%ulx, pa 0x%ulx\n", ma, pa);
return pa;
}
unsigned long
xen_va_to_ma(void *va) {
unsigned long frame = xen_mm_mfn(va);
unsigned long offset = ((unsigned long)va) & (BY2PG -1);
unsigned long retval = frame | offset;
return retval;
}
void
xen_mm_readonly(void *vva) {
unsigned long va = (unsigned long) vva;
/*
dp("xen_readonly: 0x%ulx set to 0x%ulx flags 0x%x\n", ((unsigned long)va)>>PGSHIFT,
(xen_mm_mfn(vva))|PTEVALID|PTERONLY, UVMF_INVLPG);
*/
HYPERVISOR_update_va_mapping(((unsigned long)va)>>PGSHIFT,
(xen_mm_mfn(vva))|PTEVALID|PTERONLY, UVMF_INVLPG);
}
void
xen_mm_readwrite(void *vva) {
unsigned long va = (unsigned long) vva;
/*
dp("xen_readwrite: 0x%ulx set to 0x%ulx flags 0x%x\n", ((unsigned long)va)>>PGSHIFT,
(xen_mm_mfn(vva))|PTEVALID|PTEWRITE, UVMF_INVLPG);
*/
HYPERVISOR_update_va_mapping(((unsigned long)va)>>PGSHIFT,
(xen_mm_mfn(vva))|PTEVALID|PTEWRITE, UVMF_INVLPG);
}
void
xen_mm_setl2(void *l2, unsigned long *pteptr) {
LOG(dp(" quee l2 entry update for 0x%ulx\n", pteptr));
LOG(dp("0x%ulx set to 0x%ulx flags 0x%x\n", ((unsigned long)l2)>>PGSHIFT,
xen_mm_mfn(l2)|PTEVALID|PTEWRITE, UVMF_INVLPG));
HYPERVISOR_update_va_mapping(((unsigned long)l2)>>PGSHIFT,
xen_mm_mfn(l2)|PTEVALID|PTERONLY, UVMF_INVLPG);
queue_l2_entry_update(pteptr,
xen_mm_mfn(l2)|PTEUSER|PTEWRITE|PTEVALID);
/* have to do this here! */
/* could be fancy and do tricks but won't. */
_flush_page_update_queue();
}
int
xen_mm_decrease_reservation(unsigned long *pfn_array, int npfn) {
int ret;
ret = HYPERVISOR_dom_mem_op(MEMOP_decrease_reservation,
pfn_array, npfn, 0 );
if (ret < 0)
{
dp( "Unable to reduce memory reservation (%d)\n", ret);
}
return ret;
}
|