#include "u.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "io.h"
#include "../xen/xen.h"
#define LOG(a)
#define DATASEGM(p) { 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
#define EXECSEGM(p) { 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
#define TSSSEGM(b,p) { ((b)<<16)|sizeof(Tss),\
((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP }
Segdesc gdt[NGDT] =
{
[NULLSEG] { 0, 0}, /* null descriptor */
[KDSEG] DATASEGM(0), /* kernel data/stack */
[KESEG] EXECSEGM(0), /* kernel code */
[UDSEG] DATASEGM(3), /* user data/stack */
[UESEG] EXECSEGM(3), /* user code */
[TSSSEG] TSSSEGM(0,0), /* tss segment */
};
static void
taskswitch(/*ulong */ulong * pdb, ulong stack)
{
Tss *tss;
tss = m->tss;
tss->ss0 = KDSEL;
tss->esp0 = stack;
tss->ss1 = KDSEL;
tss->esp1 = stack;
tss->ss2 = KDSEL;
tss->esp2 = stack;
tss->cr3 = PADDR(pdb);
HYPERVISOR_stack_switch(KDSEL, stack);
putcr3(pdb);
}
/*
* On processors that support it, we set the PTEGLOBAL bit in
* page table and page directory entries that map kernel memory.
* Doing this tells the processor not to bother flushing them
* from the TLB when doing the TLB flush associated with a
* context switch (write to CR3). Since kernel memory mappings
* are never removed, this is safe. (If we ever remove kernel memory
* mappings, we can do a full flush by turning off the PGE bit in CR4,
* writing to CR3, and then turning the PGE bit back on.)
*
* See also mmukmap below.
*
* Processor support for the PTEGLOBAL bit is enabled in devarch.c.
*/
static void
memglobal(void)
{
int i, j;
ulong *pde, *pte;
/* only need to do this once, on bootstrap processor */
if(m->machno != 0)
return;
if(!m->havepge)
return;
pde = m->pdb;
for(i=512; i<1024; i++){ /* 512: start at entry for virtual 0x80000000 */
if(pde[i] & PTEVALID){
pde[i] |= PTEGLOBAL;
if(!(pde[i] & PTESIZE)){
pte = KADDR(pde[i]&~(BY2PG-1));
for(j=0; j<1024; j++)
if(pte[j] & PTEVALID)
pte[j] |= PTEGLOBAL;
}
}
}
}
void
mmuinit(void)
{
ulong x;
ushort ptr[3];
extern int rtsr(void);
memglobal();
m->tss = malloc(sizeof(Tss));
memset(m->tss, 0, sizeof(Tss));
/*
* We used to keep the GDT in the Mach structure, but it
* turns out that that slows down access to the rest of the
* page. Since the Mach structure is accessed quite often,
* it pays off anywhere from a factor of 1.25 to 2 on real
* hardware to separate them (the AMDs are more sensitive
* than Intels in this regard). Under VMware it pays off
* a factor of about 10 to 100.
*/
memmove(m->gdt, gdt, sizeof gdt);
x = (ulong)m->tss;
m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss);
m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;
ptr[0] = sizeof(gdt)-1;
x = (ulong)m->gdt;
ptr[1] = x & 0xFFFF;
ptr[2] = (x>>16) & 0xFFFF;
LOG(dp("NOT DOING lgdt\n"));
// lgdt(ptr);
ptr[0] = sizeof(Segdesc)*256-1;
x = IDTADDR;
ptr[1] = x & 0xFFFF;
ptr[2] = (x>>16) & 0xFFFF;
LOG(dp("NOT DOING lidt\n"));
// lidt(ptr);
/* make kernel text unwritable */
LOG(dp("NOT MAKING KERNEL TEXT UNWRITABLE\n"));
#ifdef not
for(x = KTZERO; x < (ulong)etext; x += BY2PG){
p = mmuwalk(m->pdb, x, 2, 0);
if(p == nil)
panic("mmuinit");
*p &= ~PTEWRITE;
}
#endif
LOG(dp("NOT DOING task switch or ltr\n"));
// taskswitch(PADDR(m->pdb), (ulong)m + BY2PG);
taskswitch(m->pdb, (ulong)m+BY2PG);
#ifdef NOT
ltr(TSSSEL);
#endif
LOG(dp("ltr is 0x%x\n", rtsr()));
}
void
flushmmu(void)
{
int s;
s = splhi();
up->newtlb = 1;
mmuswitch(up);
splx(s);
}
/* this can be called with an active pdb, so use Xen calls to zero it out.
*/
static void
mmuptefree(Proc* proc)
{
ulong *pdb;
Page **last, *page;
LOG(dp("mmuptefree\n"));
if(proc->mmupdb && proc->mmuused){
pdb = (ulong*)proc->mmupdb->va;
LOG(dp("mmuptefree: pdb %p\n", pdb));
last = &proc->mmuused;
for(page = *last; page; page = page->next){
LOG(dp("mmuptefree: free page 0x%ulx index 0x%ulx\n",
page->pa, page->daddr));
queue_l2_entry_update(&pdb[page->daddr], 0);
/* this is no longer a pte page. So make it readwrite */
_flush_page_update_queue();
xen_mm_readwrite((void *)page->va);
//pdb[page->daddr] = 0;
last = &page->next;
}
*last = proc->mmufree;
proc->mmufree = proc->mmuused;
proc->mmuused = 0;
}
_flush_page_update_queue();
}
void
mmuswitch(Proc* proc)
{
ulong *pdb;
LOG(dp("mmuswitch\n"));
if(proc->newtlb){
mmuptefree(proc);
proc->newtlb = 0;
}
if(proc->mmupdb){
pdb = (ulong*)proc->mmupdb->va;
// pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
queue_l2_entry_update(&pdb[PDX(MACHADDR)],
m->pdb[PDX(MACHADDR)]);
_flush_page_update_queue();
// pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
LOG(dp("MMUSWITCH: pdb[PDX(MACHADDR)] = 0x%ulx\n", m->pdb[PDX(MACHADDR)]));
taskswitch((ulong *) proc->mmupdb->va /*pa*/, (ulong)(proc->kstack+KSTACK));
}
else
taskswitch(/*PADDR*/(m->pdb), (ulong)(proc->kstack+KSTACK));
}
void
mmurelease(Proc* proc)
{
Page *page, *next;
/*
* Release any pages allocated for a page directory base or page-tables
* for this process:
* switch to the prototype pdb for this processor (m->pdb);
* call mmuptefree() to place all pages used for page-tables (proc->mmuused)
* onto the process' free list (proc->mmufree). This has the side-effect of
* cleaning any user entries in the pdb (proc->mmupdb);
* if there's a pdb put it in the cache of pre-initialised pdb's
* for this processor (m->pdbpool) or on the process' free list;
* finally, place any pages freed back into the free pool (palloc).
* This routine is only called from sched() with palloc locked.
*/
taskswitch(/*PADDR*/(m->pdb), (ulong)m + BY2PG);
mmuptefree(proc);
if(proc->mmupdb){
xen_mm_readwrite((void *)proc->mmupdb->va);
if(m->pdbcnt > 10){
proc->mmupdb->next = proc->mmufree;
proc->mmufree = proc->mmupdb;
}
else{
proc->mmupdb->next = m->pdbpool;
m->pdbpool = proc->mmupdb;
m->pdbcnt++;
}
proc->mmupdb = 0;
}
for(page = proc->mmufree; page; page = next){
next = page->next;
if(--page->ref)
panic("mmurelease: page->ref %d\n", page->ref);
pagechainhead(page);
}
if(proc->mmufree && palloc.r.p)
wakeup(&palloc.r);
proc->mmufree = 0;
}
/* rules: pdb pages are write-protected coming out of pdballoc,
* and only become writeable when freed.
* same rules for PTEs
*/
static Page*
mmupdballoc(void)
{
int s;
Page *page;
s = splhi();
if(m->pdbpool == 0){
spllo();
page = newpage(0, 0, 0);
page->va = VA(kmap(page));
memmove((void*)page->va, m->pdb, BY2PG);
}
else{
page = m->pdbpool;
m->pdbpool = page->next;
m->pdbcnt--;
}
splx(s);
LOG(dp("pdballoc ... do the update ... \n"));
/* have to make it readonly */
xen_mm_readonly(((void *)page->va));
LOG(dp("pdballoc returns %p\n", page));
return page;
}
void
putmmu(ulong va, ulong pa, Page*)
{
int pdbx;
Page *page;
ulong *pdb, *pte;
int s;
LOG(dp("putmmu for 0x%ulx, 0x%ulx, page %p, up %p\n", va, pa, p, up));
LOG(dp(" mmupdb is %p\n", up->mmupdb));
if(up->mmupdb == 0)
up->mmupdb = mmupdballoc();
LOG(dp("pdb is %p\n", up->mmupdb));
pdb = (ulong*)up->mmupdb->va;
pdbx = PDX(va);
LOG(dp("putmmu: pdbx is 0x%x\n", pdbx));
LOG(dp("PPN(pdb[pdbx] is 0x%ulx\n", PPN(pdb[pdbx])));
if(PPN(pdb[pdbx]) == 0){
LOG(dp("putmmu: up %p\n", up));
LOG(dp("putmmu: up->mmufree %p\n", up->mmufree));
if(up->mmufree == 0){
page = newpage(1, 0, 0);
page->va = VA(kmap(page));
LOG(dp("newpage, page is %p, va 0x%ulx\n", page, page->va));
}
else {
LOG(dp("old page, page %p, va 0x%ulx\n", page, page->va));
page = up->mmufree;
up->mmufree = page->next;
memset((void*)page->va, 0, BY2PG);
}
LOG(dp("got something ... page is %p\n"));
// pdb[pdbx] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID;
LOG(dp(" quee l2 entry update for %p\n", &pdb[pdbx]));
xen_mm_setl2((void *)page->va, &pdb[pdbx]);
page->daddr = pdbx;
page->next = up->mmuused;
up->mmuused = page;
}
pte = KADDR(PPN(xen_ma_to_pa(pdb[pdbx])));
LOG(dp("pte is %p\n", pte));
LOG(dp("pdb[pdbx] is now 0x%ulx, pte[PTX(va]] is 0x%ulx\n",
pdb[pdbx], pte[PTX(va)]));
LOG(dp("PTX is 0x%x, &pte[PTX(val)] is %p, set 0x%ulx\n",
PTX(va), &pte[PTX(va)], pa|PTEUSER));
queue_l1_entry_update(&pte[PTX(va)], pa|PTEUSER);
// pte[PTX(va)] = pa|PTEUSER;
s = splhi();
queue_l2_entry_update(&pdb[PDX(MACHADDR)],
m->pdb[PDX(MACHADDR)]);
// pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
LOG(dp("pdb[PDX(MACHADDR)] = 0x%ulx\n", m->pdb[PDX(MACHADDR)]));
mmuflushtlb((ulong *) up->mmupdb->/*pa*/va);
LOG(dp("end of day, va 0x%ulx, pdb[pdbx] is 0x%ulx, pte[PTX] is 0x%ulx\n",
va, pdb[pdbx], pte[PTX(va)]));
LOG(dp("putmmu ends\n"));
splx(s);
}
ulong*
mmuwalk(ulong* pdb, ulong va, int level, int create)
{
ulong pa, *table;
/*
* Walk the page-table pointed to by pdb and return a pointer
* to the entry for virtual address va at the requested level.
* If the entry is invalid and create isn't requested then bail
* out early. Otherwise, for the 2nd level walk, allocate a new
* page-table page and register it in the 1st level.
*/
LOG(dp("pdb is %p\n", pdb));
table = &pdb[PDX(va)];
LOG(dp("table %p\n", table));
if(!(*table & PTEVALID) && create == 0)
return 0;
LOG(dp("switch on level\n"));
switch(level){
default:
return 0;
case 1:
return table;
case 2:
LOG(dp("case 2, table %p\n", table));
if(*table & PTESIZE)
panic("mmuwalk2: va %luX entry %luX\n", va, *table);
if(!(*table & PTEVALID)){
pa = PADDR(xspanalloc(BY2PG, BY2PG, 0));
*table = pa|PTEWRITE|PTEVALID;
}
table = KADDR(PPN(*table));
return &table[PTX(va)];
}
}
static Lock mmukmaplock;
int
mmukmapsync(ulong va)
{
Mach *mach0;
ulong entry, *pte;
mach0 = MACHP(0);
LOG(dp("mmukmapsync: va 0x%ulx, mach0 %p\n", va, mach0));
LOG(dp("mach0->pdb is %p\n", mach0->pdb));
/* don't need this any more ...
if (va == 0)
panic("va is 0\n");
*/
LOG(dp("mmuwalk to there is %p\n", mmuwalk(mach0->pdb, va, 1, 0)));
ilock(&mmukmaplock);
if((pte = mmuwalk(mach0->pdb, va, 1, 0)) == nil){
iunlock(&mmukmaplock);
return 0;
}
if(!(*pte & PTESIZE) && mmuwalk(mach0->pdb, va, 2, 0) == nil){
iunlock(&mmukmaplock);
return 0;
}
entry = *pte;
if(!(m->pdb[PDX(va)] & PTEVALID))
m->pdb[PDX(va)] = entry;
if(up && up->mmupdb){
((ulong*)up->mmupdb->va)[PDX(va)] = entry;
mmuflushtlb((ulong *)up->mmupdb->/*pa*/va);
}
else
mmuflushtlb(/*PADDR*/(m->pdb));
iunlock(&mmukmaplock);
return 1;
}
ulong
mmukmap(ulong pa, ulong va, int size)
{
void __flush_page_update_queue(void);
Mach *mach0;
ulong ova, pae, *table, pgsz, *pte, x;
int pse, sync;
ulong vae;
// panic("mmukmap");
mach0 = MACHP(0);
#ifdef NOT
if((mach0->cpuiddx & 0x08) && (getcr4() & 0x10))
pse = 1;
else
#endif
pse = 0;
sync = 0;
pa = PPN(pa);
if(va == 0)
va = (ulong)KADDR(pa);
else
va = PPN(va);
ova = va;
/* for xen, the last 64 MB of virtual is disallowed. Just disallow
* anything for now.
*/
pae = pa + size;
vae = va + size;
if (pa > TOM) {
LOG(dp("pa 0x%ulx not allowed in XEN mode\n", pa));
return 0;
}
if (pae > TOM) {
LOG(dp("pa end 0x%ulx not allowed in XEN mode\n", pae));
return 0;
}
if (va > TOM) {
LOG(dp("va 0x%ulx not allowed in XEN mode\n", va));
return 0;
}
if (vae > TOM) {
LOG(dp("vae 0x%ulx not allowed in XEN mode\n", vae));
return 0;
}
ilock(&mmukmaplock);
while(pa < pae){
table = &mach0->pdb[PDX(va)];
/*
* Possibly already mapped.
*/
if(*table & PTEVALID){
if(*table & PTESIZE){
panic("NO BIG PAGES");
/*
* Big page. Does it fit within?
* If it does, adjust pgsz so the correct end can be
* returned and get out.
* If not, adjust pgsz up to the next 4MB boundary
* and continue.
*/
x = PPN(*table);
if(x != pa)
panic("mmukmap1: pa %luX entry %luX\n",
pa, *table);
x += 4*MB;
if(pae <= x){
pa = pae;
break;
}
pgsz = x - pa;
pa += pgsz;
va += pgsz;
continue;
}
else{
/*
* Little page. Walk to the entry.
* If the entry is valid, set pgsz and continue.
* If not, make it so, set pgsz, sync and continue.
*/
pte = mmuwalk(mach0->pdb, va, 2, 0);
if(pte && *pte & PTEVALID){
x = PPN(*pte);
if(x != pa)
panic("mmukmap2: pa %luX entry %luX\n",
pa, *pte);
pgsz = BY2PG;
pa += pgsz;
va += pgsz;
sync++;
continue;
}
}
}
/*
* Not mapped. Check if it can be mapped using a big page -
* starts on a 4MB boundary, size >= 4MB and processor can do it.
* If not a big page, walk the walk, talk the talk.
* Sync is set.
*
* If we're creating a kernel mapping, we know that it will never
* expire and thus we can set the PTEGLOBAL bit to make the entry
* persist in the TLB across flushes. If we do add support later for
* unmapping kernel addresses, see devarch.c for instructions on
* how to do a full TLB flush.
*/
if(pse && (pa % (4*MB)) == 0 && (pae >= pa+4*MB)){
*table = pa|PTESIZE|PTEWRITE|PTEUNCACHED|PTEVALID;
if((va&KZERO) && m->havepge)
*table |= PTEGLOBAL;
pgsz = 4*MB;
}
else{
ulong pteval;
pte = mmuwalk(mach0->pdb, va, 2, 1);
pteval = pa|PTEWRITE|PTEUNCACHED|PTEVALID;
if((va&KZERO) && m->havepge)
pteval |= PTEGLOBAL;
queue_l1_entry_update(pte, pteval);
pgsz = BY2PG;
}
pa += pgsz;
va += pgsz;
sync++;
}
iunlock(&mmukmaplock);
if (sync)
_flush_page_update_queue();
/*
* If something was added
* then need to sync up.
*/
if(sync)
mmukmapsync(ova);
return pa;
}
|