Plan 9 from Bell Labs’s /usr/web/sources/xen/xen2/9/xenpc/mmu.c

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


#include	"u.h"
#include	"../port/lib.h"
#include	"mem.h"
#include	"dat.h"
#include	"fns.h"
#include	"io.h"
#include "../xen/xen.h"

#define LOG(a)  

#define	DATASEGM(p) 	{ 0xFFFF, SEGG|SEGB|(0xF<<16)|SEGP|SEGPL(p)|SEGDATA|SEGW }
#define	EXECSEGM(p) 	{ 0xFFFF, SEGG|SEGD|(0xF<<16)|SEGP|SEGPL(p)|SEGEXEC|SEGR }
#define	TSSSEGM(b,p)	{ ((b)<<16)|sizeof(Tss),\
			  ((b)&0xFF000000)|(((b)>>16)&0xFF)|SEGTSS|SEGPL(p)|SEGP }

Segdesc gdt[NGDT] =
{
[NULLSEG]	{ 0, 0},		/* null descriptor */
[KDSEG]		DATASEGM(0),		/* kernel data/stack */
[KESEG]		EXECSEGM(0),		/* kernel code */
[UDSEG]		DATASEGM(3),		/* user data/stack */
[UESEG]		EXECSEGM(3),		/* user code */
[TSSSEG]	TSSSEGM(0,0),		/* tss segment */
};

static void
taskswitch(/*ulong */ulong * pdb, ulong stack)
{
	Tss *tss;

	tss = m->tss;
	tss->ss0 = KDSEL;
	tss->esp0 = stack;
	tss->ss1 = KDSEL;
	tss->esp1 = stack;
	tss->ss2 = KDSEL;
	tss->esp2 = stack;
	tss->cr3 = PADDR(pdb);
	HYPERVISOR_stack_switch(KDSEL, stack);
	putcr3(pdb);
}

/* 
 * On processors that support it, we set the PTEGLOBAL bit in
 * page table and page directory entries that map kernel memory.
 * Doing this tells the processor not to bother flushing them
 * from the TLB when doing the TLB flush associated with a 
 * context switch (write to CR3).  Since kernel memory mappings
 * are never removed, this is safe.  (If we ever remove kernel memory
 * mappings, we can do a full flush by turning off the PGE bit in CR4,
 * writing to CR3, and then turning the PGE bit back on.) 
 *
 * See also mmukmap below.
 * 
 * Processor support for the PTEGLOBAL bit is enabled in devarch.c.
 */
static void
memglobal(void)
{
	int i, j;
	ulong *pde, *pte;

	/* only need to do this once, on bootstrap processor */
	if(m->machno != 0)
		return;

	if(!m->havepge)
		return;

	pde = m->pdb;
	for(i=512; i<1024; i++){	/* 512: start at entry for virtual 0x80000000 */
		if(pde[i] & PTEVALID){
			pde[i] |= PTEGLOBAL;
			if(!(pde[i] & PTESIZE)){
				pte = KADDR(pde[i]&~(BY2PG-1));
				for(j=0; j<1024; j++)
					if(pte[j] & PTEVALID)
						pte[j] |= PTEGLOBAL;
			}
		}
	}			
}

void
mmuinit(void)
{
	ulong x;
	ushort ptr[3];
	extern int rtsr(void);

	memglobal();

	m->tss = malloc(sizeof(Tss));
	memset(m->tss, 0, sizeof(Tss));

	/*
	 * We used to keep the GDT in the Mach structure, but it
	 * turns out that that slows down access to the rest of the
	 * page.  Since the Mach structure is accessed quite often,
	 * it pays off anywhere from a factor of 1.25 to 2 on real
	 * hardware to separate them (the AMDs are more sensitive
	 * than Intels in this regard).  Under VMware it pays off
	 * a factor of about 10 to 100.
	 */

	memmove(m->gdt, gdt, sizeof gdt);
	x = (ulong)m->tss;
	m->gdt[TSSSEG].d0 = (x<<16)|sizeof(Tss);
	m->gdt[TSSSEG].d1 = (x&0xFF000000)|((x>>16)&0xFF)|SEGTSS|SEGPL(0)|SEGP;

	ptr[0] = sizeof(gdt)-1;
	x = (ulong)m->gdt;
	ptr[1] = x & 0xFFFF;
	ptr[2] = (x>>16) & 0xFFFF;
	LOG(dp("NOT DOING lgdt\n"));
//	lgdt(ptr);

	ptr[0] = sizeof(Segdesc)*256-1;
	x = IDTADDR;
	ptr[1] = x & 0xFFFF;
	ptr[2] = (x>>16) & 0xFFFF;
	LOG(dp("NOT DOING lidt\n"));
//	lidt(ptr);

	/* make kernel text unwritable */
	LOG(dp("NOT MAKING KERNEL TEXT UNWRITABLE\n"));
#ifdef not
	for(x = KTZERO; x < (ulong)etext; x += BY2PG){
		p = mmuwalk(m->pdb, x, 2, 0);
		if(p == nil)
			panic("mmuinit");
		*p &= ~PTEWRITE;
	}
#endif

	LOG(dp("NOT DOING task switch or ltr\n"));

//	taskswitch(PADDR(m->pdb),  (ulong)m + BY2PG);
	taskswitch(m->pdb, (ulong)m+BY2PG);
#ifdef NOT
	ltr(TSSSEL);
#endif
	LOG(dp("ltr is 0x%x\n", rtsr()));
}

void
flushmmu(void)
{
	int s;

	s = splhi();
	up->newtlb = 1;
	mmuswitch(up);
	splx(s);
}

/* this can be called with an active pdb, so use Xen calls to zero it out.
  */
static void
mmuptefree(Proc* proc)
{
	ulong *pdb;
	Page **last, *page;

	LOG(dp("mmuptefree\n"));
	if(proc->mmupdb && proc->mmuused){
		pdb = (ulong*)proc->mmupdb->va;
		LOG(dp("mmuptefree: pdb %p\n", pdb));
		last = &proc->mmuused;
		for(page = *last; page; page = page->next){
			LOG(dp("mmuptefree: free page 0x%ulx index 0x%ulx\n", 
							page->pa, page->daddr));
			queue_l2_entry_update(&pdb[page->daddr], 0);
			/* this is no longer a pte page. So make it readwrite */
			_flush_page_update_queue();
			xen_mm_readwrite((void *)page->va);
			//pdb[page->daddr] = 0;
			last = &page->next;
		}
		*last = proc->mmufree;
		proc->mmufree = proc->mmuused;
		proc->mmuused = 0;
	}
	_flush_page_update_queue();
}

void
mmuswitch(Proc* proc)
{
	ulong *pdb;

	LOG(dp("mmuswitch\n"));
	if(proc->newtlb){
		mmuptefree(proc);
		proc->newtlb = 0;
	}

	if(proc->mmupdb){
		pdb = (ulong*)proc->mmupdb->va;
	//	pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
		queue_l2_entry_update(&pdb[PDX(MACHADDR)], 
					m->pdb[PDX(MACHADDR)]);
		_flush_page_update_queue();
//	pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
		LOG(dp("MMUSWITCH: pdb[PDX(MACHADDR)] = 0x%ulx\n", m->pdb[PDX(MACHADDR)]));
		taskswitch((ulong *) proc->mmupdb->va /*pa*/, (ulong)(proc->kstack+KSTACK));
	}
	else
		taskswitch(/*PADDR*/(m->pdb), (ulong)(proc->kstack+KSTACK));
}

void
mmurelease(Proc* proc)
{
	Page *page, *next;

	/*
	 * Release any pages allocated for a page directory base or page-tables
	 * for this process:
	 *   switch to the prototype pdb for this processor (m->pdb);
	 *   call mmuptefree() to place all pages used for page-tables (proc->mmuused)
	 *   onto the process' free list (proc->mmufree). This has the side-effect of
	 *   cleaning any user entries in the pdb (proc->mmupdb);
	 *   if there's a pdb put it in the cache of pre-initialised pdb's
	 *   for this processor (m->pdbpool) or on the process' free list;
	 *   finally, place any pages freed back into the free pool (palloc).
	 * This routine is only called from sched() with palloc locked.
	 */
	taskswitch(/*PADDR*/(m->pdb), (ulong)m + BY2PG);
	mmuptefree(proc);

	if(proc->mmupdb){
		xen_mm_readwrite((void *)proc->mmupdb->va);
		if(m->pdbcnt > 10){
			proc->mmupdb->next = proc->mmufree;
			proc->mmufree = proc->mmupdb;
		}
		else{
			proc->mmupdb->next = m->pdbpool;
			m->pdbpool = proc->mmupdb;
			m->pdbcnt++;
		}
		proc->mmupdb = 0;
	}

	for(page = proc->mmufree; page; page = next){
		next = page->next;
		if(--page->ref)
			panic("mmurelease: page->ref %d\n", page->ref);
		pagechainhead(page);
	}
	if(proc->mmufree && palloc.r.p)
		wakeup(&palloc.r);
	proc->mmufree = 0;
}

/* rules: pdb pages are write-protected coming out of pdballoc, 
  * and only become writeable when freed. 
  * same rules for PTEs
  */
static Page*
mmupdballoc(void)
{
	int s;
	Page *page;

	s = splhi();
	if(m->pdbpool == 0){
		spllo();
		page = newpage(0, 0, 0);
		page->va = VA(kmap(page));
		memmove((void*)page->va, m->pdb, BY2PG);
	}
	else{
		page = m->pdbpool;
		m->pdbpool = page->next;
		m->pdbcnt--;
	}
	splx(s);
	LOG(dp("pdballoc ... do the update ... \n"));
	/* have to make it readonly */
	xen_mm_readonly(((void *)page->va));
	LOG(dp("pdballoc returns %p\n", page));
	return page;
}

void
putmmu(ulong va, ulong pa, Page*)
{
	int pdbx;
	Page *page;
	ulong *pdb, *pte;
	int s;

	LOG(dp("putmmu for 0x%ulx, 0x%ulx, page %p, up %p\n", va, pa, p, up));
	LOG(dp("   mmupdb is %p\n", up->mmupdb));
	if(up->mmupdb == 0)
		up->mmupdb = mmupdballoc();
	LOG(dp("pdb is %p\n", up->mmupdb));
	pdb = (ulong*)up->mmupdb->va;
	pdbx = PDX(va);

	LOG(dp("putmmu: pdbx is 0x%x\n", pdbx));
	LOG(dp("PPN(pdb[pdbx] is 0x%ulx\n", PPN(pdb[pdbx])));
	if(PPN(pdb[pdbx]) == 0){
		LOG(dp("putmmu: up %p\n", up));
		LOG(dp("putmmu: up->mmufree %p\n", up->mmufree));
		if(up->mmufree == 0){
			page = newpage(1, 0, 0);
			page->va = VA(kmap(page));
			LOG(dp("newpage, page is %p, va 0x%ulx\n", page, page->va));
		}
		else {
			LOG(dp("old page, page %p, va 0x%ulx\n", page, page->va));
			page = up->mmufree;
			up->mmufree = page->next;
			memset((void*)page->va, 0, BY2PG);
		}
	LOG(dp("got something ... page is %p\n"));
//		pdb[pdbx] = PPN(page->pa)|PTEUSER|PTEWRITE|PTEVALID;
		LOG(dp("   quee l2 entry update for %p\n", &pdb[pdbx]));
		xen_mm_setl2((void *)page->va, &pdb[pdbx]);
		page->daddr = pdbx;
		page->next = up->mmuused;
		up->mmuused = page;
	}

	pte = KADDR(PPN(xen_ma_to_pa(pdb[pdbx])));
	LOG(dp("pte is %p\n", pte));
	LOG(dp("pdb[pdbx] is now 0x%ulx, pte[PTX(va]] is 0x%ulx\n", 
			pdb[pdbx], pte[PTX(va)]));
	LOG(dp("PTX is 0x%x, &pte[PTX(val)] is %p, set 0x%ulx\n", 
		PTX(va), &pte[PTX(va)], pa|PTEUSER));
	queue_l1_entry_update(&pte[PTX(va)], pa|PTEUSER);
//	pte[PTX(va)] = pa|PTEUSER;

	s = splhi();
	queue_l2_entry_update(&pdb[PDX(MACHADDR)], 
					m->pdb[PDX(MACHADDR)]);
//	pdb[PDX(MACHADDR)] = m->pdb[PDX(MACHADDR)];
	LOG(dp("pdb[PDX(MACHADDR)] = 0x%ulx\n", m->pdb[PDX(MACHADDR)]));
	mmuflushtlb((ulong *) up->mmupdb->/*pa*/va);
	LOG(dp("end of day, va 0x%ulx, pdb[pdbx] is 0x%ulx, pte[PTX] is 0x%ulx\n", 
			va, pdb[pdbx], pte[PTX(va)]));
	LOG(dp("putmmu ends\n"));
	splx(s);
}

ulong*
mmuwalk(ulong* pdb, ulong va, int level, int create)
{
	ulong pa, *table;

	/*
	 * Walk the page-table pointed to by pdb and return a pointer
	 * to the entry for virtual address va at the requested level.
	 * If the entry is invalid and create isn't requested then bail
	 * out early. Otherwise, for the 2nd level walk, allocate a new
	 * page-table page and register it in the 1st level.
	 */
	LOG(dp("pdb is %p\n", pdb)); 
	table = &pdb[PDX(va)];
	LOG(dp("table %p\n", table));
	if(!(*table & PTEVALID) && create == 0)
		return 0;

	LOG(dp("switch on level\n"));
	switch(level){

	default:
		return 0;

	case 1:
		return table;

	case 2:
		LOG(dp("case 2, table %p\n", table));
		if(*table & PTESIZE)
			panic("mmuwalk2: va %luX entry %luX\n", va, *table);
		if(!(*table & PTEVALID)){
			pa = PADDR(xspanalloc(BY2PG, BY2PG, 0));
			*table = pa|PTEWRITE|PTEVALID;
		}
		table = KADDR(PPN(*table));

		return &table[PTX(va)];
	}
}

static Lock mmukmaplock;

int
mmukmapsync(ulong va)
{
	Mach *mach0;
	ulong entry, *pte;

	mach0 = MACHP(0);
	LOG(dp("mmukmapsync: va 0x%ulx, mach0 %p\n", va, mach0));
	LOG(dp("mach0->pdb is %p\n", mach0->pdb));
	/* don't need this any more ...
	if (va == 0) 
		panic("va is 0\n");
	*/
	LOG(dp("mmuwalk to there is %p\n", mmuwalk(mach0->pdb, va, 1, 0)));

	ilock(&mmukmaplock);

	if((pte = mmuwalk(mach0->pdb, va, 1, 0)) == nil){
		iunlock(&mmukmaplock);
		return 0;
	}
	if(!(*pte & PTESIZE) && mmuwalk(mach0->pdb, va, 2, 0) == nil){
		iunlock(&mmukmaplock);
		return 0;
	}
	entry = *pte;

	if(!(m->pdb[PDX(va)] & PTEVALID))
		m->pdb[PDX(va)] = entry;

	if(up && up->mmupdb){
		((ulong*)up->mmupdb->va)[PDX(va)] = entry;
		mmuflushtlb((ulong *)up->mmupdb->/*pa*/va);
	}
	else
		mmuflushtlb(/*PADDR*/(m->pdb));

	iunlock(&mmukmaplock);

	return 1;
}

ulong
mmukmap(ulong pa, ulong va, int size)
{
	void __flush_page_update_queue(void);
	Mach *mach0;
	ulong ova, pae, *table, pgsz, *pte, x;
	int pse, sync;
	ulong vae;

//	panic("mmukmap");
	mach0 = MACHP(0);
#ifdef NOT
	if((mach0->cpuiddx & 0x08) && (getcr4() & 0x10))
		pse = 1;
	else
#endif
		pse = 0;
	sync = 0;

	pa = PPN(pa);
	if(va == 0)
		va = (ulong)KADDR(pa);
	else
		va = PPN(va);
	ova = va;

	/* for xen, the last 64 MB of virtual is disallowed. Just disallow 
	  * anything for now.
	  */

	
	pae = pa + size;
	vae = va + size;
	if (pa > TOM) {
		LOG(dp("pa 0x%ulx not allowed in XEN mode\n", pa));
		return 0;
	}
	if (pae > TOM) {
		LOG(dp("pa end 0x%ulx not allowed in XEN mode\n", pae));
		return 0;
	}
	if (va > TOM) {
		LOG(dp("va 0x%ulx not allowed in XEN mode\n", va));
		return 0;
	}
	if (vae > TOM) {
		LOG(dp("vae 0x%ulx not allowed in XEN mode\n", vae));
		return 0;
	}

	ilock(&mmukmaplock);
	while(pa < pae){
		table = &mach0->pdb[PDX(va)];
		/*
		 * Possibly already mapped.
		 */
		if(*table & PTEVALID){
			if(*table & PTESIZE){
				panic("NO BIG PAGES");
				/*
				 * Big page. Does it fit within?
				 * If it does, adjust pgsz so the correct end can be
				 * returned and get out.
				 * If not, adjust pgsz up to the next 4MB boundary
				 * and continue.
				 */
				x = PPN(*table);
				if(x != pa)
					panic("mmukmap1: pa %luX  entry %luX\n",
						pa, *table);
				x += 4*MB;
				if(pae <= x){
					pa = pae;
					break;
				}
				pgsz = x - pa;
				pa += pgsz;
				va += pgsz;

				continue;
			}
			else{
				/*
				 * Little page. Walk to the entry.
				 * If the entry is valid, set pgsz and continue.
				 * If not, make it so, set pgsz, sync and continue.
				 */
				pte = mmuwalk(mach0->pdb, va, 2, 0);
				if(pte && *pte & PTEVALID){
					x = PPN(*pte);
					if(x != pa)
						panic("mmukmap2: pa %luX entry %luX\n",
							pa, *pte);
					pgsz = BY2PG;
					pa += pgsz;
					va += pgsz;
					sync++;

					continue;
				}
			}
		}

		/*
		 * Not mapped. Check if it can be mapped using a big page -
		 * starts on a 4MB boundary, size >= 4MB and processor can do it.
		 * If not a big page, walk the walk, talk the talk.
		 * Sync is set.
		 *
		 * If we're creating a kernel mapping, we know that it will never
		 * expire and thus we can set the PTEGLOBAL bit to make the entry
	 	 * persist in the TLB across flushes.  If we do add support later for
		 * unmapping kernel addresses, see devarch.c for instructions on
		 * how to do a full TLB flush.
		 */
		if(pse && (pa % (4*MB)) == 0 && (pae >= pa+4*MB)){
			*table = pa|PTESIZE|PTEWRITE|PTEUNCACHED|PTEVALID;
			if((va&KZERO) && m->havepge)
				*table |= PTEGLOBAL;
			pgsz = 4*MB;
		}
		else{
			ulong pteval;
			pte = mmuwalk(mach0->pdb, va, 2, 1);
			pteval = pa|PTEWRITE|PTEUNCACHED|PTEVALID;
			if((va&KZERO) && m->havepge)
				pteval |= PTEGLOBAL;
			queue_l1_entry_update(pte, pteval);
			pgsz = BY2PG;
		}
		pa += pgsz;
		va += pgsz;
		sync++;
	}
	iunlock(&mmukmaplock);
	if (sync)
		_flush_page_update_queue();
	/*
	 * If something was added
	 * then need to sync up.
	 */
	if(sync)
		mmukmapsync(ova);

	return pa;
}

Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to [email protected].