#include "u.h"
#include "tos.h"
#include "../port/lib.h"
#include "mem.h"
#include "dat.h"
#include "fns.h"
#include "../port/error.h"
#include "edf.h"
#include <a.out.h>
int shargs(char*, int, char**);
extern void checkpages(void);
extern void checkpagerefs(void);
long
sysr1(ulong*)
{
checkpagerefs();
return 0;
}
long
sysrfork(ulong *arg)
{
Proc *p;
int n, i;
Fgrp *ofg;
Pgrp *opg;
Rgrp *org;
Egrp *oeg;
ulong pid, flag;
Mach *wm;
flag = arg[0];
/* Check flags before we commit */
if((flag & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
error(Ebadarg);
if((flag & (RFNAMEG|RFCNAMEG)) == (RFNAMEG|RFCNAMEG))
error(Ebadarg);
if((flag & (RFENVG|RFCENVG)) == (RFENVG|RFCENVG))
error(Ebadarg);
if((flag&RFPROC) == 0) {
if(flag & (RFMEM|RFNOWAIT))
error(Ebadarg);
if(flag & (RFFDG|RFCFDG)) {
ofg = up->fgrp;
if(flag & RFFDG)
up->fgrp = dupfgrp(ofg);
else
up->fgrp = dupfgrp(nil);
closefgrp(ofg);
}
if(flag & (RFNAMEG|RFCNAMEG)) {
opg = up->pgrp;
up->pgrp = newpgrp();
if(flag & RFNAMEG)
pgrpcpy(up->pgrp, opg);
/* inherit noattach */
up->pgrp->noattach = opg->noattach;
closepgrp(opg);
}
if(flag & RFNOMNT)
up->pgrp->noattach = 1;
if(flag & RFREND) {
org = up->rgrp;
up->rgrp = newrgrp();
closergrp(org);
}
if(flag & (RFENVG|RFCENVG)) {
oeg = up->egrp;
up->egrp = smalloc(sizeof(Egrp));
up->egrp->ref = 1;
if(flag & RFENVG)
envcpy(up->egrp, oeg);
closeegrp(oeg);
}
if(flag & RFNOTEG)
up->noteid = incref(¬eidalloc);
return 0;
}
p = newproc();
clonefpu(p, up);
p->scallnr = up->scallnr;
p->s = up->s;
p->nerrlab = 0;
p->slash = up->slash;
p->dot = up->dot;
incref(p->dot);
memmove(p->note, up->note, sizeof(p->note));
p->privatemem = up->privatemem;
p->noswap = up->noswap;
p->nnote = up->nnote;
p->notified = 0;
p->lastnote = up->lastnote;
p->notify = up->notify;
p->ureg = up->ureg;
p->dbgreg = 0;
/* Make a new set of memory segments */
n = flag & RFMEM;
qlock(&p->seglock);
if(waserror()){
qunlock(&p->seglock);
nexterror();
}
i = up->seglen;
if (i > NSEG) {
free(p->seg);
p->seg = 0;
p->seglen = 0;
p->seg = malloc(sizeof *p->seg * i);
p->seglen = i;
}
for(i = 0; i < up->seglen; i++)
if(up->seg[i]) {
p->seg[i] = dupseg(up->seg, i, n);
p->nseg++;
}
if (p->nseg != up->nseg) {
print("SRC nseg len (%d %d) CHLD nseg chld (%d %d) fail %d \n",
up->nseg, up->seglen, p->nseg, p->seglen, p->kp);
}
assert(p->seglen == up->seglen);
assert(p->nseg == up->nseg);
qunlock(&p->seglock);
poperror();
if (flag & RFMEM) {
/*
* If we are using RFMEM, share the segmap; it is safe to blindly
* increment the segmap's refcount, as we ourselves (up) hold a reference
* to it, keeping it live. We do not need to lock the segment map,
* as we are not changing any of the segments or counts.
*/
incref(up->segmap);
p->segmap = up->segmap;
qlock(&p->segmap->seglock);
for(i = 0; i < p->segmap->nseg; i++)
if (p->segmap->seg[i] != nil) {
lock(p->segmap->seg[i]);
p->segmap->seg[i]->ref++;
unlock(p->segmap->seg[i]);
}
qunlock(&p->segmap->seglock);
} else {
/* Allocate a new segment map */
p->segmap = malloc(sizeof *p->segmap);
p->segmap->seg = malloc(sizeof p->segmap->seg * NSEG);
p->segmap->seglen = NSEG;
incref(p->segmap);
}
/* File descriptors */
if(flag & (RFFDG|RFCFDG)) {
if(flag & RFFDG)
p->fgrp = dupfgrp(up->fgrp);
else
p->fgrp = dupfgrp(nil);
}
else {
p->fgrp = up->fgrp;
incref(p->fgrp);
}
/* Process groups */
if(flag & (RFNAMEG|RFCNAMEG)) {
p->pgrp = newpgrp();
if(flag & RFNAMEG)
pgrpcpy(p->pgrp, up->pgrp);
/* inherit noattach */
p->pgrp->noattach = up->pgrp->noattach;
}
else {
p->pgrp = up->pgrp;
incref(p->pgrp);
}
if(flag & RFNOMNT)
up->pgrp->noattach = 1;
if(flag & RFREND)
p->rgrp = newrgrp();
else {
incref(up->rgrp);
p->rgrp = up->rgrp;
}
/* Environment group */
if(flag & (RFENVG|RFCENVG)) {
p->egrp = smalloc(sizeof(Egrp));
p->egrp->ref = 1;
if(flag & RFENVG)
envcpy(p->egrp, up->egrp);
}
else {
p->egrp = up->egrp;
incref(p->egrp);
}
p->hang = up->hang;
p->procmode = up->procmode;
/* Craft a return frame which will cause the child to pop out of
* the scheduler in user mode with the return register zero
*/
forkchild(p, up->dbgreg);
p->parent = up;
p->parentpid = up->pid;
if(flag&RFNOWAIT)
p->parentpid = 0;
else {
lock(&up->exl);
up->nchild++;
unlock(&up->exl);
}
if((flag&RFNOTEG) == 0)
p->noteid = up->noteid;
p->fpstate = up->fpstate;
pid = p->pid;
memset(p->time, 0, sizeof(p->time));
p->time[TReal] = MACHP(0)->ticks;
kstrdup(&p->text, up->text);
kstrdup(&p->user, up->user);
/*
* since the bss/data segments are now shareable,
* any mmu info about this process is now stale
* (i.e. has bad properties) and has to be discarded.
*/
flushmmu();
p->basepri = up->basepri;
p->priority = up->basepri;
p->fixedpri = up->fixedpri;
p->mp = up->mp;
wm = up->wired;
if(wm)
procwired(p, wm->machno);
ready(p);
sched();
return pid;
}
static ulong
l2be(long l)
{
uchar *cp;
cp = (uchar*)&l;
return (cp[0]<<24) | (cp[1]<<16) | (cp[2]<<8) | cp[3];
}
long
sysexec(ulong *arg)
{
Segment *s, *ts;
ulong t, d, b;
int i;
Chan *tc;
char **argv, **argp;
char *a, *charp, *args, *file, *file0;
char *progarg[sizeof(Exec)/2+1], *elem, progelem[64];
ulong ssize, spage, nargs, nbytes, n, bssend;
int indir;
Exec exec;
char line[sizeof(Exec)];
Fgrp *f;
Image *img;
ulong magic, text, entry, data, bss;
Tos *tos;
Segment **ssegmap;
uintptr sgbase, sgtop, diff;
int rc;
indir = 0;
elem = nil;
validaddr(arg[0], 1, 0);
file0 = validnamedup((char*)arg[0], 1);
if(waserror()){
free(file0);
free(elem);
nexterror();
}
file = file0;
for(;;){
tc = namec(file, Aopen, OEXEC, 0);
if(waserror()){
cclose(tc);
nexterror();
}
if(!indir)
kstrdup(&elem, up->genbuf);
n = devtab[tc->type]->read(tc, &exec, sizeof(Exec), 0);
if(n < 2)
error(Ebadexec);
magic = l2be(exec.magic);
text = l2be(exec.text);
entry = l2be(exec.entry);
if(n==sizeof(Exec) && (magic == AOUT_MAGIC)){
if(text >= USTKTOP-UTZERO
|| entry < UTZERO+sizeof(Exec)
|| entry >= UTZERO+sizeof(Exec)+text)
error(Ebadexec);
break; /* for binary */
}
/*
* Process #! /bin/sh args ...
*/
memmove(line, &exec, sizeof(Exec));
if(indir || line[0]!='#' || line[1]!='!')
error(Ebadexec);
n = shargs(line, n, progarg);
if(n == 0)
error(Ebadexec);
indir = 1;
/*
* First arg becomes complete file name
*/
progarg[n++] = file;
progarg[n] = 0;
validaddr(arg[1], BY2WD, 1);
arg[1] += BY2WD;
file = progarg[0];
if(strlen(elem) >= sizeof progelem)
error(Ebadexec);
strcpy(progelem, elem);
progarg[0] = progelem;
poperror();
cclose(tc);
}
data = l2be(exec.data);
bss = l2be(exec.bss);
t = (UTZERO+sizeof(Exec)+text+(BY2PG-1)) & ~(BY2PG-1);
d = (t + data + (BY2PG-1)) & ~(BY2PG-1);
bssend = t + data + bss;
b = (bssend + (BY2PG-1)) & ~(BY2PG-1);
if(t >= KZERO || d >= KZERO || b >= KZERO)
error(Ebadexec);
/*
* Args: pass 1: count
*/
nbytes = sizeof(Tos); /* hole for profiling clock at top of stack (and more) */
nargs = 0;
if(indir){
argp = progarg;
while(*argp){
a = *argp++;
nbytes += strlen(a) + 1;
nargs++;
}
}
evenaddr(arg[1]);
argp = (char**)arg[1];
validaddr((ulong)argp, BY2WD, 0);
while(*argp){
a = *argp++;
if(((ulong)argp&(BY2PG-1)) < BY2WD)
validaddr((ulong)argp, BY2WD, 0);
validaddr((ulong)a, 1, 0);
nbytes += ((char*)vmemchr(a, 0, 0x7FFFFFFF) - a) + 1;
nargs++;
}
ssize = BY2WD*(nargs+1) + ((nbytes+(BY2WD-1)) & ~(BY2WD-1));
/*
* 8-byte align SP for those (e.g. sparc) that need it.
* execregs() will subtract another 4 bytes for argc.
*/
if((ssize+4) & 7)
ssize += 4;
spage = (ssize+(BY2PG-1)) >> PGSHIFT;
/*
* Build the stack segment, putting it in kernel virtual for the moment
*/
if(spage > TSTKSIZ)
error(Enovmem);
qlock(&up->seglock);
if(waserror()){
qunlock(&up->seglock);
nexterror();
}
up->seg[ESEG] = newseg(SG_STACK, TSTKTOP-USTKSIZE, USTKSIZE/BY2PG);
/*
* Args: pass 2: assemble; the pages will be faulted in
*/
tos = (Tos*)(TSTKTOP - sizeof(Tos));
tos->cyclefreq = m->cyclefreq;
cycles((uvlong*)&tos->pcycles);
tos->pcycles = -tos->pcycles;
tos->kcycles = tos->pcycles;
tos->clock = 0;
argv = (char**)(TSTKTOP - ssize);
charp = (char*)(TSTKTOP - nbytes);
args = charp;
if(indir)
argp = progarg;
else
argp = (char**)arg[1];
for(i=0; i<nargs; i++){
if(indir && *argp==0) {
indir = 0;
argp = (char**)arg[1];
}
*argv++ = charp + (USTKTOP-TSTKTOP);
n = strlen(*argp) + 1;
memmove(charp, *argp++, n);
charp += n;
}
free(file0);
free(up->text);
up->text = elem;
elem = nil; /* so waserror() won't free elem */
USED(elem);
/* copy args; easiest from new process's stack */
n = charp - args;
if(n > 128) /* don't waste too much space on huge arg lists */
n = 128;
a = up->args;
up->args = nil;
free(a);
up->args = smalloc(n);
memmove(up->args, args, n);
if(n>0 && up->args[n-1]!='\0'){
/* make sure last arg is NUL-terminated */
/* put NUL at UTF-8 character boundary */
for(i=n-1; i>0; --i)
if(fullrune(up->args+i, n-i))
break;
up->args[i] = 0;
n = i+1;
}
up->nargs = n;
/*
* Committed.
* Free old memory.
* Special segments are maintained across exec
*/
for(i = SSEG; i <= BSEG; i++) {
putseg(up->seg[i]);
/* prevent a second free if we have an error */
if (up->seg[i] != nil)
up->nseg--;
up->seg[i] = 0;
}
for(i = BSEG+1; i < up->seglen; i++) {
s = up->seg[i];
if(s != 0 && (s->type&SG_CEXEC)) {
putseg(s);
up->seg[i] = 0;
up->nseg--;
}
}
/*
* Close on exec
*/
f = up->fgrp;
for(i=0; i<=f->maxfd; i++)
fdclose(i, CCEXEC);
/*
* Allocate a new shared segment map and free the old one
*
* It _is_ safe to decrement/free the segment map here this way - decref
* can only ever return a 0 if no one else has _or can get_ a reference to
* the segment map.
*
* We do not violate Memetian's rule ("An Object cannon synchronize its own
* visibility") because:
*
* 1) If there are multiple processes sharing this segment map, decref
* will not return zero; we are safe, we do not free the structure.
* 2) If we are currently the only process with a reference to the segment
* map, no other process may gain a reference to it unless we call rfork().
* There is no way we can call rfork in this shared map after we are done
* with exec, as exec always switches shared maps. There are no races
* where we start a child via rfork and then exec before the child has
* a chance to run, as we do the incref on the shared map ourselves in
* rfork.
* 3) There is no race with regard to exit - either a process in the same
* shared map calls pexit() before we call decref, at which point our
* decref will return zero _and it is appropriate for us to free it_.
* or we will do the same. We must take care in both pexit and
* in here to not use the segmap after we decref. Between our decref
* and the test of i, the segmap may disappear.
*/
ssegmap = up->segmap->seg;
n = up->segmap->nseg;
i = decref(up->segmap);
if (i == 0) {
free(up->segmap);
up->segmap = nil;
for (i = 0; i < n; i++)
if (ssegmap[i] != nil) {
putseg(ssegmap[i]);
ssegmap[i] = nil;
}
free(ssegmap);
}
up->segmap = malloc(sizeof *up->segmap);
up->segmap->seg = malloc(sizeof *up->segmap->seg * NSEG);
up->segmap->seglen = NSEG;
incref(up->segmap);
/* Text. Shared. Attaches to cache image if possible */
/* attachimage returns a locked cache image */
img = attachimage(SG_TEXT|SG_RONLY, tc, UTZERO, (t-UTZERO)>>PGSHIFT);
ts = img->s;
up->seg[TSEG] = ts;
up->nseg++;
ts->flushme = 1;
ts->fstart = 0;
ts->flen = sizeof(Exec)+text;
unlock(img);
/* Data. Shared. */
s = newseg(SG_DATA, t, (d-t)>>PGSHIFT);
up->seg[DSEG] = s;
up->nseg++;
/* Attached by hand */
incref(img);
s->image = img;
s->fstart = ts->fstart+ts->flen;
s->flen = data;
/* BSS. Zero fill on demand */
up->seg[BSEG] = newseg(SG_BSS, d, (b-d)>>PGSHIFT);
up->nseg++;
/*
* Move the stack
*/
s = up->seg[ESEG];
up->seg[ESEG] = 0;
up->seg[SSEG] = s;
qunlock(&up->seglock);
poperror(); /* seglock */
poperror(); /* elem */
s->base = USTKTOP-USTKSIZE;
s->top = USTKTOP;
relocateseg(s, USTKTOP-TSTKTOP);
up->nseg++;
/*
* '/' processes are higher priority (hack to make /ip more responsive).
*/
if(devtab[tc->type]->dc == L'/')
up->basepri = PriRoot;
up->priority = up->basepri;
poperror();
cclose(tc);
/*
* At this point, the mmu contains info about the old address
* space and needs to be flushed
*/
flushmmu();
qlock(&up->debug);
up->nnote = 0;
up->notify = 0;
up->notified = 0;
up->privatemem = 0;
procsetup(up);
qunlock(&up->debug);
if(up->hang)
up->procctl = Proc_stopme;
/*
* Committed segments: fault them in again, as this process has a
* different pmap.
*/
for (i = 0; i < up->seglen; i++) {
if ((up->seg[i]) && (up->seg[i]->type & SG_COMMIT)) {
s = up->seg[i];
sgbase = s->base;
sgtop = s->top;
for (diff = sgbase; diff < sgtop; diff += BY2PG) {
qlock(&s->lk);
rc = fixfault(s, diff, 0, 1);
if (rc != 0)
print("Warn: prefault-exec fail on commit\n");
}
}
}
return execregs(entry, ssize, nargs);
}
int
shargs(char *s, int n, char **ap)
{
int i;
s += 2;
n -= 2; /* skip #! */
for(i=0; s[i]!='\n'; i++)
if(i == n-1)
return 0;
s[i] = 0;
*ap = 0;
i = 0;
for(;;) {
while(*s==' ' || *s=='\t')
s++;
if(*s == 0)
break;
i++;
*ap++ = s;
*ap = 0;
while(*s && *s!=' ' && *s!='\t')
s++;
if(*s == 0)
break;
else
*s++ = 0;
}
return i;
}
int
return0(void*)
{
return 0;
}
long
syssleep(ulong *arg)
{
int n;
n = arg[0];
if(n <= 0) {
if (up->edf && (up->edf->flags & Admitted))
edfyield();
else
yield();
return 0;
}
if(n < TK2MS(1))
n = TK2MS(1);
tsleep(&up->sleep, return0, 0, n);
return 0;
}
long
sysalarm(ulong *arg)
{
return procalarm(arg[0]);
}
long
sysexits(ulong *arg)
{
char *status;
char *inval = "invalid exit string";
char buf[ERRMAX];
status = (char*)arg[0];
if(status){
if(waserror())
status = inval;
else{
validaddr((ulong)status, 1, 0);
if(vmemchr(status, 0, ERRMAX) == 0){
memmove(buf, status, ERRMAX);
buf[ERRMAX-1] = 0;
status = buf;
}
poperror();
}
}
pexit(status, 1);
return 0; /* not reached */
}
long
sys_wait(ulong *arg)
{
int pid;
Waitmsg w;
OWaitmsg *ow;
if(arg[0] == 0)
return pwait(nil);
validaddr(arg[0], sizeof(OWaitmsg), 1);
evenaddr(arg[0]);
pid = pwait(&w);
if(pid >= 0){
ow = (OWaitmsg*)arg[0];
readnum(0, ow->pid, NUMSIZE, w.pid, NUMSIZE);
readnum(0, ow->time+TUser*NUMSIZE, NUMSIZE, w.time[TUser], NUMSIZE);
readnum(0, ow->time+TSys*NUMSIZE, NUMSIZE, w.time[TSys], NUMSIZE);
readnum(0, ow->time+TReal*NUMSIZE, NUMSIZE, w.time[TReal], NUMSIZE);
strncpy(ow->msg, w.msg, sizeof(ow->msg));
ow->msg[sizeof(ow->msg)-1] = '\0';
}
return pid;
}
long
sysawait(ulong *arg)
{
int i;
int pid;
Waitmsg w;
ulong n;
n = arg[1];
validaddr(arg[0], n, 1);
pid = pwait(&w);
if(pid < 0)
return -1;
i = snprint((char*)arg[0], n, "%d %lud %lud %lud %q",
w.pid,
w.time[TUser], w.time[TSys], w.time[TReal],
w.msg);
return i;
}
void
werrstr(char *fmt, ...)
{
va_list va;
if(up == nil)
return;
va_start(va, fmt);
vseprint(up->syserrstr, up->syserrstr+ERRMAX, fmt, va);
va_end(va);
}
static long
generrstr(char *buf, uint nbuf)
{
char tmp[ERRMAX];
if(nbuf == 0)
error(Ebadarg);
validaddr((ulong)buf, nbuf, 1);
if(nbuf > sizeof tmp)
nbuf = sizeof tmp;
memmove(tmp, buf, nbuf);
/* make sure it's NUL-terminated */
tmp[nbuf-1] = '\0';
memmove(buf, up->syserrstr, nbuf);
buf[nbuf-1] = '\0';
memmove(up->syserrstr, tmp, nbuf);
return 0;
}
long
syserrstr(ulong *arg)
{
return generrstr((char*)arg[0], arg[1]);
}
/* compatibility for old binaries */
long
sys_errstr(ulong *arg)
{
return generrstr((char*)arg[0], 64);
}
long
sysnotify(ulong *arg)
{
if(arg[0] != 0)
validaddr(arg[0], sizeof(ulong), 0);
up->notify = (int(*)(void*, char*))(arg[0]);
return 0;
}
long
sysnoted(ulong *arg)
{
if(arg[0]!=NRSTR && !up->notified)
error(Egreg);
return 0;
}
long
syssegbrk(ulong *arg)
{
int i;
ulong addr;
Segment *s;
addr = arg[0];
for(i = 0; i < up->seglen; i++) {
s = up->seg[i];
if(s == 0 || addr < s->base || addr >= s->top)
continue;
switch(s->type&SG_TYPE) {
case SG_TEXT:
case SG_DATA:
case SG_STACK:
error(Ebadarg);
default:
return ibrk(arg[1], i);
}
}
error(Ebadarg);
return 0; /* not reached */
}
long
syssegattach(ulong *arg)
{
return segattach(up, arg[0], (char*)arg[1], arg[2], arg[3]);
}
long
syssegdetach(ulong *arg)
{
int i;
ulong addr;
Segment *s;
qlock(&up->seglock);
if(waserror()){
qunlock(&up->seglock);
nexterror();
}
s = 0;
addr = arg[0];
for(i = 0; i < up->seglen; i++)
if(s = up->seg[i]) {
qlock(&s->lk);
if((addr >= s->base && addr < s->top) ||
(s->top == s->base && addr == s->base))
goto found;
qunlock(&s->lk);
}
error(Ebadarg);
found:
/*
* Check we are not detaching the initial stack segment.
*/
if(s == up->seg[SSEG]){
qunlock(&s->lk);
error(Ebadarg);
}
up->seg[i] = 0;
qunlock(&s->lk);
putseg(s);
up->nseg--;
/* There is now a hole in the segment array; we know the offset,
* move everything down one and fill in the rest of the segment array
* with zeros; this upholds the invariant that the user part of the segment
* table is contiguous. */
if (i != (up->seglen - 1)) {
memmove(&up->seg[i], &up->seg[i + 1], (up->seglen - i - 1) * sizeof(Segment *));
memset(&up->seg[up->nseg + 1], 0, (up->seglen - up->nseg - 1) * sizeof(Segment *));
}
qunlock(&up->seglock);
poperror();
/* Ensure we flush any entries from the lost segment */
flushmmu();
return 0;
}
long
syssegfree(ulong *arg)
{
Segment *s;
ulong from, to;
from = arg[0];
s = seg(up, from, 1);
if(s == nil)
error(Ebadarg);
to = (from + arg[1]) & ~(BY2PG-1);
from = PGROUND(from);
if(to > s->top) {
qunlock(&s->lk);
error(Ebadarg);
}
mfreeseg(s, from, (to - from) / BY2PG);
qunlock(&s->lk);
flushmmu();
return 0;
}
/* For binary compatibility */
long
sysbrk_(ulong *arg)
{
return ibrk(arg[0], BSEG);
}
long
sysrendezvous(ulong *arg)
{
uintptr tag, val;
Proc *p, **l;
tag = arg[0];
l = &REND(up->rgrp, tag);
up->rendval = ~(uintptr)0;
lock(up->rgrp);
for(p = *l; p; p = p->rendhash) {
if(p->rendtag == tag) {
*l = p->rendhash;
val = p->rendval;
p->rendval = arg[1];
while(p->mach != 0)
;
ready(p);
unlock(up->rgrp);
return val;
}
l = &p->rendhash;
}
/* Going to sleep here */
up->rendtag = tag;
up->rendval = arg[1];
up->rendhash = *l;
*l = up;
up->state = Rendezvous;
unlock(up->rgrp);
sched();
return up->rendval;
}
/*
* The implementation of semaphores is complicated by needing
* to avoid rescheduling in syssemrelease, so that it is safe
* to call from real-time processes. This means syssemrelease
* cannot acquire any qlocks, only spin locks.
*
* Semacquire and semrelease must both manipulate the semaphore
* wait list. Lock-free linked lists only exist in theory, not
* in practice, so the wait list is protected by a spin lock.
*
* The semaphore value *addr is stored in user memory, so it
* cannot be read or written while holding spin locks.
*
* Thus, we can access the list only when holding the lock, and
* we can access the semaphore only when not holding the lock.
* This makes things interesting. Note that sleep's condition function
* is called while holding two locks - r and up->rlock - so it cannot
* access the semaphore value either.
*
* An acquirer announces its intention to try for the semaphore
* by putting a Sema structure onto the wait list and then
* setting Sema.waiting. After one last check of semaphore,
* the acquirer sleeps until Sema.waiting==0. A releaser of n
* must wake up n acquirers who have Sema.waiting set. It does
* this by clearing Sema.waiting and then calling wakeup.
*
* There are three interesting races here.
* The first is that in this particular sleep/wakeup usage, a single
* wakeup can rouse a process from two consecutive sleeps!
* The ordering is:
*
* (a) set Sema.waiting = 1
* (a) call sleep
* (b) set Sema.waiting = 0
* (a) check Sema.waiting inside sleep, return w/o sleeping
* (a) try for semaphore, fail
* (a) set Sema.waiting = 1
* (a) call sleep
* (b) call wakeup(a)
* (a) wake up again
*
* This is okay - semacquire will just go around the loop
* again. It does mean that at the top of the for(;;) loop in
* semacquire, phore.waiting might already be set to 1.
*
* The second is that a releaser might wake an acquirer who is
* interrupted before he can acquire the lock. Since
* release(n) issues only n wakeup calls -- only n can be used
* anyway -- if the interrupted process is not going to use his
* wakeup call he must pass it on to another acquirer.
*
* The third race is similar to the second but more subtle. An
* acquirer sets waiting=1 and then does a final canacquire()
* before going to sleep. The opposite order would result in
* missing wakeups that happen between canacquire and
* waiting=1. (In fact, the whole point of Sema.waiting is to
* avoid missing wakeups between canacquire() and sleep().) But
* there can be spurious wakeups between a successful
* canacquire() and the following semdequeue(). This wakeup is
* not useful to the acquirer, since he has already acquired
* the semaphore. Like in the previous case, though, the
* acquirer must pass the wakeup call along.
*
* This is all rather subtle. The code below has been verified
* with the spin model /sys/src/9/port/semaphore.p. The
* original code anticipated the second race but not the first
* or third, which were caught only with spin. The first race
* is mentioned in /sys/doc/sleep.ps, but I'd forgotten about it.
* It was lucky that my abstract model of sleep/wakeup still managed
* to preserve that behavior.
*
* I remain slightly concerned about memory coherence
* outside of locks. The spin model does not take
* queued processor writes into account so we have to
* think hard. The only variables accessed outside locks
* are the semaphore value itself and the boolean flag
* Sema.waiting. The value is only accessed with cmpswap,
* whose job description includes doing the right thing as
* far as memory coherence across processors. That leaves
* Sema.waiting. To handle it, we call coherence() before each
* read and after each write. - rsc
*/
/* Add semaphore p with addr a to list in seg. */
static void
semqueue(Segment *s, long *a, Sema *p)
{
memset(p, 0, sizeof *p);
p->addr = a;
lock(&s->sema); /* uses s->sema.Rendez.Lock, but no one else is */
p->next = &s->sema;
p->prev = s->sema.prev;
p->next->prev = p;
p->prev->next = p;
unlock(&s->sema);
}
/* Remove semaphore p from list in seg. */
static void
semdequeue(Segment *s, Sema *p)
{
lock(&s->sema);
p->next->prev = p->prev;
p->prev->next = p->next;
unlock(&s->sema);
}
/* Wake up n waiters with addr a on list in seg. */
static void
semwakeup(Segment *s, long *a, long n)
{
Sema *p;
lock(&s->sema);
for(p=s->sema.next; p!=&s->sema && n>0; p=p->next){
if(p->addr == a && p->waiting){
p->waiting = 0;
coherence();
wakeup(p);
n--;
}
}
unlock(&s->sema);
}
/* Add delta to semaphore and wake up waiters as appropriate. */
static long
semrelease(Segment *s, long *addr, long delta)
{
long value;
do
value = *addr;
while(!cmpswap(addr, value, value+delta));
semwakeup(s, addr, delta);
return value+delta;
}
/* Try to acquire semaphore using compare-and-swap */
static int
canacquire(long *addr)
{
long value;
while((value=*addr) > 0)
if(cmpswap(addr, value, value-1))
return 1;
return 0;
}
/* Should we wake up? */
static int
semawoke(void *p)
{
coherence();
return !((Sema*)p)->waiting;
}
/* Acquire semaphore (subtract 1). */
static int
semacquire(Segment *s, long *addr, int block)
{
int acquired;
Sema phore;
if(canacquire(addr))
return 1;
if(!block)
return 0;
acquired = 0;
semqueue(s, addr, &phore);
for(;;){
phore.waiting = 1;
coherence();
if(canacquire(addr)){
acquired = 1;
break;
}
if(waserror())
break;
sleep(&phore, semawoke, &phore);
poperror();
}
semdequeue(s, &phore);
coherence(); /* not strictly necessary due to lock in semdequeue */
if(!phore.waiting)
semwakeup(s, addr, 1);
if(!acquired)
nexterror();
return 1;
}
long
syssemacquire(ulong *arg)
{
int block;
long *addr;
Segment *s;
validaddr(arg[0], sizeof(long), 1);
evenaddr(arg[0]);
addr = (long*)arg[0];
block = arg[1];
if((s = seg(up, (ulong)addr, 0)) == nil)
error(Ebadarg);
if(*addr < 0)
error(Ebadarg);
return semacquire(s, addr, block);
}
long
syssemrelease(ulong *arg)
{
long *addr, delta;
Segment *s;
validaddr(arg[0], sizeof(long), 1);
evenaddr(arg[0]);
addr = (long*)arg[0];
delta = arg[1];
if((s = seg(up, (ulong)addr, 0)) == nil)
error(Ebadarg);
if(delta < 0 || *addr < 0)
error(Ebadarg);
return semrelease(s, addr, arg[1]);
}
|