#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include "msgdb.h"
void
usage(void)
{
fprint(2, "usage: upas/msgclass [-a] [-d name dbfile]... [-l lockfile] [-m mul] [-t thresh] [tokenfile ...]\n");
exits("usage");
}
enum
{
MAXBEST = 32,
MAXLEN = 64,
MAXTAB = 256,
};
typedef struct Ndb Ndb;
struct Ndb
{
char *name;
char *file;
Msgdb *db;
double p;
long nmsg;
};
typedef struct Word Word;
struct Word
{
char s[MAXLEN];
int count[MAXTAB];
double p[MAXTAB];
double mp;
int mi; /* w.p[w.mi] = w.mp */
int nmsg;
};
Ndb db[MAXTAB];
int ndb;
int add;
int mul;
Msgdb *indb;
Word best[MAXBEST];
int mbest = 15;
int nbest;
void process(Biobuf*, char*);
void lockfile(char*);
void
noteword(Word *w, char *s)
{
int i;
for(i=nbest-1; i>=0; i--)
if(w->mp < best[i].mp)
break;
i++;
if(i >= mbest)
return;
if(nbest == mbest)
nbest--;
if(i < nbest)
memmove(&best[i+1], &best[i], (nbest-i)*sizeof(best[0]));
best[i] = *w;
strecpy(best[i].s, best[i].s+MAXLEN, s);
nbest++;
}
void
main(int argc, char **argv)
{
int i, bad, m, tot, nn, j;
Biobuf bin, *b, bout;
char *s, *lf;
double totp, p, thresh;
long n;
Word w;
lf = nil;
thresh = 0;
ARGBEGIN{
case 'a':
add = 1;
break;
case 'd':
if(ndb >= MAXTAB)
sysfatal("too many db classes");
db[ndb].name = EARGF(usage());
db[ndb].file = EARGF(usage());
ndb++;
break;
case 'l':
lf = EARGF(usage());
break;
case 'm':
mul = atoi(EARGF(usage()));
break;
case 't':
thresh = atof(EARGF(usage()));
break;
default:
usage();
}ARGEND
if(ndb == 0){
fprint(2, "must have at least one -d option\n");
usage();
}
indb = mdopen(nil, 1);
if(argc == 0){
Binit(&bin, 0, OREAD);
process(&bin, "<stdin>");
Bterm(&bin);
}else{
bad = 0;
for(i=0; i<argc; i++){
if((b = Bopen(argv[i], OREAD)) == nil){
fprint(2, "opening %s: %r\n", argv[i]);
bad = 1;
continue;
}
process(b, argv[i]);
Bterm(b);
}
if(bad)
exits("open inputs");
}
lockfile(lf);
bad = 0;
for(i=0; i<ndb; i++){
if((db[i].db = mdopen(db[i].file, 0)) == nil){
fprint(2, "opendb %s: %r\n", db[i].file);
bad = 1;
}
db[i].nmsg = mdget(db[i].db, "*From*");
}
if(bad)
exits("open databases");
/* run conditional probabilities of input words, getting 15 most specific */
mdenum(indb);
nbest = 0;
while(mdnext(indb, &s, &n) >= 0){
tot = 0;
totp = 0.0;
for(i=0; i<ndb; i++){
nn = mdget(db[i].db, s)*(i==0 ? 3 : 1);
tot += nn;
w.count[i] = nn;
p = w.count[i]/(double)db[i].nmsg;
if(p >= 1.0)
p = 1.0;
w.p[i] = p;
totp += p;
}
//fprint(2, "%s tot %d totp %g\n", s, tot, totp);
if(tot < 2)
continue;
w.mp = 0.0;
for(i=0; i<ndb; i++){
p = w.p[i];
p /= totp;
if(p < 0.001)
p = 0.001;
else if(p > 0.999)
p = 0.999;
if(p > w.mp){
w.mp = p;
w.mi = i;
}
w.p[i] = p;
}
noteword(&w, s);
}
/* compute conditional probabilities of message classes using 15 most specific */
totp = 0.0;
for(i=0; i<ndb; i++){
p = 1.0;
for(j=0; j<nbest; j++)
p *= best[j].p[i];
db[i].p = p;
totp += p;
}
for(i=0; i<ndb; i++)
db[i].p /= totp;
m = 0;
for(i=1; i<ndb; i++)
if(db[i].p > db[m].p)
m = i;
Binit(&bout, 1, OWRITE);
if(db[m].p < thresh)
m = -1;
if(m >= 0)
Bprint(&bout, "%s", db[m].name);
else
Bprint(&bout, "inconclusive");
for(j=0; j<ndb; j++)
Bprint(&bout, " %s=%g", db[j].name, db[j].p);
Bprint(&bout, "\n");
for(i=0; i<nbest; i++){
Bprint(&bout, "%s", best[i].s);
for(j=0; j<ndb; j++)
Bprint(&bout, " %s=%g", db[j].name, best[i].p[j]);
Bprint(&bout, "\n");
}
Bprint(&bout, "%s %g\n", best[i].s, best[i].p[m]);
Bterm(&bout);
if(m >= 0 && add){
mdenum(indb);
while(mdnext(indb, &s, &n) >= 0)
mdput(db[m].db, s, mdget(db[m].db, s)+n*mul);
mdclose(db[m].db);
}
exits(nil);
}
void
process(Biobuf *b, char*)
{
char *s;
char *p;
long n;
while((s = Brdline(b, '\n')) != nil){
s[Blinelen(b)-1] = 0;
if((p = strrchr(s, ' ')) != nil){
*p++ = 0;
n = atoi(p);
}else
n = 1;
mdput(indb, s, mdget(indb, s)+n);
}
}
int tpid;
void
killtickle(void)
{
postnote(PNPROC, tpid, "die");
}
void
lockfile(char *s)
{
int fd, t, w;
char err[ERRMAX];
if(s == nil)
return;
w = 50;
t = 0;
for(;;){
fd = open(s, OREAD);
if(fd >= 0)
break;
rerrstr(err, sizeof err);
if(strstr(err, "file is locked")==nil && strstr(err, "exclusive lock")==nil))
break;
sleep(w);
t += w;
if(w < 1000)
w = (w*3)/2;
if(t > 120*1000)
break;
}
if(fd < 0)
sysfatal("could not lock %s", s);
switch(tpid = fork()){
case -1:
sysfatal("fork: %r");
case 0:
for(;;){
sleep(30*1000);
free(dirfstat(fd));
}
_exits(nil);
default:
break;
}
close(fd);
atexit(killtickle);
}
|