#!/bin/sh
# --------------------------------------------------------------------------
# This is the script to create the unicode chars property table
# Written by Dimitry Golubovsky ([email protected]) as part
# of the Partial Unicode Support patch
#
# Adopted for use with GHC.
# License: see libraries/base/LICENSE
#
# -------------------------------------------------------------------------
# The script reads the file from the standard input,
# and outputs C code into the standard output.
# The C code contains the chars property table, and basic functions
# to access properties.
# Output the file header
echo "/*-------------------------------------------------------------------------"
echo "This is an automatically generated file: do not edit"
echo "Generated by `basename $0` at `date`"
echo "-------------------------------------------------------------------------*/"
echo
echo "#include \"WCsubst.h\""
# Define structures
cat <<EOF
/* Unicode general categories, listed in the same order as in the Unicode
* standard -- this must be the same order as in GHC.Unicode.
*/
enum {
NUMCAT_LU, /* Letter, Uppercase */
NUMCAT_LL, /* Letter, Lowercase */
NUMCAT_LT, /* Letter, Titlecase */
NUMCAT_LM, /* Letter, Modifier */
NUMCAT_LO, /* Letter, Other */
NUMCAT_MN, /* Mark, Non-Spacing */
NUMCAT_MC, /* Mark, Spacing Combining */
NUMCAT_ME, /* Mark, Enclosing */
NUMCAT_ND, /* Number, Decimal */
NUMCAT_NL, /* Number, Letter */
NUMCAT_NO, /* Number, Other */
NUMCAT_PC, /* Punctuation, Connector */
NUMCAT_PD, /* Punctuation, Dash */
NUMCAT_PS, /* Punctuation, Open */
NUMCAT_PE, /* Punctuation, Close */
NUMCAT_PI, /* Punctuation, Initial quote */
NUMCAT_PF, /* Punctuation, Final quote */
NUMCAT_PO, /* Punctuation, Other */
NUMCAT_SM, /* Symbol, Math */
NUMCAT_SC, /* Symbol, Currency */
NUMCAT_SK, /* Symbol, Modifier */
NUMCAT_SO, /* Symbol, Other */
NUMCAT_ZS, /* Separator, Space */
NUMCAT_ZL, /* Separator, Line */
NUMCAT_ZP, /* Separator, Paragraph */
NUMCAT_CC, /* Other, Control */
NUMCAT_CF, /* Other, Format */
NUMCAT_CS, /* Other, Surrogate */
NUMCAT_CO, /* Other, Private Use */
NUMCAT_CN /* Other, Not Assigned */
};
struct _convrule_
{
unsigned int category;
unsigned int catnumber;
int possible;
int updist;
int lowdist;
int titledist;
};
struct _charblock_
{
int start;
int length;
const struct _convrule_ *rule;
};
EOF
# Convert the stdin file to the C table
awk '
BEGIN {
FS=";"
catidx=0
rulidx=0
blockidx=0
cblckidx=0
sblckidx=0
blockb=-1
blockl=0
digs="0123456789ABCDEF"
for(i=0;i<16;i++)
{
hex[substr(digs,i+1,1)]=i;
}
}
function em1(a)
{
if(a=="") return "-1"
return "0x"a
}
function h2d(a)
{
l=length(a)
acc=0
for(i=1;i<=l;i++)
{
acc=acc*16+hex[substr(a,i,1)];
}
return acc
}
function dumpblock()
{
blkd=blockb ", " blockl ", &rule" rules[blockr]
blocks[blockidx]=blkd
blockidx++
if(blockb<=256) lat1idx++
split(blockr,rsp,",")
if(substr(rsp[3],2,1)=="1")
{
cblcks[cblckidx]=blkd
cblckidx++
}
if(rsp[1]=="GENCAT_ZS")
{
sblcks[sblckidx]=blkd
sblckidx++
}
blockb=self
blockl=1
blockr=rule
}
{
name=$2
cat=toupper($3)
self=h2d($1)
up=h2d($13)
low=h2d($14)
title=h2d($15)
convpos=1
if((up==0)&&(low==0)&&(title==0)) convpos=0
if(up==0) up=self
if(low==0) low=self
if(title==0) title=self
updist=up-self
lowdist=low-self
titledist=title-self
rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)?
("1, " updist ", " lowdist ", " titledist):
("0, 0, 0, 0"))
if(cats[cat]=="")
{
cats[cat]=(2^catidx);
catidx++;
}
if(rules[rule]=="")
{
rules[rule]=rulidx;
rulidx++;
}
if(blockb==-1)
{
blockb=self
blockl=1
blockr=rule
}
else
{
if (index(name,"First>")!=0)
{
dumpblock()
}
else if (index(name,"Last>")!=0)
{
blockl+=(self-blockb)
}
else if((self==blockb+blockl)&&(rule==blockr)) blockl++
else
{
dumpblock()
}
}
}
END {
dumpblock()
for(c in cats) print "#define GENCAT_"c" "cats[c]
print "#define MAX_UNI_CHAR " self
print "#define NUM_BLOCKS " blockidx
print "#define NUM_CONVBLOCKS " cblckidx
print "#define NUM_SPACEBLOCKS " sblckidx
print "#define NUM_LAT1BLOCKS " lat1idx
print "#define NUM_RULES " rulidx
for(r in rules)
{
printf "static const struct _convrule_ rule" rules[r] "={" r "};\n"
}
print "static const struct _charblock_ allchars[]={"
for(i=0;i<blockidx;i++)
{
printf "\t{" blocks[i] "}"
print (i<(blockidx-1))?",":""
}
print "};"
print "static const struct _charblock_ convchars[]={"
for(i=0;i<cblckidx;i++)
{
printf "\t{" cblcks[i] "}"
print (i<(cblckidx-1))?",":""
}
print "};"
print "static const struct _charblock_ spacechars[]={"
for(i=0;i<sblckidx;i++)
{
printf "\t{" sblcks[i] "}"
print (i<(sblckidx-1))?",":""
}
print "};"
}
'
# Output the C procedures code
cat <<EOF
/*
Obtain the reference to character rule by doing
binary search over the specified array of blocks.
To make checkattr shorter, the address of
nullrule is returned if the search fails:
this rule defines no category and no conversion
distances. The compare function returns 0 when
key->start is within the block. Otherwise
result of comparison of key->start and start of the
current block is returned as usual.
*/
static const struct _convrule_ nullrule={0,NUMCAT_CN,0,0,0,0};
int blkcmp(const void *vk,const void *vb)
{
const struct _charblock_ *key,*cur;
key=vk;
cur=vb;
if((key->start>=cur->start)&&(key->start<(cur->start+cur->length)))
{
return 0;
}
if(key->start>cur->start) return 1;
return -1;
}
static const struct _convrule_ *getrule(
const struct _charblock_ *blocks,
int numblocks,
int unichar)
{
struct _charblock_ key={unichar,1,(void *)0};
struct _charblock_ *cb=bsearch(&key,blocks,numblocks,sizeof(key),blkcmp);
if(cb==(void *)0) return &nullrule;
return cb->rule;
}
/*
Check whether a character (internal code) has certain attributes.
Attributes (category flags) may be ORed. The function ANDs
character category flags and the mask and returns the result.
If the character belongs to one of the categories requested,
the result will be nonzero.
*/
inline static int checkattr(int c,unsigned int catmask)
{
return (catmask & (getrule(allchars,(c<256)?NUM_LAT1BLOCKS:NUM_BLOCKS,c)->category));
}
inline static int checkattr_s(int c,unsigned int catmask)
{
return (catmask & (getrule(spacechars,NUM_SPACEBLOCKS,c)->category));
}
/*
Define predicate functions for some combinations of categories.
*/
#define unipred(p,m) \\
int p(int c) \\
{ \\
return checkattr(c,m); \\
}
#define unipred_s(p,m) \\
int p(int c) \\
{ \\
return checkattr_s(c,m); \\
}
/*
Make these rules as close to Hugs as possible.
*/
unipred(u_iswcntrl,GENCAT_CC)
unipred(u_iswprint, \
(GENCAT_MC | GENCAT_NO | GENCAT_SK | GENCAT_ME | GENCAT_ND | \
GENCAT_PO | GENCAT_LT | GENCAT_PC | GENCAT_SM | GENCAT_ZS | \
GENCAT_LU | GENCAT_PD | GENCAT_SO | GENCAT_PE | GENCAT_PF | \
GENCAT_PS | GENCAT_SC | GENCAT_LL | GENCAT_LM | GENCAT_PI | \
GENCAT_NL | GENCAT_MN | GENCAT_LO))
unipred_s(u_iswspace,GENCAT_ZS)
unipred(u_iswupper,(GENCAT_LU|GENCAT_LT))
unipred(u_iswlower,GENCAT_LL)
unipred(u_iswalpha,(GENCAT_LL|GENCAT_LU|GENCAT_LT|GENCAT_LM|GENCAT_LO))
unipred(u_iswdigit,GENCAT_ND)
unipred(u_iswalnum,(GENCAT_LT|GENCAT_LU|GENCAT_LL|GENCAT_LM|GENCAT_LO|
GENCAT_MC|GENCAT_ME|GENCAT_MN|
GENCAT_NO|GENCAT_ND|GENCAT_NL))
#define caseconv(p,to) \\
int p(int c) \\
{ \\
const struct _convrule_ *rule=getrule(convchars,NUM_CONVBLOCKS,c);\\
if(rule==&nullrule) return c;\\
return c+rule->##to;\\
}
caseconv(u_towupper,updist)
caseconv(u_towlower,lowdist)
caseconv(u_towtitle,titledist)
int u_gencat(int c)
{
return getrule(allchars,NUM_BLOCKS,c)->catnumber;
}
EOF
|