Plan 9 from Bell Labs’s /usr/web/sources/contrib/fernan/nhc98/src/libraries/base/cbits/ubconfc

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


#!/bin/sh

# --------------------------------------------------------------------------
# This is the script to create the unicode chars property table 
# Written by Dimitry Golubovsky ([email protected]) as part
# of the Partial Unicode Support patch
#
# Adopted for use with GHC.
# License: see libraries/base/LICENSE
#
# -------------------------------------------------------------------------

#	The script reads the file from the standard input,
#	and outputs C code into the standard output.
#	The C code contains the chars property table, and basic functions
#	to access properties.

#	Output the file header

echo "/*-------------------------------------------------------------------------"
echo "This is an automatically generated file: do not edit"
echo "Generated by `basename $0` at `date`"
echo "-------------------------------------------------------------------------*/"
echo
echo "#include \"WCsubst.h\""

#	Define structures

cat <<EOF

/* Unicode general categories, listed in the same order as in the Unicode
 * standard -- this must be the same order as in GHC.Unicode.
 */

enum {
    NUMCAT_LU,  /* Letter, Uppercase */
    NUMCAT_LL,  /* Letter, Lowercase */
    NUMCAT_LT,  /* Letter, Titlecase */
    NUMCAT_LM,  /* Letter, Modifier */
    NUMCAT_LO,  /* Letter, Other */
    NUMCAT_MN,  /* Mark, Non-Spacing */
    NUMCAT_MC,  /* Mark, Spacing Combining */
    NUMCAT_ME,  /* Mark, Enclosing */
    NUMCAT_ND,  /* Number, Decimal */
    NUMCAT_NL,  /* Number, Letter */
    NUMCAT_NO,  /* Number, Other */
    NUMCAT_PC,  /* Punctuation, Connector */
    NUMCAT_PD,  /* Punctuation, Dash */
    NUMCAT_PS,  /* Punctuation, Open */
    NUMCAT_PE,  /* Punctuation, Close */
    NUMCAT_PI,  /* Punctuation, Initial quote */
    NUMCAT_PF,  /* Punctuation, Final quote */
    NUMCAT_PO,  /* Punctuation, Other */
    NUMCAT_SM,  /* Symbol, Math */
    NUMCAT_SC,  /* Symbol, Currency */
    NUMCAT_SK,  /* Symbol, Modifier */
    NUMCAT_SO,  /* Symbol, Other */
    NUMCAT_ZS,  /* Separator, Space */
    NUMCAT_ZL,  /* Separator, Line */
    NUMCAT_ZP,  /* Separator, Paragraph */
    NUMCAT_CC,  /* Other, Control */
    NUMCAT_CF,  /* Other, Format */
    NUMCAT_CS,  /* Other, Surrogate */
    NUMCAT_CO,  /* Other, Private Use */
    NUMCAT_CN   /* Other, Not Assigned */
};

struct _convrule_ 
{ 
	unsigned int category;
	unsigned int catnumber;
	int possible;
	int updist;
	int lowdist; 
	int titledist;
};

struct _charblock_ 
{ 
	int start;
	int length;
	const struct _convrule_ *rule;
};

EOF

#	Convert the stdin file to the C table

awk '
BEGIN {
	FS=";"
	catidx=0
	rulidx=0
	blockidx=0
	cblckidx=0
	sblckidx=0
	blockb=-1
	blockl=0
	digs="0123456789ABCDEF"
	for(i=0;i<16;i++)
	{
		hex[substr(digs,i+1,1)]=i;
	}
}
function em1(a)
{
	if(a=="") return "-1"
	return "0x"a
}
function h2d(a)
{
	l=length(a)
	acc=0
	for(i=1;i<=l;i++)
	{
		acc=acc*16+hex[substr(a,i,1)];
	}
	return acc
}
function dumpblock()
{
	blkd=blockb ", " blockl ", &rule" rules[blockr]
	blocks[blockidx]=blkd
	blockidx++
	if(blockb<=256) lat1idx++
	split(blockr,rsp,",")
	if(substr(rsp[3],2,1)=="1")
	{
		cblcks[cblckidx]=blkd
		cblckidx++
	}
	if(rsp[1]=="GENCAT_ZS")
	{
		sblcks[sblckidx]=blkd
		sblckidx++
	}
	blockb=self
	blockl=1
	blockr=rule
}
{
	name=$2
	cat=toupper($3)
	self=h2d($1)
	up=h2d($13)
	low=h2d($14)
	title=h2d($15)
	convpos=1
	if((up==0)&&(low==0)&&(title==0)) convpos=0
	if(up==0) up=self
	if(low==0) low=self
	if(title==0) title=self
	updist=up-self
	lowdist=low-self
	titledist=title-self
	rule="GENCAT_"cat", NUMCAT_"cat", "((convpos==1)?
				("1, " updist ", " lowdist ", " titledist):
				("0, 0, 0, 0"))
	if(cats[cat]=="")
	{
		cats[cat]=(2^catidx);
		catidx++;
	}
	if(rules[rule]=="")
	{
		rules[rule]=rulidx;
		rulidx++;
	}
	if(blockb==-1)
	{
		blockb=self
		blockl=1
		blockr=rule
	}
	else
	{
		if (index(name,"First>")!=0)
		{
			dumpblock()
		}
		else if (index(name,"Last>")!=0)
		{
			blockl+=(self-blockb)
		}
		else if((self==blockb+blockl)&&(rule==blockr)) blockl++
		else
		{
			dumpblock()
		}
	}
}
END {
	dumpblock()
	for(c in cats) print "#define GENCAT_"c" "cats[c]
	print "#define MAX_UNI_CHAR " self
	print "#define NUM_BLOCKS " blockidx
	print "#define NUM_CONVBLOCKS " cblckidx
	print "#define NUM_SPACEBLOCKS " sblckidx
	print "#define NUM_LAT1BLOCKS " lat1idx
        print "#define NUM_RULES " rulidx
	for(r in rules)
	{
		printf "static const struct _convrule_ rule" rules[r] "={" r "};\n"
	}
	print "static const struct _charblock_ allchars[]={"
	for(i=0;i<blockidx;i++)
	{
		printf "\t{" blocks[i] "}"
		print (i<(blockidx-1))?",":"" 
	}
	print "};"
	print "static const struct _charblock_ convchars[]={"
	for(i=0;i<cblckidx;i++)
	{
		printf "\t{" cblcks[i] "}"
		print (i<(cblckidx-1))?",":""
	}
        print "};"
        print "static const struct _charblock_ spacechars[]={"
        for(i=0;i<sblckidx;i++)
        {       
                printf "\t{" sblcks[i] "}"
                print (i<(sblckidx-1))?",":""
        }       
	print "};"
}
'
#	Output the C procedures code

cat <<EOF

/*
	Obtain the reference to character rule by doing
	binary search over the specified array of blocks.
	To make checkattr shorter, the address of
	nullrule is returned if the search fails:
	this rule defines no category and no conversion
	distances. The compare function returns 0 when
	key->start is within the block. Otherwise
	result of comparison of key->start and start of the
	current block is returned as usual.
*/

static const struct _convrule_ nullrule={0,NUMCAT_CN,0,0,0,0};

int blkcmp(const void *vk,const void *vb)
{
	const struct _charblock_ *key,*cur;
	key=vk;
	cur=vb;
	if((key->start>=cur->start)&&(key->start<(cur->start+cur->length)))
	{
		return 0;
	}
	if(key->start>cur->start) return 1;
	return -1;
}

static const struct _convrule_ *getrule(
	const struct _charblock_ *blocks,
	int numblocks,
	int unichar)
{
	struct _charblock_ key={unichar,1,(void *)0};
	struct _charblock_ *cb=bsearch(&key,blocks,numblocks,sizeof(key),blkcmp);
	if(cb==(void *)0) return &nullrule;
	return cb->rule;
}
	


/*
	Check whether a character (internal code) has certain attributes.
	Attributes (category flags) may be ORed. The function ANDs
	character category flags and the mask and returns the result.
	If the character belongs to one of the categories requested,
	the result will be nonzero.
*/

inline static int checkattr(int c,unsigned int catmask)
{
	return (catmask & (getrule(allchars,(c<256)?NUM_LAT1BLOCKS:NUM_BLOCKS,c)->category));
}

inline static int checkattr_s(int c,unsigned int catmask)
{
        return (catmask & (getrule(spacechars,NUM_SPACEBLOCKS,c)->category));
}

/*
	Define predicate functions for some combinations of categories.
*/

#define unipred(p,m) \\
int p(int c) \\
{ \\
	return checkattr(c,m); \\
}

#define unipred_s(p,m) \\
int p(int c) \\
{ \\
        return checkattr_s(c,m); \\
}

/*
	Make these rules as close to Hugs as possible.
*/

unipred(u_iswcntrl,GENCAT_CC)
unipred(u_iswprint, \
(GENCAT_MC | GENCAT_NO | GENCAT_SK | GENCAT_ME | GENCAT_ND | \
  GENCAT_PO | GENCAT_LT | GENCAT_PC | GENCAT_SM | GENCAT_ZS | \
  GENCAT_LU | GENCAT_PD | GENCAT_SO | GENCAT_PE | GENCAT_PF | \
  GENCAT_PS | GENCAT_SC | GENCAT_LL | GENCAT_LM | GENCAT_PI | \
  GENCAT_NL | GENCAT_MN | GENCAT_LO))
unipred_s(u_iswspace,GENCAT_ZS)
unipred(u_iswupper,(GENCAT_LU|GENCAT_LT))
unipred(u_iswlower,GENCAT_LL)
unipred(u_iswalpha,(GENCAT_LL|GENCAT_LU|GENCAT_LT|GENCAT_LM|GENCAT_LO))
unipred(u_iswdigit,GENCAT_ND)

unipred(u_iswalnum,(GENCAT_LT|GENCAT_LU|GENCAT_LL|GENCAT_LM|GENCAT_LO|
		    GENCAT_MC|GENCAT_ME|GENCAT_MN|
		    GENCAT_NO|GENCAT_ND|GENCAT_NL))

#define caseconv(p,to) \\
int p(int c) \\
{ \\
	const struct _convrule_ *rule=getrule(convchars,NUM_CONVBLOCKS,c);\\
	if(rule==&nullrule) return c;\\
	return c+rule->##to;\\
}

caseconv(u_towupper,updist)
caseconv(u_towlower,lowdist)
caseconv(u_towtitle,titledist)

int u_gencat(int c)
{
	return getrule(allchars,NUM_BLOCKS,c)->catnumber;
}

EOF

Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to [email protected].