Plan 9 from Bell Labs’s /usr/web/sources/contrib/fgb/root/sys/src/cmd/tcl/generic/tclRegexp.c

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


/*
 * tclRegexp.c --
 *
 *	This file contains the public interfaces to the Tcl regular expression
 *	mechanism.
 *
 * Copyright (c) 1998 by Sun Microsystems, Inc.
 * Copyright (c) 1998-1999 by Scriptics Corporation.
 *
 * See the file "license.terms" for information on usage and redistribution of
 * this file, and for a DISCLAIMER OF ALL WARRANTIES.
 *
 * RCS: @(#) $Id: tclRegexp.c,v 1.28 2007/12/13 15:23:20 dgp Exp $
 */

#include "tclInt.h"
#include "tclRegexp.h"

/*
 *----------------------------------------------------------------------
 * The routines in this file use Henry Spencer's regular expression package
 * contained in the following additional source files:
 *
 *	regc_color.c	regc_cvec.c	regc_lex.c
 *	regc_nfa.c	regcomp.c	regcustom.h
 *	rege_dfa.c	regerror.c	regerrs.h
 *	regex.h		regexec.c	regfree.c
 *	regfronts.c	regguts.h
 *
 * Copyright (c) 1998 Henry Spencer.  All rights reserved.
 *
 * Development of this software was funded, in part, by Cray Research Inc.,
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
 * Corporation, none of whom are responsible for the results. The author
 * thanks all of them.
 *
 * Redistribution and use in source and binary forms -- with or without
 * modification -- are permitted for any purpose, provided that
 * redistributions in source form retain this entire copyright notice and
 * indicate the origin and nature of any modifications.
 *
 * I'd appreciate being given credit for this package in the documentation of
 * software which uses it, but that is not a requirement.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 * *** NOTE: this code has been altered slightly for use in Tcl: ***
 * *** 1. Names have been changed, e.g. from re_comp to		 ***
 * ***    TclRegComp, to avoid clashes with other 		 ***
 * ***    regexp implementations used by applications. 		 ***
 */

/*
 * Thread local storage used to maintain a per-thread cache of compiled
 * regular expressions.
 */

#define NUM_REGEXPS 30

typedef struct ThreadSpecificData {
    int initialized;		/* Set to 1 when the module is initialized. */
    char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled regular
				 * expression patterns. NULL means that this
				 * slot isn't used. Malloc-ed. */
    int patLengths[NUM_REGEXPS];/* Number of non-null characters in
				 * corresponding entry in patterns. -1 means
				 * entry isn't used. */
    struct TclRegexp *regexps[NUM_REGEXPS];
				/* Compiled forms of above strings. Also
				 * malloc-ed, or NULL if not in use yet. */
} ThreadSpecificData;

static Tcl_ThreadDataKey dataKey;

/*
 * Declarations for functions used only in this file.
 */

static TclRegexp *	CompileRegexp(Tcl_Interp *interp, const char *pattern,
			    int length, int flags);
static void		DupRegexpInternalRep(Tcl_Obj *srcPtr,
			    Tcl_Obj *copyPtr);
static void		FinalizeRegexp(ClientData clientData);
static void		FreeRegexp(TclRegexp *regexpPtr);
static void		FreeRegexpInternalRep(Tcl_Obj *objPtr);
static int		RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re,
			    const Tcl_UniChar *uniString, int numChars,
			    int nmatches, int flags);
static int		SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);

/*
 * The regular expression Tcl object type. This serves as a cache of the
 * compiled form of the regular expression.
 */

Tcl_ObjType tclRegexpType = {
    "regexp",				/* name */
    FreeRegexpInternalRep,		/* freeIntRepProc */
    DupRegexpInternalRep,		/* dupIntRepProc */
    NULL,				/* updateStringProc */
    SetRegexpFromAny			/* setFromAnyProc */
};

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpCompile --
 *
 *	Compile a regular expression into a form suitable for fast matching.
 *	This function is DEPRECATED in favor of the object version of the
 *	command.
 *
 * Results:
 *	The return value is a pointer to the compiled form of string, suitable
 *	for passing to Tcl_RegExpExec. This compiled form is only valid up
 *	until the next call to this function, so don't keep these around for a
 *	long time! If an error occurred while compiling the pattern, then NULL
 *	is returned and an error message is left in the interp's result.
 *
 * Side effects:
 *	Updates the cache of compiled regexps.
 *
 *----------------------------------------------------------------------
 */

Tcl_RegExp
Tcl_RegExpCompile(
    Tcl_Interp *interp,		/* For use in error reporting and to access
				 * the interp regexp cache. */
    const char *pattern)	/* String for which to produce compiled
				 * regular expression. */
{
    return (Tcl_RegExp) CompileRegexp(interp, pattern, (int) strlen(pattern),
	    REG_ADVANCED);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpExec --
 *
 *	Execute the regular expression matcher using a compiled form of a
 *	regular expression and save information about any match that is found.
 *
 * Results:
 *	If an error occurs during the matching operation then -1 is returned
 *	and the interp's result contains an error message. Otherwise the
 *	return value is 1 if a matching range is found and 0 if there is no
 *	matching range.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
Tcl_RegExpExec(
    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */
    Tcl_RegExp re,		/* Compiled regular expression; must have been
				 * returned by previous call to
				 * Tcl_GetRegExpFromObj. */
    const char *text,		/* Text against which to match re. */
    const char *start)		/* If text is part of a larger string, this
				 * identifies beginning of larger string, so
				 * that "^" won't match. */
{
    int flags, result, numChars;
    TclRegexp *regexp = (TclRegexp *)re;
    Tcl_DString ds;
    const Tcl_UniChar *ustr;

    /*
     * If the starting point is offset from the beginning of the buffer, then
     * we need to tell the regexp engine not to match "^".
     */

    if (text > start) {
	flags = REG_NOTBOL;
    } else {
	flags = 0;
    }

    /*
     * Remember the string for use by Tcl_RegExpRange().
     */

    regexp->string = text;
    regexp->objPtr = NULL;

    /*
     * Convert the string to Unicode and perform the match.
     */

    Tcl_DStringInit(&ds);
    ustr = Tcl_UtfToUniCharDString(text, -1, &ds);
    numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar);
    result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */,
	    flags);
    Tcl_DStringFree(&ds);

    return result;
}

/*
 *---------------------------------------------------------------------------
 *
 * Tcl_RegExpRange --
 *
 *	Returns pointers describing the range of a regular expression match,
 *	or one of the subranges within the match.
 *
 * Results:
 *	The variables at *startPtr and *endPtr are modified to hold the
 *	addresses of the endpoints of the range given by index. If the
 *	specified range doesn't exist then NULLs are returned.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

void
Tcl_RegExpRange(
    Tcl_RegExp re,		/* Compiled regular expression that has been
				 * passed to Tcl_RegExpExec. */
    int index,			/* 0 means give the range of the entire match,
				 * > 0 means give the range of a matching
				 * subrange. */
    const char **startPtr,	/* Store address of first character in
				 * (sub-)range here. */
    const char **endPtr)	/* Store address of character just after last
				 * in (sub-)range here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;
    const char *string;

    if ((size_t) index > regexpPtr->re.re_nsub) {
	*startPtr = *endPtr = NULL;
    } else if (regexpPtr->matches[index].rm_so < 0) {
	*startPtr = *endPtr = NULL;
    } else {
	if (regexpPtr->objPtr) {
	    string = TclGetString(regexpPtr->objPtr);
	} else {
	    string = regexpPtr->string;
	}
	*startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so);
	*endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo);
    }
}

/*
 *---------------------------------------------------------------------------
 *
 * RegExpExecUniChar --
 *
 *	Execute the regular expression matcher using a compiled form of a
 *	regular expression and save information about any match that is found.
 *
 * Results:
 *	If an error occurs during the matching operation then -1 is returned
 *	and an error message is left in interp's result. Otherwise the return
 *	value is 1 if a matching range was found or 0 if there was no matching
 *	range.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static int
RegExpExecUniChar(
    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */
    Tcl_RegExp re,		/* Compiled regular expression; returned by a
				 * previous call to Tcl_GetRegExpFromObj */
    const Tcl_UniChar *wString,	/* String against which to match re. */
    int numChars,		/* Length of Tcl_UniChar string (must be
				 * >=0). */
    int nmatches,		/* How many subexpression matches (counting
				 * the whole match as subexpression 0) are of
				 * interest. -1 means "don't know". */
    int flags)			/* Regular expression flags. */
{
    int status;
    TclRegexp *regexpPtr = (TclRegexp *) re;
    size_t last = regexpPtr->re.re_nsub + 1;
    size_t nm = last;

    if (nmatches >= 0 && (size_t) nmatches < nm) {
	nm = (size_t) nmatches;
    }

    status = TclReExec(&regexpPtr->re, wString, (size_t) numChars,
	    &regexpPtr->details, nm, regexpPtr->matches, flags);

    /*
     * Check for errors.
     */

    if (status != REG_OKAY) {
	if (status == REG_NOMATCH) {
	    return 0;
	}
	if (interp != NULL) {
	    TclRegError(interp, "error while matching regular expression: ",
		    status);
	}
	return -1;
    }
    return 1;
}

/*
 *---------------------------------------------------------------------------
 *
 * TclRegExpRangeUniChar --
 *
 *	Returns pointers describing the range of a regular expression match,
 *	or one of the subranges within the match, or the hypothetical range
 *	represented by the rm_extend field of the rm_detail_t.
 *
 * Results:
 *	The variables at *startPtr and *endPtr are modified to hold the
 *	offsets of the endpoints of the range given by index. If the specified
 *	range doesn't exist then -1s are supplied.
 *
 * Side effects:
 *	None.
 *
 *---------------------------------------------------------------------------
 */

void
TclRegExpRangeUniChar(
    Tcl_RegExp re,		/* Compiled regular expression that has been
				 * passed to Tcl_RegExpExec. */
    int index,			/* 0 means give the range of the entire match,
				 * > 0 means give the range of a matching
				 * subrange, -1 means the range of the
				 * rm_extend field. */
    int *startPtr,		/* Store address of first character in
				 * (sub-)range here. */
    int *endPtr)		/* Store address of character just after last
				 * in (sub-)range here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;

    if ((regexpPtr->flags&REG_EXPECT) && index == -1) {
	*startPtr = regexpPtr->details.rm_extend.rm_so;
	*endPtr = regexpPtr->details.rm_extend.rm_eo;
    } else if ((size_t) index > regexpPtr->re.re_nsub) {
	*startPtr = -1;
	*endPtr = -1;
    } else {
	*startPtr = regexpPtr->matches[index].rm_so;
	*endPtr = regexpPtr->matches[index].rm_eo;
    }
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpMatch --
 *
 *	See if a string matches a regular expression.
 *
 * Results:
 *	If an error occurs during the matching operation then -1 is returned
 *	and the interp's result contains an error message. Otherwise the
 *	return value is 1 if "text" matches "pattern" and 0 otherwise.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
Tcl_RegExpMatch(
    Tcl_Interp *interp,		/* Used for error reporting. May be NULL. */
    const char *text,		/* Text to search for pattern matches. */
    const char *pattern)	/* Regular expression to match against text. */
{
    Tcl_RegExp re;

    re = Tcl_RegExpCompile(interp, pattern);
    if (re == NULL) {
	return -1;
    }
    return Tcl_RegExpExec(interp, re, text, text);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpExecObj --
 *
 *	Execute a precompiled regexp against the given object.
 *
 * Results:
 *	If an error occurs during the matching operation then -1 is returned
 *	and the interp's result contains an error message. Otherwise the
 *	return value is 1 if "string" matches "pattern" and 0 otherwise.
 *
 * Side effects:
 *	Converts the object to a Unicode object.
 *
 *----------------------------------------------------------------------
 */

int
Tcl_RegExpExecObj(
    Tcl_Interp *interp,		/* Interpreter to use for error reporting. */
    Tcl_RegExp re,		/* Compiled regular expression; must have been
				 * returned by previous call to
				 * Tcl_GetRegExpFromObj. */
    Tcl_Obj *textObj,		/* Text against which to match re. */
    int offset,			/* Character index that marks where matching
				 * should begin. */
    int nmatches,		/* How many subexpression matches (counting
				 * the whole match as subexpression 0) are of
				 * interest. -1 means all of them. */
    int flags)			/* Regular expression execution flags. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;
    Tcl_UniChar *udata;
    int length;
    int reflags = regexpPtr->flags;
#define TCL_REG_GLOBOK_FLAGS (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE)

    /*
     * Take advantage of the equivalent glob pattern, if one exists.
     * This is possible based only on the right mix of incoming flags (0)
     * and regexp compile flags.
     */
    if ((offset == 0) && (nmatches == 0) && (flags == 0)
	    && !(reflags & ~TCL_REG_GLOBOK_FLAGS)
	    && (regexpPtr->globObjPtr != NULL)) {
	int nocase = (reflags & TCL_REG_NOCASE) ? TCL_MATCH_NOCASE : 0;

	/*
	 * Pass to TclStringMatchObj for obj-specific handling.
	 * XXX: Currently doesn't take advantage of exact-ness that
	 * XXX: TclReToGlob tells us about
	 */

	return TclStringMatchObj(textObj, regexpPtr->globObjPtr, nocase);
    }

    /*
     * Save the target object so we can extract strings from it later.
     */

    regexpPtr->string = NULL;
    regexpPtr->objPtr = textObj;

    udata = Tcl_GetUnicodeFromObj(textObj, &length);

    if (offset > length) {
	offset = length;
    }
    udata += offset;
    length -= offset;

    return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpMatchObj --
 *
 *	See if an object matches a regular expression.
 *
 * Results:
 *	If an error occurs during the matching operation then -1 is returned
 *	and the interp's result contains an error message. Otherwise the
 *	return value is 1 if "text" matches "pattern" and 0 otherwise.
 *
 * Side effects:
 *	Changes the internal rep of the pattern and string objects.
 *
 *----------------------------------------------------------------------
 */

int
Tcl_RegExpMatchObj(
    Tcl_Interp *interp,		/* Used for error reporting. May be NULL. */
    Tcl_Obj *textObj,		/* Object containing the String to search. */
    Tcl_Obj *patternObj)	/* Regular expression to match against
				 * string. */
{
    Tcl_RegExp re;

    re = Tcl_GetRegExpFromObj(interp, patternObj,
	    TCL_REG_ADVANCED | TCL_REG_NOSUB);
    if (re == NULL) {
	return -1;
    }
    return Tcl_RegExpExecObj(interp, re, textObj, 0 /* offset */,
	    0 /* nmatches */, 0 /* flags */);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_RegExpGetInfo --
 *
 *	Retrieve information about the current match.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

void
Tcl_RegExpGetInfo(
    Tcl_RegExp regexp,		/* Pattern from which to get subexpressions. */
    Tcl_RegExpInfo *infoPtr)	/* Match information is stored here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) regexp;

    infoPtr->nsubs = regexpPtr->re.re_nsub;
    infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches;
    infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_GetRegExpFromObj --
 *
 *	Compile a regular expression into a form suitable for fast matching.
 *	This function caches the result in a Tcl_Obj.
 *
 * Results:
 *	The return value is a pointer to the compiled form of string, suitable
 *	for passing to Tcl_RegExpExec. If an error occurred while compiling
 *	the pattern, then NULL is returned and an error message is left in the
 *	interp's result.
 *
 * Side effects:
 *	Updates the native rep of the Tcl_Obj.
 *
 *----------------------------------------------------------------------
 */

Tcl_RegExp
Tcl_GetRegExpFromObj(
    Tcl_Interp *interp,		/* For use in error reporting, and to access
				 * the interp regexp cache. */
    Tcl_Obj *objPtr,		/* Object whose string rep contains regular
				 * expression pattern. Internal rep will be
				 * changed to compiled form of this regular
				 * expression. */
    int flags)			/* Regular expression compilation flags. */
{
    int length;
    TclRegexp *regexpPtr;
    char *pattern;

    /*
     * This is OK because we only actually interpret this value properly as a
     * TclRegexp* when the type is tclRegexpType.
     */

    regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;

    if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) {
	pattern = TclGetStringFromObj(objPtr, &length);

	regexpPtr = CompileRegexp(interp, pattern, length, flags);
	if (regexpPtr == NULL) {
	    return NULL;
	}

	/*
	 * Add a reference to the regexp so it will persist even if it is
	 * pushed out of the current thread's regexp cache. This reference
	 * will be removed when the object's internal rep is freed.
	 */

	regexpPtr->refCount++;

	/*
	 * Free the old representation and set our type.
	 */

	TclFreeIntRep(objPtr);
	objPtr->internalRep.otherValuePtr = (void *) regexpPtr;
	objPtr->typePtr = &tclRegexpType;
    }
    return (Tcl_RegExp) regexpPtr;
}

/*
 *----------------------------------------------------------------------
 *
 * TclRegAbout --
 *
 *	Return information about a compiled regular expression.
 *
 * Results:
 *	The return value is -1 for failure, 0 for success, although at the
 *	moment there's nothing that could fail. On success, a list is left in
 *	the interp's result: first element is the subexpression count, second
 *	is a list of re_info bit names.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

int
TclRegAbout(
    Tcl_Interp *interp,		/* For use in variable assignment. */
    Tcl_RegExp re)		/* The compiled regular expression. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;
    struct infoname {
	int bit;
	const char *text;
    };
    static const struct infoname infonames[] = {
	{REG_UBACKREF,		"REG_UBACKREF"},
	{REG_ULOOKAHEAD,	"REG_ULOOKAHEAD"},
	{REG_UBOUNDS,		"REG_UBOUNDS"},
	{REG_UBRACES,		"REG_UBRACES"},
	{REG_UBSALNUM,		"REG_UBSALNUM"},
	{REG_UPBOTCH,		"REG_UPBOTCH"},
	{REG_UBBS,		"REG_UBBS"},
	{REG_UNONPOSIX,		"REG_UNONPOSIX"},
	{REG_UUNSPEC,		"REG_UUNSPEC"},
	{REG_UUNPORT,		"REG_UUNPORT"},
	{REG_ULOCALE,		"REG_ULOCALE"},
	{REG_UEMPTYMATCH,	"REG_UEMPTYMATCH"},
	{REG_UIMPOSSIBLE,	"REG_UIMPOSSIBLE"},
	{REG_USHORTEST,		"REG_USHORTEST"},
	{0,			NULL}
    };
    const struct infoname *inf;
    Tcl_Obj *infoObj;

    /*
     * The reset here guarantees that the interpreter result is empty and
     * unshared. This means that we can use Tcl_ListObjAppendElement on the
     * result object quite safely.
     */

    Tcl_ResetResult(interp);

    /*
     * Assume that there will never be more than INT_MAX subexpressions. This
     * is a pretty reasonable assumption; the RE engine doesn't scale _that_
     * well and Tcl has other limits that constrain things as well...
     */

    Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp),
	    Tcl_NewIntObj((int) regexpPtr->re.re_nsub));

    /*
     * Now append a list of all the bit-flags set for the RE.
     */

    TclNewObj(infoObj);
    for (inf=infonames ; inf->bit != 0 ; inf++) {
	if (regexpPtr->re.re_info & inf->bit) {
	    Tcl_ListObjAppendElement(NULL, infoObj,
		    Tcl_NewStringObj(inf->text, -1));
	}
    }
    Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp), infoObj);

    return 0;
}

/*
 *----------------------------------------------------------------------
 *
 * TclRegError --
 *
 *	Generate an error message based on the regexp status code.
 *
 * Results:
 *	Places an error in the interpreter.
 *
 * Side effects:
 *	Sets errorCode as well.
 *
 *----------------------------------------------------------------------
 */

void
TclRegError(
    Tcl_Interp *interp,		/* Interpreter for error reporting. */
    const char *msg,		/* Message to prepend to error. */
    int status)			/* Status code to report. */
{
    char buf[100];		/* ample in practice */
    char cbuf[100];		/* lots in practice */
    size_t n;
    const char *p;

    Tcl_ResetResult(interp);
    n = TclReError(status, NULL, buf, sizeof(buf));
    p = (n > sizeof(buf)) ? "..." : "";
    Tcl_AppendResult(interp, msg, buf, p, NULL);

    sprintf(cbuf, "%d", status);
    (void) TclReError(REG_ITOA, NULL, cbuf, sizeof(cbuf));
    Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL);
}

/*
 *----------------------------------------------------------------------
 *
 * FreeRegexpInternalRep --
 *
 *	Deallocate the storage associated with a regexp object's internal
 *	representation.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Frees the compiled regular expression.
 *
 *----------------------------------------------------------------------
 */

static void
FreeRegexpInternalRep(
    Tcl_Obj *objPtr)		/* Regexp object with internal rep to free. */
{
    TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;

    /*
     * If this is the last reference to the regexp, free it.
     */

    if (--(regexpRepPtr->refCount) <= 0) {
	FreeRegexp(regexpRepPtr);
    }
}

/*
 *----------------------------------------------------------------------
 *
 * DupRegexpInternalRep --
 *
 *	We copy the reference to the compiled regexp and bump its reference
 *	count.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	Increments the reference count of the regexp.
 *
 *----------------------------------------------------------------------
 */

static void
DupRegexpInternalRep(
    Tcl_Obj *srcPtr,		/* Object with internal rep to copy. */
    Tcl_Obj *copyPtr)		/* Object with internal rep to set. */
{
    TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr;

    regexpPtr->refCount++;
    copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr;
    copyPtr->typePtr = &tclRegexpType;
}

/*
 *----------------------------------------------------------------------
 *
 * SetRegexpFromAny --
 *
 *	Attempt to generate a compiled regular expression for the Tcl object
 *	"objPtr".
 *
 * Results:
 *	The return value is TCL_OK or TCL_ERROR. If an error occurs during
 *	conversion, an error message is left in the interpreter's result
 *	unless "interp" is NULL.
 *
 * Side effects:
 *	If no error occurs, a regular expression is stored as "objPtr"s
 *	internal representation.
 *
 *----------------------------------------------------------------------
 */

static int
SetRegexpFromAny(
    Tcl_Interp *interp,		/* Used for error reporting if not NULL. */
    Tcl_Obj *objPtr)		/* The object to convert. */
{
    if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) {
	return TCL_ERROR;
    }
    return TCL_OK;
}

/*
 *---------------------------------------------------------------------------
 *
 * CompileRegexp --
 *
 *	Attempt to compile the given regexp pattern. If the compiled regular
 *	expression can be found in the per-thread cache, it will be used
 *	instead of compiling a new copy.
 *
 * Results:
 *	The return value is a pointer to a newly allocated TclRegexp that
 *	represents the compiled pattern, or NULL if the pattern could not be
 *	compiled. If NULL is returned, an error message is left in the
 *	interp's result.
 *
 * Side effects:
 *	The thread-local regexp cache is updated and a new TclRegexp may be
 *	allocated.
 *
 *----------------------------------------------------------------------
 */

static TclRegexp *
CompileRegexp(
    Tcl_Interp *interp,		/* Used for error reporting if not NULL. */
    const char *string,		/* The regexp to compile (UTF-8). */
    int length,			/* The length of the string in bytes. */
    int flags)			/* Compilation flags. */
{
    TclRegexp *regexpPtr;
    const Tcl_UniChar *uniString;
    int numChars, status, i, exact;
    Tcl_DString stringBuf;
    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);

    if (!tsdPtr->initialized) {
	tsdPtr->initialized = 1;
	Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL);
    }

    /*
     * This routine maintains a second-level regular expression cache in
     * addition to the per-object regexp cache. The per-thread cache is needed
     * to handle the case where for various reasons the object is lost between
     * invocations of the regexp command, but the literal pattern is the same.
     */

    /*
     * Check the per-thread compiled regexp cache. We can only reuse a regexp
     * if it has the same pattern and the same flags.
     */

    for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
	if ((length == tsdPtr->patLengths[i])
		&& (tsdPtr->regexps[i]->flags == flags)
		&& (strcmp(string, tsdPtr->patterns[i]) == 0)) {
	    /*
	     * Move the matched pattern to the first slot in the cache and
	     * shift the other patterns down one position.
	     */

	    if (i != 0) {
		int j;
		char *cachedString;

		cachedString = tsdPtr->patterns[i];
		regexpPtr = tsdPtr->regexps[i];
		for (j = i-1; j >= 0; j--) {
		    tsdPtr->patterns[j+1] = tsdPtr->patterns[j];
		    tsdPtr->patLengths[j+1] = tsdPtr->patLengths[j];
		    tsdPtr->regexps[j+1] = tsdPtr->regexps[j];
		}
		tsdPtr->patterns[0] = cachedString;
		tsdPtr->patLengths[0] = length;
		tsdPtr->regexps[0] = regexpPtr;
	    }
	    return tsdPtr->regexps[0];
	}
    }

    /*
     * This is a new expression, so compile it and add it to the cache.
     */

    regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp));
    regexpPtr->objPtr = NULL;
    regexpPtr->string = NULL;
    regexpPtr->details.rm_extend.rm_so = -1;
    regexpPtr->details.rm_extend.rm_eo = -1;

    /*
     * Get the up-to-date string representation and map to unicode.
     */

    Tcl_DStringInit(&stringBuf);
    uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf);
    numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);

    /*
     * Compile the string and check for errors.
     */

    regexpPtr->flags = flags;
    status = TclReComp(&regexpPtr->re, uniString, (size_t) numChars, flags);
    Tcl_DStringFree(&stringBuf);

    if (status != REG_OKAY) {
	/*
	 * Clean up and report errors in the interpreter, if possible.
	 */

	ckfree((char *)regexpPtr);
	if (interp) {
	    TclRegError(interp,
		    "couldn't compile regular expression pattern: ", status);
	}
	return NULL;
    }

    /*
     * Convert RE to a glob pattern equivalent, if any, and cache it.  If this
     * is not possible, then globObjPtr will be NULL.  This is used by
     * Tcl_RegExpExecObj to optionally do a fast match (avoids RE engine).
     */

    if (TclReToGlob(NULL, string, length, &stringBuf, &exact) == TCL_OK) {
	regexpPtr->globObjPtr = Tcl_NewStringObj(Tcl_DStringValue(&stringBuf),
		Tcl_DStringLength(&stringBuf));
	Tcl_IncrRefCount(regexpPtr->globObjPtr);
	Tcl_DStringFree(&stringBuf);
    } else {
	regexpPtr->globObjPtr = NULL;
    }

    /*
     * Allocate enough space for all of the subexpressions, plus one extra for
     * the entire pattern.
     */

    regexpPtr->matches = (regmatch_t *) ckalloc(
	    sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1));

    /*
     * Initialize the refcount to one initially, since it is in the cache.
     */

    regexpPtr->refCount = 1;

    /*
     * Free the last regexp, if necessary, and make room at the head of the
     * list for the new regexp.
     */

    if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) {
	TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1];
	if (--(oldRegexpPtr->refCount) <= 0) {
	    FreeRegexp(oldRegexpPtr);
	}
	ckfree(tsdPtr->patterns[NUM_REGEXPS-1]);
    }
    for (i = NUM_REGEXPS - 2; i >= 0; i--) {
	tsdPtr->patterns[i+1] = tsdPtr->patterns[i];
	tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i];
	tsdPtr->regexps[i+1] = tsdPtr->regexps[i];
    }
    tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1));
    strcpy(tsdPtr->patterns[0], string);
    tsdPtr->patLengths[0] = length;
    tsdPtr->regexps[0] = regexpPtr;

    return regexpPtr;
}

/*
 *----------------------------------------------------------------------
 *
 * FreeRegexp --
 *
 *	Release the storage associated with a TclRegexp.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static void
FreeRegexp(
    TclRegexp *regexpPtr)	/* Compiled regular expression to free. */
{
    TclReFree(&regexpPtr->re);
    if (regexpPtr->globObjPtr) {
	TclDecrRefCount(regexpPtr->globObjPtr);
    }
    if (regexpPtr->matches) {
	ckfree((char *) regexpPtr->matches);
    }
    ckfree((char *) regexpPtr);
}

/*
 *----------------------------------------------------------------------
 *
 * FinalizeRegexp --
 *
 *	Release the storage associated with the per-thread regexp cache.
 *
 * Results:
 *	None.
 *
 * Side effects:
 *	None.
 *
 *----------------------------------------------------------------------
 */

static void
FinalizeRegexp(
    ClientData clientData)	/* Not used. */
{
    int i;
    TclRegexp *regexpPtr;
    ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);

    for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
	regexpPtr = tsdPtr->regexps[i];
	if (--(regexpPtr->refCount) <= 0) {
	    FreeRegexp(regexpPtr);
	}
	ckfree(tsdPtr->patterns[i]);
	tsdPtr->patterns[i] = NULL;
    }
    /*
     * We may find ourselves reinitialized if another finalization routine
     * invokes regexps.
     */
    tsdPtr->initialized = 0;
}

/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */

Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to [email protected].