Home | History | Annotate | Download | only in expect-5.43
      1 #if 0 /*WHOLE FILE*/
      2 
      3 /*
      4  * regcomp and regexec -- regsub and regerror are elsewhere
      5  *
      6  *	Copyright (c) 1986 by University of Toronto.
      7  *	Written by Henry Spencer.  Not derived from licensed software.
      8  *
      9  *	Permission is granted to anyone to use this software for any
     10  *	purpose on any computer system, and to redistribute it freely,
     11  *	subject to the following restrictions:
     12  *
     13  *	1. The author is not responsible for the consequences of use of
     14  *		this software, no matter how awful, even if they arise
     15  *		from defects in it.
     16  *
     17  *	2. The origin of this software must not be misrepresented, either
     18  *		by explicit claim or by omission.
     19  *
     20  *	3. Altered versions must be plainly marked as such, and must not
     21  *		be misrepresented as being the original software.
     22  *
     23  * Beware that some of this code is subtly aware of the way operator
     24  * precedence is structured in regular expressions.  Serious changes in
     25  * regular-expression syntax might require a total rethink.
     26  *
     27  * *** NOTE: this code has been altered slightly for use in Tcl. ***
     28  * *** The only change is to use ckalloc and ckfree instead of   ***
     29  * *** malloc and free.						 ***
     30 
     31  * *** and again for Expect!!! - DEL
     32 
     33  * *** More minor corrections stolen from tcl7.5p1/regexp.c - DEL
     34 
     35  */
     36 
     37 #include "tcl.h"
     38 #include "expect_cf.h"
     39 #include "exp_prog.h"
     40 #include "tclRegexp.h"
     41 #include "exp_regexp.h"
     42 #include "string.h"
     43 
     44 #define NOTSTATIC	/* was at one time, but Expect needs access */
     45 
     46 /*
     47  * The "internal use only" fields in regexp.h are present to pass info from
     48  * compile to execute that permits the execute phase to run lots faster on
     49  * simple cases.  They are:
     50  *
     51  * regstart	char that must begin a match; '\0' if none obvious
     52  * reganch	is the match anchored (at beginning-of-line only)?
     53  * regmust	string (pointer into program) that match must include, or NULL
     54  * regmlen	length of regmust string
     55  *
     56  * Regstart and reganch permit very fast decisions on suitable starting points
     57  * for a match, cutting down the work a lot.  Regmust permits fast rejection
     58  * of lines that cannot possibly match.  The regmust tests are costly enough
     59  * that regcomp() supplies a regmust only if the r.e. contains something
     60  * potentially expensive (at present, the only such thing detected is * or +
     61  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
     62  * supplied because the test in regexec() needs it and regcomp() is computing
     63  * it anyway.
     64  */
     65 
     66 /*
     67  * Structure for regexp "program".  This is essentially a linear encoding
     68  * of a nondeterministic finite-state machine (aka syntax charts or
     69  * "railroad normal form" in parsing technology).  Each node is an opcode
     70  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
     71  * all nodes except BRANCH implement concatenation; a "next" pointer with
     72  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
     73  * have one of the subtle syntax dependencies:  an individual BRANCH (as
     74  * opposed to a collection of them) is never concatenated with anything
     75  * because of operator precedence.)  The operand of some types of node is
     76  * a literal string; for others, it is a node leading into a sub-FSM.  In
     77  * particular, the operand of a BRANCH node is the first node of the branch.
     78  * (NB this is *not* a tree structure:  the tail of the branch connects
     79  * to the thing following the set of BRANCHes.)  The opcodes are:
     80  */
     81 
     82 /* definition	number	opnd?	meaning */
     83 #define	END	0	/* no	End of program. */
     84 #define	BOL	1	/* no	Match "" at beginning of line. */
     85 #define	EOL	2	/* no	Match "" at end of line. */
     86 #define	ANY	3	/* no	Match any one character. */
     87 #define	ANYOF	4	/* str	Match any character in this string. */
     88 #define	ANYBUT	5	/* str	Match any character not in this string. */
     89 #define	BRANCH	6	/* node	Match this alternative, or the next... */
     90 #define	BACK	7	/* no	Match "", "next" ptr points backward. */
     91 #define	EXACTLY	8	/* str	Match this string. */
     92 #define	NOTHING	9	/* no	Match empty string. */
     93 #define	STAR	10	/* node	Match this (simple) thing 0 or more times. */
     94 #define	PLUS	11	/* node	Match this (simple) thing 1 or more times. */
     95 #define	OPEN	20	/* no	Mark this point in input as start of #n. */
     96 			/*	OPEN+1 is number 1, etc. */
     97 #define	CLOSE	(OPEN+NSUBEXP)	/* no	Analogous to OPEN. */
     98 
     99 /*
    100  * Opcode notes:
    101  *
    102  * BRANCH	The set of branches constituting a single choice are hooked
    103  *		together with their "next" pointers, since precedence prevents
    104  *		anything being concatenated to any individual branch.  The
    105  *		"next" pointer of the last BRANCH in a choice points to the
    106  *		thing following the whole choice.  This is also where the
    107  *		final "next" pointer of each individual branch points; each
    108  *		branch starts with the operand node of a BRANCH node.
    109  *
    110  * BACK		Normal "next" pointers all implicitly point forward; BACK
    111  *		exists to make loop structures possible.
    112  *
    113  * STAR,PLUS	'?', and complex '*' and '+', are implemented as circular
    114  *		BRANCH structures using BACK.  Simple cases (one character
    115  *		per match) are implemented with STAR and PLUS for speed
    116  *		and to minimize recursive plunges.
    117  *
    118  * OPEN,CLOSE	...are numbered at compile time.
    119  */
    120 
    121 /*
    122  * A node is one char of opcode followed by two chars of "next" pointer.
    123  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
    124  * value is a positive offset from the opcode of the node containing it.
    125  * An operand, if any, simply follows the node.  (Note that much of the
    126  * code generation knows about this implicit relationship.)
    127  *
    128  * Using two bytes for the "next" pointer is vast overkill for most things,
    129  * but allows patterns to get big without disasters.
    130  */
    131 #define	OP(p)	(*(p))
    132 #define	NEXT(p)	(((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
    133 #define	OPERAND(p)	((p) + 3)
    134 
    135 /*
    136  * See regmagic.h for one further detail of program structure.
    137  */
    138 
    139 
    140 /*
    141  * Utility definitions.
    142  */
    143 #ifndef CHARBITS
    144 #define	UCHARAT(p)	((int)*(unsigned char *)(p))
    145 #else
    146 #define	UCHARAT(p)	((int)*(p)&CHARBITS)
    147 #endif
    148 
    149 #define	FAIL(m)	{ regerror(m); return(NULL); }
    150 #define	ISMULT(c)	((c) == '*' || (c) == '+' || (c) == '?')
    151 #define	META	"^$.[()|?+*\\"
    152 
    153 /*
    154  * Flags to be passed up and down.
    155  */
    156 #define	HASWIDTH	01	/* Known never to match null string. */
    157 #define	SIMPLE		02	/* Simple enough to be STAR/PLUS operand. */
    158 #define	SPSTART		04	/* Starts with * or +. */
    159 #define	WORST		0	/* Worst case. */
    160 
    161 /*
    162  * Global work variables for regcomp().
    163  */
    164 static char *regparse;		/* Input-scan pointer. */
    165 static int regnpar;		/* () count. */
    166 static char regdummy;
    167 static char *regcode;		/* Code-emit pointer; &regdummy = don't. */
    168 static long regsize;		/* Code size. */
    169 
    170 /*
    171  * The first byte of the regexp internal "program" is actually this magic
    172  * number; the start node begins in the second byte.
    173  */
    174 #define	MAGIC	0234
    175 
    176 
    177 /*
    178  * Forward declarations for regcomp()'s friends.
    179  */
    180 #ifndef STATIC
    181 #define	STATIC	static
    182 #endif
    183 STATIC char *reg();
    184 STATIC char *regbranch();
    185 STATIC char *regpiece();
    186 STATIC char *regatom();
    187 STATIC char *regnode();
    188 STATIC char *regnext();
    189 STATIC void regc();
    190 STATIC void reginsert();
    191 STATIC void regtail();
    192 STATIC void regoptail();
    193 #ifdef STRCSPN
    194 STATIC int strcspn();
    195 #endif
    196 
    197 /* regcomp originally appeared here - DEL */
    198 
    199 /*
    200  - reg - regular expression, i.e. main body or parenthesized thing
    201  *
    202  * Caller must absorb opening parenthesis.
    203  *
    204  * Combining parenthesis handling with the base level of regular expression
    205  * is a trifle forced, but the need to tie the tails of the branches to what
    206  * follows makes it hard to avoid.
    207  */
    208 static char *
    209 reg(paren, flagp)
    210 int paren;			/* Parenthesized? */
    211 int *flagp;
    212 {
    213 	register char *ret;
    214 	register char *br;
    215 	register char *ender;
    216 	register int parno = 0;
    217 	int flags;
    218 
    219 	*flagp = HASWIDTH;	/* Tentatively. */
    220 
    221 	/* Make an OPEN node, if parenthesized. */
    222 	if (paren) {
    223 		if (regnpar >= NSUBEXP)
    224 			FAIL("too many ()");
    225 		parno = regnpar;
    226 		regnpar++;
    227 		ret = regnode(OPEN+parno);
    228 	} else
    229 		ret = NULL;
    230 
    231 	/* Pick up the branches, linking them together. */
    232 	br = regbranch(&flags);
    233 	if (br == NULL)
    234 		return(NULL);
    235 	if (ret != NULL)
    236 		regtail(ret, br);	/* OPEN -> first. */
    237 	else
    238 		ret = br;
    239 	if (!(flags&HASWIDTH))
    240 		*flagp &= ~HASWIDTH;
    241 	*flagp |= flags&SPSTART;
    242 	while (*regparse == '|') {
    243 		regparse++;
    244 		br = regbranch(&flags);
    245 		if (br == NULL)
    246 			return(NULL);
    247 		regtail(ret, br);	/* BRANCH -> BRANCH. */
    248 		if (!(flags&HASWIDTH))
    249 			*flagp &= ~HASWIDTH;
    250 		*flagp |= flags&SPSTART;
    251 	}
    252 
    253 	/* Make a closing node, and hook it on the end. */
    254 	ender = regnode((paren) ? CLOSE+parno : END);
    255 	regtail(ret, ender);
    256 
    257 	/* Hook the tails of the branches to the closing node. */
    258 	for (br = ret; br != NULL; br = regnext(br))
    259 		regoptail(br, ender);
    260 
    261 	/* Check for proper termination. */
    262 	if (paren && *regparse++ != ')') {
    263 		FAIL("unmatched ()");
    264 	} else if (!paren && *regparse != '\0') {
    265 		if (*regparse == ')') {
    266 			FAIL("unmatched ()");
    267 		} else
    268 			FAIL("junk on end");	/* "Can't happen". */
    269 		/* NOTREACHED */
    270 	}
    271 
    272 	return(ret);
    273 }
    274 
    275 /*
    276  - regbranch - one alternative of an | operator
    277  *
    278  * Implements the concatenation operator.
    279  */
    280 static char *
    281 regbranch(flagp)
    282 int *flagp;
    283 {
    284 	register char *ret;
    285 	register char *chain;
    286 	register char *latest;
    287 	int flags;
    288 
    289 	*flagp = WORST;		/* Tentatively. */
    290 
    291 	ret = regnode(BRANCH);
    292 	chain = NULL;
    293 	while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
    294 		latest = regpiece(&flags);
    295 		if (latest == NULL)
    296 			return(NULL);
    297 		*flagp |= flags&HASWIDTH;
    298 		if (chain == NULL)	/* First piece. */
    299 			*flagp |= flags&SPSTART;
    300 		else
    301 			regtail(chain, latest);
    302 		chain = latest;
    303 	}
    304 	if (chain == NULL)	/* Loop ran zero times. */
    305 		(void) regnode(NOTHING);
    306 
    307 	return(ret);
    308 }
    309 
    310 /*
    311  - regpiece - something followed by possible [*+?]
    312  *
    313  * Note that the branching code sequences used for ? and the general cases
    314  * of * and + are somewhat optimized:  they use the same NOTHING node as
    315  * both the endmarker for their branch list and the body of the last branch.
    316  * It might seem that this node could be dispensed with entirely, but the
    317  * endmarker role is not redundant.
    318  */
    319 static char *
    320 regpiece(flagp)
    321 int *flagp;
    322 {
    323 	register char *ret;
    324 	register char op;
    325 	register char *next;
    326 	int flags;
    327 
    328 	ret = regatom(&flags);
    329 	if (ret == NULL)
    330 		return(NULL);
    331 
    332 	op = *regparse;
    333 	if (!ISMULT(op)) {
    334 		*flagp = flags;
    335 		return(ret);
    336 	}
    337 
    338 	if (!(flags&HASWIDTH) && op != '?')
    339 		FAIL("*+ operand could be empty");
    340 	*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
    341 
    342 	if (op == '*' && (flags&SIMPLE))
    343 		reginsert(STAR, ret);
    344 	else if (op == '*') {
    345 		/* Emit x* as (x&|), where & means "self". */
    346 		reginsert(BRANCH, ret);			/* Either x */
    347 		regoptail(ret, regnode(BACK));		/* and loop */
    348 		regoptail(ret, ret);			/* back */
    349 		regtail(ret, regnode(BRANCH));		/* or */
    350 		regtail(ret, regnode(NOTHING));		/* null. */
    351 	} else if (op == '+' && (flags&SIMPLE))
    352 		reginsert(PLUS, ret);
    353 	else if (op == '+') {
    354 		/* Emit x+ as x(&|), where & means "self". */
    355 		next = regnode(BRANCH);			/* Either */
    356 		regtail(ret, next);
    357 		regtail(regnode(BACK), ret);		/* loop back */
    358 		regtail(next, regnode(BRANCH));		/* or */
    359 		regtail(ret, regnode(NOTHING));		/* null. */
    360 	} else if (op == '?') {
    361 		/* Emit x? as (x|) */
    362 		reginsert(BRANCH, ret);			/* Either x */
    363 		regtail(ret, regnode(BRANCH));		/* or */
    364 		next = regnode(NOTHING);		/* null. */
    365 		regtail(ret, next);
    366 		regoptail(ret, next);
    367 	}
    368 	regparse++;
    369 	if (ISMULT(*regparse))
    370 		FAIL("nested *?+");
    371 
    372 	return(ret);
    373 }
    374 
    375 /*
    376  - regatom - the lowest level
    377  *
    378  * Optimization:  gobbles an entire sequence of ordinary characters so that
    379  * it can turn them into a single node, which is smaller to store and
    380  * faster to run.  Backslashed characters are exceptions, each becoming a
    381  * separate node; the code is simpler that way and it's not worth fixing.
    382  */
    383 static char *
    384 regatom(flagp)
    385 int *flagp;
    386 {
    387 	register char *ret;
    388 	int flags;
    389 
    390 	*flagp = WORST;		/* Tentatively. */
    391 
    392 	switch (*regparse++) {
    393 	case '^':
    394 		ret = regnode(BOL);
    395 		break;
    396 	case '$':
    397 		ret = regnode(EOL);
    398 		break;
    399 	case '.':
    400 		ret = regnode(ANY);
    401 		*flagp |= HASWIDTH|SIMPLE;
    402 		break;
    403 	case '[': {
    404 			register int clss;
    405 			register int classend;
    406 
    407 			if (*regparse == '^') {	/* Complement of range. */
    408 				ret = regnode(ANYBUT);
    409 				regparse++;
    410 			} else
    411 				ret = regnode(ANYOF);
    412 			if (*regparse == ']' || *regparse == '-')
    413 				regc(*regparse++);
    414 			while (*regparse != '\0' && *regparse != ']') {
    415 				if (*regparse == '-') {
    416 					regparse++;
    417 					if (*regparse == ']' || *regparse == '\0')
    418 						regc('-');
    419 					else {
    420 						clss = UCHARAT(regparse-2)+1;
    421 						classend = UCHARAT(regparse);
    422 						if (clss > classend+1)
    423 							FAIL("invalid [] range");
    424 						for (; clss <= classend; clss++)
    425 							regc((char)clss);
    426 						regparse++;
    427 					}
    428 				} else
    429 					regc(*regparse++);
    430 			}
    431 			regc('\0');
    432 			if (*regparse != ']')
    433 				FAIL("unmatched []");
    434 			regparse++;
    435 			*flagp |= HASWIDTH|SIMPLE;
    436 		}
    437 		break;
    438 	case '(':
    439 		ret = reg(1, &flags);
    440 		if (ret == NULL)
    441 			return(NULL);
    442 		*flagp |= flags&(HASWIDTH|SPSTART);
    443 		break;
    444 	case '\0':
    445 	case '|':
    446 	case ')':
    447 		FAIL("internal urp");	/* Supposed to be caught earlier. */
    448 		/* NOTREACHED */
    449 		break;
    450 	case '?':
    451 	case '+':
    452 	case '*':
    453 		FAIL("?+* follows nothing");
    454 		/* NOTREACHED */
    455 		break;
    456 	case '\\':
    457 		if (*regparse == '\0')
    458 			FAIL("trailing \\");
    459 		ret = regnode(EXACTLY);
    460 		regc(*regparse++);
    461 		regc('\0');
    462 		*flagp |= HASWIDTH|SIMPLE;
    463 		break;
    464 	default: {
    465 			register int len;
    466 			register char ender;
    467 
    468 			regparse--;
    469 			len = strcspn(regparse, META);
    470 			if (len <= 0)
    471 				FAIL("internal disaster");
    472 			ender = *(regparse+len);
    473 			if (len > 1 && ISMULT(ender))
    474 				len--;		/* Back off clear of ?+* operand. */
    475 			*flagp |= HASWIDTH;
    476 			if (len == 1)
    477 				*flagp |= SIMPLE;
    478 			ret = regnode(EXACTLY);
    479 			while (len > 0) {
    480 				regc(*regparse++);
    481 				len--;
    482 			}
    483 			regc('\0');
    484 		}
    485 		break;
    486 	}
    487 
    488 	return(ret);
    489 }
    490 
    491 /*
    492  - regnode - emit a node
    493  */
    494 static char *			/* Location. */
    495 regnode(op)
    496 int op;
    497 {
    498 	register char *ret;
    499 	register char *ptr;
    500 
    501 	ret = regcode;
    502 	if (ret == &regdummy) {
    503 		regsize += 3;
    504 		return(ret);
    505 	}
    506 
    507 	ptr = ret;
    508 	*ptr++ = (char)op;
    509 	*ptr++ = '\0';		/* Null "next" pointer. */
    510 	*ptr++ = '\0';
    511 	regcode = ptr;
    512 
    513 	return(ret);
    514 }
    515 
    516 /*
    517  - regc - emit (if appropriate) a byte of code
    518  */
    519 static void
    520 regc(b)
    521 int b;
    522 {
    523 	if (regcode != &regdummy)
    524 		*regcode++ = (char)b;
    525 	else
    526 		regsize++;
    527 }
    528 
    529 /*
    530  - reginsert - insert an operator in front of already-emitted operand
    531  *
    532  * Means relocating the operand.
    533  */
    534 static void
    535 reginsert(op, opnd)
    536 int op;
    537 char *opnd;
    538 {
    539 	register char *src;
    540 	register char *dst;
    541 	register char *place;
    542 
    543 	if (regcode == &regdummy) {
    544 		regsize += 3;
    545 		return;
    546 	}
    547 
    548 	src = regcode;
    549 	regcode += 3;
    550 	dst = regcode;
    551 	while (src > opnd)
    552 		*--dst = *--src;
    553 
    554 	place = opnd;		/* Op node, where operand used to be. */
    555 	*place++ = (char)op;
    556 	*place++ = '\0';
    557 	*place = '\0';
    558 }
    559 
    560 /*
    561  - regtail - set the next-pointer at the end of a node chain
    562  */
    563 static void
    564 regtail(p, val)
    565 char *p;
    566 char *val;
    567 {
    568 	register char *scan;
    569 	register char *temp;
    570 	register int offset;
    571 
    572 	if (p == &regdummy)
    573 		return;
    574 
    575 	/* Find last node. */
    576 	scan = p;
    577 	for (;;) {
    578 		temp = regnext(scan);
    579 		if (temp == NULL)
    580 			break;
    581 		scan = temp;
    582 	}
    583 
    584 	if (OP(scan) == BACK)
    585 		offset = scan - val;
    586 	else
    587 		offset = val - scan;
    588 	*(scan+1) = (char)(offset>>8)&0377;
    589 	*(scan+2) = (char)offset&0377;
    590 }
    591 
    592 /*
    593  - regoptail - regtail on operand of first argument; nop if operandless
    594  */
    595 static void
    596 regoptail(p, val)
    597 char *p;
    598 char *val;
    599 {
    600 	/* "Operandless" and "op != BRANCH" are synonymous in practice. */
    601 	if (p == NULL || p == &regdummy || OP(p) != BRANCH)
    602 		return;
    603 	regtail(OPERAND(p), val);
    604 }
    605 
    606 /*
    607  * regexec and friends
    608  */
    609 
    610 /*
    611  * Global work variables for regexec().
    612  */
    613 static char *reginput;		/* String-input pointer. */
    614 NOTSTATIC char *regbol;		/* Beginning of input, for ^ check. */
    615 static char **regstartp;	/* Pointer to startp array. */
    616 static char **regendp;		/* Ditto for endp. */
    617 
    618 /*
    619  * Forwards.
    620  */
    621 
    622 NOTSTATIC int regtry();
    623 STATIC int regmatch();
    624 STATIC int regrepeat();
    625 
    626 #ifdef DEBUG
    627 int regnarrate = 0;
    628 void regdump();
    629 STATIC char *regprop();
    630 #endif
    631 
    632 #if 0
    633 /*
    634  - regexec - match a regexp against a string
    635  */
    636 int
    637 regexec(prog, string, stringlength, matchlength)
    638 register regexp *prog;
    639 register char *string;	/* note: CURRENTLY ASSUMED TO BE NULL-TERMINATED!!! */
    640 int stringlength;	/* length of string */
    641 int *matchlength;	/* number of chars matched (or to be skipped) */
    642 			/* set when MATCH or CANT_MATCH */
    643 {
    644 	register char *s;
    645 	extern char *strchr();
    646 
    647 	/* Be paranoid... */
    648 	if (prog == NULL || string == NULL) {
    649 		regerror("NULL parameter");
    650 		return(EXP_TCLERROR);
    651 	}
    652 
    653 	/* Check validity of program. */
    654 	if (UCHARAT(prog->program) != MAGIC) {
    655 		regerror("corrupted program");
    656 		return(EXP_KM_ERROR);
    657 	}
    658 
    659 #if THIS_RUINS_EXP
    660 /* no need for this shortcut anyway */
    661 	/* If there is a "must appear" string, look for it. */
    662 	if (prog->regmust != NULL) {
    663 		s = string;
    664 		while ((s = strchr(s, prog->regmust[0])) != NULL) {
    665 			if (strncmp(s, prog->regmust, prog->regmlen) == 0)
    666 				break;	/* Found it. */
    667 			s++;
    668 		}
    669 		if (s == NULL)	/* Not present. */
    670 			return(0);
    671 	}
    672 #endif
    673 
    674 	/* Mark beginning of line for ^ . */
    675 	regbol = string;
    676 
    677 	/* Simplest case:  anchored match need be tried only once. */
    678 	if (prog->reganch) {
    679 		int r = regtry(prog,string,matchlength);
    680 		if (r == CANT_MATCH) *matchlength = stringlength;
    681 		return(r);
    682 	}
    683 
    684 	/* Messy cases:  unanchored match. */
    685 	s = string;
    686 	if (prog->regstart != '\0') {
    687 		register char *s2 = s;
    688 
    689 		/* We know what char it must start with. */
    690 		while (1) {
    691 			int r;
    692 
    693 			s2 = strchr(s2,prog->regstart);
    694 			if (s2 == 0) {
    695 				*matchlength = stringlength;
    696 				return(CANT_MATCH);
    697 			}
    698 			r = regtry(prog,s2,matchlength);
    699 			if (r == CANT_MATCH) {
    700 				s2++;
    701 				continue;
    702 			}
    703 			if (s2 == s) return(r);
    704 			*matchlength = s2-s;
    705 			return CANT_MATCH;
    706 		}
    707 	} else {
    708 		/* We don't -- general case. */
    709 		register char *s2 = s;
    710 		int r = regtry(prog,s,matchlength);
    711 		if (r == EXP_MATCH) return(r);
    712 		else if (r == EXP_CANMATCH) return(r);
    713 		/* at this point, we know some characters at front */
    714 		/* of string don't match */
    715 		for (s2++;*s2;s2++) {
    716 			r = regtry(prog,s2,matchlength);
    717 			if (r == CANT_MATCH) continue;
    718 			/* if we match or can_match, say cant_match and */
    719 			/* record the number of chars at front that don't match */
    720 			*matchlength = s2-s;
    721 			return(CANT_MATCH);
    722 		}
    723 		/* made it thru string with CANT_MATCH all the way */
    724 		*matchlength = stringlength;
    725 		return(CANT_MATCH);
    726 	}
    727 }
    728 #endif
    729 
    730 /*
    731  - regtry - try match at specific point
    732  */
    733 /* return CAN_MATCH, CANT_MATCH or MATCH */
    734 int			/* 0 failure, 1 success */
    735 regtry(prog, string, matchlength)
    736 regexp *prog;
    737 char *string;
    738 int *matchlength;	/* only set for MATCH */
    739 {
    740 	register int i;
    741 	register char **sp;
    742 	register char **ep;
    743 	int r;		/* result of regmatch */
    744 
    745 	reginput = string;
    746 	regstartp = prog->startp;
    747 	regendp = prog->endp;
    748 
    749 	sp = prog->startp;
    750 	ep = prog->endp;
    751 	for (i = NSUBEXP; i > 0; i--) {
    752 		*sp++ = NULL;
    753 		*ep++ = NULL;
    754 	}
    755 	r = regmatch(prog->program + 1);
    756 	if (EXP_MATCH == r) {
    757 		prog->startp[0] = string;
    758 		prog->endp[0] = reginput;
    759 		*matchlength = reginput-string;
    760 		return(EXP_MATCH);
    761 	}
    762 	return(r);	/* CAN_MATCH or CANT_MATCH */
    763 }
    764 
    765 /*
    766  - regmatch - main matching routine
    767  *
    768  * Conceptually the strategy is simple:  check to see whether the current
    769  * node matches, call self recursively to see whether the rest matches,
    770  * and then act accordingly.  In practice we make some effort to avoid
    771  * recursion, in particular by going through "ordinary" nodes (that don't
    772  * need to know whether the rest of the match failed) by a loop instead of
    773  * by recursion.
    774  */
    775 /* returns CAN, CANT or MATCH */
    776 static int			/* 0 failure, 1 success */
    777 regmatch(prog)
    778 char *prog;
    779 {
    780 	register char *scan;	/* Current node. */
    781 	char *next;		/* Next node. */
    782 #ifndef strchr	/* May be #defined to something else */
    783 	extern char *strchr();
    784 #endif
    785 
    786 	scan = prog;
    787 #ifdef DEBUG
    788 	if (scan != NULL && regnarrate)
    789 		fprintf(stderr, "%s(\n", regprop(scan));
    790 #endif
    791 	while (scan != NULL) {
    792 #ifdef DEBUG
    793 		if (regnarrate)
    794 			fprintf(stderr, "%s...\n", regprop(scan));
    795 #endif
    796 		next = regnext(scan);
    797 
    798 		switch (OP(scan)) {
    799 		case BOL:
    800 			if (reginput != regbol)
    801 /*				return(0);*/
    802 				return(EXP_CANTMATCH);
    803 			break;
    804 		case EOL:
    805 			if (*reginput != '\0')
    806 /*				return(0);*/
    807 /* note this implies that "$" must match everything received to this point! */
    808 				return(EXP_CANTMATCH);
    809 			break;
    810 		case ANY:
    811 			if (*reginput == '\0')
    812 /*				return(0);*/
    813 				return(EXP_CANMATCH);
    814 			reginput++;
    815 			break;
    816 		case EXACTLY: {
    817 /*				register int len;*/
    818 				register char *opnd;
    819 
    820 				opnd = OPERAND(scan);
    821 
    822 				/* this section of code is totally rewritten - DEL */
    823 				/* group of literal chars in pattern */
    824 				/* compare each one */
    825 				do {
    826 					if (*opnd != *reginput) {
    827 						if (*reginput == '\0') {
    828 							return EXP_CANMATCH;
    829 						} else 	return EXP_CANTMATCH;
    830 					}
    831 
    832 					reginput++;
    833 					opnd++;
    834 				} while (*opnd != '\0');
    835 			}
    836 			break;
    837 		case ANYOF:
    838 /* 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
    839 				return(0);
    840 */
    841 			if (*reginput == '\0')
    842 				return(EXP_CANMATCH);
    843 			if (strchr(OPERAND(scan),*reginput) == NULL)
    844 				return(EXP_CANTMATCH);
    845 			reginput++;
    846 			break;
    847 		case ANYBUT:
    848 /* 			if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
    849 				return(0);
    850 */
    851 			if (*reginput == '\0')
    852 				return(EXP_CANMATCH);
    853 			if (strchr(OPERAND(scan),*reginput) != NULL)
    854 				return(EXP_CANTMATCH);
    855 			reginput++;
    856 			break;
    857 		case NOTHING:
    858 			break;
    859 		case BACK:
    860 			break;
    861 		case OPEN+1:
    862 		case OPEN+2:
    863 		case OPEN+3:
    864 		case OPEN+4:
    865 		case OPEN+5:
    866 		case OPEN+6:
    867 		case OPEN+7:
    868 		case OPEN+8:
    869 		case OPEN+9: {
    870 				register int no;
    871 				register char *save;
    872 				int r;	/* result of regmatch */
    873 
    874 	doOpen:
    875 				no = OP(scan) - OPEN;
    876 				save = reginput;
    877 
    878 				r = regmatch(next);
    879 				if (r == EXP_MATCH) {
    880 					/*
    881 					 * Don't set startp if some later
    882 					 * invocation of the same parentheses
    883 					 * already has.
    884 					 */
    885 					if (regstartp[no] == NULL)
    886 						regstartp[no] = save;
    887 				}
    888 				return(r);
    889 			}
    890 			/* NOTREACHED */
    891 			break;
    892 		case CLOSE+1:
    893 		case CLOSE+2:
    894 		case CLOSE+3:
    895 		case CLOSE+4:
    896 		case CLOSE+5:
    897 		case CLOSE+6:
    898 		case CLOSE+7:
    899 		case CLOSE+8:
    900 		case CLOSE+9: {
    901 				register int no;
    902 				register char *save;
    903 				int r;	/* result of regmatch */
    904 
    905 	doClose:
    906 				no = OP(scan) - CLOSE;
    907 				save = reginput;
    908 
    909 				r = regmatch(next);
    910 				if (r == EXP_MATCH) {
    911 					/*
    912 					 * Don't set endp if some later
    913 					 * invocation of the same parentheses
    914 					 * already has.
    915 					 */
    916 					if (regendp[no] == NULL)
    917 						regendp[no] = save;
    918 				}
    919 				return(r);
    920 			}
    921 			/* NOTREACHED */
    922 			break;
    923 		case BRANCH: {
    924 				register char *save;
    925 				int match_status;
    926 
    927 				if (OP(next) != BRANCH)		/* No choice. */
    928 					next = OPERAND(scan);	/* Avoid recursion. */
    929 				else {
    930 					match_status = EXP_CANTMATCH;
    931 
    932 					do {
    933 						int r;
    934 
    935 						save = reginput;
    936 						r = regmatch(OPERAND(scan));
    937 						if (r == EXP_MATCH) return(r);
    938 						if (r == EXP_CANMATCH) {
    939 							match_status = r;
    940 						}
    941 						reginput = save;
    942 						scan = regnext(scan);
    943 					} while (scan != NULL && OP(scan) == BRANCH);
    944 					return(match_status);
    945 					/* NOTREACHED */
    946 				}
    947 			}
    948 			/* NOTREACHED */
    949 			break;
    950 		case STAR:
    951 		case PLUS: {
    952 				register char nextch;
    953 				register int no;
    954 				register char *save;
    955 				register int min;
    956 				int match_status;
    957 
    958 				if (*reginput == '\0') return EXP_CANMATCH;
    959 
    960 				/*
    961 				 * Lookahead to avoid useless match attempts
    962 				 * when we know what character comes next.
    963 				 */
    964 				match_status = EXP_CANTMATCH;
    965 				nextch = '\0';
    966 				if (OP(next) == EXACTLY)
    967 					nextch = *OPERAND(next);
    968 				min = (OP(scan) == STAR) ? 0 : 1;
    969 				save = reginput;
    970 				no = regrepeat(OPERAND(scan));
    971 				while (no >= min) {
    972 					/* If it could work, try it. */
    973 					/* 3rd condition allows for CAN_MATCH */
    974 					if (nextch == '\0' || *reginput == nextch || *reginput == '\0') {
    975 						int r = regmatch(next);
    976 						if (r == EXP_MATCH)
    977 							return(EXP_MATCH);
    978 						if (r == EXP_CANMATCH)
    979 /*							match_status = r;*/
    980 							return(EXP_CANMATCH);
    981 					}
    982 					/* Couldn't or didn't -- back up. */
    983 					no--;
    984 					reginput = save + no;
    985 				}
    986 				return(match_status);
    987 			}
    988 			/* NOTREACHED */
    989 			break;
    990 		case END:
    991 			/* Success! */
    992 			if (*reginput == '\0') {
    993 				return(EXP_CANMATCH);
    994 			} else {
    995 				return(EXP_MATCH);
    996 			}
    997 			/* return(EXP_CANMATCH);  Success! */
    998 			/* NOTREACHED */
    999 			break;
   1000 		default:
   1001 			if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) {
   1002 				goto doOpen;
   1003 			} else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) {
   1004 				goto doClose;
   1005 			}
   1006 			regerror("memory corruption");
   1007 			return(EXP_TCLERROR);
   1008 			/* NOTREACHED */
   1009 			break;
   1010 		}
   1011 
   1012 		scan = next;
   1013 	}
   1014 
   1015 	/*
   1016 	 * We get here only if there's trouble -- normally "case END" is
   1017 	 * the terminating point.
   1018 	 */
   1019 	regerror("corrupted pointers");
   1020 	return(EXP_TCLERROR);
   1021 }
   1022 
   1023 /*
   1024  - regrepeat - repeatedly match something simple, report how many
   1025  */
   1026 static int
   1027 regrepeat(p)
   1028 char *p;
   1029 {
   1030 	register int count = 0;
   1031 	register char *scan;
   1032 	register char *opnd;
   1033 #ifndef strchr	/* May be #defined to something else */
   1034 /*DEL*/	extern char *strchr();
   1035 #endif
   1036 
   1037 	scan = reginput;
   1038 	opnd = OPERAND(p);
   1039 	switch (OP(p)) {
   1040 	case ANY:
   1041 		count = strlen(scan);
   1042 		scan += count;
   1043 		break;
   1044 	case EXACTLY:
   1045 		while (*opnd == *scan) {
   1046 			count++;
   1047 			scan++;
   1048 		}
   1049 		break;
   1050 	case ANYOF:
   1051 		while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
   1052 			count++;
   1053 			scan++;
   1054 		}
   1055 		break;
   1056 	case ANYBUT:
   1057 		while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
   1058 			count++;
   1059 			scan++;
   1060 		}
   1061 		break;
   1062 	default:		/* Oh dear.  Called inappropriately. */
   1063 		regerror("internal foulup");
   1064 		count = 0;	/* Best compromise. */
   1065 		break;
   1066 	}
   1067 	reginput = scan;
   1068 
   1069 	return(count);
   1070 }
   1071 
   1072 /*
   1073  - regnext - dig the "next" pointer out of a node
   1074  */
   1075 static char *
   1076 regnext(p)
   1077 register char *p;
   1078 {
   1079 	register int offset;
   1080 
   1081 	if (p == &regdummy)
   1082 		return(NULL);
   1083 
   1084 	offset = NEXT(p);
   1085 	if (offset == 0)
   1086 		return(NULL);
   1087 
   1088 	if (OP(p) == BACK)
   1089 		return(p-offset);
   1090 	else
   1091 		return(p+offset);
   1092 }
   1093 
   1094 #ifdef DEBUG
   1095 
   1096 STATIC char *regprop();
   1097 
   1098 /*
   1099  - regdump - dump a regexp onto stdout in vaguely comprehensible form
   1100  */
   1101 void
   1102 regdump(r)
   1103 regexp *r;
   1104 {
   1105 	register char *s;
   1106 	register char op = EXACTLY;	/* Arbitrary non-END op. */
   1107 	register char *next;
   1108 	extern char *strchr();
   1109 
   1110 
   1111 	s = r->program + 1;
   1112 	while (op != END) {	/* While that wasn't END last time... */
   1113 		op = OP(s);
   1114 		printf("%2d%s", s-r->program, regprop(s));	/* Where, what. */
   1115 		next = regnext(s);
   1116 		if (next == NULL)		/* Next ptr. */
   1117 			printf("(0)");
   1118 		else
   1119 			printf("(%d)", (s-r->program)+(next-s));
   1120 		s += 3;
   1121 		if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
   1122 			/* Literal string, where present. */
   1123 			while (*s != '\0') {
   1124 				putchar(*s);
   1125 				s++;
   1126 			}
   1127 			s++;
   1128 		}
   1129 		putchar('\n');
   1130 	}
   1131 
   1132 	/* Header fields of interest. */
   1133 	if (r->regstart != '\0')
   1134 		printf("start `%c' ", r->regstart);
   1135 	if (r->reganch)
   1136 		printf("anchored ");
   1137 	if (r->regmust != NULL)
   1138 		printf("must have \"%s\"", r->regmust);
   1139 	printf("\n");
   1140 }
   1141 
   1142 /*
   1143  - regprop - printable representation of opcode
   1144  */
   1145 static char *
   1146 regprop(op)
   1147 char *op;
   1148 {
   1149 	register char *p;
   1150 	static char buf[50];
   1151 
   1152 	(void) strcpy(buf, ":");
   1153 
   1154 	switch (OP(op)) {
   1155 	case BOL:
   1156 		p = "BOL";
   1157 		break;
   1158 	case EOL:
   1159 		p = "EOL";
   1160 		break;
   1161 	case ANY:
   1162 		p = "ANY";
   1163 		break;
   1164 	case ANYOF:
   1165 		p = "ANYOF";
   1166 		break;
   1167 	case ANYBUT:
   1168 		p = "ANYBUT";
   1169 		break;
   1170 	case BRANCH:
   1171 		p = "BRANCH";
   1172 		break;
   1173 	case EXACTLY:
   1174 		p = "EXACTLY";
   1175 		break;
   1176 	case NOTHING:
   1177 		p = "NOTHING";
   1178 		break;
   1179 	case BACK:
   1180 		p = "BACK";
   1181 		break;
   1182 	case END:
   1183 		p = "END";
   1184 		break;
   1185 	case OPEN+1:
   1186 	case OPEN+2:
   1187 	case OPEN+3:
   1188 	case OPEN+4:
   1189 	case OPEN+5:
   1190 	case OPEN+6:
   1191 	case OPEN+7:
   1192 	case OPEN+8:
   1193 	case OPEN+9:
   1194 		sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
   1195 		p = NULL;
   1196 		break;
   1197 	case CLOSE+1:
   1198 	case CLOSE+2:
   1199 	case CLOSE+3:
   1200 	case CLOSE+4:
   1201 	case CLOSE+5:
   1202 	case CLOSE+6:
   1203 	case CLOSE+7:
   1204 	case CLOSE+8:
   1205 	case CLOSE+9:
   1206 		sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
   1207 		p = NULL;
   1208 		break;
   1209 	case STAR:
   1210 		p = "STAR";
   1211 		break;
   1212 	case PLUS:
   1213 		p = "PLUS";
   1214 		break;
   1215 	default:
   1216 		if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) {
   1217 		    sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
   1218 		    p = NULL;
   1219 		    break;
   1220 		} else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) {
   1221 		    sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
   1222 		    p = NULL;
   1223 		} else {
   1224 		    TclRegError("corrupted opcode");
   1225 		}
   1226 		break;
   1227 	}
   1228 	if (p != NULL)
   1229 		(void) strcat(buf, p);
   1230 	return(buf);
   1231 }
   1232 #endif
   1233 
   1234 /*
   1235  * The following is provided for those people who do not have strcspn() in
   1236  * their C libraries.  They should get off their butts and do something
   1237  * about it; at least one public-domain implementation of those (highly
   1238  * useful) string routines has been published on Usenet.
   1239  */
   1240 #ifdef STRCSPN
   1241 /*
   1242  * strcspn - find length of initial segment of s1 consisting entirely
   1243  * of characters not from s2
   1244  */
   1245 
   1246 static int
   1247 strcspn(s1, s2)
   1248 char *s1;
   1249 char *s2;
   1250 {
   1251 	register char *scan1;
   1252 	register char *scan2;
   1253 	register int count;
   1254 
   1255 	count = 0;
   1256 	for (scan1 = s1; *scan1 != '\0'; scan1++) {
   1257 		for (scan2 = s2; *scan2 != '\0';)	/* ++ moved down. */
   1258 			if (*scan1 == *scan2++)
   1259 				return(count);
   1260 		count++;
   1261 	}
   1262 	return(count);
   1263 }
   1264 #endif
   1265 #endif /* 0 WHOLE FILE */
   1266