1 #if 0 /*WHOLE FILE*/ 2 3 /* 4 * regcomp and regexec -- regsub and regerror are elsewhere 5 * 6 * Copyright (c) 1986 by University of Toronto. 7 * Written by Henry Spencer. Not derived from licensed software. 8 * 9 * Permission is granted to anyone to use this software for any 10 * purpose on any computer system, and to redistribute it freely, 11 * subject to the following restrictions: 12 * 13 * 1. The author is not responsible for the consequences of use of 14 * this software, no matter how awful, even if they arise 15 * from defects in it. 16 * 17 * 2. The origin of this software must not be misrepresented, either 18 * by explicit claim or by omission. 19 * 20 * 3. Altered versions must be plainly marked as such, and must not 21 * be misrepresented as being the original software. 22 * 23 * Beware that some of this code is subtly aware of the way operator 24 * precedence is structured in regular expressions. Serious changes in 25 * regular-expression syntax might require a total rethink. 26 * 27 * *** NOTE: this code has been altered slightly for use in Tcl. *** 28 * *** The only change is to use ckalloc and ckfree instead of *** 29 * *** malloc and free. *** 30 31 * *** and again for Expect!!! - DEL 32 33 * *** More minor corrections stolen from tcl7.5p1/regexp.c - DEL 34 35 */ 36 37 #include "tcl.h" 38 #include "expect_cf.h" 39 #include "exp_prog.h" 40 #include "tclRegexp.h" 41 #include "exp_regexp.h" 42 #include "string.h" 43 44 #define NOTSTATIC /* was at one time, but Expect needs access */ 45 46 /* 47 * The "internal use only" fields in regexp.h are present to pass info from 48 * compile to execute that permits the execute phase to run lots faster on 49 * simple cases. They are: 50 * 51 * regstart char that must begin a match; '\0' if none obvious 52 * reganch is the match anchored (at beginning-of-line only)? 53 * regmust string (pointer into program) that match must include, or NULL 54 * regmlen length of regmust string 55 * 56 * Regstart and reganch permit very fast decisions on suitable starting points 57 * for a match, cutting down the work a lot. Regmust permits fast rejection 58 * of lines that cannot possibly match. The regmust tests are costly enough 59 * that regcomp() supplies a regmust only if the r.e. contains something 60 * potentially expensive (at present, the only such thing detected is * or + 61 * at the start of the r.e., which can involve a lot of backup). Regmlen is 62 * supplied because the test in regexec() needs it and regcomp() is computing 63 * it anyway. 64 */ 65 66 /* 67 * Structure for regexp "program". This is essentially a linear encoding 68 * of a nondeterministic finite-state machine (aka syntax charts or 69 * "railroad normal form" in parsing technology). Each node is an opcode 70 * plus a "next" pointer, possibly plus an operand. "Next" pointers of 71 * all nodes except BRANCH implement concatenation; a "next" pointer with 72 * a BRANCH on both ends of it is connecting two alternatives. (Here we 73 * have one of the subtle syntax dependencies: an individual BRANCH (as 74 * opposed to a collection of them) is never concatenated with anything 75 * because of operator precedence.) The operand of some types of node is 76 * a literal string; for others, it is a node leading into a sub-FSM. In 77 * particular, the operand of a BRANCH node is the first node of the branch. 78 * (NB this is *not* a tree structure: the tail of the branch connects 79 * to the thing following the set of BRANCHes.) The opcodes are: 80 */ 81 82 /* definition number opnd? meaning */ 83 #define END 0 /* no End of program. */ 84 #define BOL 1 /* no Match "" at beginning of line. */ 85 #define EOL 2 /* no Match "" at end of line. */ 86 #define ANY 3 /* no Match any one character. */ 87 #define ANYOF 4 /* str Match any character in this string. */ 88 #define ANYBUT 5 /* str Match any character not in this string. */ 89 #define BRANCH 6 /* node Match this alternative, or the next... */ 90 #define BACK 7 /* no Match "", "next" ptr points backward. */ 91 #define EXACTLY 8 /* str Match this string. */ 92 #define NOTHING 9 /* no Match empty string. */ 93 #define STAR 10 /* node Match this (simple) thing 0 or more times. */ 94 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */ 95 #define OPEN 20 /* no Mark this point in input as start of #n. */ 96 /* OPEN+1 is number 1, etc. */ 97 #define CLOSE (OPEN+NSUBEXP) /* no Analogous to OPEN. */ 98 99 /* 100 * Opcode notes: 101 * 102 * BRANCH The set of branches constituting a single choice are hooked 103 * together with their "next" pointers, since precedence prevents 104 * anything being concatenated to any individual branch. The 105 * "next" pointer of the last BRANCH in a choice points to the 106 * thing following the whole choice. This is also where the 107 * final "next" pointer of each individual branch points; each 108 * branch starts with the operand node of a BRANCH node. 109 * 110 * BACK Normal "next" pointers all implicitly point forward; BACK 111 * exists to make loop structures possible. 112 * 113 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular 114 * BRANCH structures using BACK. Simple cases (one character 115 * per match) are implemented with STAR and PLUS for speed 116 * and to minimize recursive plunges. 117 * 118 * OPEN,CLOSE ...are numbered at compile time. 119 */ 120 121 /* 122 * A node is one char of opcode followed by two chars of "next" pointer. 123 * "Next" pointers are stored as two 8-bit pieces, high order first. The 124 * value is a positive offset from the opcode of the node containing it. 125 * An operand, if any, simply follows the node. (Note that much of the 126 * code generation knows about this implicit relationship.) 127 * 128 * Using two bytes for the "next" pointer is vast overkill for most things, 129 * but allows patterns to get big without disasters. 130 */ 131 #define OP(p) (*(p)) 132 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) 133 #define OPERAND(p) ((p) + 3) 134 135 /* 136 * See regmagic.h for one further detail of program structure. 137 */ 138 139 140 /* 141 * Utility definitions. 142 */ 143 #ifndef CHARBITS 144 #define UCHARAT(p) ((int)*(unsigned char *)(p)) 145 #else 146 #define UCHARAT(p) ((int)*(p)&CHARBITS) 147 #endif 148 149 #define FAIL(m) { regerror(m); return(NULL); } 150 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') 151 #define META "^$.[()|?+*\\" 152 153 /* 154 * Flags to be passed up and down. 155 */ 156 #define HASWIDTH 01 /* Known never to match null string. */ 157 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ 158 #define SPSTART 04 /* Starts with * or +. */ 159 #define WORST 0 /* Worst case. */ 160 161 /* 162 * Global work variables for regcomp(). 163 */ 164 static char *regparse; /* Input-scan pointer. */ 165 static int regnpar; /* () count. */ 166 static char regdummy; 167 static char *regcode; /* Code-emit pointer; ®dummy = don't. */ 168 static long regsize; /* Code size. */ 169 170 /* 171 * The first byte of the regexp internal "program" is actually this magic 172 * number; the start node begins in the second byte. 173 */ 174 #define MAGIC 0234 175 176 177 /* 178 * Forward declarations for regcomp()'s friends. 179 */ 180 #ifndef STATIC 181 #define STATIC static 182 #endif 183 STATIC char *reg(); 184 STATIC char *regbranch(); 185 STATIC char *regpiece(); 186 STATIC char *regatom(); 187 STATIC char *regnode(); 188 STATIC char *regnext(); 189 STATIC void regc(); 190 STATIC void reginsert(); 191 STATIC void regtail(); 192 STATIC void regoptail(); 193 #ifdef STRCSPN 194 STATIC int strcspn(); 195 #endif 196 197 /* regcomp originally appeared here - DEL */ 198 199 /* 200 - reg - regular expression, i.e. main body or parenthesized thing 201 * 202 * Caller must absorb opening parenthesis. 203 * 204 * Combining parenthesis handling with the base level of regular expression 205 * is a trifle forced, but the need to tie the tails of the branches to what 206 * follows makes it hard to avoid. 207 */ 208 static char * 209 reg(paren, flagp) 210 int paren; /* Parenthesized? */ 211 int *flagp; 212 { 213 register char *ret; 214 register char *br; 215 register char *ender; 216 register int parno = 0; 217 int flags; 218 219 *flagp = HASWIDTH; /* Tentatively. */ 220 221 /* Make an OPEN node, if parenthesized. */ 222 if (paren) { 223 if (regnpar >= NSUBEXP) 224 FAIL("too many ()"); 225 parno = regnpar; 226 regnpar++; 227 ret = regnode(OPEN+parno); 228 } else 229 ret = NULL; 230 231 /* Pick up the branches, linking them together. */ 232 br = regbranch(&flags); 233 if (br == NULL) 234 return(NULL); 235 if (ret != NULL) 236 regtail(ret, br); /* OPEN -> first. */ 237 else 238 ret = br; 239 if (!(flags&HASWIDTH)) 240 *flagp &= ~HASWIDTH; 241 *flagp |= flags&SPSTART; 242 while (*regparse == '|') { 243 regparse++; 244 br = regbranch(&flags); 245 if (br == NULL) 246 return(NULL); 247 regtail(ret, br); /* BRANCH -> BRANCH. */ 248 if (!(flags&HASWIDTH)) 249 *flagp &= ~HASWIDTH; 250 *flagp |= flags&SPSTART; 251 } 252 253 /* Make a closing node, and hook it on the end. */ 254 ender = regnode((paren) ? CLOSE+parno : END); 255 regtail(ret, ender); 256 257 /* Hook the tails of the branches to the closing node. */ 258 for (br = ret; br != NULL; br = regnext(br)) 259 regoptail(br, ender); 260 261 /* Check for proper termination. */ 262 if (paren && *regparse++ != ')') { 263 FAIL("unmatched ()"); 264 } else if (!paren && *regparse != '\0') { 265 if (*regparse == ')') { 266 FAIL("unmatched ()"); 267 } else 268 FAIL("junk on end"); /* "Can't happen". */ 269 /* NOTREACHED */ 270 } 271 272 return(ret); 273 } 274 275 /* 276 - regbranch - one alternative of an | operator 277 * 278 * Implements the concatenation operator. 279 */ 280 static char * 281 regbranch(flagp) 282 int *flagp; 283 { 284 register char *ret; 285 register char *chain; 286 register char *latest; 287 int flags; 288 289 *flagp = WORST; /* Tentatively. */ 290 291 ret = regnode(BRANCH); 292 chain = NULL; 293 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') { 294 latest = regpiece(&flags); 295 if (latest == NULL) 296 return(NULL); 297 *flagp |= flags&HASWIDTH; 298 if (chain == NULL) /* First piece. */ 299 *flagp |= flags&SPSTART; 300 else 301 regtail(chain, latest); 302 chain = latest; 303 } 304 if (chain == NULL) /* Loop ran zero times. */ 305 (void) regnode(NOTHING); 306 307 return(ret); 308 } 309 310 /* 311 - regpiece - something followed by possible [*+?] 312 * 313 * Note that the branching code sequences used for ? and the general cases 314 * of * and + are somewhat optimized: they use the same NOTHING node as 315 * both the endmarker for their branch list and the body of the last branch. 316 * It might seem that this node could be dispensed with entirely, but the 317 * endmarker role is not redundant. 318 */ 319 static char * 320 regpiece(flagp) 321 int *flagp; 322 { 323 register char *ret; 324 register char op; 325 register char *next; 326 int flags; 327 328 ret = regatom(&flags); 329 if (ret == NULL) 330 return(NULL); 331 332 op = *regparse; 333 if (!ISMULT(op)) { 334 *flagp = flags; 335 return(ret); 336 } 337 338 if (!(flags&HASWIDTH) && op != '?') 339 FAIL("*+ operand could be empty"); 340 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); 341 342 if (op == '*' && (flags&SIMPLE)) 343 reginsert(STAR, ret); 344 else if (op == '*') { 345 /* Emit x* as (x&|), where & means "self". */ 346 reginsert(BRANCH, ret); /* Either x */ 347 regoptail(ret, regnode(BACK)); /* and loop */ 348 regoptail(ret, ret); /* back */ 349 regtail(ret, regnode(BRANCH)); /* or */ 350 regtail(ret, regnode(NOTHING)); /* null. */ 351 } else if (op == '+' && (flags&SIMPLE)) 352 reginsert(PLUS, ret); 353 else if (op == '+') { 354 /* Emit x+ as x(&|), where & means "self". */ 355 next = regnode(BRANCH); /* Either */ 356 regtail(ret, next); 357 regtail(regnode(BACK), ret); /* loop back */ 358 regtail(next, regnode(BRANCH)); /* or */ 359 regtail(ret, regnode(NOTHING)); /* null. */ 360 } else if (op == '?') { 361 /* Emit x? as (x|) */ 362 reginsert(BRANCH, ret); /* Either x */ 363 regtail(ret, regnode(BRANCH)); /* or */ 364 next = regnode(NOTHING); /* null. */ 365 regtail(ret, next); 366 regoptail(ret, next); 367 } 368 regparse++; 369 if (ISMULT(*regparse)) 370 FAIL("nested *?+"); 371 372 return(ret); 373 } 374 375 /* 376 - regatom - the lowest level 377 * 378 * Optimization: gobbles an entire sequence of ordinary characters so that 379 * it can turn them into a single node, which is smaller to store and 380 * faster to run. Backslashed characters are exceptions, each becoming a 381 * separate node; the code is simpler that way and it's not worth fixing. 382 */ 383 static char * 384 regatom(flagp) 385 int *flagp; 386 { 387 register char *ret; 388 int flags; 389 390 *flagp = WORST; /* Tentatively. */ 391 392 switch (*regparse++) { 393 case '^': 394 ret = regnode(BOL); 395 break; 396 case '$': 397 ret = regnode(EOL); 398 break; 399 case '.': 400 ret = regnode(ANY); 401 *flagp |= HASWIDTH|SIMPLE; 402 break; 403 case '[': { 404 register int clss; 405 register int classend; 406 407 if (*regparse == '^') { /* Complement of range. */ 408 ret = regnode(ANYBUT); 409 regparse++; 410 } else 411 ret = regnode(ANYOF); 412 if (*regparse == ']' || *regparse == '-') 413 regc(*regparse++); 414 while (*regparse != '\0' && *regparse != ']') { 415 if (*regparse == '-') { 416 regparse++; 417 if (*regparse == ']' || *regparse == '\0') 418 regc('-'); 419 else { 420 clss = UCHARAT(regparse-2)+1; 421 classend = UCHARAT(regparse); 422 if (clss > classend+1) 423 FAIL("invalid [] range"); 424 for (; clss <= classend; clss++) 425 regc((char)clss); 426 regparse++; 427 } 428 } else 429 regc(*regparse++); 430 } 431 regc('\0'); 432 if (*regparse != ']') 433 FAIL("unmatched []"); 434 regparse++; 435 *flagp |= HASWIDTH|SIMPLE; 436 } 437 break; 438 case '(': 439 ret = reg(1, &flags); 440 if (ret == NULL) 441 return(NULL); 442 *flagp |= flags&(HASWIDTH|SPSTART); 443 break; 444 case '\0': 445 case '|': 446 case ')': 447 FAIL("internal urp"); /* Supposed to be caught earlier. */ 448 /* NOTREACHED */ 449 break; 450 case '?': 451 case '+': 452 case '*': 453 FAIL("?+* follows nothing"); 454 /* NOTREACHED */ 455 break; 456 case '\\': 457 if (*regparse == '\0') 458 FAIL("trailing \\"); 459 ret = regnode(EXACTLY); 460 regc(*regparse++); 461 regc('\0'); 462 *flagp |= HASWIDTH|SIMPLE; 463 break; 464 default: { 465 register int len; 466 register char ender; 467 468 regparse--; 469 len = strcspn(regparse, META); 470 if (len <= 0) 471 FAIL("internal disaster"); 472 ender = *(regparse+len); 473 if (len > 1 && ISMULT(ender)) 474 len--; /* Back off clear of ?+* operand. */ 475 *flagp |= HASWIDTH; 476 if (len == 1) 477 *flagp |= SIMPLE; 478 ret = regnode(EXACTLY); 479 while (len > 0) { 480 regc(*regparse++); 481 len--; 482 } 483 regc('\0'); 484 } 485 break; 486 } 487 488 return(ret); 489 } 490 491 /* 492 - regnode - emit a node 493 */ 494 static char * /* Location. */ 495 regnode(op) 496 int op; 497 { 498 register char *ret; 499 register char *ptr; 500 501 ret = regcode; 502 if (ret == ®dummy) { 503 regsize += 3; 504 return(ret); 505 } 506 507 ptr = ret; 508 *ptr++ = (char)op; 509 *ptr++ = '\0'; /* Null "next" pointer. */ 510 *ptr++ = '\0'; 511 regcode = ptr; 512 513 return(ret); 514 } 515 516 /* 517 - regc - emit (if appropriate) a byte of code 518 */ 519 static void 520 regc(b) 521 int b; 522 { 523 if (regcode != ®dummy) 524 *regcode++ = (char)b; 525 else 526 regsize++; 527 } 528 529 /* 530 - reginsert - insert an operator in front of already-emitted operand 531 * 532 * Means relocating the operand. 533 */ 534 static void 535 reginsert(op, opnd) 536 int op; 537 char *opnd; 538 { 539 register char *src; 540 register char *dst; 541 register char *place; 542 543 if (regcode == ®dummy) { 544 regsize += 3; 545 return; 546 } 547 548 src = regcode; 549 regcode += 3; 550 dst = regcode; 551 while (src > opnd) 552 *--dst = *--src; 553 554 place = opnd; /* Op node, where operand used to be. */ 555 *place++ = (char)op; 556 *place++ = '\0'; 557 *place = '\0'; 558 } 559 560 /* 561 - regtail - set the next-pointer at the end of a node chain 562 */ 563 static void 564 regtail(p, val) 565 char *p; 566 char *val; 567 { 568 register char *scan; 569 register char *temp; 570 register int offset; 571 572 if (p == ®dummy) 573 return; 574 575 /* Find last node. */ 576 scan = p; 577 for (;;) { 578 temp = regnext(scan); 579 if (temp == NULL) 580 break; 581 scan = temp; 582 } 583 584 if (OP(scan) == BACK) 585 offset = scan - val; 586 else 587 offset = val - scan; 588 *(scan+1) = (char)(offset>>8)&0377; 589 *(scan+2) = (char)offset&0377; 590 } 591 592 /* 593 - regoptail - regtail on operand of first argument; nop if operandless 594 */ 595 static void 596 regoptail(p, val) 597 char *p; 598 char *val; 599 { 600 /* "Operandless" and "op != BRANCH" are synonymous in practice. */ 601 if (p == NULL || p == ®dummy || OP(p) != BRANCH) 602 return; 603 regtail(OPERAND(p), val); 604 } 605 606 /* 607 * regexec and friends 608 */ 609 610 /* 611 * Global work variables for regexec(). 612 */ 613 static char *reginput; /* String-input pointer. */ 614 NOTSTATIC char *regbol; /* Beginning of input, for ^ check. */ 615 static char **regstartp; /* Pointer to startp array. */ 616 static char **regendp; /* Ditto for endp. */ 617 618 /* 619 * Forwards. 620 */ 621 622 NOTSTATIC int regtry(); 623 STATIC int regmatch(); 624 STATIC int regrepeat(); 625 626 #ifdef DEBUG 627 int regnarrate = 0; 628 void regdump(); 629 STATIC char *regprop(); 630 #endif 631 632 #if 0 633 /* 634 - regexec - match a regexp against a string 635 */ 636 int 637 regexec(prog, string, stringlength, matchlength) 638 register regexp *prog; 639 register char *string; /* note: CURRENTLY ASSUMED TO BE NULL-TERMINATED!!! */ 640 int stringlength; /* length of string */ 641 int *matchlength; /* number of chars matched (or to be skipped) */ 642 /* set when MATCH or CANT_MATCH */ 643 { 644 register char *s; 645 extern char *strchr(); 646 647 /* Be paranoid... */ 648 if (prog == NULL || string == NULL) { 649 regerror("NULL parameter"); 650 return(EXP_TCLERROR); 651 } 652 653 /* Check validity of program. */ 654 if (UCHARAT(prog->program) != MAGIC) { 655 regerror("corrupted program"); 656 return(EXP_KM_ERROR); 657 } 658 659 #if THIS_RUINS_EXP 660 /* no need for this shortcut anyway */ 661 /* If there is a "must appear" string, look for it. */ 662 if (prog->regmust != NULL) { 663 s = string; 664 while ((s = strchr(s, prog->regmust[0])) != NULL) { 665 if (strncmp(s, prog->regmust, prog->regmlen) == 0) 666 break; /* Found it. */ 667 s++; 668 } 669 if (s == NULL) /* Not present. */ 670 return(0); 671 } 672 #endif 673 674 /* Mark beginning of line for ^ . */ 675 regbol = string; 676 677 /* Simplest case: anchored match need be tried only once. */ 678 if (prog->reganch) { 679 int r = regtry(prog,string,matchlength); 680 if (r == CANT_MATCH) *matchlength = stringlength; 681 return(r); 682 } 683 684 /* Messy cases: unanchored match. */ 685 s = string; 686 if (prog->regstart != '\0') { 687 register char *s2 = s; 688 689 /* We know what char it must start with. */ 690 while (1) { 691 int r; 692 693 s2 = strchr(s2,prog->regstart); 694 if (s2 == 0) { 695 *matchlength = stringlength; 696 return(CANT_MATCH); 697 } 698 r = regtry(prog,s2,matchlength); 699 if (r == CANT_MATCH) { 700 s2++; 701 continue; 702 } 703 if (s2 == s) return(r); 704 *matchlength = s2-s; 705 return CANT_MATCH; 706 } 707 } else { 708 /* We don't -- general case. */ 709 register char *s2 = s; 710 int r = regtry(prog,s,matchlength); 711 if (r == EXP_MATCH) return(r); 712 else if (r == EXP_CANMATCH) return(r); 713 /* at this point, we know some characters at front */ 714 /* of string don't match */ 715 for (s2++;*s2;s2++) { 716 r = regtry(prog,s2,matchlength); 717 if (r == CANT_MATCH) continue; 718 /* if we match or can_match, say cant_match and */ 719 /* record the number of chars at front that don't match */ 720 *matchlength = s2-s; 721 return(CANT_MATCH); 722 } 723 /* made it thru string with CANT_MATCH all the way */ 724 *matchlength = stringlength; 725 return(CANT_MATCH); 726 } 727 } 728 #endif 729 730 /* 731 - regtry - try match at specific point 732 */ 733 /* return CAN_MATCH, CANT_MATCH or MATCH */ 734 int /* 0 failure, 1 success */ 735 regtry(prog, string, matchlength) 736 regexp *prog; 737 char *string; 738 int *matchlength; /* only set for MATCH */ 739 { 740 register int i; 741 register char **sp; 742 register char **ep; 743 int r; /* result of regmatch */ 744 745 reginput = string; 746 regstartp = prog->startp; 747 regendp = prog->endp; 748 749 sp = prog->startp; 750 ep = prog->endp; 751 for (i = NSUBEXP; i > 0; i--) { 752 *sp++ = NULL; 753 *ep++ = NULL; 754 } 755 r = regmatch(prog->program + 1); 756 if (EXP_MATCH == r) { 757 prog->startp[0] = string; 758 prog->endp[0] = reginput; 759 *matchlength = reginput-string; 760 return(EXP_MATCH); 761 } 762 return(r); /* CAN_MATCH or CANT_MATCH */ 763 } 764 765 /* 766 - regmatch - main matching routine 767 * 768 * Conceptually the strategy is simple: check to see whether the current 769 * node matches, call self recursively to see whether the rest matches, 770 * and then act accordingly. In practice we make some effort to avoid 771 * recursion, in particular by going through "ordinary" nodes (that don't 772 * need to know whether the rest of the match failed) by a loop instead of 773 * by recursion. 774 */ 775 /* returns CAN, CANT or MATCH */ 776 static int /* 0 failure, 1 success */ 777 regmatch(prog) 778 char *prog; 779 { 780 register char *scan; /* Current node. */ 781 char *next; /* Next node. */ 782 #ifndef strchr /* May be #defined to something else */ 783 extern char *strchr(); 784 #endif 785 786 scan = prog; 787 #ifdef DEBUG 788 if (scan != NULL && regnarrate) 789 fprintf(stderr, "%s(\n", regprop(scan)); 790 #endif 791 while (scan != NULL) { 792 #ifdef DEBUG 793 if (regnarrate) 794 fprintf(stderr, "%s...\n", regprop(scan)); 795 #endif 796 next = regnext(scan); 797 798 switch (OP(scan)) { 799 case BOL: 800 if (reginput != regbol) 801 /* return(0);*/ 802 return(EXP_CANTMATCH); 803 break; 804 case EOL: 805 if (*reginput != '\0') 806 /* return(0);*/ 807 /* note this implies that "$" must match everything received to this point! */ 808 return(EXP_CANTMATCH); 809 break; 810 case ANY: 811 if (*reginput == '\0') 812 /* return(0);*/ 813 return(EXP_CANMATCH); 814 reginput++; 815 break; 816 case EXACTLY: { 817 /* register int len;*/ 818 register char *opnd; 819 820 opnd = OPERAND(scan); 821 822 /* this section of code is totally rewritten - DEL */ 823 /* group of literal chars in pattern */ 824 /* compare each one */ 825 do { 826 if (*opnd != *reginput) { 827 if (*reginput == '\0') { 828 return EXP_CANMATCH; 829 } else return EXP_CANTMATCH; 830 } 831 832 reginput++; 833 opnd++; 834 } while (*opnd != '\0'); 835 } 836 break; 837 case ANYOF: 838 /* if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL) 839 return(0); 840 */ 841 if (*reginput == '\0') 842 return(EXP_CANMATCH); 843 if (strchr(OPERAND(scan),*reginput) == NULL) 844 return(EXP_CANTMATCH); 845 reginput++; 846 break; 847 case ANYBUT: 848 /* if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL) 849 return(0); 850 */ 851 if (*reginput == '\0') 852 return(EXP_CANMATCH); 853 if (strchr(OPERAND(scan),*reginput) != NULL) 854 return(EXP_CANTMATCH); 855 reginput++; 856 break; 857 case NOTHING: 858 break; 859 case BACK: 860 break; 861 case OPEN+1: 862 case OPEN+2: 863 case OPEN+3: 864 case OPEN+4: 865 case OPEN+5: 866 case OPEN+6: 867 case OPEN+7: 868 case OPEN+8: 869 case OPEN+9: { 870 register int no; 871 register char *save; 872 int r; /* result of regmatch */ 873 874 doOpen: 875 no = OP(scan) - OPEN; 876 save = reginput; 877 878 r = regmatch(next); 879 if (r == EXP_MATCH) { 880 /* 881 * Don't set startp if some later 882 * invocation of the same parentheses 883 * already has. 884 */ 885 if (regstartp[no] == NULL) 886 regstartp[no] = save; 887 } 888 return(r); 889 } 890 /* NOTREACHED */ 891 break; 892 case CLOSE+1: 893 case CLOSE+2: 894 case CLOSE+3: 895 case CLOSE+4: 896 case CLOSE+5: 897 case CLOSE+6: 898 case CLOSE+7: 899 case CLOSE+8: 900 case CLOSE+9: { 901 register int no; 902 register char *save; 903 int r; /* result of regmatch */ 904 905 doClose: 906 no = OP(scan) - CLOSE; 907 save = reginput; 908 909 r = regmatch(next); 910 if (r == EXP_MATCH) { 911 /* 912 * Don't set endp if some later 913 * invocation of the same parentheses 914 * already has. 915 */ 916 if (regendp[no] == NULL) 917 regendp[no] = save; 918 } 919 return(r); 920 } 921 /* NOTREACHED */ 922 break; 923 case BRANCH: { 924 register char *save; 925 int match_status; 926 927 if (OP(next) != BRANCH) /* No choice. */ 928 next = OPERAND(scan); /* Avoid recursion. */ 929 else { 930 match_status = EXP_CANTMATCH; 931 932 do { 933 int r; 934 935 save = reginput; 936 r = regmatch(OPERAND(scan)); 937 if (r == EXP_MATCH) return(r); 938 if (r == EXP_CANMATCH) { 939 match_status = r; 940 } 941 reginput = save; 942 scan = regnext(scan); 943 } while (scan != NULL && OP(scan) == BRANCH); 944 return(match_status); 945 /* NOTREACHED */ 946 } 947 } 948 /* NOTREACHED */ 949 break; 950 case STAR: 951 case PLUS: { 952 register char nextch; 953 register int no; 954 register char *save; 955 register int min; 956 int match_status; 957 958 if (*reginput == '\0') return EXP_CANMATCH; 959 960 /* 961 * Lookahead to avoid useless match attempts 962 * when we know what character comes next. 963 */ 964 match_status = EXP_CANTMATCH; 965 nextch = '\0'; 966 if (OP(next) == EXACTLY) 967 nextch = *OPERAND(next); 968 min = (OP(scan) == STAR) ? 0 : 1; 969 save = reginput; 970 no = regrepeat(OPERAND(scan)); 971 while (no >= min) { 972 /* If it could work, try it. */ 973 /* 3rd condition allows for CAN_MATCH */ 974 if (nextch == '\0' || *reginput == nextch || *reginput == '\0') { 975 int r = regmatch(next); 976 if (r == EXP_MATCH) 977 return(EXP_MATCH); 978 if (r == EXP_CANMATCH) 979 /* match_status = r;*/ 980 return(EXP_CANMATCH); 981 } 982 /* Couldn't or didn't -- back up. */ 983 no--; 984 reginput = save + no; 985 } 986 return(match_status); 987 } 988 /* NOTREACHED */ 989 break; 990 case END: 991 /* Success! */ 992 if (*reginput == '\0') { 993 return(EXP_CANMATCH); 994 } else { 995 return(EXP_MATCH); 996 } 997 /* return(EXP_CANMATCH); Success! */ 998 /* NOTREACHED */ 999 break; 1000 default: 1001 if (OP(scan) > OPEN && OP(scan) < OPEN+NSUBEXP) { 1002 goto doOpen; 1003 } else if (OP(scan) > CLOSE && OP(scan) < CLOSE+NSUBEXP) { 1004 goto doClose; 1005 } 1006 regerror("memory corruption"); 1007 return(EXP_TCLERROR); 1008 /* NOTREACHED */ 1009 break; 1010 } 1011 1012 scan = next; 1013 } 1014 1015 /* 1016 * We get here only if there's trouble -- normally "case END" is 1017 * the terminating point. 1018 */ 1019 regerror("corrupted pointers"); 1020 return(EXP_TCLERROR); 1021 } 1022 1023 /* 1024 - regrepeat - repeatedly match something simple, report how many 1025 */ 1026 static int 1027 regrepeat(p) 1028 char *p; 1029 { 1030 register int count = 0; 1031 register char *scan; 1032 register char *opnd; 1033 #ifndef strchr /* May be #defined to something else */ 1034 /*DEL*/ extern char *strchr(); 1035 #endif 1036 1037 scan = reginput; 1038 opnd = OPERAND(p); 1039 switch (OP(p)) { 1040 case ANY: 1041 count = strlen(scan); 1042 scan += count; 1043 break; 1044 case EXACTLY: 1045 while (*opnd == *scan) { 1046 count++; 1047 scan++; 1048 } 1049 break; 1050 case ANYOF: 1051 while (*scan != '\0' && strchr(opnd, *scan) != NULL) { 1052 count++; 1053 scan++; 1054 } 1055 break; 1056 case ANYBUT: 1057 while (*scan != '\0' && strchr(opnd, *scan) == NULL) { 1058 count++; 1059 scan++; 1060 } 1061 break; 1062 default: /* Oh dear. Called inappropriately. */ 1063 regerror("internal foulup"); 1064 count = 0; /* Best compromise. */ 1065 break; 1066 } 1067 reginput = scan; 1068 1069 return(count); 1070 } 1071 1072 /* 1073 - regnext - dig the "next" pointer out of a node 1074 */ 1075 static char * 1076 regnext(p) 1077 register char *p; 1078 { 1079 register int offset; 1080 1081 if (p == ®dummy) 1082 return(NULL); 1083 1084 offset = NEXT(p); 1085 if (offset == 0) 1086 return(NULL); 1087 1088 if (OP(p) == BACK) 1089 return(p-offset); 1090 else 1091 return(p+offset); 1092 } 1093 1094 #ifdef DEBUG 1095 1096 STATIC char *regprop(); 1097 1098 /* 1099 - regdump - dump a regexp onto stdout in vaguely comprehensible form 1100 */ 1101 void 1102 regdump(r) 1103 regexp *r; 1104 { 1105 register char *s; 1106 register char op = EXACTLY; /* Arbitrary non-END op. */ 1107 register char *next; 1108 extern char *strchr(); 1109 1110 1111 s = r->program + 1; 1112 while (op != END) { /* While that wasn't END last time... */ 1113 op = OP(s); 1114 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */ 1115 next = regnext(s); 1116 if (next == NULL) /* Next ptr. */ 1117 printf("(0)"); 1118 else 1119 printf("(%d)", (s-r->program)+(next-s)); 1120 s += 3; 1121 if (op == ANYOF || op == ANYBUT || op == EXACTLY) { 1122 /* Literal string, where present. */ 1123 while (*s != '\0') { 1124 putchar(*s); 1125 s++; 1126 } 1127 s++; 1128 } 1129 putchar('\n'); 1130 } 1131 1132 /* Header fields of interest. */ 1133 if (r->regstart != '\0') 1134 printf("start `%c' ", r->regstart); 1135 if (r->reganch) 1136 printf("anchored "); 1137 if (r->regmust != NULL) 1138 printf("must have \"%s\"", r->regmust); 1139 printf("\n"); 1140 } 1141 1142 /* 1143 - regprop - printable representation of opcode 1144 */ 1145 static char * 1146 regprop(op) 1147 char *op; 1148 { 1149 register char *p; 1150 static char buf[50]; 1151 1152 (void) strcpy(buf, ":"); 1153 1154 switch (OP(op)) { 1155 case BOL: 1156 p = "BOL"; 1157 break; 1158 case EOL: 1159 p = "EOL"; 1160 break; 1161 case ANY: 1162 p = "ANY"; 1163 break; 1164 case ANYOF: 1165 p = "ANYOF"; 1166 break; 1167 case ANYBUT: 1168 p = "ANYBUT"; 1169 break; 1170 case BRANCH: 1171 p = "BRANCH"; 1172 break; 1173 case EXACTLY: 1174 p = "EXACTLY"; 1175 break; 1176 case NOTHING: 1177 p = "NOTHING"; 1178 break; 1179 case BACK: 1180 p = "BACK"; 1181 break; 1182 case END: 1183 p = "END"; 1184 break; 1185 case OPEN+1: 1186 case OPEN+2: 1187 case OPEN+3: 1188 case OPEN+4: 1189 case OPEN+5: 1190 case OPEN+6: 1191 case OPEN+7: 1192 case OPEN+8: 1193 case OPEN+9: 1194 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1195 p = NULL; 1196 break; 1197 case CLOSE+1: 1198 case CLOSE+2: 1199 case CLOSE+3: 1200 case CLOSE+4: 1201 case CLOSE+5: 1202 case CLOSE+6: 1203 case CLOSE+7: 1204 case CLOSE+8: 1205 case CLOSE+9: 1206 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1207 p = NULL; 1208 break; 1209 case STAR: 1210 p = "STAR"; 1211 break; 1212 case PLUS: 1213 p = "PLUS"; 1214 break; 1215 default: 1216 if (OP(op) > OPEN && OP(op) < OPEN+NSUBEXP) { 1217 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN); 1218 p = NULL; 1219 break; 1220 } else if (OP(op) > CLOSE && OP(op) < CLOSE+NSUBEXP) { 1221 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE); 1222 p = NULL; 1223 } else { 1224 TclRegError("corrupted opcode"); 1225 } 1226 break; 1227 } 1228 if (p != NULL) 1229 (void) strcat(buf, p); 1230 return(buf); 1231 } 1232 #endif 1233 1234 /* 1235 * The following is provided for those people who do not have strcspn() in 1236 * their C libraries. They should get off their butts and do something 1237 * about it; at least one public-domain implementation of those (highly 1238 * useful) string routines has been published on Usenet. 1239 */ 1240 #ifdef STRCSPN 1241 /* 1242 * strcspn - find length of initial segment of s1 consisting entirely 1243 * of characters not from s2 1244 */ 1245 1246 static int 1247 strcspn(s1, s2) 1248 char *s1; 1249 char *s2; 1250 { 1251 register char *scan1; 1252 register char *scan2; 1253 register int count; 1254 1255 count = 0; 1256 for (scan1 = s1; *scan1 != '\0'; scan1++) { 1257 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */ 1258 if (*scan1 == *scan2++) 1259 return(count); 1260 count++; 1261 } 1262 return(count); 1263 } 1264 #endif 1265 #endif /* 0 WHOLE FILE */ 1266