Home | History | Annotate | Download | only in awk_xpg4
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Copyright 1986, 1994 by Mortice Kern Systems Inc.  All rights reserved.
     28  */
     29 
     30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     31 
     32 /*
     33  * awk -- mainline, yylex, etc.
     34  *
     35  * Based on MKS awk(1) ported to be /usr/xpg4/bin/awk with POSIX/XCU4 changes
     36  */
     37 
     38 #include "awk.h"
     39 #include "y.tab.h"
     40 #include <stdarg.h>
     41 #include <unistd.h>
     42 #include <locale.h>
     43 #include <search.h>
     44 
     45 static char	*progfiles[NPFILE];	/* Programmes files for yylex */
     46 static char	**progfilep = &progfiles[0]; /* Pointer to last file */
     47 static wchar_t	*progptr;		/* In-memory programme */
     48 static int	proglen;		/* Length of progptr */
     49 static wchar_t	context[NCONTEXT];	/* Circular buffer of context */
     50 static wchar_t	*conptr = &context[0];	/* context ptr */
     51 static FILE	*progfp;		/* Stdio stream for programme */
     52 static char	*filename;
     53 #ifdef	DEBUG
     54 static int	dflag;
     55 #endif
     56 
     57 #define	AWK_EXEC_MAGIC	"<MKS AWKC>"
     58 #define	LEN_EXEC_MAGIC	10
     59 
     60 static char	unbal[] = "unbalanced E char";
     61 
     62 static void	awkarginit(int c, char **av);
     63 static int	lexid(wint_t c);
     64 static int	lexnumber(wint_t c);
     65 static int	lexstring(wint_t endc);
     66 static int	lexregexp(wint_t endc);
     67 
     68 static void	awkvarinit(void);
     69 static wint_t	lexgetc(void);
     70 static void	lexungetc(wint_t c);
     71 static size_t	lexescape(wint_t endc, int regx, int cmd_line_operand);
     72 static void	awkierr(int perr, char *fmt, va_list ap);
     73 static int	usage(void);
     74 void		strescape(wchar_t *str);
     75 static const char	*toprint(wint_t);
     76 char *_cmdname;
     77 static wchar_t *mbconvert(char *str);
     78 
     79 extern int	isclvar(wchar_t *arg);
     80 
     81 /*
     82  * mainline for awk
     83  */
     84 int
     85 main(int argc, char *argv[])
     86 {
     87 	wchar_t *ap;
     88 	char *cmd;
     89 
     90 	cmd = argv[0];
     91 	_cmdname = cmd;
     92 
     93 	linebuf = emalloc(NLINE * sizeof (wchar_t));
     94 
     95 	/*
     96 	 * At this point only messaging should be internationalized.
     97 	 * numbers are still scanned as in the Posix locale.
     98 	 */
     99 	(void) setlocale(LC_ALL, "");
    100 	(void) setlocale(LC_NUMERIC, "C");
    101 #if !defined(TEXT_DOMAIN)
    102 #define	TEXT_DOMAIN	"SYS_TEST"
    103 #endif
    104 	(void) textdomain(TEXT_DOMAIN);
    105 
    106 	awkvarinit();
    107 	/* running = 1; */
    108 	while (argc > 1 && *argv[1] == '-') {
    109 		void *save_ptr = NULL;
    110 		ap = mbstowcsdup(&argv[1][1]);
    111 		if (ap == NULL)
    112 			break;
    113 		if (*ap == '\0') {
    114 			free(ap);
    115 			break;
    116 		}
    117 		save_ptr = (void *) ap;
    118 		++argv;
    119 		--argc;
    120 		if (*ap == '-' && ap[1] == '\0')
    121 			break;
    122 		for (; *ap != '\0'; ++ap) {
    123 			switch (*ap) {
    124 #ifdef DEBUG
    125 			case 'd':
    126 				dflag = 1;
    127 				continue;
    128 
    129 #endif
    130 			case 'f':
    131 				if (argc < 2) {
    132 					(void) fprintf(stderr,
    133 				gettext("Missing script file\n"));
    134 					return (1);
    135 				}
    136 				*progfilep++ = argv[1];
    137 				--argc;
    138 				++argv;
    139 				continue;
    140 
    141 			case 'F':
    142 				if (ap[1] == '\0') {
    143 					if (argc < 2) {
    144 						(void) fprintf(stderr,
    145 				gettext("Missing field separator\n"));
    146 						return (1);
    147 					}
    148 					ap = mbstowcsdup(argv[1]);
    149 					--argc;
    150 					++argv;
    151 				} else
    152 					++ap;
    153 				strescape(ap);
    154 				strassign(varFS, linebuf, FALLOC,
    155 				    wcslen(linebuf));
    156 				break;
    157 
    158 			case 'v': {
    159 				wchar_t *vp;
    160 				wchar_t *arg;
    161 
    162 				if (argc < 2) {
    163 					(void) fprintf(stderr,
    164 		gettext("Missing variable assignment\n"));
    165 					return (1);
    166 				}
    167 				arg = mbconvert(argv[1]);
    168 				/*
    169 				 * Ensure the variable expression
    170 				 * is valid (correct form).
    171 				 */
    172 				if (((vp = wcschr(arg, '=')) != NULL) &&
    173 				    isclvar(arg)) {
    174 					*vp = '\0';
    175 					strescape(vp+1);
    176 					strassign(vlook(arg), linebuf,
    177 					    FALLOC|FSENSE,
    178 					    wcslen(linebuf));
    179 					*vp = '=';
    180 				} else {
    181 					(void) fprintf(stderr, gettext(
    182 					    "Invalid form for variable "
    183 					    "assignment: %S\n"), arg);
    184 					return (1);
    185 				}
    186 				--argc;
    187 				++argv;
    188 				continue;
    189 			}
    190 
    191 			default:
    192 				(void) fprintf(stderr,
    193 				gettext("Unknown option \"-%S\"\n"), ap);
    194 				return (usage());
    195 			}
    196 			break;
    197 		}
    198 		if (save_ptr)
    199 			free(save_ptr);
    200 	}
    201 	if (progfilep == &progfiles[0]) {
    202 		if (argc < 2)
    203 			return (usage());
    204 		filename = "[command line]";	/* BUG: NEEDS TRANSLATION */
    205 		progptr = mbstowcsdup(argv[1]);
    206 		proglen = wcslen(progptr);
    207 		--argc;
    208 		++argv;
    209 	}
    210 
    211 	argv[0] = cmd;
    212 
    213 	awkarginit(argc, argv);
    214 
    215 	/* running = 0; */
    216 	(void) yyparse();
    217 
    218 	lineno = 0;
    219 	/*
    220 	 * Ok, done parsing, so now activate the rest of the nls stuff, set
    221 	 * the radix character.
    222 	 */
    223 	(void) setlocale(LC_ALL, "");
    224 	radixpoint = *localeconv()->decimal_point;
    225 	awk();
    226 	/* NOTREACHED */
    227 	return (0);
    228 }
    229 
    230 /*
    231  * Do initial setup of buffers, etc.
    232  * This must be called before most processing
    233  * and especially before lexical analysis.
    234  * Variables initialised here will be overruled by command
    235  * line parameter initialisation.
    236  */
    237 static void
    238 awkvarinit()
    239 {
    240 	NODE *np;
    241 
    242 	(void) setvbuf(stderr, NULL, _IONBF, 0);
    243 
    244 	if ((NIOSTREAM = sysconf(_SC_OPEN_MAX) - 4) <= 0) {
    245 		(void) fprintf(stderr,
    246 	gettext("not enough available file descriptors"));
    247 		exit(1);
    248 	}
    249 	ofiles = (OFILE *)emalloc(sizeof (OFILE)*NIOSTREAM);
    250 #ifdef A_ZERO_POINTERS
    251 	(void) memset((wchar_t *)ofiles, 0, sizeof (OFILE) * NIOSTREAM);
    252 #else
    253 	{
    254 		/* initialize file descriptor table */
    255 		OFILE *fp;
    256 		for (fp = ofiles; fp < &ofiles[NIOSTREAM]; fp += 1) {
    257 			fp->f_fp = FNULL;
    258 					fp->f_mode = 0;
    259 					fp->f_name = (char *)0;
    260 		}
    261 	}
    262 #endif
    263 	constant = intnode((INT)0);
    264 
    265 	const0 = intnode((INT)0);
    266 	const1 = intnode((INT)1);
    267 	constundef = emptynode(CONSTANT, 0);
    268 	constundef->n_flags = FSTRING|FVINT;
    269 	constundef->n_string = _null;
    270 	constundef->n_strlen = 0;
    271 	inc_oper = emptynode(ADD, 0);
    272 	inc_oper->n_right = const1;
    273 	asn_oper = emptynode(ADD, 0);
    274 	field0 = node(FIELD, const0, NNULL);
    275 
    276 	{
    277 		RESFUNC near*rp;
    278 
    279 		for (rp = &resfuncs[0]; rp->rf_name != (LOCCHARP)NULL; ++rp) {
    280 			np = finstall(rp->rf_name, rp->rf_func, rp->rf_type);
    281 		}
    282 	}
    283 	{
    284 		RESERVED near*rp;
    285 
    286 		for (rp = &reserved[0]; rp->r_name != (LOCCHARP)NULL; ++rp) {
    287 			switch (rp->r_type) {
    288 			case SVAR:
    289 			case VAR:
    290 				running = 1;
    291 				np = vlook(rp->r_name);
    292 				if (rp->r_type == SVAR)
    293 					np->n_flags |= FSPECIAL;
    294 				if (rp->r_svalue != NULL)
    295 					strassign(np, rp->r_svalue, FSTATIC,
    296 					    (size_t)rp->r_ivalue);
    297 				else {
    298 					constant->n_int = rp->r_ivalue;
    299 					(void) assign(np, constant);
    300 				}
    301 				running = 0;
    302 				break;
    303 
    304 			case KEYWORD:
    305 				kinstall(rp->r_name, (int)rp->r_ivalue);
    306 				break;
    307 			}
    308 		}
    309 	}
    310 
    311 	varNR = vlook(s_NR);
    312 	varFNR = vlook(s_FNR);
    313 	varNF = vlook(s_NF);
    314 	varOFMT = vlook(s_OFMT);
    315 	varCONVFMT = vlook(s_CONVFMT);
    316 	varOFS = vlook(s_OFS);
    317 	varORS = vlook(s_ORS);
    318 	varRS = vlook(s_RS);
    319 	varFS = vlook(s_FS);
    320 	varARGC = vlook(s_ARGC);
    321 	varSUBSEP = vlook(s_SUBSEP);
    322 	varENVIRON = vlook(s_ENVIRON);
    323 	varFILENAME = vlook(s_FILENAME);
    324 	varSYMTAB = vlook(s_SYMTAB);
    325 	incNR = node(ASG, varNR, node(ADD, varNR, const1));
    326 	incFNR = node(ASG, varFNR, node(ADD, varFNR, const1));
    327 	clrFNR = node(ASG, varFNR, const0);
    328 }
    329 
    330 /*
    331  * Initialise awk ARGC, ARGV variables.
    332  */
    333 static void
    334 awkarginit(int ac, char **av)
    335 {
    336 	int i;
    337 	wchar_t *cp;
    338 
    339 	ARGVsubi = node(INDEX, vlook(s_ARGV), constant);
    340 	running = 1;
    341 	constant->n_int = ac;
    342 	(void) assign(varARGC, constant);
    343 	for (i = 0; i < ac; ++i) {
    344 		cp = mbstowcsdup(av[i]);
    345 		constant->n_int = i;
    346 		strassign(exprreduce(ARGVsubi), cp,
    347 		    FSTATIC|FSENSE, wcslen(cp));
    348 	}
    349 	running = 0;
    350 }
    351 
    352 /*
    353  * Clean up when done parsing a function.
    354  * All formal parameters, because of a deal (funparm) in
    355  * yylex, get put into the symbol table in front of any
    356  * global variable of the same name.  When the entire
    357  * function is parsed, remove these formal dummy nodes
    358  * from the symbol table but retain the nodes because
    359  * the generated tree points at them.
    360  */
    361 void
    362 uexit(NODE *np)
    363 {
    364 	NODE *formal;
    365 
    366 	while ((formal = getlist(&np)) != NNULL)
    367 		delsymtab(formal, 0);
    368 }
    369 
    370 /*
    371  * The lexical analyzer.
    372  */
    373 int
    374 yylex()
    375 #ifdef	DEBUG
    376 {
    377 	int l;
    378 
    379 	l = yyhex();
    380 	if (dflag)
    381 		(void) printf("%d\n", l);
    382 	return (l);
    383 }
    384 yyhex()
    385 #endif
    386 {
    387 	wint_t c, c1;
    388 	int i;
    389 	static int savetoken = 0;
    390 	static int wasfield;
    391 	static int isfuncdef;
    392 	static int nbrace, nparen, nbracket;
    393 	static struct ctosymstruct {
    394 		wint_t c, sym;
    395 	} ctosym[] = {
    396 		{ '|', BAR },		{ '^', CARAT },
    397 		{ '~', TILDE },		{ '<', LANGLE },
    398 		{ '>', RANGLE },	{ '+', PLUSC },
    399 		{ '-', HYPHEN },	{ '*', STAR },
    400 		{ '/', SLASH },		{ '%', PERCENT },
    401 		{ '!', EXCLAMATION },	{ '$', DOLLAR },
    402 		{ '[', LSQUARE },	{ ']', RSQUARE },
    403 		{ '(', LPAREN },	{ ')', RPAREN },
    404 		{ ';', SEMI },		{ '{', LBRACE },
    405 		{ '}', RBRACE },	{   0, 0 }
    406 	};
    407 
    408 	if (savetoken) {
    409 		c = savetoken;
    410 		savetoken = 0;
    411 	} else if (redelim != '\0') {
    412 		c = redelim;
    413 		redelim = 0;
    414 		catterm = 0;
    415 		savetoken = c;
    416 		return (lexlast = lexregexp(c));
    417 	} else while ((c = lexgetc()) != WEOF) {
    418 		if (iswalpha(c) || c == '_') {
    419 			c = lexid(c);
    420 		} else if (iswdigit(c) || c == '.') {
    421 			c = lexnumber(c);
    422 		} else if (isWblank(c)) {
    423 			continue;
    424 		} else switch (c) {
    425 #if DOS || OS2
    426 		case 032:		/* ^Z */
    427 			continue;
    428 #endif
    429 
    430 		case '"':
    431 			c = lexstring(c);
    432 			break;
    433 
    434 		case '#':
    435 			while ((c = lexgetc()) != '\n' && c != WEOF)
    436 				;
    437 			lexungetc(c);
    438 			continue;
    439 
    440 		case '+':
    441 			if ((c1 = lexgetc()) == '+')
    442 				c = INC;
    443 			else if (c1 == '=')
    444 				c = AADD;
    445 			else
    446 				lexungetc(c1);
    447 			break;
    448 
    449 		case '-':
    450 			if ((c1 = lexgetc()) == '-')
    451 				c = DEC;
    452 			else if (c1 == '=')
    453 				c = ASUB;
    454 			else
    455 				lexungetc(c1);
    456 			break;
    457 
    458 		case '*':
    459 			if ((c1 = lexgetc()) == '=')
    460 				c = AMUL;
    461 			else if (c1 == '*') {
    462 				if ((c1 = lexgetc()) == '=')
    463 					c = AEXP;
    464 				else {
    465 					c = EXP;
    466 					lexungetc(c1);
    467 				}
    468 			} else
    469 				lexungetc(c1);
    470 			break;
    471 
    472 		case '^':
    473 			if ((c1 = lexgetc()) == '=') {
    474 				c = AEXP;
    475 			} else {
    476 				c = EXP;
    477 				lexungetc(c1);
    478 			}
    479 			break;
    480 
    481 		case '/':
    482 			if ((c1 = lexgetc()) == '=' &&
    483 			    lexlast != RE && lexlast != NRE &&
    484 			    lexlast != ';' && lexlast != '\n' &&
    485 			    lexlast != ',' && lexlast != '(')
    486 				c = ADIV;
    487 			else
    488 				lexungetc(c1);
    489 			break;
    490 
    491 		case '%':
    492 			if ((c1 = lexgetc()) == '=')
    493 				c = AREM;
    494 			else
    495 				lexungetc(c1);
    496 			break;
    497 
    498 		case '&':
    499 			if ((c1 = lexgetc()) == '&')
    500 				c = AND;
    501 			else
    502 				lexungetc(c1);
    503 			break;
    504 
    505 		case '|':
    506 			if ((c1 = lexgetc()) == '|')
    507 				c = OR;
    508 			else {
    509 				lexungetc(c1);
    510 				if (inprint)
    511 					c = PIPE;
    512 			}
    513 			break;
    514 
    515 		case '>':
    516 			if ((c1 = lexgetc()) == '=')
    517 				c = GE;
    518 			else if (c1 == '>')
    519 				c = APPEND;
    520 			else {
    521 				lexungetc(c1);
    522 				if (nparen == 0 && inprint)
    523 					c = WRITE;
    524 			}
    525 			break;
    526 
    527 		case '<':
    528 			if ((c1 = lexgetc()) == '=')
    529 				c = LE;
    530 			else
    531 				lexungetc(c1);
    532 			break;
    533 
    534 		case '!':
    535 			if ((c1 = lexgetc()) == '=')
    536 				c = NE;
    537 			else if (c1 == '~')
    538 				c = NRE;
    539 			else
    540 				lexungetc(c1);
    541 			break;
    542 
    543 		case '=':
    544 			if ((c1 = lexgetc()) == '=')
    545 				c = EQ;
    546 			else {
    547 				lexungetc(c1);
    548 				c = ASG;
    549 			}
    550 			break;
    551 
    552 		case '\n':
    553 			switch (lexlast) {
    554 			case ')':
    555 				if (catterm || inprint) {
    556 					c = ';';
    557 					break;
    558 				}
    559 			/*FALLTHRU*/
    560 			case AND:
    561 			case OR:
    562 			case COMMA:
    563 			case '{':
    564 			case ELSE:
    565 			case ';':
    566 			case DO:
    567 				continue;
    568 
    569 			case '}':
    570 				if (nbrace != 0)
    571 					continue;
    572 
    573 			default:
    574 				c = ';';
    575 				break;
    576 			}
    577 			break;
    578 
    579 		case ELSE:
    580 			if (lexlast != ';') {
    581 				savetoken = ELSE;
    582 				c = ';';
    583 			}
    584 			break;
    585 
    586 		case '(':
    587 			++nparen;
    588 			break;
    589 
    590 		case ')':
    591 			if (--nparen < 0)
    592 				awkerr(unbal, "()");
    593 			break;
    594 
    595 		case '{':
    596 			nbrace++;
    597 			break;
    598 
    599 		case '}':
    600 			if (--nbrace < 0) {
    601 				char brk[3];
    602 
    603 				brk[0] = '{';
    604 				brk[1] = '}';
    605 				brk[2] = '\0';
    606 				awkerr(unbal, brk);
    607 			}
    608 			if (lexlast != ';') {
    609 				savetoken = c;
    610 				c = ';';
    611 			}
    612 			break;
    613 
    614 		case '[':
    615 			++nbracket;
    616 			break;
    617 
    618 		case ']':
    619 			if (--nbracket < 0) {
    620 				char brk[3];
    621 
    622 				brk[0] = '[';
    623 				brk[1] = ']';
    624 				brk[2] = '\0';
    625 				awkerr(unbal, brk);
    626 			}
    627 			break;
    628 
    629 		case '\\':
    630 			if ((c1 = lexgetc()) == '\n')
    631 				continue;
    632 			lexungetc(c1);
    633 			break;
    634 
    635 		case ',':
    636 			c = COMMA;
    637 			break;
    638 
    639 		case '?':
    640 			c = QUEST;
    641 			break;
    642 
    643 		case ':':
    644 			c = COLON;
    645 			break;
    646 
    647 		default:
    648 			if (!iswprint(c))
    649 				awkerr(
    650 				    gettext("invalid character \"%s\""),
    651 				    toprint(c));
    652 			break;
    653 		}
    654 		break;
    655 	}
    656 
    657 	switch (c) {
    658 	case ']':
    659 		++catterm;
    660 		break;
    661 
    662 	case VAR:
    663 		if (catterm) {
    664 			savetoken = c;
    665 			c = CONCAT;
    666 			catterm = 0;
    667 		} else if (!isfuncdef) {
    668 			if ((c1 = lexgetc()) != '(')
    669 				++catterm;
    670 			lexungetc(c1);
    671 		}
    672 		isfuncdef = 0;
    673 		break;
    674 
    675 	case PARM:
    676 	case CONSTANT:
    677 		if (catterm) {
    678 			savetoken = c;
    679 			c = CONCAT;
    680 			catterm = 0;
    681 		} else {
    682 			if (lexlast == '$')
    683 				wasfield = 2;
    684 			++catterm;
    685 		}
    686 		break;
    687 
    688 	case INC:
    689 	case DEC:
    690 		if (!catterm || lexlast != CONSTANT || wasfield)
    691 			break;
    692 
    693 	/*FALLTHRU*/
    694 	case UFUNC:
    695 	case FUNC:
    696 	case GETLINE:
    697 	case '!':
    698 	case '$':
    699 	case '(':
    700 		if (catterm) {
    701 			savetoken = c;
    702 			c = CONCAT;
    703 			catterm = 0;
    704 		}
    705 		break;
    706 
    707 	/* { */ case '}':
    708 		if (nbrace == 0)
    709 			savetoken = ';';
    710 	/*FALLTHRU*/
    711 	case ';':
    712 		inprint = 0;
    713 	/*FALLTHRU*/
    714 	default:
    715 		if (c == DEFFUNC)
    716 			isfuncdef = 1;
    717 		catterm = 0;
    718 	}
    719 	lexlast = c;
    720 	if (wasfield)
    721 		wasfield--;
    722 	/*
    723 	 * Map character constants to symbolic names.
    724 	 */
    725 	for (i = 0; ctosym[i].c != 0; i++)
    726 		if (c == ctosym[i].c) {
    727 			c = ctosym[i].sym;
    728 			break;
    729 		}
    730 	return ((int)c);
    731 }
    732 
    733 /*
    734  * Read a number for the lexical analyzer.
    735  * Input is the first character of the number.
    736  * Return value is the lexical type.
    737  */
    738 static int
    739 lexnumber(wint_t c)
    740 {
    741 	wchar_t *cp;
    742 	int dotfound = 0;
    743 	int efound = 0;
    744 	INT number;
    745 
    746 	cp = linebuf;
    747 	do {
    748 		if (iswdigit(c))
    749 			;
    750 		else if (c == '.') {
    751 			if (dotfound++)
    752 				break;
    753 		} else if (c == 'e' || c == 'E') {
    754 			if ((c = lexgetc()) != '-' && c != '+') {
    755 				lexungetc(c);
    756 				c = 'e';
    757 			} else
    758 				*cp++ = 'e';
    759 			if (efound++)
    760 				break;
    761 		} else
    762 			break;
    763 		*cp++ = c;
    764 	} while ((c = lexgetc()) != WEOF);
    765 	*cp = '\0';
    766 	if (dotfound && cp == linebuf+1)
    767 		return (DOT);
    768 	lexungetc(c);
    769 	errno = 0;
    770 	if (!dotfound && !efound &&
    771 	    ((number = wcstol(linebuf, (wchar_t **)0, 10)), errno != ERANGE))
    772 		yylval.node = intnode(number);
    773 	else
    774 		yylval.node = realnode((REAL)wcstod(linebuf, (wchar_t **)0));
    775 	return (CONSTANT);
    776 }
    777 
    778 /*
    779  * Read an identifier.
    780  * Input is first character of identifier.
    781  * Return VAR.
    782  */
    783 static int
    784 lexid(wint_t c)
    785 {
    786 	wchar_t *cp;
    787 	size_t i;
    788 	NODE *np;
    789 
    790 	cp = linebuf;
    791 	do {
    792 		*cp++ = c;
    793 		c = lexgetc();
    794 	} while (iswalpha(c) || iswdigit(c) || c == '_');
    795 	*cp = '\0';
    796 	lexungetc(c);
    797 	yylval.node = np = vlook(linebuf);
    798 
    799 	switch (np->n_type) {
    800 	case KEYWORD:
    801 		switch (np->n_keywtype) {
    802 		case PRINT:
    803 		case PRINTF:
    804 			++inprint;
    805 		default:
    806 			return ((int)np->n_keywtype);
    807 		}
    808 		/* NOTREACHED */
    809 
    810 	case ARRAY:
    811 	case VAR:
    812 		/*
    813 		 * If reading the argument list, create a dummy node
    814 		 * for the duration of that function. These variables
    815 		 * can be removed from the symbol table at function end
    816 		 * but they must still exist because the execution tree
    817 		 * knows about them.
    818 		 */
    819 		if (funparm) {
    820 do_funparm:
    821 			np = emptynode(PARM, i = (cp-linebuf));
    822 			np->n_flags = FSTRING;
    823 			np->n_string = _null;
    824 			np->n_strlen = 0;
    825 			(void) memcpy(np->n_name, linebuf,
    826 			    (i+1) * sizeof (wchar_t));
    827 			addsymtab(np);
    828 			yylval.node = np;
    829 		} else if (np == varNF || (np == varFS &&
    830 		    (!doing_begin || begin_getline))) {
    831 			/*
    832 			 * If the user program references NF or sets
    833 			 * FS either outside of a begin block or
    834 			 * in a begin block after a getline then the
    835 			 * input line will be split immediately upon read
    836 			 * rather than when a field is first referenced.
    837 			 */
    838 			needsplit = 1;
    839 		} else if (np == varENVIRON)
    840 			needenviron = 1;
    841 	/*FALLTHRU*/
    842 	case PARM:
    843 		return (VAR);
    844 
    845 	case UFUNC:
    846 		/*
    847 		 * It is ok to redefine functions as parameters
    848 		 */
    849 		if (funparm) goto do_funparm;
    850 	/*FALLTHRU*/
    851 	case FUNC:
    852 	case GETLINE:
    853 		/*
    854 		 * When a getline is encountered, clear the 'doing_begin' flag.
    855 		 * This will force the 'needsplit' flag to be set, even inside
    856 		 * a begin block, if FS is altered. (See VAR case above)
    857 		 */
    858 		if (doing_begin)
    859 			begin_getline = 1;
    860 		return (np->n_type);
    861 	}
    862 	/* NOTREACHED */
    863 	return (0);
    864 }
    865 
    866 /*
    867  * Read a string for the lexical analyzer.
    868  * `endc' terminates the string.
    869  */
    870 static int
    871 lexstring(wint_t endc)
    872 {
    873 	size_t length = lexescape(endc, 0, 0);
    874 
    875 	yylval.node = stringnode(linebuf, FALLOC, length);
    876 	return (CONSTANT);
    877 }
    878 
    879 /*
    880  * Read a regular expression.
    881  */
    882 static int
    883 lexregexp(wint_t endc)
    884 {
    885 	(void) lexescape(endc, 1, 0);
    886 	yylval.node = renode(linebuf);
    887 	return (URE);
    888 }
    889 
    890 /*
    891  * Process a string, converting the escape characters as required by
    892  * 1003.2. The processed string ends up in the global linebuf[]. This
    893  * routine also changes the value of 'progfd' - the program file
    894  * descriptor, so it should be used with some care. It is presently used to
    895  * process -v (awk1.c) and var=str type arguments (awk2.c, nextrecord()).
    896  */
    897 void
    898 strescape(wchar_t *str)
    899 {
    900 	progptr = str;
    901 	proglen = wcslen(str) + 1;	/* Include \0 */
    902 	(void) lexescape('\0', 0, 1);
    903 	progptr = NULL;
    904 }
    905 
    906 /*
    907  * Read a string or regular expression, terminated by ``endc'',
    908  * for lexical analyzer, processing escape sequences.
    909  * Return string length.
    910  */
    911 static size_t
    912 lexescape(wint_t endc, int regx, int cmd_line_operand)
    913 {
    914 	static char nlre[256];
    915 	static char nlstr[256];
    916 	static char eofre[256];
    917 	static char eofstr[256];
    918 	int first_time = 1;
    919 	wint_t c;
    920 	wchar_t *cp;
    921 	int n, max;
    922 
    923 	if (first_time == 1) {
    924 		(void) strcpy(nlre, gettext("Newline in regular expression\n"));
    925 		(void) strcpy(nlstr, gettext("Newline in string\n"));
    926 		(void) strcpy(eofre, gettext("EOF in regular expression\n"));
    927 		(void) strcpy(eofstr, gettext("EOF in string\n"));
    928 		first_time = 0;
    929 	}
    930 
    931 	cp = linebuf;
    932 	while ((c = lexgetc()) != endc) {
    933 		if (c == '\n')
    934 			awkerr(regx ? nlre : nlstr);
    935 		if (c == '\\') {
    936 			switch (c = lexgetc(), c) {
    937 			case '\\':
    938 				if (regx)
    939 					*cp++ = '\\';
    940 				break;
    941 
    942 			case '/':
    943 				c = '/';
    944 				break;
    945 
    946 			case 'n':
    947 				c = '\n';
    948 				break;
    949 
    950 			case 'b':
    951 				c = '\b';
    952 				break;
    953 
    954 			case 't':
    955 				c = '\t';
    956 				break;
    957 
    958 			case 'r':
    959 				c = '\r';
    960 				break;
    961 
    962 			case 'f':
    963 				c = '\f';
    964 				break;
    965 
    966 			case 'v':
    967 				c = '\v';
    968 				break;
    969 
    970 			case 'a':
    971 				c = (char)0x07;
    972 				break;
    973 
    974 			case 'x':
    975 				n = 0;
    976 				while (iswxdigit(c = lexgetc())) {
    977 					if (iswdigit(c))
    978 						c -= '0';
    979 					else if (iswupper(c))
    980 						c -= 'A'-10;
    981 					else
    982 						c -= 'a'-10;
    983 					n = (n<<4) + c;
    984 				}
    985 				lexungetc(c);
    986 				c = n;
    987 				break;
    988 
    989 			case '0':
    990 			case '1':
    991 			case '2':
    992 			case '3':
    993 			case '4':
    994 			case '5':
    995 			case '6':
    996 			case '7':
    997 #if 0
    998 /*
    999  * Posix.2 draft 10 disallows the use of back-referencing - it explicitly
   1000  * requires processing of the octal escapes both in strings and
   1001  * regular expressions. The following code is disabled instead of
   1002  * removed as back-referencing may be reintroduced in a future draft
   1003  * of the standard.
   1004  */
   1005 				/*
   1006 				 * For regular expressions, we disallow
   1007 				 * \ooo to mean octal character, in favour
   1008 				 * of back referencing.
   1009 				 */
   1010 				if (regx) {
   1011 					*cp++ = '\\';
   1012 					break;
   1013 				}
   1014 #endif
   1015 				max = 3;
   1016 				n = 0;
   1017 				do {
   1018 					n = (n<<3) + c-'0';
   1019 					if ((c = lexgetc()) > '7' || c < '0')
   1020 						break;
   1021 				} while (--max);
   1022 				lexungetc(c);
   1023 				/*
   1024 				 * an octal escape sequence must have at least
   1025 				 * 2 digits after the backslash, otherwise
   1026 				 * it gets passed straight thru for possible
   1027 				 * use in backreferencing.
   1028 				 */
   1029 				if (max == 3) {
   1030 					*cp++ = '\\';
   1031 					n += '0';
   1032 				}
   1033 				c = n;
   1034 				break;
   1035 
   1036 			case '\n':
   1037 				continue;
   1038 
   1039 			default:
   1040 				if (c != endc || cmd_line_operand) {
   1041 					*cp++ = '\\';
   1042 					if (c == endc)
   1043 						lexungetc(c);
   1044 				}
   1045 			}
   1046 		}
   1047 		if (c == WEOF)
   1048 			awkerr(regx ? eofre : eofstr);
   1049 		*cp++ = c;
   1050 	}
   1051 	*cp = '\0';
   1052 	return (cp - linebuf);
   1053 }
   1054 
   1055 /*
   1056  * Build a regular expression NODE.
   1057  * Argument is the string holding the expression.
   1058  */
   1059 NODE *
   1060 renode(wchar_t *s)
   1061 {
   1062 	NODE *np;
   1063 	int n;
   1064 
   1065 	np = emptynode(RE, 0);
   1066 	np->n_left = np->n_right = NNULL;
   1067 	if ((n = REGWCOMP(&np->n_regexp, s)) != REG_OK) {
   1068 		int m;
   1069 		char *p;
   1070 
   1071 		m = REGWERROR(n, np->n_regexp, NULL, 0);
   1072 		p = (char *)emalloc(m);
   1073 		REGWERROR(n, np->n_regexp, p, m);
   1074 		awkerr("/%S/: %s", s, p);
   1075 	}
   1076 	return (np);
   1077 }
   1078 /*
   1079  * Get a character for the lexical analyser routine.
   1080  */
   1081 static wint_t
   1082 lexgetc()
   1083 {
   1084 	wint_t c;
   1085 	static char **files = &progfiles[0];
   1086 
   1087 	if (progfp != FNULL && (c = fgetwc(progfp)) != WEOF)
   1088 		;
   1089 	else {
   1090 		if (progptr != NULL) {
   1091 			if (proglen-- <= 0)
   1092 				c = WEOF;
   1093 			else
   1094 				c = *progptr++;
   1095 		} else {
   1096 			if (progfp != FNULL)
   1097 				if (progfp != stdin)
   1098 					(void) fclose(progfp);
   1099 				else
   1100 					clearerr(progfp);
   1101 				progfp = FNULL;
   1102 			if (files < progfilep) {
   1103 				filename = *files++;
   1104 				lineno = 1;
   1105 				if (filename[0] == '-' && filename[1] == '\0')
   1106 					progfp = stdin;
   1107 				else if ((progfp = fopen(filename, r))
   1108 				    == FNULL) {
   1109 					(void) fprintf(stderr,
   1110 				gettext("script file \"%s\""), filename);
   1111 					exit(1);
   1112 				}
   1113 				c = fgetwc(progfp);
   1114 			}
   1115 		}
   1116 	}
   1117 	if (c == '\n')
   1118 		++lineno;
   1119 	if (conptr >= &context[NCONTEXT])
   1120 		conptr = &context[0];
   1121 	if (c != WEOF)
   1122 		*conptr++ = c;
   1123 	return (c);
   1124 }
   1125 
   1126 /*
   1127  * Return a character for lexical analyser.
   1128  * Only one returned character is (not enforced) legitimite.
   1129  */
   1130 static void
   1131 lexungetc(wint_t c)
   1132 {
   1133 	if (c == '\n')
   1134 		--lineno;
   1135 	if (c != WEOF) {
   1136 		if (conptr == &context[0])
   1137 			conptr = &context[NCONTEXT];
   1138 		*--conptr = '\0';
   1139 	}
   1140 	if (progfp != FNULL) {
   1141 		(void) ungetwc(c, progfp);
   1142 		return;
   1143 	}
   1144 	if (c == WEOF)
   1145 		return;
   1146 	*--progptr = c;
   1147 	proglen++;
   1148 }
   1149 
   1150 /*
   1151  * Syntax errors during parsing.
   1152  */
   1153 void
   1154 yyerror(char *s, ...)
   1155 {
   1156 	if (lexlast == FUNC || lexlast == GETLINE || lexlast == KEYWORD)
   1157 		if (lexlast == KEYWORD)
   1158 			awkerr(gettext("inadmissible use of reserved keyword"));
   1159 		else
   1160 			awkerr(gettext("attempt to redefine builtin function"));
   1161 	awkerr(s);
   1162 }
   1163 
   1164 /*
   1165  * Error routine for all awk errors.
   1166  */
   1167 /* ARGSUSED */
   1168 void
   1169 awkerr(char *fmt, ...)
   1170 {
   1171 	va_list args;
   1172 
   1173 	va_start(args, fmt);
   1174 	awkierr(0, fmt, args);
   1175 	va_end(args);
   1176 }
   1177 
   1178 /*
   1179  * Error routine like "awkerr" except that it prints out
   1180  * a message that includes an errno-specific indication.
   1181  */
   1182 /* ARGSUSED */
   1183 void
   1184 awkperr(char *fmt, ...)
   1185 {
   1186 	va_list args;
   1187 
   1188 	va_start(args, fmt);
   1189 	awkierr(1, fmt, args);
   1190 	va_end(args);
   1191 }
   1192 
   1193 /*
   1194  * Common internal routine for awkerr, awkperr
   1195  */
   1196 static void
   1197 awkierr(int perr, char *fmt, va_list ap)
   1198 {
   1199 	static char sep1[] = "\n>>>\t";
   1200 	static char sep2[] = "\t<<<";
   1201 	int saveerr = errno;
   1202 
   1203 	(void) fprintf(stderr, "%s: ", _cmdname);
   1204 	if (running) {
   1205 		(void) fprintf(stderr, gettext("line %u ("),
   1206 		    curnode == NNULL ? 0 : curnode->n_lineno);
   1207 		if (phase == 0)
   1208 			(void) fprintf(stderr, "NR=%lld): ",
   1209 			    (INT)exprint(varNR));
   1210 		else
   1211 			(void) fprintf(stderr, "%s): ",
   1212 			    phase == BEGIN ? s_BEGIN : s_END);
   1213 	} else if (lineno != 0) {
   1214 		(void) fprintf(stderr, gettext("file \"%s\": "), filename);
   1215 		(void) fprintf(stderr, gettext("line %u: "), lineno);
   1216 	}
   1217 	(void) vfprintf(stderr, gettext(fmt), ap);
   1218 	if (perr == 1)
   1219 		(void) fprintf(stderr, ": %s", strerror(saveerr));
   1220 	if (perr != 2 && !running) {
   1221 		wchar_t *cp;
   1222 		int n;
   1223 		int c;
   1224 
   1225 		(void) fprintf(stderr, gettext("  Context is:%s"), sep1);
   1226 		cp = conptr;
   1227 		n = NCONTEXT;
   1228 		do {
   1229 			if (cp >= &context[NCONTEXT])
   1230 				cp = &context[0];
   1231 			if ((c = *cp++) != '\0')
   1232 				(void) fputs(c == '\n' ? sep1 : toprint(c),
   1233 				    stderr);
   1234 		} while (--n != 0);
   1235 		(void) fputs(sep2, stderr);
   1236 	}
   1237 	(void) fprintf(stderr, "\n");
   1238 	exit(1);
   1239 }
   1240 
   1241 wchar_t *
   1242 emalloc(unsigned n)
   1243 {
   1244 	wchar_t *cp;
   1245 
   1246 	if ((cp = malloc(n)) == NULL)
   1247 		awkerr(nomem);
   1248 	return (cp);
   1249 }
   1250 
   1251 wchar_t *
   1252 erealloc(wchar_t *p, unsigned n)
   1253 {
   1254 	wchar_t *cp;
   1255 
   1256 	if ((cp = realloc(p, n)) == NULL)
   1257 		awkerr(nomem);
   1258 	return (cp);
   1259 }
   1260 
   1261 
   1262 /*
   1263  * usage message for awk
   1264  */
   1265 static int
   1266 usage()