source: trunk/minix/lib/other/v8regexp.c@ 10

Last change on this file since 10 was 9, checked in by Mattia Monga, 14 years ago

Minix 3.1.2a

File size: 27.4 KB
RevLine 
[9]1/* regcomp and regexec -- regsub and regerror are elsewhere
2 *
3 * Copyright (c) 1986 by University of Toronto.
4 * Written by Henry Spencer. Not derived from licensed software.
5 *
6 * Permission is granted to anyone to use this software for any
7 * purpose on any computer system, and to redistribute it freely,
8 * subject to the following restrictions:
9 *
10 * 1. The author is not responsible for the consequences of use of
11 * this software, no matter how awful, even if they arise
12 * from defects in it.
13 *
14 * 2. The origin of this software must not be misrepresented, either
15 * by explicit claim or by omission.
16 *
17 * 3. Altered versions must be plainly marked as such, and must not
18 * be misrepresented as being the original software.
19 *
20 * Beware that some of this code is subtly aware of the way operator
21 * precedence is structured in regular expressions. Serious changes in
22 * regular-expression syntax might require a total rethink.
23 *
24 * The third parameter to regexec was added by Martin C. Atkins.
25 * Andy Tanenbaum also made some changes.
26 */
27
28#include <stdlib.h>
29#include <string.h>
30#include <stdio.h>
31#define const /* avoid "const poisoning" */
32#include <regexp.h>
33#undef const
34
35/* The first byte of the regexp internal "program" is actually this magic
36 * number; the start node begins in the second byte.
37 */
38#define MAGIC 0234
39
40/* The "internal use only" fields in regexp.h are present to pass info from
41 * compile to execute that permits the execute phase to run lots faster on
42 * simple cases. They are:
43 *
44 * regstart char that must begin a match; '\0' if none obvious
45 * reganch is the match anchored (at beginning-of-line only)?
46 * regmust string (pointer into program) that match must include, or NULL
47 * regmlen length of regmust string
48 *
49 * Regstart and reganch permit very fast decisions on suitable starting points
50 * for a match, cutting down the work a lot. Regmust permits fast rejection
51 * of lines that cannot possibly match. The regmust tests are costly enough
52 * that regcomp() supplies a regmust only if the r.e. contains something
53 * potentially expensive (at present, the only such thing detected is * or +
54 * at the start of the r.e., which can involve a lot of backup). Regmlen is
55 * supplied because the test in regexec() needs it and regcomp() is computing
56 * it anyway.
57 */
58
59/* Structure for regexp "program". This is essentially a linear encoding
60 * of a nondeterministic finite-state machine (aka syntax charts or
61 * "railroad normal form" in parsing technology). Each node is an opcode
62 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
63 * all nodes except BRANCH implement concatenation; a "next" pointer with
64 * a BRANCH on both ends of it is connecting two alternatives. (Here we
65 * have one of the subtle syntax dependencies: an individual BRANCH (as
66 * opposed to a collection of them) is never concatenated with anything
67 * because of operator precedence.) The operand of some types of node is
68 * a literal string; for others, it is a node leading into a sub-FSM. In
69 * particular, the operand of a BRANCH node is the first node of the branch.
70 * (NB this is *not* a tree structure: the tail of the branch connects
71 * to the thing following the set of BRANCHes.) The opcodes are:
72 */
73
74/* Definition number opnd? meaning */
75#define END 0 /* no End of program. */
76#define BOL 1 /* no Match "" at beginning of line. */
77#define EOL 2 /* no Match "" at end of line. */
78#define ANY 3 /* no Match any one character. */
79#define ANYOF 4 /* str Match any character in this string. */
80#define ANYBUT 5 /* str Match any character not in this
81 * string. */
82#define BRANCH 6 /* node Match this alternative, or the
83 * next... */
84#define BACK 7 /* no Match "", "next" ptr points backward. */
85#define EXACTLY 8 /* str Match this string. */
86#define NOTHING 9 /* no Match empty string. */
87#define STAR 10 /* node Match this (simple) thing 0 or more
88 * times. */
89#define PLUS 11 /* node Match this (simple) thing 1 or more
90 * times. */
91#define OPEN 20 /* no Mark this point in input as start of
92 * #n. */
93 /* OPEN+1 is number 1, etc. */
94#define CLOSE 30 /* no Analogous to OPEN. */
95
96/* Opcode notes:
97 *
98 * BRANCH The set of branches constituting a single choice are hooked
99 * together with their "next" pointers, since precedence prevents
100 * anything being concatenated to any individual branch. The
101 * "next" pointer of the last BRANCH in a choice points to the
102 * thing following the whole choice. This is also where the
103 * final "next" pointer of each individual branch points; each
104 * branch starts with the operand node of a BRANCH node.
105 *
106 * BACK Normal "next" pointers all implicitly point forward; BACK
107 * exists to make loop structures possible.
108 *
109 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
110 * BRANCH structures using BACK. Simple cases (one character
111 * per match) are implemented with STAR and PLUS for speed
112 * and to minimize recursive plunges.
113 *
114 * OPEN,CLOSE ...are numbered at compile time.
115 */
116
117/* A node is one char of opcode followed by two chars of "next" pointer.
118 * "Next" pointers are stored as two 8-bit pieces, high order first. The
119 * value is a positive offset from the opcode of the node containing it.
120 * An operand, if any, simply follows the node. (Note that much of the
121 * code generation knows about this implicit relationship.)
122 *
123 * Using two bytes for the "next" pointer is vast overkill for most things,
124 * but allows patterns to get big without disasters.
125 */
126#define OP(p) (*(p))
127#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
128#define OPERAND(p) ((p) + 3)
129
130/* Utility definitions.
131 */
132#ifndef CHARBITS
133#define UCHARAT(p) ((int)*(unsigned char *)(p))
134#else
135#define UCHARAT(p) ((int)*(p)&CHARBITS)
136#endif
137
138#define CFAIL(m) { regerror(m); return((char *)NULL); }
139#define RFAIL(m) { regerror(m); return((regexp *)NULL); }
140#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
141#define META "^$.[()|?+*\\"
142
143/* Flags to be passed up and down.
144 */
145#define HASWIDTH 01 /* Known never to match null string. */
146#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
147#define SPSTART 04 /* Starts with * or +. */
148#define WORST 0 /* Worst case. */
149
150/* Global work variables for regcomp().
151 */
152static char *regparse; /* Input-scan pointer. */
153static int regnpar; /* () count. */
154static char regdummy;
155static char *regcode; /* Code-emit pointer; &regdummy = don't. */
156static long regsize; /* Code size. */
157
158/* Forward declarations for regcomp()'s friends.
159 */
160static char *reg(int paren, int *flagp);
161static char *regbranch(int *flagp);
162static char *regpiece(int *flagp);
163static char *regatom(int *flagp);
164static char *regnode(int op);
165static char *regnext(char *p);
166static void regc(int b);
167static void reginsert(int op, char *opnd);
168static void regtail(char *p, char *val);
169static void regoptail(char *p, char *val);
170
171/*
172 - regcomp - compile a regular expression into internal code
173 *
174 * We can't allocate space until we know how big the compiled form will be,
175 * but we can't compile it (and thus know how big it is) until we've got a
176 * place to put the code. So we cheat: we compile it twice, once with code
177 * generation turned off and size counting turned on, and once "for real".
178 * This also means that we don't allocate space until we are sure that the
179 * thing really will compile successfully, and we never have to move the
180 * code and thus invalidate pointers into it. (Note that it has to be in
181 * one piece because free() must be able to free it all.)
182 *
183 * Beware that the optimization-preparation code in here knows about some
184 * of the structure of the compiled regexp.
185 */
186regexp *regcomp(exp)
187char *exp;
188{
189 register regexp *r;
190 register char *scan;
191 register char *longest;
192 register int len;
193 int flags;
194
195 if (exp == (char *)NULL) RFAIL("NULL argument");
196
197 /* First pass: determine size, legality. */
198 regparse = exp;
199 regnpar = 1;
200 regsize = 0L;
201 regcode = &regdummy;
202 regc(MAGIC);
203 if (reg(0, &flags) == (char *)NULL) return((regexp *)NULL);
204
205 /* Small enough for pointer-storage convention? */
206 if (regsize >= 32767L) /* Probably could be 65535L. */
207 RFAIL("regexp too big");
208
209 /* Allocate space. */
210 r = (regexp *) malloc(sizeof(regexp) + (unsigned) regsize);
211 if (r == (regexp *)NULL) RFAIL("out of space");
212
213 /* Second pass: emit code. */
214 regparse = exp;
215 regnpar = 1;
216 regcode = r->program;
217 regc(MAGIC);
218 if (reg(0, &flags) == (char *)NULL) return((regexp *)NULL);
219
220 /* Dig out information for optimizations. */
221 r->regstart = '\0'; /* Worst-case defaults. */
222 r->reganch = 0;
223 r->regmust = (char *)NULL;
224 r->regmlen = 0;
225 scan = r->program + 1; /* First BRANCH. */
226 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
227 scan = OPERAND(scan);
228
229 /* Starting-point info. */
230 if (OP(scan) == EXACTLY)
231 r->regstart = *OPERAND(scan);
232 else if (OP(scan) == BOL)
233 r->reganch++;
234
235 /* If there's something expensive in the r.e., find the
236 * longest literal string that must appear and make it the
237 * regmust. Resolve ties in favor of later strings, since
238 * the regstart check works with the beginning of the r.e.
239 * and avoiding duplication strengthens checking. Not a
240 * strong reason, but sufficient in the absence of others. */
241 if (flags & SPSTART) {
242 longest = (char *)NULL;
243 len = 0;
244 for (; scan != (char *)NULL; scan = regnext(scan))
245 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
246 longest = OPERAND(scan);
247 len = strlen(OPERAND(scan));
248 }
249 r->regmust = longest;
250 r->regmlen = len;
251 }
252 }
253 return(r);
254}
255
256/*
257 - reg - regular expression, i.e. main body or parenthesized thing
258 *
259 * Caller must absorb opening parenthesis.
260 *
261 * Combining parenthesis handling with the base level of regular expression
262 * is a trifle forced, but the need to tie the tails of the branches to what
263 * follows makes it hard to avoid.
264 */
265static char *reg(paren, flagp)
266int paren; /* Parenthesized? */
267int *flagp;
268{
269 register char *ret;
270 register char *br;
271 register char *ender;
272 register int parno;
273 int flags;
274
275 *flagp = HASWIDTH; /* Tentatively. */
276
277 /* Make an OPEN node, if parenthesized. */
278 if (paren) {
279 if (regnpar >= NSUBEXP) CFAIL("too many ()");
280 parno = regnpar;
281 regnpar++;
282 ret = regnode(OPEN + parno);
283 } else {
284 parno = 0; /* not actually used, keep compiler quiet */
285 ret = (char *)NULL;
286 }
287
288 /* Pick up the branches, linking them together. */
289 br = regbranch(&flags);
290 if (br == (char *)NULL) return((char *)NULL);
291 if (ret != (char *)NULL)
292 regtail(ret, br); /* OPEN -> first. */
293 else
294 ret = br;
295 if (!(flags & HASWIDTH)) *flagp &= ~HASWIDTH;
296 *flagp |= flags & SPSTART;
297 while (*regparse == '|') {
298 regparse++;
299 br = regbranch(&flags);
300 if (br == (char *)NULL) return((char *)NULL);
301 regtail(ret, br); /* BRANCH -> BRANCH. */
302 if (!(flags & HASWIDTH)) *flagp &= ~HASWIDTH;
303 *flagp |= flags & SPSTART;
304 }
305
306 /* Make a closing node, and hook it on the end. */
307 ender = regnode((paren) ? CLOSE + parno : END);
308 regtail(ret, ender);
309
310 /* Hook the tails of the branches to the closing node. */
311 for (br = ret; br != (char *)NULL; br = regnext(br)) regoptail(br, ender);
312
313 /* Check for proper termination. */
314 if (paren && *regparse++ != ')') {
315 CFAIL("unmatched ()");
316 } else if (!paren && *regparse != '\0') {
317 if (*regparse == ')') {
318 CFAIL("unmatched ()");
319 } else
320 CFAIL("junk on end"); /* "Can't happen". */
321 /* NOTREACHED */
322 }
323 return(ret);
324}
325
326/*
327 - regbranch - one alternative of an | operator
328 *
329 * Implements the concatenation operator.
330 */
331static char *regbranch(flagp)
332int *flagp;
333{
334 register char *ret;
335 register char *chain;
336 register char *latest;
337 int flags;
338
339 *flagp = WORST; /* Tentatively. */
340
341 ret = regnode(BRANCH);
342 chain = (char *)NULL;
343 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
344 latest = regpiece(&flags);
345 if (latest == (char *)NULL) return((char *)NULL);
346 *flagp |= flags & HASWIDTH;
347 if (chain == (char *)NULL) /* First piece. */
348 *flagp |= flags & SPSTART;
349 else
350 regtail(chain, latest);
351 chain = latest;
352 }
353 if (chain == (char *)NULL) /* Loop ran zero times. */
354 regnode(NOTHING);
355
356 return(ret);
357}
358
359/*
360 - regpiece - something followed by possible [*+?]
361 *
362 * Note that the branching code sequences used for ? and the general cases
363 * of * and + are somewhat optimized: they use the same NOTHING node as
364 * both the endmarker for their branch list and the body of the last branch.
365 * It might seem that this node could be dispensed with entirely, but the
366 * endmarker role is not redundant.
367 */
368static char *regpiece(flagp)
369int *flagp;
370{
371 register char *ret;
372 register char op;
373 register char *next;
374 int flags;
375
376 ret = regatom(&flags);
377 if (ret == (char *)NULL) return((char *)NULL);
378
379 op = *regparse;
380 if (!ISMULT(op)) {
381 *flagp = flags;
382 return(ret);
383 }
384 if (!(flags & HASWIDTH) && op != '?') CFAIL("*+ operand could be empty");
385 *flagp = (op != '+') ? (WORST | SPSTART) : (WORST | HASWIDTH);
386
387 if (op == '*' && (flags & SIMPLE))
388 reginsert(STAR, ret);
389 else if (op == '*') {
390 /* Emit x* as (x&|), where & means "self". */
391 reginsert(BRANCH, ret); /* Either x */
392 regoptail(ret, regnode(BACK)); /* and loop */
393 regoptail(ret, ret); /* back */
394 regtail(ret, regnode(BRANCH)); /* or */
395 regtail(ret, regnode(NOTHING)); /* null. */
396 } else if (op == '+' && (flags & SIMPLE))
397 reginsert(PLUS, ret);
398 else if (op == '+') {
399 /* Emit x+ as x(&|), where & means "self". */
400 next = regnode(BRANCH); /* Either */
401 regtail(ret, next);
402 regtail(regnode(BACK), ret); /* loop back */
403 regtail(next, regnode(BRANCH)); /* or */
404 regtail(ret, regnode(NOTHING)); /* null. */
405 } else if (op == '?') {
406 /* Emit x? as (x|) */
407 reginsert(BRANCH, ret); /* Either x */
408 regtail(ret, regnode(BRANCH)); /* or */
409 next = regnode(NOTHING);/* null. */
410 regtail(ret, next);
411 regoptail(ret, next);
412 }
413 regparse++;
414 if (ISMULT(*regparse)) CFAIL("nested *?+");
415
416 return(ret);
417}
418
419/*
420 - regatom - the lowest level
421 *
422 * Optimization: gobbles an entire sequence of ordinary characters so that
423 * it can turn them into a single node, which is smaller to store and
424 * faster to run. Backslashed characters are exceptions, each becoming a
425 * separate node; the code is simpler that way and it's not worth fixing.
426 */
427static char *regatom(flagp)
428int *flagp;
429{
430 register char *ret;
431 int flags;
432
433 *flagp = WORST; /* Tentatively. */
434
435 switch (*regparse++) {
436 case '^': ret = regnode(BOL); break;
437 case '$': ret = regnode(EOL); break;
438 case '.':
439 ret = regnode(ANY);
440 *flagp |= HASWIDTH | SIMPLE;
441 break;
442 case '[':{
443 register int class;
444 register int classend;
445
446 if (*regparse == '^') { /* Complement of range. */
447 ret = regnode(ANYBUT);
448 regparse++;
449 } else
450 ret = regnode(ANYOF);
451 if (*regparse == ']' || *regparse == '-') regc(*regparse++);
452 while (*regparse != '\0' && *regparse != ']') {
453 if (*regparse == '-') {
454 regparse++;
455 if (*regparse == ']' || *regparse == '\0')
456 regc('-');
457 else {
458 class = UCHARAT(regparse - 2) + 1;
459 classend = UCHARAT(regparse);
460 if (class > classend + 1)
461 CFAIL("invalid [] range");
462 for (; class <= classend; class++)
463 regc(class);
464 regparse++;
465 }
466 } else
467 regc(*regparse++);
468 }
469 regc('\0');
470 if (*regparse != ']') CFAIL("unmatched []");
471 regparse++;
472 *flagp |= HASWIDTH | SIMPLE;
473 }
474 break;
475 case '(':
476 ret = reg(1, &flags);
477 if (ret == (char *)NULL) return((char *)NULL);
478 *flagp |= flags & (HASWIDTH | SPSTART);
479 break;
480 case '\0':
481 case '|':
482 case ')':
483 CFAIL("internal urp"); /* Supposed to be caught earlier. */
484 break;
485 case '?':
486 case '+':
487 case '*': CFAIL("?+* follows nothing"); break;
488 case '\\':
489 if (*regparse == '\0') CFAIL("trailing \\");
490 ret = regnode(EXACTLY);
491 regc(*regparse++);
492 regc('\0');
493 *flagp |= HASWIDTH | SIMPLE;
494 break;
495 default:{
496 register int len;
497 register char ender;
498
499 regparse--;
500 len = strcspn(regparse, META);
501 if (len <= 0) CFAIL("internal disaster");
502 ender = *(regparse + len);
503 if (len > 1 && ISMULT(ender))
504 len--; /* Back off clear of ?+* operand. */
505 *flagp |= HASWIDTH;
506 if (len == 1) *flagp |= SIMPLE;
507 ret = regnode(EXACTLY);
508 while (len > 0) {
509 regc(*regparse++);
510 len--;
511 }
512 regc('\0');
513 }
514 break;
515 }
516
517 return(ret);
518}
519
520/*
521 - regnode - emit a node
522 */
523static char *regnode(op)
524char op;
525{
526 register char *ret;
527 register char *ptr;
528
529 ret = regcode;
530 if (ret == &regdummy) {
531 regsize += 3;
532 return(ret);
533 }
534 ptr = ret;
535 *ptr++ = op;
536 *ptr++ = '\0'; /* Null "next" pointer. */
537 *ptr++ = '\0';
538 regcode = ptr;
539
540 return(ret);
541}
542
543/*
544 - regc - emit (if appropriate) a byte of code
545 */
546static void regc(b)
547char b;
548{
549 if (regcode != &regdummy)
550 *regcode++ = b;
551 else
552 regsize++;
553}
554
555/*
556 - reginsert - insert an operator in front of already-emitted operand
557 *
558 * Means relocating the operand.
559 */
560static void reginsert(op, opnd)
561char op;
562char *opnd;
563{
564 register char *src;
565 register char *dst;
566 register char *place;
567
568 if (regcode == &regdummy) {
569 regsize += 3;
570 return;
571 }
572 src = regcode;
573 regcode += 3;
574 dst = regcode;
575 while (src > opnd) *--dst = *--src;
576
577 place = opnd; /* Op node, where operand used to be. */
578 *place++ = op;
579 *place++ = '\0';
580 *place++ = '\0';
581}
582
583/*
584 - regtail - set the next-pointer at the end of a node chain
585 */
586static void regtail(p, val)
587char *p;
588char *val;
589{
590 register char *scan;
591 register char *temp;
592 register int offset;
593
594 if (p == &regdummy) return;
595
596 /* Find last node. */
597 scan = p;
598 for (;;) {
599 temp = (char *)regnext(scan);
600 if (temp == (char *)NULL) break;
601 scan = temp;
602 }
603
604 if (OP(scan) == BACK)
605 offset = scan - val;
606 else
607 offset = val - scan;
608 *(scan + 1) = (offset >> 8) & 0377;
609 *(scan + 2) = offset & 0377;
610}
611
612/*
613 - regoptail - regtail on operand of first argument; nop if operandless
614 */
615static void regoptail(p, val)
616char *p;
617char *val;
618{
619 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
620 if (p == (char *)NULL || p == &regdummy || OP(p) != BRANCH) return;
621 regtail(OPERAND(p), val);
622}
623
624/* regexec and friends
625 */
626
627/* Global work variables for regexec().
628 */
629static char *reginput; /* String-input pointer. */
630static char *regbol; /* Beginning of input, for ^ check. */
631static char **regstartp; /* Pointer to startp array. */
632static char **regendp; /* Ditto for endp. */
633
634/* Forwards.
635 */
636static int regtry(regexp *prog, char *string);
637static int regmatch(char *prog);
638static int regrepeat(char *p);
639
640#ifdef DEBUG
641int regnarrate = 0;
642void regdump();
643static char *regprop(char *op);
644#endif
645
646/*
647 - regexec - match a regexp against a string
648 */
649int regexec(prog, string, bolflag)
650register regexp *prog;
651register char *string;
652int bolflag;
653{
654 register char *s;
655
656 /* Be paranoid... */
657 if (prog == (regexp *)NULL || string == (char *)NULL) {
658 regerror("NULL parameter");
659 return(0);
660 }
661
662 /* Check validity of program. */
663 if (UCHARAT(prog->program) != MAGIC) {
664 regerror("corrupted program");
665 return(0);
666 }
667
668 /* If there is a "must appear" string, look for it. */
669 if (prog->regmust != (char *)NULL) {
670 s = string;
671 while ((s = strchr(s, prog->regmust[0])) != (char *)NULL) {
672 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
673 break; /* Found it. */
674 s++;
675 }
676 if (s == (char *)NULL) /* Not present. */
677 return(0);
678 }
679
680 /* Mark beginning of line for ^ . */
681 if (bolflag)
682 regbol = string;
683 else
684 regbol = (char *)NULL;
685
686 /* Simplest case: anchored match need be tried only once. */
687 if (prog->reganch) return(regtry(prog, string));
688
689 /* Messy cases: unanchored match. */
690 s = string;
691 if (prog->regstart != '\0') /* We know what char it must start with. */
692 while ((s = strchr(s, prog->regstart)) != (char *)NULL) {
693 if (regtry(prog, s)) return(1);
694 s++;
695 }
696 else
697 /* We don't -- general case. */
698 do {
699 if (regtry(prog, s)) return(1);
700 } while (*s++ != '\0');
701
702 /* Failure. */
703 return(0);
704}
705
706/*
707 - regtry - try match at specific point
708 */
709static int regtry(prog, string) /* 0 failure, 1 success */
710regexp *prog;
711char *string;
712{
713 register int i;
714 register char **sp;
715 register char **ep;
716
717 reginput = string;
718 regstartp = prog->startp;
719 regendp = prog->endp;
720
721 sp = prog->startp;
722 ep = prog->endp;
723 for (i = NSUBEXP; i > 0; i--) {
724 *sp++ = (char *)NULL;
725 *ep++ = (char *)NULL;
726 }
727 if (regmatch(prog->program + 1)) {
728 prog->startp[0] = string;
729 prog->endp[0] = reginput;
730 return(1);
731 } else
732 return(0);
733}
734
735/*
736 - regmatch - main matching routine
737 *
738 * Conceptually the strategy is simple: check to see whether the current
739 * node matches, call self recursively to see whether the rest matches,
740 * and then act accordingly. In practice we make some effort to avoid
741 * recursion, in particular by going through "ordinary" nodes (that don't
742 * need to know whether the rest of the match failed) by a loop instead of
743 * by recursion.
744 */
745static int regmatch(prog) /* 0 failure, 1 success */
746char *prog;
747{
748 register char *scan; /* Current node. */
749 char *next; /* Next node. */
750
751 scan = prog;
752#ifdef DEBUG
753 if (scan != (char *)NULL && regnarrate) fprintf(stderr, "%s(\n", regprop(scan));
754#endif
755 while (scan != (char *)NULL) {
756#ifdef DEBUG
757 if (regnarrate) fprintf(stderr, "%s...\n", regprop(scan));
758#endif
759 next = regnext(scan);
760
761 switch (OP(scan)) {
762 case BOL:
763 if (reginput != regbol) return(0);
764 break;
765 case EOL:
766 if (*reginput != '\0') return(0);
767 break;
768 case ANY:
769 if (*reginput == '\0') return(0);
770 reginput++;
771 break;
772 case EXACTLY:{
773 register int len;
774 register char *opnd;
775
776 opnd = OPERAND(scan);
777 /* Inline the first character, for speed. */
778 if (*opnd != *reginput) return(0);
779 len = strlen(opnd);
780 if (len > 1 && strncmp(opnd, reginput, len) != 0)
781 return(0);
782 reginput += len;
783 }
784 break;
785 case ANYOF:
786 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == (char *)NULL)
787 return(0);
788 reginput++;
789 break;
790 case ANYBUT:
791 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != (char *)NULL)
792 return(0);
793 reginput++;
794 break;
795 case NOTHING:
796 break;
797 case BACK:
798 break;
799 case OPEN + 1:
800 case OPEN + 2:
801 case OPEN + 3:
802 case OPEN + 4:
803 case OPEN + 5:
804 case OPEN + 6:
805 case OPEN + 7:
806 case OPEN + 8:
807 case OPEN + 9:{
808 register int no;
809 register char *save;
810
811 no = OP(scan) - OPEN;
812 save = reginput;
813
814 if (regmatch(next)) {
815 /* Don't set startp if some later
816 * invocation of the same parentheses
817 * already has. */
818 if (regstartp[no] == (char *)NULL)
819 regstartp[no] = save;
820 return(1);
821 } else
822 return(0);
823 }
824 break;
825 case CLOSE + 1:
826 case CLOSE + 2:
827 case CLOSE + 3:
828 case CLOSE + 4:
829 case CLOSE + 5:
830 case CLOSE + 6:
831 case CLOSE + 7:
832 case CLOSE + 8:
833 case CLOSE + 9:{
834 register int no;
835 register char *save;
836
837 no = OP(scan) - CLOSE;
838 save = reginput;
839
840 if (regmatch(next)) {
841 /* Don't set endp if some later
842 * invocation of the same parentheses
843 * already has. */
844 if (regendp[no] == (char *)NULL) regendp[no] = save;
845 return(1);
846 } else
847 return(0);
848 }
849 break;
850 case BRANCH:{
851 register char *save;
852
853 if (OP(next) != BRANCH) /* No choice. */
854 next = OPERAND(scan); /* Avoid recursion. */
855 else {
856 do {
857 save = reginput;
858 if (regmatch(OPERAND(scan)))
859 return(1);
860 reginput = save;
861 scan = regnext(scan);
862 } while (scan != (char *)NULL && OP(scan) == BRANCH);
863 return(0);
864 /* NOTREACHED */
865 }
866 }
867 break;
868 case STAR:
869 case PLUS:{
870 register char nextch;
871 register int no;
872 register char *save;
873 register int min;
874
875 /* Lookahead to avoid useless match attempts
876 * when we know what character comes next. */
877 nextch = '\0';
878 if (OP(next) == EXACTLY) nextch = *OPERAND(next);
879 min = (OP(scan) == STAR) ? 0 : 1;
880 save = reginput;
881 no = regrepeat(OPERAND(scan));
882 while (no >= min) {
883 /* If it could work, try it. */
884 if (nextch == '\0' || *reginput == nextch)
885 if (regmatch(next)) return(1);
886 /* Couldn't or didn't -- back up. */
887 no--;
888 reginput = save + no;
889 }
890 return(0);
891 }
892 break;
893 case END:
894 return(1); /* Success! */
895 break;
896 default:
897 regerror("memory corruption");
898 return(0);
899 break;
900 }
901
902 scan = next;
903 }
904
905 /* We get here only if there's trouble -- normally "case END" is the
906 * terminating point. */
907 regerror("corrupted pointers");
908 return(0);
909}
910
911/*
912 - regrepeat - repeatedly match something simple, report how many
913 */
914static int regrepeat(p)
915char *p;
916{
917 register int count = 0;
918 register char *scan;
919 register char *opnd;
920
921 scan = reginput;
922 opnd = OPERAND(p);
923 switch (OP(p)) {
924 case ANY:
925 count = strlen(scan);
926 scan += count;
927 break;
928 case EXACTLY:
929 while (*opnd == *scan) {
930 count++;
931 scan++;
932 }
933 break;
934 case ANYOF:
935 while (*scan != '\0' && strchr(opnd, *scan) != (char *)NULL) {
936 count++;
937 scan++;
938 }
939 break;
940 case ANYBUT:
941 while (*scan != '\0' && strchr(opnd, *scan) == (char *)NULL) {
942 count++;
943 scan++;
944 }
945 break;
946 default: /* Oh dear. Called inappropriately. */
947 regerror("internal foulup");
948 count = 0; /* Best compromise. */
949 break;
950 }
951 reginput = scan;
952
953 return(count);
954}
955
956/*
957 - regnext - dig the "next" pointer out of a node
958 */
959static char *regnext(p)
960register char *p;
961{
962 register int offset;
963
964 if (p == &regdummy) return((char *)NULL);
965
966 offset = NEXT(p);
967 if (offset == 0) return((char *)NULL);
968
969 if (OP(p) == BACK)
970 return(p - offset);
971 else
972 return(p + offset);
973}
974
975#ifdef DEBUG
976
977static char *regprop();
978
979/*
980 - regdump - dump a regexp onto stdout in vaguely comprehensible form
981 */
982void regdump(r)
983regexp *r;
984{
985 register char *s;
986 register char op = EXACTLY; /* Arbitrary non-END op. */
987 register char *next;
988
989 s = r->program + 1;
990 while (op != END) { /* While that wasn't END last time... */
991 op = OP(s);
992 printf("%2d%s", (int) (s - r->program), regprop(s)); /* Where, what. */
993 next = regnext(s);
994 if (next == (char *)NULL) /* Next ptr. */
995 printf("(0)");
996 else
997 printf("(%d)", (int) (s - r->program) + (int) (next - s));
998 s += 3;
999 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1000 /* Literal string, where present. */
1001 while (*s != '\0') {
1002 putchar(*s);
1003 s++;
1004 }
1005 s++;
1006 }
1007 putchar('\n');
1008 }
1009
1010 /* Header fields of interest. */
1011 if (r->regstart != '\0') printf("start `%c' ", r->regstart);
1012 if (r->reganch) printf("anchored ");
1013 if (r->regmust != (char *)NULL) printf("must have \"%s\"", r->regmust);
1014 printf("\n");
1015}
1016
1017/*
1018 - regprop - printable representation of opcode
1019 */
1020static char *regprop(op)
1021char *op;
1022{
1023 register char *p;
1024 static char buf[50];
1025
1026 (void) strcpy(buf, ":");
1027
1028 switch (OP(op)) {
1029 case BOL: p = "BOL"; break;
1030 case EOL: p = "EOL"; break;
1031 case ANY: p = "ANY"; break;
1032 case ANYOF: p = "ANYOF"; break;
1033 case ANYBUT: p = "ANYBUT"; break;
1034 case BRANCH: p = "BRANCH"; break;
1035 case EXACTLY: p = "EXACTLY"; break;
1036 case NOTHING: p = "NOTHING"; break;
1037 case BACK: p = "BACK"; break;
1038 case END: p = "END"; break;
1039 case OPEN + 1:
1040 case OPEN + 2:
1041 case OPEN + 3:
1042 case OPEN + 4:
1043 case OPEN + 5:
1044 case OPEN + 6:
1045 case OPEN + 7:
1046 case OPEN + 8:
1047 case OPEN + 9:
1048 sprintf(buf + strlen(buf), "OPEN%d", OP(op) - OPEN);
1049 p = (char *)NULL;
1050 break;
1051 case CLOSE + 1:
1052 case CLOSE + 2:
1053 case CLOSE + 3:
1054 case CLOSE + 4:
1055 case CLOSE + 5:
1056 case CLOSE + 6:
1057 case CLOSE + 7:
1058 case CLOSE + 8:
1059 case CLOSE + 9:
1060 sprintf(buf + strlen(buf), "CLOSE%d", OP(op) - CLOSE);
1061 p = (char *)NULL;
1062 break;
1063 case STAR: p = "STAR"; break;
1064 case PLUS: p = "PLUS"; break;
1065 default: regerror("corrupted opcode"); p = (char *) NULL; break;
1066 }
1067 if (p != (char *)NULL) (void) strcat(buf, p);
1068 return(buf);
1069}
1070
1071#endif
1072
1073/*
1074 * $PchId: regexp.c,v 1.4 1996/02/22 09:03:07 philip Exp $
1075 */
Note: See TracBrowser for help on using the repository browser.