1 | /*
|
---|
2 | * Regular expression matching for expr(1). Bugs: The upper bound of
|
---|
3 | * a range specified by the \{ feature cannot be zero.
|
---|
4 | *
|
---|
5 | * Copyright (C) 1989 by Kenneth Almquist. All rights reserved.
|
---|
6 | * This file is part of ash, which is distributed under the terms specified
|
---|
7 | * by the Ash General Public License. See the file named LICENSE.
|
---|
8 | */
|
---|
9 |
|
---|
10 | #include "bltin.h"
|
---|
11 |
|
---|
12 |
|
---|
13 | #define RE_END 0 /* end of regular expression */
|
---|
14 | #define RE_LITERAL 1 /* normal character follows */
|
---|
15 | #define RE_DOT 2 /* "." */
|
---|
16 | #define RE_CCL 3 /* "[...]" */
|
---|
17 | #define RE_NCCL 4 /* "[^...]" */
|
---|
18 | #define RE_LP 5 /* "\(" */
|
---|
19 | #define RE_RP 6 /* "\)" */
|
---|
20 | #define RE_MATCHED 7 /* "\digit" */
|
---|
21 | #define RE_EOS 8 /* "$" matches end of string */
|
---|
22 | #define RE_STAR 9 /* "*" */
|
---|
23 | #define RE_RANGE 10 /* "\{num,num\}" */
|
---|
24 |
|
---|
25 |
|
---|
26 |
|
---|
27 | char *match_begin[10];
|
---|
28 | short match_length[10];
|
---|
29 | short number_parens;
|
---|
30 | static int match();
|
---|
31 |
|
---|
32 |
|
---|
33 |
|
---|
34 | char *
|
---|
35 | re_compile(pattern)
|
---|
36 | char *pattern;
|
---|
37 | {
|
---|
38 | register char *p;
|
---|
39 | register char c;
|
---|
40 | char *comp;
|
---|
41 | register char *q;
|
---|
42 | char *begin;
|
---|
43 | char *endp;
|
---|
44 | register int len;
|
---|
45 | int first;
|
---|
46 | int type;
|
---|
47 | char *stackp;
|
---|
48 | char stack[10];
|
---|
49 | int paren_num;
|
---|
50 | int i;
|
---|
51 | char *malloc();
|
---|
52 |
|
---|
53 | p = pattern;
|
---|
54 | if (*p == '^')
|
---|
55 | p++;
|
---|
56 | comp = q = malloc(2 * strlen(p) + 1);
|
---|
57 | begin = q;
|
---|
58 | stackp = stack;
|
---|
59 | paren_num = 0;
|
---|
60 | for (;;) {
|
---|
61 | switch (c = *p++) {
|
---|
62 | case '\0':
|
---|
63 | *q = '\0';
|
---|
64 | goto out;
|
---|
65 | case '.':
|
---|
66 | *q++ = RE_DOT;
|
---|
67 | len = 1;
|
---|
68 | break;
|
---|
69 | case '[':
|
---|
70 | begin = q;
|
---|
71 | *q = RE_CCL;
|
---|
72 | if (*p == '^') {
|
---|
73 | *q = RE_NCCL;
|
---|
74 | p++;
|
---|
75 | }
|
---|
76 | q++;
|
---|
77 | first = 1;
|
---|
78 | while (*p != ']' || first == 1) {
|
---|
79 | if (p[1] == '-' && p[2] != ']') {
|
---|
80 | *q++ = '-';
|
---|
81 | *q++ = p[0];
|
---|
82 | *q++ = p[2];
|
---|
83 | p += 3;
|
---|
84 | } else if (*p == '-') {
|
---|
85 | *q++ = '-';
|
---|
86 | *q++ = '-';
|
---|
87 | *q++ = '-';
|
---|
88 | p++;
|
---|
89 | } else {
|
---|
90 | *q++ = *p++;
|
---|
91 | }
|
---|
92 | first = 0;
|
---|
93 | }
|
---|
94 | p++;
|
---|
95 | *q++ = '\0';
|
---|
96 | len = q - begin;
|
---|
97 | break;
|
---|
98 | case '$':
|
---|
99 | if (*p != '\0')
|
---|
100 | goto dft;
|
---|
101 | *q++ = RE_EOS;
|
---|
102 | break;
|
---|
103 | case '*':
|
---|
104 | if (len == 0)
|
---|
105 | goto dft;
|
---|
106 | type = RE_STAR;
|
---|
107 | range:
|
---|
108 | i = (type == RE_RANGE)? 3 : 1;
|
---|
109 | endp = q + i;
|
---|
110 | begin = q - len;
|
---|
111 | do {
|
---|
112 | --q;
|
---|
113 | *(q + i) = *q;
|
---|
114 | } while (--len > 0);
|
---|
115 | q = begin;
|
---|
116 | *q++ = type;
|
---|
117 | if (type == RE_RANGE) {
|
---|
118 | i = 0;
|
---|
119 | while ((unsigned)(*p - '0') <= 9)
|
---|
120 | i = 10 * i + (*p++ - '0');
|
---|
121 | *q++ = i;
|
---|
122 | if (*p != ',') {
|
---|
123 | *q++ = i;
|
---|
124 | } else {
|
---|
125 | p++;
|
---|
126 | i = 0;
|
---|
127 | while ((unsigned)(*p - '0') <= 9)
|
---|
128 | i = 10 * i + (*p++ - '0');
|
---|
129 | *q++ = i;
|
---|
130 | }
|
---|
131 | if (*p != '\\' || *++p != '}')
|
---|
132 | error("RE error");
|
---|
133 | p++;
|
---|
134 | }
|
---|
135 | q = endp;
|
---|
136 | break;
|
---|
137 | case '\\':
|
---|
138 | if ((c = *p++) == '(') {
|
---|
139 | if (++paren_num > 9)
|
---|
140 | error("RE error");
|
---|
141 | *q++ = RE_LP;
|
---|
142 | *q++ = paren_num;
|
---|
143 | *stackp++ = paren_num;
|
---|
144 | len = 0;
|
---|
145 | } else if (c == ')') {
|
---|
146 | if (stackp == stack)
|
---|
147 | error("RE error");
|
---|
148 | *q++ = RE_RP;
|
---|
149 | *q++ = *--stackp;
|
---|
150 | len = 0;
|
---|
151 | } else if (c == '{') {
|
---|
152 | type = RE_RANGE;
|
---|
153 | goto range;
|
---|
154 | } else if ((unsigned)(c - '1') < 9) {
|
---|
155 | /* should check validity here */
|
---|
156 | *q++ = RE_MATCHED;
|
---|
157 | *q++ = c - '0';
|
---|
158 | len = 2;
|
---|
159 | } else {
|
---|
160 | goto dft;
|
---|
161 | }
|
---|
162 | break;
|
---|
163 | default:
|
---|
164 | dft: *q++ = RE_LITERAL;
|
---|
165 | *q++ = c;
|
---|
166 | len = 2;
|
---|
167 | break;
|
---|
168 | }
|
---|
169 | }
|
---|
170 | out:
|
---|
171 | if (stackp != stack)
|
---|
172 | error("RE error");
|
---|
173 | number_parens = paren_num;
|
---|
174 | return comp;
|
---|
175 | }
|
---|
176 |
|
---|
177 |
|
---|
178 |
|
---|
179 | re_match(pattern, string)
|
---|
180 | char *pattern;
|
---|
181 | char *string;
|
---|
182 | {
|
---|
183 | char **pp;
|
---|
184 |
|
---|
185 | match_begin[0] = string;
|
---|
186 | for (pp = &match_begin[1] ; pp <= &match_begin[9] ; pp++)
|
---|
187 | *pp = 0;
|
---|
188 | return match(pattern, string);
|
---|
189 | }
|
---|
190 |
|
---|
191 |
|
---|
192 |
|
---|
193 | static
|
---|
194 | match(pattern, string)
|
---|
195 | char *pattern;
|
---|
196 | char *string;
|
---|
197 | {
|
---|
198 | register char *p, *q;
|
---|
199 | int counting;
|
---|
200 | int low, high, count;
|
---|
201 | char *curpat;
|
---|
202 | char *start_count;
|
---|
203 | int negate;
|
---|
204 | int found;
|
---|
205 | char *r;
|
---|
206 | int len;
|
---|
207 | char c;
|
---|
208 |
|
---|
209 | p = pattern;
|
---|
210 | q = string;
|
---|
211 | counting = 0;
|
---|
212 | for (;;) {
|
---|
213 | if (counting) {
|
---|
214 | if (++count > high)
|
---|
215 | goto bad;
|
---|
216 | p = curpat;
|
---|
217 | }
|
---|
218 | switch (*p++) {
|
---|
219 | case RE_END:
|
---|
220 | match_length[0] = q - match_begin[0];
|
---|
221 | return 1;
|
---|
222 | case RE_LITERAL:
|
---|
223 | if (*q++ != *p++)
|
---|
224 | goto bad;
|
---|
225 | break;
|
---|
226 | case RE_DOT:
|
---|
227 | if (*q++ == '\0')
|
---|
228 | goto bad;
|
---|
229 | break;
|
---|
230 | case RE_CCL:
|
---|
231 | negate = 0;
|
---|
232 | goto ccl;
|
---|
233 | case RE_NCCL:
|
---|
234 | negate = 1;
|
---|
235 | ccl:
|
---|
236 | found = 0;
|
---|
237 | c = *q++;
|
---|
238 | while (*p) {
|
---|
239 | if (*p == '-') {
|
---|
240 | if (c >= *++p && c <= *++p)
|
---|
241 | found = 1;
|
---|
242 | } else {
|
---|
243 | if (c == *p)
|
---|
244 | found = 1;
|
---|
245 | }
|
---|
246 | p++;
|
---|
247 | }
|
---|
248 | p++;
|
---|
249 | if (found == negate || c == 0)
|
---|
250 | goto bad;
|
---|
251 | break;
|
---|
252 | case RE_LP:
|
---|
253 | match_begin[*p++] = q;
|
---|
254 | break;
|
---|
255 | case RE_RP:
|
---|
256 | match_length[*p] = q - match_begin[*p];
|
---|
257 | p++;
|
---|
258 | break;
|
---|
259 | case RE_MATCHED:
|
---|
260 | r = match_begin[*p];
|
---|
261 | len = match_length[*p++];
|
---|
262 | while (--len >= 0) {
|
---|
263 | if (*q++ != *r++)
|
---|
264 | goto bad;
|
---|
265 | }
|
---|
266 | break;
|
---|
267 | case RE_EOS:
|
---|
268 | if (*q != '\0')
|
---|
269 | goto bad;
|
---|
270 | break;
|
---|
271 | case RE_STAR:
|
---|
272 | low = 0;
|
---|
273 | high = 32767;
|
---|
274 | goto range;
|
---|
275 | case RE_RANGE:
|
---|
276 | low = *p++;
|
---|
277 | high = *p++;
|
---|
278 | if (high == 0)
|
---|
279 | high = 32767;
|
---|
280 | range:
|
---|
281 | curpat = p;
|
---|
282 | start_count = q;
|
---|
283 | count = 0;
|
---|
284 | counting++;
|
---|
285 | break;
|
---|
286 | }
|
---|
287 | }
|
---|
288 | bad:
|
---|
289 | if (! counting)
|
---|
290 | return 0;
|
---|
291 | len = 1;
|
---|
292 | if (*curpat == RE_MATCHED)
|
---|
293 | len = match_length[curpat[1]];
|
---|
294 | while (--count >= low) {
|
---|
295 | if (match(p, start_count + count * len))
|
---|
296 | return 1;
|
---|
297 | }
|
---|
298 | return 0;
|
---|
299 | }
|
---|