1 /*
2  * Copyright (C) 2013-2015  Internet Systems Consortium, Inc. ("ISC")
3  *
4  * Permission to use, copy, modify, and/or distribute this software for any
5  * purpose with or without fee is hereby granted, provided that the above
6  * copyright notice and this permission notice appear in all copies.
7  *
8  * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH
9  * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
10  * AND FITNESS.  IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT,
11  * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
12  * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
13  * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
14  * PERFORMANCE OF THIS SOFTWARE.
15  */
16 
17 #include <config.h>
18 
19 #include <isc/file.h>
20 #include <isc/print.h>
21 #include <isc/regex.h>
22 #include <isc/string.h>
23 
24 #if VALREGEX_REPORT_REASON
25 #define FAIL(x) do { reason = (x); goto error; } while(0)
26 #else
27 #define FAIL(x) goto error
28 #endif
29 
30 /*
31  * Validate the regular expression 'C' locale.
32  */
33 int
isc_regex_validate(const char * c)34 isc_regex_validate(const char *c) {
35 	enum {
36 		none, parse_bracket, parse_bound,
37 		parse_ce, parse_ec, parse_cc
38 	} state = none;
39 	/* Well known character classes. */
40 	const char *cc[] = {
41 		":alnum:", ":digit:", ":punct:", ":alpha:", ":graph:",
42 		":space:", ":blank:", ":lower:", ":upper:", ":cntrl:",
43 		":print:", ":xdigit:"
44 	};
45 	isc_boolean_t seen_comma = ISC_FALSE;
46 	isc_boolean_t seen_high = ISC_FALSE;
47 	isc_boolean_t seen_char = ISC_FALSE;
48 	isc_boolean_t seen_ec = ISC_FALSE;
49 	isc_boolean_t seen_ce = ISC_FALSE;
50 	isc_boolean_t have_atom = ISC_FALSE;
51 	int group = 0;
52 	int range = 0;
53 	int sub = 0;
54 	isc_boolean_t empty_ok = ISC_FALSE;
55 	isc_boolean_t neg = ISC_FALSE;
56 	isc_boolean_t was_multiple = ISC_FALSE;
57 	unsigned int low = 0;
58 	unsigned int high = 0;
59 	const char *ccname = NULL;
60 	int range_start = 0;
61 #if VALREGEX_REPORT_REASON
62 	const char *reason = "";
63 #endif
64 
65 	if (c == NULL || *c == 0)
66 		FAIL("empty string");
67 
68 	while (c != NULL && *c != 0) {
69 		switch (state) {
70 		case none:
71 			switch (*c) {
72 			case '\\':	/* make literal */
73 				++c;
74 				switch (*c) {
75 				case '1': case '2': case '3':
76 				case '4': case '5': case '6':
77 				case '7': case '8': case '9':
78 					if ((*c - '0') > sub)
79 						FAIL("bad back reference");
80 					have_atom = ISC_TRUE;
81 					was_multiple = ISC_FALSE;
82 					break;
83 				case 0:
84 					FAIL("escaped end-of-string");
85 				default:
86 					goto literal;
87 				}
88 				++c;
89 				break;
90 			case '[':	/* bracket start */
91 				++c;
92 				neg = ISC_FALSE;
93 				was_multiple = ISC_FALSE;
94 				seen_char = ISC_FALSE;
95 				state = parse_bracket;
96 				break;
97 			case '{': 	/* bound start */
98 				switch (c[1]) {
99 				case '0': case '1': case '2': case '3':
100 				case '4': case '5': case '6': case '7':
101 				case '8': case '9':
102 					if (!have_atom)
103 						FAIL("no atom");
104 					if (was_multiple)
105 						FAIL("was multiple");
106 					seen_comma = ISC_FALSE;
107 					seen_high = ISC_FALSE;
108 					low = high = 0;
109 					state = parse_bound;
110 					break;
111 				default:
112 					goto literal;
113 				}
114 				++c;
115 				have_atom = ISC_TRUE;
116 				was_multiple = ISC_TRUE;
117 				break;
118 			case '}':
119 				goto literal;
120 			case '(':	/* group start */
121 				have_atom = ISC_FALSE;
122 				was_multiple = ISC_FALSE;
123 				empty_ok = ISC_TRUE;
124 				++group;
125 				++sub;
126 				++c;
127 				break;
128 			case ')':	/* group end */
129 				if (group && !have_atom && !empty_ok)
130 					FAIL("empty alternative");
131 				have_atom = ISC_TRUE;
132 				was_multiple = ISC_FALSE;
133 				if (group != 0)
134 					--group;
135 				++c;
136 				break;
137 			case '|':	/* alternative seperator */
138 				if (!have_atom)
139 					FAIL("no atom");
140 				have_atom = ISC_FALSE;
141 				empty_ok = ISC_FALSE;
142 				was_multiple = ISC_FALSE;
143 				++c;
144 				break;
145 			case '^':
146 			case '$':
147 				have_atom = ISC_TRUE;
148 				was_multiple = ISC_TRUE;
149 				++c;
150 				break;
151 			case '+':
152 			case '*':
153 			case '?':
154 				if (was_multiple)
155 					FAIL("was multiple");
156 				if (!have_atom)
157 					FAIL("no atom");
158 				have_atom = ISC_TRUE;
159 				was_multiple = ISC_TRUE;
160 				++c;
161 				break;
162 			case '.':
163 			default:
164 			literal:
165 				have_atom = ISC_TRUE;
166 				was_multiple = ISC_FALSE;
167 				++c;
168 				break;
169 			}
170 			break;
171 		case parse_bound:
172 			switch (*c) {
173 			case '0': case '1': case '2': case '3': case '4':
174 			case '5': case '6': case '7': case '8': case '9':
175 				if (!seen_comma) {
176 					low = low * 10 + *c - '0';
177 					if (low > 255)
178 						FAIL("lower bound too big");
179 				} else {
180 					seen_high = ISC_TRUE;
181 					high = high * 10 + *c - '0';
182 					if (high > 255)
183 						FAIL("upper bound too big");
184 				}
185 				++c;
186 				break;
187 			case ',':
188 				if (seen_comma)
189 					FAIL("multiple commas");
190 				seen_comma = ISC_TRUE;
191 				++c;
192 				break;
193 			default:
194 			case '{':
195 				FAIL("non digit/comma");
196 			case '}':
197 				if (seen_high && low > high)
198 					FAIL("bad parse bound");
199 				seen_comma = ISC_FALSE;
200 				state = none;
201 				++c;
202 				break;
203 			}
204 			break;
205 		case parse_bracket:
206 			switch (*c) {
207 			case '^':
208 				if (seen_char || neg) goto inside;
209 				neg = ISC_TRUE;
210 				++c;
211 				break;
212 			case '-':
213 				if (range == 2) goto inside;
214 				if (!seen_char) goto inside;
215 				if (range == 1)
216 					FAIL("bad range");
217 				range = 2;
218 				++c;
219 				break;
220 			case '[':
221 				++c;
222 				switch (*c) {
223 				case '.':	/* collating element */
224 					if (range != 0) --range;
225 					++c;
226 					state = parse_ce;
227 					seen_ce = ISC_FALSE;
228 					break;
229 				case '=':	/* equivalence class */
230 					if (range == 2)
231 					    FAIL("equivalence class in range");
232 					++c;
233 					state = parse_ec;
234 					seen_ec = ISC_FALSE;
235 					break;
236 				case ':':	/* character class */
237 					if (range == 2)
238 					      FAIL("character class in range");
239 					ccname = c;
240 					++c;
241 					state = parse_cc;
242 					break;
243 				}
244 				seen_char = ISC_TRUE;
245 				break;
246 			case ']':
247 				if (!c[1] && !seen_char)
248 					FAIL("unfinished brace");
249 				if (!seen_char)
250 					goto inside;
251 				++c;
252 				range = 0;
253 				have_atom = ISC_TRUE;
254 				state = none;
255 				break;
256 			default:
257 			inside:
258 				seen_char = ISC_TRUE;
259 				if (range == 2 && (*c & 0xff) < range_start)
260 					FAIL("out of order range");
261 				if (range != 0)
262 					--range;
263 				range_start = *c & 0xff;
264 				++c;
265 				break;
266 			};
267 			break;
268 		case parse_ce:
269 			switch (*c) {
270 			case '.':
271 				++c;
272 				switch (*c) {
273 				case ']':
274 					if (!seen_ce)
275 						 FAIL("empty ce");
276 					++c;
277 					state = parse_bracket;
278 					break;
279 				default:
280 					if (seen_ce)
281 						range_start = 256;
282 					else
283 						range_start = '.';
284 					seen_ce = ISC_TRUE;
285 					break;
286 				}
287 				break;
288 			default:
289 				if (seen_ce)
290 					range_start = 256;
291 				else
292 					range_start = *c;
293 				seen_ce = ISC_TRUE;
294 				++c;
295 				break;
296 			}
297 			break;
298 		case parse_ec:
299 			switch (*c) {
300 			case '=':
301 				++c;
302 				switch (*c) {
303 				case ']':
304 					if (!seen_ec)
305 						FAIL("no ec");
306 					++c;
307 					state = parse_bracket;
308 					break;
309 				default:
310 					seen_ec = ISC_TRUE;
311 					break;
312 				}
313 				break;
314 			default:
315 				seen_ec = ISC_TRUE;
316 				++c;
317 				break;
318 			}
319 			break;
320 		case parse_cc:
321 			switch (*c) {
322 			case ':':
323 				++c;
324 				switch (*c) {
325 				case ']': {
326 					unsigned int i;
327 					isc_boolean_t found = ISC_FALSE;
328 					for (i = 0;
329 					     i < sizeof(cc)/sizeof(*cc);
330 					     i++)
331 					{
332 						unsigned int len;
333 						len = strlen(cc[i]);
334 						if (len !=
335 						    (unsigned int)(c - ccname))
336 							continue;
337 						if (strncmp(cc[i], ccname, len))
338 							continue;
339 						found = ISC_TRUE;
340 					}
341 					if (!found)
342 						FAIL("unknown cc");
343 					++c;
344 					state = parse_bracket;
345 					break;
346 					}
347 				default:
348 					break;
349 				}
350 				break;
351 			default:
352 				++c;
353 				break;
354 			}
355 			break;
356 		}
357 	}
358 	if (group != 0)
359 		FAIL("group open");
360 	if (state != none)
361 		FAIL("incomplete");
362 	if (!have_atom)
363 		FAIL("no atom");
364 	return (sub);
365 
366  error:
367 #if VALREGEX_REPORT_REASON
368 	fprintf(stderr, "%s\n", reason);
369 #endif
370 	return (-1);
371 }
372