1 /* $OpenBSD: str.c,v 1.9 2004/09/15 22:12:19 deraadt Exp $ */
2 /* $NetBSD: str.c,v 1.7 1995/08/31 22:13:47 jtc Exp $ */
3
4 /*-
5 * Copyright (c) 2007
6 * Thorsten Glaser <tg@mirbsd.de>
7 * Copyright (c) 1991, 1993
8 * The Regents of the University of California. All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #include <sys/types.h>
36
37 #include <errno.h>
38 #include <err.h>
39 #include <stdbool.h>
40 #include <wctype.h>
41
42 #include "extern.h"
43
44 __SCCSID("@(#)str.c 8.2 (Berkeley) 4/28/95");
45 __RCSID("$MirOS: src/usr.bin/tr/str.c,v 1.4 2009/05/31 15:47:39 tg Exp $");
46
47 static wchar_t backslash(STR *);
48 static bool bracket(STR *);
49 static void genclass(STR *);
50 static void genequiv(STR *);
51 static bool genrange(STR *);
52 static void genseq(STR *);
53
54 bool
next(STR * s)55 next(STR *s)
56 {
57 wchar_t ch;
58
59 switch (s->state) {
60 case EOS:
61 return (false);
62 case INFINITE:
63 return (true);
64 case NORMAL:
65 switch (ch = *s->str) {
66 case '\0':
67 s->state = EOS;
68 return (false);
69 case '\\':
70 s->lastch = backslash(s);
71 break;
72 case '[':
73 if (bracket(s))
74 return (next(s));
75 /* FALLTHROUGH */
76 default:
77 ++s->str;
78 s->lastch = ch;
79 break;
80 }
81
82 /* We can start a range at any time. */
83 if (s->str[0] == L'-' && genrange(s))
84 return (next(s));
85 return (true);
86 case RANGE:
87 if (s->cnt-- == 0) {
88 s->state = NORMAL;
89 return (next(s));
90 }
91 ++s->lastch;
92 return (true);
93 case SEQUENCE:
94 if (s->cnt-- == 0) {
95 s->state = NORMAL;
96 return (next(s));
97 }
98 return (true);
99 case SET:
100 if ((s->lastch = s->set[s->cnt++]) == OOBCH) {
101 s->state = NORMAL;
102 return (next(s));
103 }
104 return (true);
105 case WCLASS:
106 while (s->cnext != OOBCH) {
107 if (iswctype(s->cnext, s->wclass))
108 break;
109 s->cnext++;
110 }
111 if (s->cnext == OOBCH) {
112 s->state = NORMAL;
113 return (next(s));
114 }
115 s->lastch = s->cnext++;
116 return (true);
117 case WTRANS:
118 while (s->cnext != OOBCH) {
119 if (iswctype(s->cnext, s->wclass) &&
120 (ch = towctrans(s->cnext, s->wtrans)) != s->cnext) {
121 s->lastch = s->which == STRING1 ? s->cnext : ch;
122 s->cnext++;
123 return (true);
124 }
125 s->cnext++;
126 }
127 s->state = NORMAL;
128 return (next(s));
129 default:
130 return (false);
131 }
132 /* NOTREACHED */
133 }
134
135 static bool
bracket(STR * s)136 bracket(STR *s)
137 {
138 wchar_t *p;
139
140 switch (s->str[1]) {
141 case L':': /* "[:class:]" */
142 if ((p = wcsstr(s->str + 2, L":]")) == NULL)
143 return (false);
144 *p = L'\0';
145 s->str += 2;
146 genclass(s);
147 s->str = p + 2;
148 return (true);
149 case L'=': /* "[=equiv=]" */
150 if ((p = wcsstr(s->str + 2, L"=]")) == NULL)
151 return (false);
152 s->str += 2;
153 genequiv(s);
154 return (true);
155 default: /* "[\###*n]" or "[#*n]" */
156 if ((p = wcspbrk(s->str + 2, L"*]")) == NULL)
157 return (false);
158 if (p[0] != L'*' || wcschr(p, L']') == NULL)
159 return (false);
160 s->str += 1;
161 genseq(s);
162 return (true);
163 }
164 /* NOTREACHED */
165 }
166
167 static void
genclass(STR * s)168 genclass(STR *s)
169 {
170 wctype_t tmp;
171 size_t len;
172 char *cp;
173 bool alower;
174
175 if (s->use_wctrans && ((alower = !wcscmp(s->str, L"lower")) ||
176 !wcscmp(s->str, L"upper"))) {
177 alower = (alower && s->which == STRING1) ||
178 (!alower && s->which == STRING2);
179 s->wclass = alower ? wctype("lower") : wctype("upper");
180 s->wtrans = alower ? wctrans("toupper") : wctrans("tolower");
181 s->state = WTRANS;
182 s->cnext = 0;
183 return;
184 }
185
186 len = wcstombs(NULL, s->str, 0);
187 if ((cp = malloc(len + 1)) == NULL)
188 err(1, "out of memory allocating %zu bytes", len);
189 wcstombs(cp, s->str, len + 1);
190
191 if ((tmp = wctype(cp)) == 0)
192 errx(1, "unknown class %s", cp);
193
194 free(cp);
195 s->state = WCLASS;
196 s->wclass = tmp;
197 s->cnext = 0;
198 }
199
200 /*
201 * English doesn't have any equivalence classes, so for now
202 * we just syntax check and grab the character.
203 */
204 static void
genequiv(STR * s)205 genequiv(STR *s)
206 {
207 if (*s->str == L'\\') {
208 s->equiv[0] = backslash(s);
209 if (*s->str != L'=')
210 errx(1, "misplaced equivalence equals sign");
211 } else {
212 s->equiv[0] = s->str[0];
213 if (s->str[1] != L'=')
214 errx(1, "misplaced equivalence equals sign");
215 }
216 s->str += 2;
217 s->cnt = 0;
218 s->state = SET;
219 s->set = s->equiv;
220 }
221
222 static bool
genrange(STR * s)223 genrange(STR *s)
224 {
225 wchar_t stopval, *savestart;
226
227 savestart = s->str;
228 stopval = *++s->str == L'\\' ? backslash(s) : *s->str++;
229 if (stopval < s->lastch) {
230 s->str = savestart;
231 return (false);
232 }
233 s->cnt = stopval - s->lastch + 1;
234 s->state = RANGE;
235 --s->lastch;
236 return (true);
237 }
238
239 static void
genseq(STR * s)240 genseq(STR *s)
241 {
242 wchar_t *ep;
243
244 if (s->which == STRING1)
245 errx(1, "sequences only valid in string2");
246
247 if (*s->str == L'\\')
248 s->lastch = backslash(s);
249 else
250 s->lastch = *s->str++;
251 if (*s->str != L'*')
252 errx(1, "misplaced sequence asterisk");
253
254 switch (*++s->str) {
255 case L'\\':
256 s->cnt = backslash(s);
257 break;
258 case L']':
259 s->cnt = 0;
260 ++s->str;
261 break;
262 default:
263 if (iswdigit(*s->str)) {
264 s->cnt = wcstol(s->str, &ep, 0);
265 if (*ep == L']') {
266 s->str = ep + 1;
267 break;
268 }
269 }
270 errx(1, "illegal sequence count");
271 /* NOTREACHED */
272 }
273
274 s->state = s->cnt ? SEQUENCE : INFINITE;
275 }
276
277 /*
278 * Translate \??? into a character. Up to 3 octal digits, if no digits either
279 * an escape code or a literal character. Spew out a MirOS OPTU-16 raw octet.
280 */
281 static wchar_t
backslash(STR * s)282 backslash(STR *s)
283 {
284 wchar_t ch, cnt, val;
285
286 for (cnt = val = 0;;) {
287 ch = *++s->str;
288 if (ch < L'0' || ch > L'7')
289 break;
290 val = val * 8 + ch - L'0';
291 if (++cnt == 3) {
292 ++s->str;
293 break;
294 }
295 }
296 if (cnt)
297 return ((val &= 0xFF) < 0x80 ? val : 0xEF00 | val);
298 if (ch != L'\0')
299 ++s->str;
300 switch (ch) {
301 case L'a': /* escape characters */
302 return (L'\7');
303 case L'b':
304 return (L'\b');
305 case L'f':
306 return (L'\f');
307 case L'n':
308 return (L'\n');
309 case L'r':
310 return (L'\r');
311 case L't':
312 return (L'\t');
313 case L'v':
314 return (L'\13');
315 case L'\0': /* \" -> \ */
316 s->state = EOS;
317 return (L'\\');
318 default: /* \x" -> x */
319 return (ch);
320 }
321 }
322