1 /*        $NetBSD: ascmagic.c,v 1.1.1.16 2023/08/18 18:36:49 christos Exp $     */
2 
3 /*
4  * Copyright (c) Ian F. Darwin 1986-1995.
5  * Software written by Ian F. Darwin and others;
6  * maintained 1995-present by Christos Zoulas and others.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice immediately at the beginning of the file, without modification,
13  *    this list of conditions, and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
22  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 /*
31  * ASCII magic -- try to detect text encoding.
32  *
33  * Extensively modified by Eric Fischer <enf@pobox.com> in July, 2000,
34  * to handle character codes other than ASCII on a unified basis.
35  */
36 
37 #include "file.h"
38 
39 #ifndef   lint
40 #if 0
41 FILE_RCSID("@(#)$File: ascmagic.c,v 1.116 2023/05/21 16:08:50 christos Exp $")
42 #else
43 __RCSID("$NetBSD: ascmagic.c,v 1.1.1.16 2023/08/18 18:36:49 christos Exp $");
44 #endif
45 #endif    /* lint */
46 
47 #include "magic.h"
48 #include <string.h>
49 #include <ctype.h>
50 #include <stdlib.h>
51 #ifdef HAVE_UNISTD_H
52 #include <unistd.h>
53 #endif
54 
55 #define MAXLINELEN 300        /* longest sane line length */
56 #define ISSPC(x) ((x) == ' ' || (x) == '\t' || (x) == '\r' || (x) == '\n' \
57                       || (x) == 0x85 || (x) == '\f')
58 
59 file_private unsigned char *encode_utf8(unsigned char *, size_t, file_unichar_t *,
60     size_t);
61 file_private size_t trim_nuls(const unsigned char *, size_t);
62 
63 /*
64  * Undo the NUL-termination kindly provided by process()
65  * but leave at least one byte to look at
66  */
67 file_private size_t
trim_nuls(const unsigned char * buf,size_t nbytes)68 trim_nuls(const unsigned char *buf, size_t nbytes)
69 {
70           while (nbytes > 1 && buf[nbytes - 1] == '\0')
71                     nbytes--;
72 
73           return nbytes;
74 }
75 
76 file_protected int
file_ascmagic(struct magic_set * ms,const struct buffer * b,int text)77 file_ascmagic(struct magic_set *ms, const struct buffer *b, int text)
78 {
79           file_unichar_t *ubuf = NULL;
80           size_t ulen = 0;
81           int rv = 1;
82           struct buffer bb;
83 
84           const char *code = NULL;
85           const char *code_mime = NULL;
86           const char *type = NULL;
87 
88           bb = *b;
89           bb.flen = trim_nuls(CAST(const unsigned char *, b->fbuf), b->flen);
90           /*
91            * Avoid trimming at an odd byte if the original buffer was evenly
92            * sized; this avoids losing the last character on UTF-16 LE text
93            */
94           if ((bb.flen & 1) && !(b->flen & 1))
95                     bb.flen++;
96 
97           /* If file doesn't look like any sort of text, give up. */
98           if (file_encoding(ms, &bb, &ubuf, &ulen, &code, &code_mime,
99               &type) == 0)
100                     rv = 0;
101         else
102                     rv = file_ascmagic_with_encoding(ms, &bb,
103                         ubuf, ulen, code, type, text);
104 
105           free(ubuf);
106 
107           return rv;
108 }
109 
110 file_protected int
file_ascmagic_with_encoding(struct magic_set * ms,const struct buffer * b,file_unichar_t * ubuf,size_t ulen,const char * code,const char * type,int text)111 file_ascmagic_with_encoding(struct magic_set *ms, const struct buffer *b,
112     file_unichar_t *ubuf, size_t ulen, const char *code, const char *type,
113     int text)
114 {
115           struct buffer bb;
116           const unsigned char *buf = CAST(const unsigned char *, b->fbuf);
117           size_t nbytes = b->flen;
118           unsigned char *utf8_buf = NULL, *utf8_end;
119           size_t mlen, i, len;
120           int rv = -1;
121           int mime = ms->flags & MAGIC_MIME;
122           int need_separator = 0;
123 
124           const char *subtype = NULL;
125 
126           int has_escapes = 0;
127           int has_backspace = 0;
128           int seen_cr = 0;
129 
130           size_t n_crlf = 0;
131           size_t n_lf = 0;
132           size_t n_cr = 0;
133           size_t n_nel = 0;
134           int executable = 0;
135 
136           size_t last_line_end = CAST(size_t, -1);
137           size_t has_long_lines = 0;
138 
139           nbytes = trim_nuls(buf, nbytes);
140 
141           /* If we have fewer than 2 bytes, give up. */
142           if (nbytes <= 1) {
143                     rv = 0;
144                     goto done;
145           }
146 
147           if (ulen > 0 && (ms->flags & MAGIC_NO_CHECK_SOFT) == 0) {
148                     /* Convert ubuf to UTF-8 and try text soft magic */
149                     /* malloc size is a conservative overestimate; could be
150                        improved, or at least realloced after conversion. */
151                     mlen = ulen * 6;
152                     if ((utf8_buf = CAST(unsigned char *, malloc(mlen))) == NULL) {
153                               file_oomem(ms, mlen);
154                               goto done;
155                     }
156                     if ((utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))
157                         == NULL) {
158                               rv = 0;
159                               goto done;
160                     }
161                     buffer_init(&bb, b->fd, &b->st, utf8_buf,
162                         CAST(size_t, utf8_end - utf8_buf));
163 
164                     if ((rv = file_softmagic(ms, &bb, NULL, NULL,
165                         TEXTTEST, text)) == 0)
166                               rv = -1;
167                     else
168                               need_separator = 1;
169                     buffer_fini(&bb);
170                     if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
171                               rv = rv == -1 ? 0 : 1;
172                               goto done;
173                     }
174           }
175 
176           if ((ms->flags & (MAGIC_APPLE|MAGIC_EXTENSION))) {
177                     rv = 0;
178                     goto done;
179           }
180 
181           /* Now try to discover other details about the file. */
182           for (i = 0; i < ulen; i++) {
183                     if (ubuf[i] == '\n') {
184                               if (seen_cr)
185                                         n_crlf++;
186                               else
187                                         n_lf++;
188                               last_line_end = i;
189                     } else if (seen_cr)
190                               n_cr++;
191 
192                     seen_cr = (ubuf[i] == '\r');
193                     if (seen_cr)
194                               last_line_end = i;
195 
196                     if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */
197                               n_nel++;
198                               last_line_end = i;
199                     }
200 
201                     /* If this line is _longer_ than MAXLINELEN, remember it. */
202                     if (i > last_line_end + MAXLINELEN) {
203                               size_t ll = i - last_line_end;
204                               if (ll > has_long_lines)
205                                         has_long_lines = ll;
206                     }
207 
208                     if (ubuf[i] == '\033')
209                               has_escapes = 1;
210                     if (ubuf[i] == '\b')
211                               has_backspace = 1;
212           }
213 
214           if (strcmp(type, "binary") == 0) {
215                     rv = 0;
216                     goto done;
217           }
218           len = file_printedlen(ms);
219           if (mime) {
220                     if ((mime & MAGIC_MIME_TYPE) != 0) {
221                               if (len) {
222                                         /*
223                                          * Softmagic printed something, we
224                                          * are either done, or we need a separator
225                                          */
226                                         if ((ms->flags & MAGIC_CONTINUE) == 0) {
227                                                   rv = 1;
228                                                   goto done;
229                                         }
230                                         if (need_separator && file_separator(ms) == -1)
231                                                   goto done;
232                               }
233                               if (file_printf(ms, "text/plain") == -1)
234                                         goto done;
235                     }
236           } else {
237                     if (len) {
238                               switch (file_replace(ms, " text$", ", ")) {
239                               case 0:
240                                         switch (file_replace(ms, " text executable$",
241                                             ", ")) {
242                                         case 0:
243                                                   if (file_printf(ms, ", ") == -1)
244                                                             goto done;
245                                                   break;
246                                         case -1:
247                                                   goto done;
248                                         default:
249                                                   executable = 1;
250                                                   break;
251                                         }
252                                         break;
253                               case -1:
254                                         goto done;
255                               default:
256                                         break;
257                               }
258                     }
259 
260                     if (file_printf(ms, "%s", code) == -1)
261                               goto done;
262 
263                     if (subtype) {
264                               if (file_printf(ms, " %s", subtype) == -1)
265                                         goto done;
266                     }
267 
268                     if (file_printf(ms, " %s", type) == -1)
269                               goto done;
270 
271                     if (executable)
272                               if (file_printf(ms, " executable") == -1)
273                                         goto done;
274 
275                     if (has_long_lines)
276                               if (file_printf(ms, ", with very long lines (%"
277                                   SIZE_T_FORMAT "u)", has_long_lines) == -1)
278                                         goto done;
279 
280                     /*
281                      * Only report line terminators if we find one other than LF,
282                      * or if we find none at all.
283                      */
284                     if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) ||
285                         (n_crlf != 0 || n_cr != 0 || n_nel != 0)) {
286                               if (file_printf(ms, ", with") == -1)
287                                         goto done;
288 
289                               if (n_crlf == 0 && n_cr == 0 &&
290                                   n_nel == 0 && n_lf == 0) {
291                                         if (file_printf(ms, " no") == -1)
292                                                   goto done;
293                               } else {
294                                         if (n_crlf) {
295                                                   if (file_printf(ms, " CRLF") == -1)
296                                                             goto done;
297                                                   if (n_cr || n_lf || n_nel)
298                                                             if (file_printf(ms, ",") == -1)
299                                                                       goto done;
300                                         }
301                                         if (n_cr) {
302                                                   if (file_printf(ms, " CR") == -1)
303                                                             goto done;
304                                                   if (n_lf || n_nel)
305                                                             if (file_printf(ms, ",") == -1)
306                                                                       goto done;
307                                         }
308                                         if (n_lf) {
309                                                   if (file_printf(ms, " LF") == -1)
310                                                             goto done;
311                                                   if (n_nel)
312                                                             if (file_printf(ms, ",") == -1)
313                                                                       goto done;
314                                         }
315                                         if (n_nel)
316                                                   if (file_printf(ms, " NEL") == -1)
317                                                             goto done;
318                               }
319 
320                               if (file_printf(ms, " line terminators") == -1)
321                                         goto done;
322                     }
323 
324                     if (has_escapes)
325                               if (file_printf(ms, ", with escape sequences") == -1)
326                                         goto done;
327                     if (has_backspace)
328                               if (file_printf(ms, ", with overstriking") == -1)
329                                         goto done;
330           }
331           rv = 1;
332 done:
333           free(utf8_buf);
334 
335           return rv;
336 }
337 
338 /*
339  * Encode Unicode string as UTF-8, returning pointer to character
340  * after end of string, or NULL if an invalid character is found.
341  */
342 file_private unsigned char *
encode_utf8(unsigned char * buf,size_t len,file_unichar_t * ubuf,size_t ulen)343 encode_utf8(unsigned char *buf, size_t len, file_unichar_t *ubuf, size_t ulen)
344 {
345           size_t i;
346           unsigned char *end = buf + len;
347 
348           for (i = 0; i < ulen; i++) {
349                     if (ubuf[i] <= 0x7f) {
350                               if (end - buf < 1)
351                                         return NULL;
352                               *buf++ = CAST(unsigned char, ubuf[i]);
353                               continue;
354                     }
355                     if (ubuf[i] <= 0x7ff) {
356                               if (end - buf < 2)
357                                         return NULL;
358                               *buf++ = CAST(unsigned char, (ubuf[i] >> 6) + 0xc0);
359                               goto out1;
360                     }
361                     if (ubuf[i] <= 0xffff) {
362                               if (end - buf < 3)
363                                         return NULL;
364                               *buf++ = CAST(unsigned char, (ubuf[i] >> 12) + 0xe0);
365                               goto out2;
366                     }
367                     if (ubuf[i] <= 0x1fffff) {
368                               if (end - buf < 4)
369                                         return NULL;
370                               *buf++ = CAST(unsigned char, (ubuf[i] >> 18) + 0xf0);
371                               goto out3;
372                     }
373                     if (ubuf[i] <= 0x3ffffff) {
374                               if (end - buf < 5)
375                                         return NULL;
376                               *buf++ = CAST(unsigned char, (ubuf[i] >> 24) + 0xf8);
377                               goto out4;
378                     }
379                     if (ubuf[i] <= 0x7fffffff) {
380                               if (end - buf < 6)
381                                         return NULL;
382                               *buf++ = CAST(unsigned char, (ubuf[i] >> 30) + 0xfc);
383                               goto out5;
384                     }
385                     /* Invalid character */
386                     return NULL;
387           out5:     *buf++ = CAST(unsigned char, ((ubuf[i] >> 24) & 0x3f) + 0x80);
388           out4:     *buf++ = CAST(unsigned char, ((ubuf[i] >> 18) & 0x3f) + 0x80);
389           out3:     *buf++ = CAST(unsigned char, ((ubuf[i] >> 12) & 0x3f) + 0x80);
390           out2:     *buf++ = CAST(unsigned char, ((ubuf[i] >>  6) & 0x3f) + 0x80);
391           out1:     *buf++ = CAST(unsigned char, ((ubuf[i] >>  0) & 0x3f) + 0x80);
392           }
393 
394           return buf;
395 }
396