1 /*        $NetBSD: mime_header.c,v 1.9 2013/02/14 18:23:45 christos Exp $       */
2 
3 /*-
4  * Copyright (c) 2006 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Anon Ymous.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 
33 /*
34  * This module contains the core MIME header decoding routines.
35  * Please refer to RFC 2047 and RFC 2822.
36  */
37 
38 #ifdef MIME_SUPPORT
39 
40 #include <sys/cdefs.h>
41 #ifndef __lint__
42 __RCSID("$NetBSD: mime_header.c,v 1.9 2013/02/14 18:23:45 christos Exp $");
43 #endif /* not __lint__ */
44 
45 #include <assert.h>
46 #include <stdio.h>
47 #include <stdlib.h>
48 #include <string.h>
49 
50 #include "def.h"
51 #include "extern.h"
52 #include "mime.h"
53 #include "mime_header.h"
54 #include "mime_codecs.h"
55 
56 static const char *
grab_charset(char * from_cs,size_t from_cs_len,const char * p)57 grab_charset(char *from_cs, size_t from_cs_len, const char *p)
58 {
59           char *q;
60           q = from_cs;
61           for (/*EMPTY*/; *p != '?'; p++) {
62                     if (*p == '\0' || q >= from_cs + from_cs_len - 1)
63                               return NULL;
64                     *q++ = *p;
65           }
66           *q = '\0';
67           return ++p;         /* if here, then we got the '?' */
68 }
69 
70 /*
71  * An encoded word is a string of at most 75 non-white space
72  * characters of the following form:
73  *
74  *  =?charset?X?encoding?=
75  *
76  * where:
77  *   'charset'      is the original character set of the unencoded string.
78  *
79  *   'X'  is the encoding type 'B' or 'Q' for "base64" or
80  *              "quoted-printable", respectively,
81  *   'encoding'     is the encoded string.
82  *
83  * Both 'charset' and 'X' are case independent and 'encoding' cannot
84  * contain any whitespace or '?' characters.  The 'encoding' must also
85  * be fully contained within the encoded words, i.e., it cannot be
86  * split between encoded words.
87  *
88  * Note: the 'B' encoding is a slightly modified "quoted-printable"
89  * encoding.  In particular, spaces (' ') may be encoded as '_' to
90  * improve undecoded readability.
91  */
92 static int
decode_word(const char ** ibuf,char ** obuf,char * oend,const char * to_cs)93 decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
94 {
95           ssize_t declen;
96           size_t enclen, dstlen;
97           char decword[LINESIZE];
98           char from_cs[LINESIZE];
99           const char *encword, *iend, *p;
100           char *dstend;
101           char enctype;
102 
103           p = *ibuf;
104           if (p[0] != '=' && p[1] != '?')
105                     return -1;
106           if (strlen(p) <  2 + 1 + 3 + 1 + 2)
107                     return -1;
108           p = grab_charset(from_cs, sizeof(from_cs), p + 2);
109           if (p == NULL)
110                     return -1;
111           enctype = *p++;
112           if (*p++ != '?')
113                     return -1;
114           encword = p;
115           p = strchr(p, '?');
116           if (p == NULL || p[1] != '=')
117                     return -1;
118           enclen = p - encword;         /* length of encoded substring */
119           iend = p + 2;
120           /* encoded words are at most 75 characters (RFC 2047, sec 2) */
121           if (iend > *ibuf + 75)
122                     return -1;
123 
124           if (oend < *obuf + 1) {
125                     assert(/*CONSTCOND*/ 0);      /* We have a coding error! */
126                     return -1;
127           }
128           dstend = to_cs ? decword : *obuf;
129           dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1;
130 
131           declen = mime_rfc2047_decode(enctype, dstend, dstlen, encword, enclen);
132           if (declen == -1)
133                     return -1;
134 
135           dstend += declen;
136 #ifdef CHARSET_SUPPORT
137           if (to_cs != NULL) {
138                     iconv_t cd;
139                     const char *src;
140                     size_t srclen;
141                     size_t cnt;
142 
143                     cd = iconv_open(to_cs, from_cs);
144                     if (cd == (iconv_t)-1)
145                               return -1;
146 
147                     src = decword;
148                     srclen = declen;
149                     dstend = *obuf;
150                     dstlen = oend - *obuf - 1;
151                     cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);
152 
153                     (void)iconv_close(cd);
154                     if (cnt == (size_t)-1)
155                               return -1;
156           }
157 #endif /* CHARSET_SUPPORT */
158           *dstend = '\0';
159           *ibuf = iend;
160           *obuf = dstend;
161           return 0;
162 }
163 
164 
165 /*
166  * Folding White Space.  See RFC 2822.
167  *
168  * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
169  * pairs (i.e., "\r\n") and never separately.  However, by the time
170  * mail(1) sees the messages, all CRLF pairs have been converted to
171  * '\n' characters.
172  *
173  * XXX - pull is_FWS() and skip_FWS() up to def.h?
174  */
175 static inline int
is_FWS(int c)176 is_FWS(int c)
177 {
178           return c == ' ' || c == '\t' || c == '\n';
179 }
180 
181 static inline const char *
skip_FWS(const char * p)182 skip_FWS(const char *p)
183 {
184           while (is_FWS(*p))
185                     p++;
186           return p;
187 }
188 
189 static inline void
copy_skipped_FWS(char ** dst,char * dstend,const char ** src,const char * srcend)190 copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
191 {
192           const char *p, *pend;
193           char *q, *qend;
194 
195           p = *src;
196           q = *dst;
197           pend = srcend;
198           qend = dstend;
199 
200           if (p) {  /* copy any skipped linear-white-space */
201                     while (p < pend && q < qend)
202                               *q++ = *p++;
203                     *dst = q;
204                     *src = NULL;
205           }
206 }
207 
208 /*
209  * Decode an unstructured field.
210  *
211  * See RFC 2822 Sec 2.2.1 and 3.6.5.
212  * Encoded words may occur anywhere in unstructured fields provided
213  * they are separated from any other text or encoded words by at least
214  * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
215  * encoded words occur sequentially (separated by only FWS) then the
216  * separating FWS is removed.
217  *
218  * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
219  * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
220  * (or any non-whitespace character) immediately before an
221  * encoded-word will prevent it from being decoded.
222  *
223  * hstring should be a NULL terminated string.
224  * outbuf should be sufficiently large to hold the result.
225  */
226 static void
mime_decode_usfield(char * outbuf,size_t outsize,const char * hstring)227 mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
228 {
229           const char *p, *p0;
230           char *q, *qend;
231           int lastc;
232           const char *charset;
233 
234           charset = value(ENAME_MIME_CHARSET);
235           qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
236           q = outbuf;
237           p = hstring;
238           p0 = NULL;
239           lastc = (unsigned char)' ';
240           while (*p && q < qend) {
241                     const char *p1;
242                     char *q1;
243                     if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
244                         decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
245                         (*p1 == '\0' || is_FWS(*p1))) {
246                               p0 = p1;  /* pointer to first character after encoded word */
247                               q = q1;
248                               p = skip_FWS(p1);
249                               lastc = (unsigned char)*p0;
250                     }
251                     else {
252                               copy_skipped_FWS(&q, qend, &p0, p);
253                               lastc = (unsigned char)*p;
254                               if (q < qend)
255                                         *q++ = *p++;
256                     }
257           }
258           copy_skipped_FWS(&q, qend, &p0, p);
259           *q = '\0';
260 }
261 
262 /*
263  * Decode a field comment.
264  *
265  * Comments only occur in structured fields, can be nested (rfc 2822,
266  * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
267  * Otherwise, they can be regarded as unstructured fields that are
268  * bounded by '(' and ')' characters.
269  */
270 static int
decode_comment(char ** obuf,char * oend,const char ** ibuf,const char * iend,const char * charset)271 decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
272 {
273           const char *p, *pend, *p0;
274           char *q, *qend;
275           int lastc;
276 
277           p = *ibuf;
278           q = *obuf;
279           pend = iend;
280           qend = oend;
281           lastc = ' ';
282           p0 = NULL;
283           while (p < pend && q < qend) {
284                     const char *p1;
285                     char *q1;
286 
287                     if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
288                         decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
289                         (*p1 == ')' || is_FWS(*p1))) {
290                               lastc = (unsigned char)*p1;
291                               p0 = p1;
292                               q = q1;
293                               p = skip_FWS(p1);
294                               /*
295                                * XXX - this check should be unnecessary as *pend should
296                                * be '\0' which will stop skip_FWS()
297                                */
298                               if (p > pend)
299                                         p = pend;
300                     }
301                     else {
302                               copy_skipped_FWS(&q, qend, &p0, p);
303                               if (q >= qend)      /* XXX - q > qend cannot happen */
304                                         break;
305 
306                               if (*p == ')') {
307                                         *q++ = *p++;        /* copy the closing ')' */
308                                         break;              /* and get out of here! */
309                               }
310 
311                               if (*p == '(') {
312                                         *q++ = *p++;        /* copy the opening '(' */
313                                         if (decode_comment(&q, qend, &p, pend, charset) == -1)
314                                                   return -1;          /* is this right or should we update? */
315                                         lastc = ')';
316                               }
317                               else if (*p == '\\' && p + 1 < pend) {  /* quoted-pair */
318                                         if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
319                                                   *q++ = *p;
320                                         p++;
321                                         lastc = (unsigned char)*p;
322                                         if (q < qend)
323                                                   *q++ = *p++;
324                               }
325                               else {
326                                         lastc = (unsigned char)*p;
327                                         *q++ = *p++;
328                               }
329                     }
330           }
331           *ibuf = p;
332           *obuf = q;
333           return 0;
334 }
335 
336 /*
337  * Decode a quoted-string or no-fold-quote.
338  *
339  * These cannot contain encoded words.  They can contain quoted-pairs,
340  * making '\\' special.  They have no other structure.  See RFC 2822
341  * sec 3.2.5 and 3.6.4.
342  */
343 static void
decode_quoted_string(char ** obuf,char * oend,const char ** ibuf,const char * iend)344 decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
345 {
346           const char *p, *pend;
347           char *q, *qend;
348 
349           qend = oend;
350           pend = iend;
351           p = *ibuf;
352           q = *obuf;
353           while (p < pend && q < qend) {
354                     if (*p == '"') {
355                               *q++ = *p++;        /* copy the closing '"' */
356                               break;
357                     }
358                     if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
359                               if (p[1] == '"' || p[1] == '\\') {
360                                         *q++ = *p;
361                                         if (q >= qend)
362                                                   break;
363                               }
364                               p++;
365                     }
366                     *q++ = *p++;
367           }
368           *ibuf = p;
369           *obuf = q;
370 }
371 
372 /*
373  * Decode a domain-literal or no-fold-literal.
374  *
375  * These cannot contain encoded words.  They can have quoted pairs and
376  * are delimited by '[' and ']' making '\\', '[', and ']' special.
377  * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
378  */
379 static void
decode_domain_literal(char ** obuf,char * oend,const char ** ibuf,const char * iend)380 decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
381 {
382           const char *p, *pend;
383           char *q, *qend;
384 
385           qend = oend;
386           pend = iend;
387           p = *ibuf;
388           q = *obuf;
389           while (p < pend && q < qend) {
390                     if (*p == ']') {
391                               *q++ = *p++;        /* copy the closing ']' */
392                               break;
393                     }
394                     if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
395                               if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
396                                         *q++ = *p;
397                                         if (q >= qend)
398                                                   break;
399                               }
400                               p++;
401                     }
402                     *q++ = *p++;
403           }
404           *ibuf = p;
405           *obuf = q;
406 }
407 
408 /*
409  * Specials: see RFC 2822 sec 3.2.1.
410  */
411 static inline int
is_specials(int c)412 is_specials(int c)
413 {
414           static const char specialtab[] = {
415                     0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
416                     0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
417                     0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
418                     0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,
419 
420                     1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
421                     0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
422                     0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
423                     0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
424           };
425           return !(c & ~0x7f) ? specialtab[c] : 0;
426 }
427 
428 /*
429  * Decode a structured field.
430  *
431  * At the top level, structured fields can only contain encoded-words
432  * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
433  */
434 static void
mime_decode_sfield(char * linebuf,size_t bufsize,const char * hstring)435 mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
436 {
437           const char *p, *pend, *p0;
438           char *q, *qend;
439           const char *charset;
440           int lastc;
441 
442           charset = value(ENAME_MIME_CHARSET);
443 
444           p = hstring;
445           q = linebuf;
446           pend = hstring + strlen(hstring);
447           qend = linebuf + bufsize - 1; /* save room for the NULL terminator */
448           lastc = (unsigned char)' ';
449           p0 = NULL;
450           while (p < pend && q < qend) {
451                     const char *p1;
452                     char *q1;
453 
454                     if (*p != '=') {
455                               copy_skipped_FWS(&q, qend, &p0, p);
456                               if (q >= qend)
457                                         break;
458                     }
459 
460                     switch (*p) {
461                     case '(': /* start of comment */
462                               *q++ = *p++;        /* copy the opening '(' */
463                               (void)decode_comment(&q, qend, &p, pend, charset);
464                               lastc = (unsigned char)p[-1];
465                               break;
466 
467                     case '"': /* start of quoted-string or no-fold-quote */
468                               *q++ = *p++;        /* copy the opening '"' */
469                               decode_quoted_string(&q, qend, &p, pend);
470                               lastc = (unsigned char)p[-1];
471                               break;
472 
473                     case '[': /* start of domain-literal or no-fold-literal */
474                               *q++ = *p++;        /* copy the opening '[' */
475                               decode_domain_literal(&q, qend, &p, pend);
476                               lastc = (unsigned char)p[-1];
477                               break;
478 
479                     case '\\':          /* start of quoted-pair */
480                               if (p + 1 < pend) {           /* quoted pair */
481                                         if (is_specials(p[1])) {
482                                                   *q++ = *p;
483                                                   if (q >= qend)
484                                                             break;
485                                         }
486                                         p++;      /* skip the '\\' */
487                               }
488                               goto copy_char;
489 
490                     case '=':
491                               /*
492                                * At this level encoded words can appear via
493                                * 'phrases' (possibly delimited by ',' as in
494                                * 'keywords').  Thus we handle them as such.
495                                * Hopefully this is sufficient.
496                                */
497                               if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
498                                   decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
499                                   (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
500                                         lastc = (unsigned char)*p1;
501                                         p0 = p1;
502                                         q = q1;
503                                         p = skip_FWS(p1);
504                                         /*
505                                          * XXX - this check should be
506                                          * unnecessary as *pend should be '\0'
507                                          * which will stop skip_FWS()
508                                          */
509                                         if (p > pend)
510                                                   p = pend;
511                                         break;
512                               }
513                               else {
514                                         copy_skipped_FWS(&q, qend, &p0, p);
515                                         if (q >= qend)
516                                                   break;
517                                         goto copy_char;
518                               }
519 
520                     case '<': /* start of angle-addr, msg-id, or path. */
521                               /*
522                                * A msg-id cannot contain encoded-pairs or
523                                * encoded-words, but angle-addr and path can.
524                                * Distinguishing between them seems to be
525                                * unnecessary, so let's be loose and just
526                                * decode them as if they were all the same.
527                                */
528                     default:
529           copy_char:
530                               lastc = (unsigned char)*p;
531                               *q++ = *p++;
532                               break;
533                     }
534           }
535           copy_skipped_FWS(&q, qend, &p0, p);
536           *q = '\0';          /* null terminate the result! */
537 }
538 
539 /*
540  * Returns the correct hfield decoder, or NULL if none.
541  * Info extracted from RFC 2822.
542  *
543  * name - pointer to field name of header line (with colon).
544  */
545 PUBLIC hfield_decoder_t
mime_hfield_decoder(const char * name)546 mime_hfield_decoder(const char *name)
547 {
548           static const struct field_decoder_tbl_s {
549                     const char *field_name;
550                     size_t field_len;
551                     hfield_decoder_t decoder;
552           } field_decoder_tbl[] = {
553 #define X(s)        s, sizeof(s) - 1
554                     { X("Received:"),                       NULL },
555 
556                     { X("Content-Type:"),                             NULL },
557                     { X("Content-Disposition:"),            NULL },
558                     { X("Content-Transfer-Encoding:"),      NULL },
559                     { X("Content-Description:"),            mime_decode_sfield },
560                     { X("Content-ID:"),                     mime_decode_sfield },
561                     { X("MIME-Version:"),                             mime_decode_sfield },
562 
563                     { X("Bcc:"),                                      mime_decode_sfield },
564                     { X("Cc:"),                                       mime_decode_sfield },
565                     { X("Date:"),                                     mime_decode_sfield },
566                     { X("From:"),                                     mime_decode_sfield },
567                     { X("In-Reply-To:"),                              mime_decode_sfield },
568                     { X("Keywords:"),                       mime_decode_sfield },
569                     { X("Message-ID:"),                     mime_decode_sfield },
570                     { X("References:"),                     mime_decode_sfield },
571                     { X("Reply-To:"),                       mime_decode_sfield },
572                     { X("Return-Path:"),                              mime_decode_sfield },
573                     { X("Sender:"),                                   mime_decode_sfield },
574                     { X("To:"),                                       mime_decode_sfield },
575                     { X("Subject:"),                        mime_decode_usfield },
576                     { X("Comments:"),                       mime_decode_usfield },
577                     { X("X-"),                                        mime_decode_usfield },
578                     { NULL, 0,                                        mime_decode_usfield },        /* optional-fields */
579 #undef X
580           };
581           const struct field_decoder_tbl_s *fp;
582 
583           /* XXX - this begs for a hash table! */
584           for (fp = field_decoder_tbl; fp->field_name; fp++)
585                     if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
586                               break;
587           return fp->decoder;
588 }
589 
590 #endif /* MIME_SUPPORT */
591