mime_header.c - OpenGrok cross reference for /netbsd/src/usr.bin/mail/mime_header.c

/*        $NetBSD: mime_header.c,v 1.9 2013/02/14 18:23:45 christos Exp $       */

/*-
 * Copyright (c) 2006 The NetBSD Foundation, Inc.
 * All rights reserved.
 *
 * This code is derived from software contributed to The NetBSD Foundation
 * by Anon Ymous.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */


/*
 * This module contains the core MIME header decoding routines.
 * Please refer to RFC 2047 and RFC 2822.
 */

#ifdef MIME_SUPPORT

#include <sys/cdefs.h>
#ifndef __lint__
__RCSID("$NetBSD: mime_header.c,v 1.9 2013/02/14 18:23:45 christos Exp $");
#endif /* not __lint__ */

#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#include "def.h"
#include "extern.h"
#include "mime.h"
#include "mime_header.h"
#include "mime_codecs.h"

static const char *
grab_charset(char *from_cs, size_t from_cs_len, const char *p)
{
          char *q;
          q = from_cs;
          for (/*EMPTY*/; *p != '?'; p++) {
                    if (*p == '\0' || q >= from_cs + from_cs_len - 1)
                              return NULL;
                    *q++ = *p;
          }
          *q = '\0';
          return ++p;         /* if here, then we got the '?' */
}

/*
 * An encoded word is a string of at most 75 non-white space
 * characters of the following form:
 *
 *  =?charset?X?encoding?=
 *
 * where:
 *   'charset'      is the original character set of the unencoded string.
 *
 *   'X'  is the encoding type 'B' or 'Q' for "base64" or
 *              "quoted-printable", respectively,
 *   'encoding'     is the encoded string.
 *
 * Both 'charset' and 'X' are case independent and 'encoding' cannot
 * contain any whitespace or '?' characters.  The 'encoding' must also
 * be fully contained within the encoded words, i.e., it cannot be
 * split between encoded words.
 *
 * Note: the 'B' encoding is a slightly modified "quoted-printable"
 * encoding.  In particular, spaces (' ') may be encoded as '_' to
 * improve undecoded readability.
 */
static int
decode_word(const char **ibuf, char **obuf, char *oend, const char *to_cs)
{
          ssize_t declen;
          size_t enclen, dstlen;
          char decword[LINESIZE];
          char from_cs[LINESIZE];
          const char *encword, *iend, *p;
          char *dstend;
          char enctype;

          p = *ibuf;
          if (p[0] != '=' && p[1] != '?')
                    return -1;
          if (strlen(p) <  2 + 1 + 3 + 1 + 2)
                    return -1;
          p = grab_charset(from_cs, sizeof(from_cs), p + 2);
          if (p == NULL)
                    return -1;
          enctype = *p++;
          if (*p++ != '?')
                    return -1;
          encword = p;
          p = strchr(p, '?');
          if (p == NULL || p[1] != '=')
                    return -1;
          enclen = p - encword;         /* length of encoded substring */
          iend = p + 2;
          /* encoded words are at most 75 characters (RFC 2047, sec 2) */
          if (iend > *ibuf + 75)
                    return -1;

          if (oend < *obuf + 1) {
                    assert(/*CONSTCOND*/ 0);      /* We have a coding error! */
                    return -1;
          }
          dstend = to_cs ? decword : *obuf;
          dstlen = (to_cs ? sizeof(decword) : (size_t)(oend - *obuf)) - 1;

          declen = mime_rfc2047_decode(enctype, dstend, dstlen, encword, enclen);
          if (declen == -1)
                    return -1;

          dstend += declen;
#ifdef CHARSET_SUPPORT
          if (to_cs != NULL) {
                    iconv_t cd;
                    const char *src;
                    size_t srclen;
                    size_t cnt;

                    cd = iconv_open(to_cs, from_cs);
                    if (cd == (iconv_t)-1)
                              return -1;

                    src = decword;
                    srclen = declen;
                    dstend = *obuf;
                    dstlen = oend - *obuf - 1;
                    cnt = mime_iconv(cd, &src, &srclen, &dstend, &dstlen);

                    (void)iconv_close(cd);
                    if (cnt == (size_t)-1)
                              return -1;
          }
#endif /* CHARSET_SUPPORT */
          *dstend = '\0';
          *ibuf = iend;
          *obuf = dstend;
          return 0;
}


/*
 * Folding White Space.  See RFC 2822.
 *
 * Note: RFC 2822 specifies that '\n' and '\r' only occur as CRLF
 * pairs (i.e., "\r\n") and never separately.  However, by the time
 * mail(1) sees the messages, all CRLF pairs have been converted to
 * '\n' characters.
 *
 * XXX - pull is_FWS() and skip_FWS() up to def.h?
 */
static inline int
is_FWS(int c)
{
          return c == ' ' || c == '\t' || c == '\n';
}

static inline const char *
skip_FWS(const char *p)
{
          while (is_FWS(*p))
                    p++;
          return p;
}

static inline void
copy_skipped_FWS(char **dst, char *dstend, const char **src, const char *srcend)
{
          const char *p, *pend;
          char *q, *qend;

          p = *src;
          q = *dst;
          pend = srcend;
          qend = dstend;

          if (p) {  /* copy any skipped linear-white-space */
                    while (p < pend && q < qend)
                              *q++ = *p++;
                    *dst = q;
                    *src = NULL;
          }
}

/*
 * Decode an unstructured field.
 *
 * See RFC 2822 Sec 2.2.1 and 3.6.5.
 * Encoded words may occur anywhere in unstructured fields provided
 * they are separated from any other text or encoded words by at least
 * one linear-white-space character. (See RFC 2047 sec 5.1.)  If two
 * encoded words occur sequentially (separated by only FWS) then the
 * separating FWS is removed.
 *
 * NOTE: unstructured fields cannot contain 'quoted-pairs' (see
 * RFC2822 sec 3.2.6 and RFC 2047), but that is no problem as a '\\'
 * (or any non-whitespace character) immediately before an
 * encoded-word will prevent it from being decoded.
 *
 * hstring should be a NULL terminated string.
 * outbuf should be sufficiently large to hold the result.
 */
static void
mime_decode_usfield(char *outbuf, size_t outsize, const char *hstring)
{
          const char *p, *p0;
          char *q, *qend;
          int lastc;
          const char *charset;

          charset = value(ENAME_MIME_CHARSET);
          qend = outbuf + outsize - 1; /* Make sure there is room for the trailing NULL! */
          q = outbuf;
          p = hstring;
          p0 = NULL;
          lastc = (unsigned char)' ';
          while (*p && q < qend) {
                    const char *p1;
                    char *q1;
                    if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
                        decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
                        (*p1 == '\0' || is_FWS(*p1))) {
                              p0 = p1;  /* pointer to first character after encoded word */
                              q = q1;
                              p = skip_FWS(p1);
                              lastc = (unsigned char)*p0;
                    }
                    else {
                              copy_skipped_FWS(&q, qend, &p0, p);
                              lastc = (unsigned char)*p;
                              if (q < qend)
                                        *q++ = *p++;
                    }
          }
          copy_skipped_FWS(&q, qend, &p0, p);
          *q = '\0';
}

/*
 * Decode a field comment.
 *
 * Comments only occur in structured fields, can be nested (rfc 2822,
 * sec 3.2.3), and can contain 'encoded-words' and 'quoted-pairs'.
 * Otherwise, they can be regarded as unstructured fields that are
 * bounded by '(' and ')' characters.
 */
static int
decode_comment(char **obuf, char *oend, const char **ibuf, const char *iend, const char *charset)
{
          const char *p, *pend, *p0;
          char *q, *qend;
          int lastc;

          p = *ibuf;
          q = *obuf;
          pend = iend;
          qend = oend;
          lastc = ' ';
          p0 = NULL;
          while (p < pend && q < qend) {
                    const char *p1;
                    char *q1;

                    if (is_FWS(lastc) && p[0] == '=' && p[1] == '?' &&
                        decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
                        (*p1 == ')' || is_FWS(*p1))) {
                              lastc = (unsigned char)*p1;
                              p0 = p1;
                              q = q1;
                              p = skip_FWS(p1);
                              /*
                               * XXX - this check should be unnecessary as *pend should
                               * be '\0' which will stop skip_FWS()
                               */
                              if (p > pend)
                                        p = pend;
                    }
                    else {
                              copy_skipped_FWS(&q, qend, &p0, p);
                              if (q >= qend)      /* XXX - q > qend cannot happen */
                                        break;

                              if (*p == ')') {
                                        *q++ = *p++;        /* copy the closing ')' */
                                        break;              /* and get out of here! */
                              }

                              if (*p == '(') {
                                        *q++ = *p++;        /* copy the opening '(' */
                                        if (decode_comment(&q, qend, &p, pend, charset) == -1)
                                                  return -1;          /* is this right or should we update? */
                                        lastc = ')';
                              }
                              else if (*p == '\\' && p + 1 < pend) {  /* quoted-pair */
                                        if (p[1] == '(' || p[1] == ')' || p[1] == '\\') /* need quoted-pair*/
                                                  *q++ = *p;
                                        p++;
                                        lastc = (unsigned char)*p;
                                        if (q < qend)
                                                  *q++ = *p++;
                              }
                              else {
                                        lastc = (unsigned char)*p;
                                        *q++ = *p++;
                              }
                    }
          }
          *ibuf = p;
          *obuf = q;
          return 0;
}

/*
 * Decode a quoted-string or no-fold-quote.
 *
 * These cannot contain encoded words.  They can contain quoted-pairs,
 * making '\\' special.  They have no other structure.  See RFC 2822
 * sec 3.2.5 and 3.6.4.
 */
static void
decode_quoted_string(char **obuf, char *oend, const char **ibuf, const char *iend)
{
          const char *p, *pend;
          char *q, *qend;

          qend = oend;
          pend = iend;
          p = *ibuf;
          q = *obuf;
          while (p < pend && q < qend) {
                    if (*p == '"') {
                              *q++ = *p++;        /* copy the closing '"' */
                              break;
                    }
                    if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
                              if (p[1] == '"' || p[1] == '\\') {
                                        *q++ = *p;
                                        if (q >= qend)
                                                  break;
                              }
                              p++;
                    }
                    *q++ = *p++;
          }
          *ibuf = p;
          *obuf = q;
}

/*
 * Decode a domain-literal or no-fold-literal.
 *
 * These cannot contain encoded words.  They can have quoted pairs and
 * are delimited by '[' and ']' making '\\', '[', and ']' special.
 * They have no other structure.  See RFC 2822 sec 3.4.1 and 3.6.4.
 */
static void
decode_domain_literal(char **obuf, char *oend, const char **ibuf, const char *iend)
{
          const char *p, *pend;
          char *q, *qend;

          qend = oend;
          pend = iend;
          p = *ibuf;
          q = *obuf;
          while (p < pend && q < qend) {
                    if (*p == ']') {
                              *q++ = *p++;        /* copy the closing ']' */
                              break;
                    }
                    if (*p == '\\' && p + 1 < pend) { /* quoted-pair */
                              if (p[1] == '[' || p[1] == ']' || p[1] == '\\') {
                                        *q++ = *p;
                                        if (q >= qend)
                                                  break;
                              }
                              p++;
                    }
                    *q++ = *p++;
          }
          *ibuf = p;
          *obuf = q;
}

/*
 * Specials: see RFC 2822 sec 3.2.1.
 */
static inline int
is_specials(int c)
{
          static const char specialtab[] = {
                    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
                    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
                    0, 0, 1, 0,  0, 0, 0, 0,  1, 1, 0, 0,  1, 0, 1, 0,
                    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 1, 1,  1, 0, 1, 0,

                    1, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
                    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 1,  1, 1, 0, 0,
                    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
                    0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,  0, 0, 0, 0,
          };
          return !(c & ~0x7f) ? specialtab[c] : 0;
}

/*
 * Decode a structured field.
 *
 * At the top level, structured fields can only contain encoded-words
 * via 'phrases' and 'comments'.  See RFC 2047 sec 5.
 */
static void
mime_decode_sfield(char *linebuf, size_t bufsize, const char *hstring)
{
          const char *p, *pend, *p0;
          char *q, *qend;
          const char *charset;
          int lastc;

          charset = value(ENAME_MIME_CHARSET);

          p = hstring;
          q = linebuf;
          pend = hstring + strlen(hstring);
          qend = linebuf + bufsize - 1; /* save room for the NULL terminator */
          lastc = (unsigned char)' ';
          p0 = NULL;
          while (p < pend && q < qend) {
                    const char *p1;
                    char *q1;

                    if (*p != '=') {
                              copy_skipped_FWS(&q, qend, &p0, p);
                              if (q >= qend)
                                        break;
                    }

                    switch (*p) {
                    case '(': /* start of comment */
                              *q++ = *p++;        /* copy the opening '(' */
                              (void)decode_comment(&q, qend, &p, pend, charset);
                              lastc = (unsigned char)p[-1];
                              break;

                    case '"': /* start of quoted-string or no-fold-quote */
                              *q++ = *p++;        /* copy the opening '"' */
                              decode_quoted_string(&q, qend, &p, pend);
                              lastc = (unsigned char)p[-1];
                              break;

                    case '[': /* start of domain-literal or no-fold-literal */
                              *q++ = *p++;        /* copy the opening '[' */
                              decode_domain_literal(&q, qend, &p, pend);
                              lastc = (unsigned char)p[-1];
                              break;

                    case '\\':          /* start of quoted-pair */
                              if (p + 1 < pend) {           /* quoted pair */
                                        if (is_specials(p[1])) {
                                                  *q++ = *p;
                                                  if (q >= qend)
                                                            break;
                                        }
                                        p++;      /* skip the '\\' */
                              }
                              goto copy_char;

                    case '=':
                              /*
                               * At this level encoded words can appear via
                               * 'phrases' (possibly delimited by ',' as in
                               * 'keywords').  Thus we handle them as such.
                               * Hopefully this is sufficient.
                               */
                              if ((lastc == ',' || is_FWS(lastc)) && p[1] == '?' &&
                                  decode_word((p1 = p, &p1), (q1 = q, &q1), qend, charset) == 0 &&
                                  (*p1 == '\0' || *p1 == ',' || is_FWS(*p1))) {
                                        lastc = (unsigned char)*p1;
                                        p0 = p1;
                                        q = q1;
                                        p = skip_FWS(p1);
                                        /*
                                         * XXX - this check should be
                                         * unnecessary as *pend should be '\0'
                                         * which will stop skip_FWS()
                                         */
                                        if (p > pend)
                                                  p = pend;
                                        break;
                              }
                              else {
                                        copy_skipped_FWS(&q, qend, &p0, p);
                                        if (q >= qend)
                                                  break;
                                        goto copy_char;
                              }

                    case '<': /* start of angle-addr, msg-id, or path. */
                              /*
                               * A msg-id cannot contain encoded-pairs or
                               * encoded-words, but angle-addr and path can.
                               * Distinguishing between them seems to be
                               * unnecessary, so let's be loose and just
                               * decode them as if they were all the same.
                               */
                    default:
          copy_char:
                              lastc = (unsigned char)*p;
                              *q++ = *p++;
                              break;
                    }
          }
          copy_skipped_FWS(&q, qend, &p0, p);
          *q = '\0';          /* null terminate the result! */
}

/*
 * Returns the correct hfield decoder, or NULL if none.
 * Info extracted from RFC 2822.
 *
 * name - pointer to field name of header line (with colon).
 */
PUBLIC hfield_decoder_t
mime_hfield_decoder(const char *name)
{
          static const struct field_decoder_tbl_s {
                    const char *field_name;
                    size_t field_len;
                    hfield_decoder_t decoder;
          } field_decoder_tbl[] = {
#define X(s)        s, sizeof(s) - 1
                    { X("Received:"),                       NULL },

                    { X("Content-Type:"),                             NULL },
                    { X("Content-Disposition:"),            NULL },
                    { X("Content-Transfer-Encoding:"),      NULL },
                    { X("Content-Description:"),            mime_decode_sfield },
                    { X("Content-ID:"),                     mime_decode_sfield },
                    { X("MIME-Version:"),                             mime_decode_sfield },

                    { X("Bcc:"),                                      mime_decode_sfield },
                    { X("Cc:"),                                       mime_decode_sfield },
                    { X("Date:"),                                     mime_decode_sfield },
                    { X("From:"),                                     mime_decode_sfield },
                    { X("In-Reply-To:"),                              mime_decode_sfield },
                    { X("Keywords:"),                       mime_decode_sfield },
                    { X("Message-ID:"),                     mime_decode_sfield },
                    { X("References:"),                     mime_decode_sfield },
                    { X("Reply-To:"),                       mime_decode_sfield },
                    { X("Return-Path:"),                              mime_decode_sfield },
                    { X("Sender:"),                                   mime_decode_sfield },
                    { X("To:"),                                       mime_decode_sfield },
                    { X("Subject:"),                        mime_decode_usfield },
                    { X("Comments:"),                       mime_decode_usfield },
                    { X("X-"),                                        mime_decode_usfield },
                    { NULL, 0,                                        mime_decode_usfield },        /* optional-fields */
#undef X
          };
          const struct field_decoder_tbl_s *fp;

          /* XXX - this begs for a hash table! */
          for (fp = field_decoder_tbl; fp->field_name; fp++)
                    if (strncasecmp(name, fp->field_name, fp->field_len) == 0)
                              break;
          return fp->decoder;
}

#endif /* MIME_SUPPORT */