notmuch/date.c

/* date.c - Date-parsing utility for the notmuch mail system.
 *
 *  Copyright © 2000-2009 Jeffrey Stedfast
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see http://www.gnu.org/licenses/
 */

/* This code was originally written by from Jeffrey Stedfast
 * as part of his GMime library (http://spruce.sourceforge.net/gmime/)
 *
 * Carl Worth <cworth@cworth.org> imported it into notmuch and removed
 * some glib-isms.
 */

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#define _GNU_SOURCE

#include <glib.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#ifdef HAVE_SYS_PARAM_H
#include <sys/param.h>      /* for MAXHOSTNAMELEN */
#else
#define MAXHOSTNAMELEN 64
#endif
#ifdef HAVE_UTSNAME_DOMAINNAME
#include <sys/utsname.h>    /* for uname() */
#endif
#include <sys/types.h>
#ifdef HAVE_UNISTD_H
#include <unistd.h>         /* Unix header for getpid() */
#endif
#ifdef G_OS_WIN32
#include <winsock2.h>
#include <ws2tcpip.h>
#include <process.h>
#define getpid() _getpid()
#endif
#ifdef HAVE_NETDB_H
#include <netdb.h>
#endif
#include <ctype.h>
#include <errno.h>

#include "gmime-utils.h"
#include "gmime-table-private.h"
#include "gmime-parse-utils.h"
#include "gmime-part.h"
#include "gmime-charset.h"
#include "gmime-iconv.h"
#include "gmime-iconv-utils.h"

#ifdef ENABLE_WARNINGS
#define w(x) x
#else
#define w(x)
#endif /* ENABLE_WARNINGS */

#define d(x)


/**
 * SECTION: gmime-utils
 * @title: gmime-utils
 * @short_description: MIME utility functions
 * @see_also:
 *
 * Utility functions to parse, encode and decode various MIME tokens
 * and encodings.
 **/

extern gboolean _g_mime_enable_rfc2047_workarounds (void);

#define GMIME_FOLD_PREENCODED  (GMIME_FOLD_LEN / 2)

/* date parser macros */
#define NUMERIC_CHARS          "1234567890"
#define WEEKDAY_CHARS          "SundayMondayTuesdayWednesdayThursdayFridaySaturday"
#define MONTH_CHARS            "JanuaryFebruaryMarchAprilMayJuneJulyAugustSeptemberOctoberNovemberDecember"
#define TIMEZONE_ALPHA_CHARS   "UTCGMTESTEDTCSTCDTMSTPSTPDTZAMNY()"
#define TIMEZONE_NUMERIC_CHARS "-+1234567890"
#define TIME_CHARS             "1234567890:"

#define DATE_TOKEN_NON_NUMERIC          (1 << 0)
#define DATE_TOKEN_NON_WEEKDAY          (1 << 1)
#define DATE_TOKEN_NON_MONTH            (1 << 2)
#define DATE_TOKEN_NON_TIME             (1 << 3)
#define DATE_TOKEN_HAS_COLON            (1 << 4)
#define DATE_TOKEN_NON_TIMEZONE_ALPHA   (1 << 5)
#define DATE_TOKEN_NON_TIMEZONE_NUMERIC (1 << 6)
#define DATE_TOKEN_HAS_SIGN             (1 << 7)

static unsigned char tohex[16] = {
	'0', '1', '2', '3', '4', '5', '6', '7',
	'8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
};

static unsigned char gmime_datetok_table[256] = {
	128,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111, 79, 79,111,175,111,175,111,111,
	 38, 38, 38, 38, 38, 38, 38, 38, 38, 38,119,111,111,111,111,111,
	111, 75,111, 79, 75, 79,105, 79,111,111,107,111,111, 73, 75,107,
	 79,111,111, 73, 77, 79,111,109,111, 79, 79,111,111,111,111,111,
	111,105,107,107,109,105,111,107,105,105,111,111,107,107,105,105,
	107,111,105,105,105,105,107,111,111,105,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
	111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,111,
};

/* hrm, is there a library for this shit? */
static struct {
	char *name;
	int offset;
} tz_offsets [] = {
	{ "UT", 0 },
	{ "GMT", 0 },
	{ "EST", -500 },	/* these are all US timezones.  bloody yanks */
	{ "EDT", -400 },
	{ "CST", -600 },
	{ "CDT", -500 },
	{ "MST", -700 },
	{ "MDT", -600 },
	{ "PST", -800 },
	{ "PDT", -700 },
	{ "Z", 0 },
	{ "A", -100 },
	{ "M", -1200 },
	{ "N", 100 },
	{ "Y", 1200 },
};

static char *tm_months[] = {
	"Jan", "Feb", "Mar", "Apr", "May", "Jun",
	"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
};

static char *tm_days[] = {
	"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
};


/**
 * g_mime_utils_header_format_date:
 * @date: time_t date representation
 * @tz_offset: Timezone offset
 *
 * Allocates a string buffer containing the rfc822 formatted date
 * string represented by @time and @tz_offset.
 *
 * Returns: a valid string representation of the date.
 **/
char *
g_mime_utils_header_format_date (time_t date, int tz_offset)
{
	struct tm tm;

	date += ((tz_offset / 100) * (60 * 60)) + (tz_offset % 100) * 60;

#if defined (HAVE_GMTIME_R)
	gmtime_r (&date, &tm);
#elif defined (HAVE_GMTIME_S)
	gmtime_s (&tm, &date);
#else
	memcpy (&tm, gmtime (&date), sizeof (tm));
#endif

	return g_strdup_printf ("%s, %02d %s %04d %02d:%02d:%02d %+05d",
				tm_days[tm.tm_wday], tm.tm_mday,
				tm_months[tm.tm_mon],
				tm.tm_year + 1900,
				tm.tm_hour, tm.tm_min, tm.tm_sec,
				tz_offset);
}

/* This is where it gets ugly... */

typedef struct _date_token {
	struct _date_token *next;
	unsigned char mask;
	const char *start;
	size_t len;
} date_token;

#define date_token_free(tok) g_slice_free (date_token, tok)
#define date_token_new() g_slice_new (date_token)

static date_token *
datetok (const char *date)
{
	date_token *tokens = NULL, *token, *tail = (date_token *) &tokens;
	const char *start, *end;
        unsigned char mask;

	start = date;
	while (*start) {
		/* kill leading whitespace */
		while (*start == ' ' || *start == '\t')
			start++;

		if (*start == '\0')
			break;

		mask = gmime_datetok_table[(unsigned char) *start];

		/* find the end of this token */
		end = start + 1;
		while (*end && !strchr ("-/,\t\r\n ", *end))
			mask |= gmime_datetok_table[(unsigned char) *end++];

		if (end != start) {
			token = date_token_new ();
			token->next = NULL;
			token->start = start;
			token->len = end - start;
			token->mask = mask;

			tail->next = token;
			tail = token;
		}

		if (*end)
			start = end + 1;
		else
			break;
	}

	return tokens;
}

static int
decode_int (const char *in, size_t inlen)
{
	register const char *inptr;
	int sign = 1, val = 0;
	const char *inend;

	inptr = in;
	inend = in + inlen;

	if (*inptr == '-') {
		sign = -1;
		inptr++;
	} else if (*inptr == '+')
		inptr++;

	for ( ; inptr < inend; inptr++) {
		if (!(*inptr >= '0' && *inptr <= '9'))
			return -1;
		else
			val = (val * 10) + (*inptr - '0');
	}

	val *= sign;

	return val;
}

#if 0
static int
get_days_in_month (int month, int year)
{
        switch (month) {
	case 1:
	case 3:
	case 5:
	case 7:
	case 8:
	case 10:
	case 12:
	        return 31;
	case 4:
	case 6:
	case 9:
	case 11:
	        return 30;
	case 2:
	        if (g_date_is_leap_year (year))
		        return 29;
		else
		        return 28;
	default:
	        return 0;
	}
}
#endif

static int
get_wday (const char *in, size_t inlen)
{
	int wday;

	g_return_val_if_fail (in != NULL, -1);

	if (inlen < 3)
		return -1;

	for (wday = 0; wday < 7; wday++) {
		if (!g_ascii_strncasecmp (in, tm_days[wday], 3))
			return wday;
	}

	return -1;  /* unknown week day */
}

static int
get_mday (const char *in, size_t inlen)
{
	int mday;

	g_return_val_if_fail (in != NULL, -1);

	mday = decode_int (in, inlen);

	if (mday < 0 || mday > 31)
		mday = -1;

	return mday;
}

static int
get_month (const char *in, size_t inlen)
{
	int i;

	g_return_val_if_fail (in != NULL, -1);

	if (inlen < 3)
		return -1;

	for (i = 0; i < 12; i++) {
		if (!g_ascii_strncasecmp (in, tm_months[i], 3))
			return i;
	}

	return -1;  /* unknown month */
}

static int
get_year (const char *in, size_t inlen)
{
	int year;

	g_return_val_if_fail (in != NULL, -1);

	if ((year = decode_int (in, inlen)) == -1)
		return -1;

	if (year < 100)
		year += (year < 70) ? 2000 : 1900;

	if (year < 1969)
		return -1;

	return year;
}

static gboolean
get_time (const char *in, size_t inlen, int *hour, int *min, int *sec)
{
	register const char *inptr;
	int *val, colons = 0;
	const char *inend;

	*hour = *min = *sec = 0;

	inend = in + inlen;
	val = hour;
	for (inptr = in; inptr < inend; inptr++) {
		if (*inptr == ':') {
			colons++;
			switch (colons) {
			case 1:
				val = min;
				break;
			case 2:
				val = sec;
				break;
			default:
				return FALSE;
			}
		} else if (!(*inptr >= '0' && *inptr <= '9'))
			return FALSE;
		else
			*val = (*val * 10) + (*inptr - '0');
	}

	return TRUE;
}

static int
get_tzone (date_token **token)
{
	const char *inptr, *inend;
	size_t inlen;
	int i, t;

	for (i = 0; *token && i < 2; *token = (*token)->next, i++) {
		inptr = (*token)->start;
		inlen = (*token)->len;
		inend = inptr + inlen;

		if (*inptr == '+' || *inptr == '-') {
			return decode_int (inptr, inlen);
		} else {
			if (*inptr == '(') {
				inptr++;
				if (*(inend - 1) == ')')
					inlen -= 2;
				else
					inlen--;
			}

			for (t = 0; t < 15; t++) {
				size_t len = strlen (tz_offsets[t].name);

				if (len != inlen)
					continue;

				if (!strncmp (inptr, tz_offsets[t].name, len))
					return tz_offsets[t].offset;
			}
		}
	}

	return -1;
}

static time_t
mktime_utc (struct tm *tm)
{
	time_t tt;
	long tz;

	tm->tm_isdst = -1;
	tt = mktime (tm);

#if defined (G_OS_WIN32)
	_get_timezone (&tz);
	if (tm->tm_isdst > 0) {
		int dst;

		_get_dstbias (&dst);
		tz += dst;
	}
#elif defined (HAVE_TM_GMTOFF)
	tz = -tm->tm_gmtoff;
#elif defined (HAVE_TIMEZONE)
	if (tm->tm_isdst > 0) {
#if defined (HAVE_ALTZONE)
		tz = altzone;
#else /* !defined (HAVE_ALTZONE) */
		tz = (timezone - 3600);
#endif
	} else {
		tz = timezone;
	}
#elif defined (HAVE__TIMEZONE)
	tz = _timezone;
#else
#error Neither HAVE_TIMEZONE nor HAVE_TM_GMTOFF defined. Rerun autoheader, autoconf, etc.
#endif

	return tt - tz;
}

static time_t
parse_rfc822_date (date_token *tokens, int *tzone)
{
	int hour, min, sec, offset, n;
	date_token *token;
	struct tm tm;
	time_t t;

	g_return_val_if_fail (tokens != NULL, (time_t) 0);

	token = tokens;

	memset ((void *) &tm, 0, sizeof (struct tm));

	if ((n = get_wday (token->start, token->len)) != -1) {
		/* not all dates may have this... */
		tm.tm_wday = n;
		token = token->next;
	}

	/* get the mday */
	if (!token || (n = get_mday (token->start, token->len)) == -1)
		return (time_t) 0;

	tm.tm_mday = n;
	token = token->next;

	/* get the month */
	if (!token || (n = get_month (token->start, token->len)) == -1)
		return (time_t) 0;

	tm.tm_mon = n;
	token = token->next;

	/* get the year */
	if (!token || (n = get_year (token->start, token->len)) == -1)
		return (time_t) 0;

	tm.tm_year = n - 1900;
	token = token->next;

	/* get the hour/min/sec */
	if (!token || !get_time (token->start, token->len, &hour, &min, &sec))
		return (time_t) 0;

	tm.tm_hour = hour;
	tm.tm_min = min;
	tm.tm_sec = sec;
	token = token->next;

	/* get the timezone */
	if (!token || (n = get_tzone (&token)) == -1) {
		/* I guess we assume tz is GMT? */
		offset = 0;
	} else {
		offset = n;
	}

	t = mktime_utc (&tm);

	/* t is now GMT of the time we want, but not offset by the timezone ... */

	/* this should convert the time to the GMT equiv time */
	t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;

	if (tzone)
		*tzone = offset;

	return t;
}


#define date_token_mask(t)  (((date_token *) t)->mask)
#define is_numeric(t)       ((date_token_mask (t) & DATE_TOKEN_NON_NUMERIC) == 0)
#define is_weekday(t)       ((date_token_mask (t) & DATE_TOKEN_NON_WEEKDAY) == 0)
#define is_month(t)         ((date_token_mask (t) & DATE_TOKEN_NON_MONTH) == 0)
#define is_time(t)          (((date_token_mask (t) & DATE_TOKEN_NON_TIME) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_COLON))
#define is_tzone_alpha(t)   ((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_ALPHA) == 0)
#define is_tzone_numeric(t) (((date_token_mask (t) & DATE_TOKEN_NON_TIMEZONE_NUMERIC) == 0) && (date_token_mask (t) & DATE_TOKEN_HAS_SIGN))
#define is_tzone(t)         (is_tzone_alpha (t) || is_tzone_numeric (t))

static time_t
parse_broken_date (date_token *tokens, int *tzone)
{
	gboolean got_wday, got_month, got_tzone;
	int hour, min, sec, offset, n;
	date_token *token;
	struct tm tm;
	time_t t;

	memset ((void *) &tm, 0, sizeof (struct tm));
	got_wday = got_month = got_tzone = FALSE;
	offset = 0;

	token = tokens;
	while (token) {
		if (is_weekday (token) && !got_wday) {
			if ((n = get_wday (token->start, token->len)) != -1) {
				d(printf ("weekday; "));
				got_wday = TRUE;
				tm.tm_wday = n;
				goto next;
			}
		}

		if (is_month (token) && !got_month) {
			if ((n = get_month (token->start, token->len)) != -1) {
				d(printf ("month; "));
				got_month = TRUE;
				tm.tm_mon = n;
				goto next;
			}
		}

		if (is_time (token) && !tm.tm_hour && !tm.tm_min && !tm.tm_sec) {
			if (get_time (token->start, token->len, &hour, &min, &sec)) {
				d(printf ("time; "));
				tm.tm_hour = hour;
				tm.tm_min = min;
				tm.tm_sec = sec;
				goto next;
			}
		}

		if (is_tzone (token) && !got_tzone) {
			date_token *t = token;

			if ((n = get_tzone (&t)) != -1) {
				d(printf ("tzone; "));
				got_tzone = TRUE;
				offset = n;
				goto next;
			}
		}

		if (is_numeric (token)) {
			if (token->len == 4 && !tm.tm_year) {
				if ((n = get_year (token->start, token->len)) != -1) {
					d(printf ("year; "));
					tm.tm_year = n - 1900;
					goto next;
				}
			} else {
				/* Note: assumes MM-DD-YY ordering if '0 < MM < 12' holds true */
				if (!got_month && token->next && is_numeric (token->next)) {
					if ((n = decode_int (token->start, token->len)) > 12) {
						goto mday;
					} else if (n > 0) {
						d(printf ("mon; "));
						got_month = TRUE;
						tm.tm_mon = n - 1;
					}
					goto next;
				} else if (!tm.tm_mday && (n = get_mday (token->start, token->len)) != -1) {
				mday:
					d(printf ("mday; "));
					tm.tm_mday = n;
					goto next;
				} else if (!tm.tm_year) {
					if ((n = get_year (token->start, token->len)) != -1) {
						d(printf ("2-digit year; "));
						tm.tm_year = n - 1900;
					}
					goto next;
				}
			}
		}

		d(printf ("???; "));

	next:

		token = token->next;
	}

	d(printf ("\n"));

	t = mktime_utc (&tm);

	/* t is now GMT of the time we want, but not offset by the timezone ... */

	/* this should convert the time to the GMT equiv time */
	t -= ((offset / 100) * 60 * 60) + (offset % 100) * 60;

	if (tzone)
		*tzone = offset;

	return t;
}

#if 0
static void
gmime_datetok_table_init (void)
{
	int i;

	memset (gmime_datetok_table, 0, sizeof (gmime_datetok_table));

	for (i = 0; i < 256; i++) {
		if (!strchr (NUMERIC_CHARS, i))
			gmime_datetok_table[i] |= DATE_TOKEN_NON_NUMERIC;

		if (!strchr (WEEKDAY_CHARS, i))
			gmime_datetok_table[i] |= DATE_TOKEN_NON_WEEKDAY;

		if (!strchr (MONTH_CHARS, i))
			gmime_datetok_table[i] |= DATE_TOKEN_NON_MONTH;

		if (!strchr (TIME_CHARS, i))
			gmime_datetok_table[i] |= DATE_TOKEN_NON_TIME;

		if (!strchr (TIMEZONE_ALPHA_CHARS, i))
			gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_ALPHA;

		if (!strchr (TIMEZONE_NUMERIC_CHARS, i))
			gmime_datetok_table[i] |= DATE_TOKEN_NON_TIMEZONE_NUMERIC;

		if (((char) i) == ':')
			gmime_datetok_table[i] |= DATE_TOKEN_HAS_COLON;

		if (strchr ("+-", i))
			gmime_datetok_table[i] |= DATE_TOKEN_HAS_SIGN;
	}

	printf ("static unsigned char gmime_datetok_table[256] = {");
	for (i = 0; i < 256; i++) {
		if (i % 16 == 0)
			printf ("\n\t");
		printf ("%3d,", gmime_datetok_table[i]);
	}
	printf ("\n};\n");
}
#endif


/**
 * g_mime_utils_header_decode_date:
 * @str: input date string
 * @tz_offset: timezone offset
 *
 * Decodes the rfc822 date string and saves the GMT offset into
 * @tz_offset if non-NULL.
 *
 * Returns: the time_t representation of the date string specified by
 * @str or (time_t) %0 on error. If @tz_offset is non-NULL, the value
 * of the timezone offset will be stored.
 **/
time_t
g_mime_utils_header_decode_date (const char *str, int *tz_offset)
{
	date_token *token, *tokens;
	time_t date;

	if (!(tokens = datetok (str))) {
		if (tz_offset)
			*tz_offset = 0;

		return (time_t) 0;
	}

	if (!(date = parse_rfc822_date (tokens, tz_offset)))
		date = parse_broken_date (tokens, tz_offset);

	/* cleanup */
	while (tokens) {
		token = tokens;
		tokens = tokens->next;
		date_token_free (token);
	}

	return date;
}


/**
 * g_mime_utils_generate_message_id:
 * @fqdn: Fully qualified domain name
 *
 * Generates a unique Message-Id.
 *
 * Returns: a unique string in an addr-spec format suitable for use as
 * a Message-Id.
 **/
char *
g_mime_utils_generate_message_id (const char *fqdn)
{
#ifdef G_THREADS_ENABLED
	static GStaticMutex mutex = G_STATIC_MUTEX_INIT;
#define MUTEX_LOCK()   g_static_mutex_lock (&mutex)
#define MUTEX_UNLOCK() g_static_mutex_unlock (&mutex)
#else
#define MUTEX_LOCK()
#define MUTEX_UNLOCK()
#endif
	static unsigned long int count = 0;
	const char *hostname = NULL;
	char *name = NULL;
	char *msgid;

	if (!fqdn) {
#ifdef HAVE_UTSNAME_DOMAINNAME
		struct utsname unam;

		uname (&unam);

		hostname = unam.nodename;

		if (unam.domainname[0])
			name = g_strdup_printf ("%s.%s", hostname, unam.domainname);
#else /* ! HAVE_UTSNAME_DOMAINNAME */
		char host[MAXHOSTNAMELEN + 1];

#ifdef HAVE_GETHOSTNAME
		host[MAXHOSTNAMELEN] = '\0';
		if (gethostname (host, MAXHOSTNAMELEN) == 0) {
#ifdef HAVE_GETDOMAINNAME
			size_t domainlen = MAXHOSTNAMELEN;
			char *domain;
			int rv;

			domain = g_malloc (domainlen);

			while ((rv = getdomainname (domain, domainlen)) == -1 && errno == EINVAL) {
				domainlen += MAXHOSTNAMELEN;
				domain = g_realloc (domain, domainlen);
			}

			if (rv == 0 && domain[0]) {
				if (host[0]) {
					name = g_strdup_printf ("%s.%s", host, domain);
					g_free (domain);
				} else {
					name = domain;
				}
			}
#endif /* HAVE_GETDOMAINNAME */
		} else {
			host[0] = '\0';
		}
#endif /* HAVE_GETHOSTNAME */
		hostname = host;
#endif /* HAVE_UTSNAME_DOMAINNAME */

#ifdef HAVE_GETADDRINFO
		if (!name && hostname[0]) {
			/* we weren't able to get a domain name */
			struct addrinfo hints, *res;

			memset (&hints, 0, sizeof (hints));
			hints.ai_flags = AI_CANONNAME;

			if (getaddrinfo (hostname, NULL, &hints, &res) == 0) {
				name = g_strdup (res->ai_canonname);
				freeaddrinfo (res);
			}
		}
#endif /* HAVE_GETADDRINFO */

		fqdn = name != NULL ? name : (hostname[0] ? hostname : "localhost.localdomain");
	}

	MUTEX_LOCK ();
	msgid = g_strdup_printf ("%lu.%lu.%lu@%s", (unsigned long int) time (NULL),
				 (unsigned long int) getpid (), count++, fqdn);
	MUTEX_UNLOCK ();

	g_free (name);

	return msgid;
}

static char *
decode_addrspec (const char **in)
{
	const char *word, *inptr;
	GString *addrspec;
	char *str;

	decode_lwsp (in);
	inptr = *in;

	if (!(word = decode_word (&inptr))) {
		w(g_warning ("No local-part in addr-spec: %s", *in));
		return NULL;
	}

	addrspec = g_string_new ("");
	g_string_append_len (addrspec, word, (size_t) (inptr - word));

	/* get the rest of the local-part */
	decode_lwsp (&inptr);
	while (*inptr == '.') {
		g_string_append_c (addrspec, *inptr++);
		if ((word = decode_word (&inptr))) {
			g_string_append_len (addrspec, word, (size_t) (inptr - word));
			decode_lwsp (&inptr);
		} else {
			w(g_warning ("Invalid local-part in addr-spec: %s", *in));
			goto exception;
		}
	}

	/* we should be at the '@' now... */
	if (*inptr++ != '@') {
		w(g_warning ("Invalid addr-spec; missing '@': %s", *in));
		goto exception;
	}

	g_string_append_c (addrspec, '@');
	if (!decode_domain (&inptr, addrspec)) {
		w(g_warning ("No domain in addr-spec: %s", *in));
		goto exception;
	}

	str = addrspec->str;
	g_string_free (addrspec, FALSE);

	*in = inptr;

	return str;

 exception:

	g_string_free (addrspec, TRUE);

	return NULL;
}

static char *
decode_msgid (const char **in)
{
	const char *inptr = *in;
	char *msgid = NULL;

	decode_lwsp (&inptr);
	if (*inptr != '<') {
		w(g_warning ("Invalid msg-id; missing '<': %s", *in));
	} else {
		inptr++;
	}

	decode_lwsp (&inptr);
	if ((msgid = decode_addrspec (&inptr))) {
		decode_lwsp (&inptr);
		if (*inptr != '>') {
			w(g_warning ("Invalid msg-id; missing '>': %s", *in));
		} else {
			inptr++;
		}

		*in = inptr;
	} else {
		w(g_warning ("Invalid msg-id; missing addr-spec: %s", *in));
		*in = inptr;
		while (*inptr && *inptr != '>')
			inptr++;

		msgid = g_strndup (*in, (size_t) (inptr - *in));
		*in = inptr;
	}

	return msgid;
}


/**
 * g_mime_utils_decode_message_id:
 * @message_id: string containing a message-id
 *
 * Decodes a msg-id as defined by rfc822.
 *
 * Returns: the addr-spec portion of the msg-id.
 **/
char *
g_mime_utils_decode_message_id (const char *message_id)
{
	g_return_val_if_fail (message_id != NULL, NULL);

	return decode_msgid (&message_id);
}


/**
 * g_mime_references_decode:
 * @text: string containing a list of msg-ids
 *
 * Decodes a list of msg-ids as in the References and/or In-Reply-To
 * headers defined in rfc822.
 *
 * Returns: a list of referenced msg-ids.
 **/
GMimeReferences *
g_mime_references_decode (const char *text)
{
	GMimeReferences *refs, *tail, *ref;
	const char *word, *inptr = text;
	char *msgid;

	g_return_val_if_fail (text != NULL, NULL);

	refs = NULL;
	tail = (GMimeReferences *) &refs;

	while (*inptr) {
		decode_lwsp (&inptr);
		if (*inptr == '<') {
			/* looks like a msg-id */
			if ((msgid = decode_msgid (&inptr))) {
				ref = g_new (GMimeReferences, 1);
				ref->next = NULL;
				ref->msgid = msgid;
				tail->next = ref;
				tail = ref;
			} else {
				w(g_warning ("Invalid References header: %s", inptr));
				break;
			}
		} else if (*inptr) {
			/* looks like part of a phrase */
			if (!(word = decode_word (&inptr))) {
				w(g_warning ("Invalid References header: %s", inptr));
				break;
			}
		}
	}

	return refs;
}


/**
 * g_mime_references_append:
 * @refs: the address of a #GMimeReferences list
 * @msgid: a message-id string
 *
 * Appends a reference to msgid to the list of references.
 **/
void
g_mime_references_append (GMimeReferences **refs, const char *msgid)
{
	GMimeReferences *ref;

	g_return_if_fail (refs != NULL);
	g_return_if_fail (msgid != NULL);

	ref = (GMimeReferences *) refs;
	while (ref->next)
		ref = ref->next;

	ref->next = g_new (GMimeReferences, 1);
	ref->next->msgid = g_strdup (msgid);
	ref->next->next = NULL;
}


/**
 * g_mime_references_free:
 * @refs: a #GMimeReferences list
 *
 * Frees the #GMimeReferences list.
 **/
void
g_mime_references_free (GMimeReferences *refs)
{
	GMimeReferences *ref, *next;

	ref = refs;
	while (ref) {
		next = ref->next;
		g_free (ref->msgid);
		g_free (ref);
		ref = next;
	}
}


/**
 * g_mime_references_clear:
 * @refs: address of a #GMimeReferences list
 *
 * Clears the #GMimeReferences list and resets it to %NULL.
 **/
void
g_mime_references_clear (GMimeReferences **refs)
{
	g_return_if_fail (refs != NULL);

	g_mime_references_free (*refs);
	*refs = NULL;
}


/**
 * g_mime_references_get_next:
 * @ref: a #GMimeReferences list
 *
 * Advances to the next reference node in the #GMimeReferences list.
 *
 * Returns: the next reference node in the #GMimeReferences list.
 **/
const GMimeReferences *
g_mime_references_get_next (const GMimeReferences *ref)
{
	return ref ? ref->next : NULL;
}


/**
 * g_mime_references_get_message_id:
 * @ref: a #GMimeReferences list
 *
 * Gets the Message-Id reference from the #GMimeReferences node.
 *
 * Returns: the Message-Id reference from the #GMimeReferences node.
 **/
const char *
g_mime_references_get_message_id (const GMimeReferences *ref)
{
	return ref ? ref->msgid : NULL;
}


static gboolean
is_rfc2047_token (const char *inptr, size_t len)
{
	if (len < 8 || strncmp (inptr, "=?", 2) != 0 || strncmp (inptr + len - 2, "?=", 2) != 0)
		return FALSE;

	inptr += 2;
	len -= 2;

	/* skip past the charset */
	while (*inptr != '?' && len > 0) {
		inptr++;
		len--;
	}

	if (*inptr != '?' || len < 4)
		return FALSE;

	if (inptr[1] != 'q' && inptr[1] != 'Q' && inptr[1] != 'b' && inptr[1] != 'B')
		return FALSE;

	inptr += 2;
	len -= 2;

	if (*inptr != '?')
		return FALSE;

	return TRUE;
}

static char *
header_fold (const char *in, gboolean structured)
{
	gboolean last_was_lwsp = FALSE;
	register const char *inptr;
	size_t len, outlen, i;
	size_t fieldlen;
	GString *out;
	char *ret;

	inptr = in;
	len = strlen (in);
	if (len <= GMIME_FOLD_LEN + 1)
		return g_strdup (in);

	out = g_string_new ("");
	fieldlen = strcspn (inptr, ": \t\n");
	g_string_append_len (out, inptr, fieldlen);
	outlen = fieldlen;
	inptr += fieldlen;

	while (*inptr && *inptr != '\n') {
		len = strcspn (inptr, " \t\n");

		if (len > 1 && outlen + len > GMIME_FOLD_LEN) {
			if (outlen > 1 && out->len > fieldlen + 2) {
				if (last_was_lwsp) {
					if (structured)
						out->str[out->len - 1] = '\t';

					g_string_insert_c (out, out->len - 1, '\n');
				} else
					g_string_append (out, "\n\t");
				outlen = 1;
			}

			if (!structured && !is_rfc2047_token (inptr, len)) {
				/* check for very long words, just cut them up */
				while (outlen + len > GMIME_FOLD_LEN) {
					for (i = 0; i < GMIME_FOLD_LEN - outlen; i++)
						g_string_append_c (out, inptr[i]);
					inptr += GMIME_FOLD_LEN - outlen;
					len -= GMIME_FOLD_LEN - outlen;
					g_string_append (out, "\n\t");
					outlen = 1;
				}
			} else {
				g_string_append_len (out, inptr, len);
				outlen += len;
				inptr += len;
			}
			last_was_lwsp = FALSE;
		} else if (len > 0) {
			g_string_append_len (out, inptr, len);
			outlen += len;
			inptr += len;
			last_was_lwsp = FALSE;
		} else {
			last_was_lwsp = TRUE;
			if (*inptr == '\t') {
				/* tabs are a good place to fold, odds
				   are that this is where the previous
				   mailer folded it */
				g_string_append (out, "\n\t");
				outlen = 1;
				while (is_blank (*inptr))
					inptr++;
			} else {
				g_string_append_c (out, *inptr++);
				outlen++;
			}
		}
	}

	if (*inptr == '\n' && out->str[out->len - 1] != '\n')
		g_string_append_c (out, '\n');

	ret = out->str;
	g_string_free (out, FALSE);

	return ret;
}


/**
 * g_mime_utils_structured_header_fold:
 * @str: input string
 *
 * Folds a structured header according to the rules in rfc822.
 *
 * Returns: an allocated string containing the folded header.
 **/
char *
g_mime_utils_structured_header_fold (const char *str)
{
	return header_fold (str, TRUE);
}


/**
 * g_mime_utils_unstructured_header_fold:
 * @str: input string
 *
 * Folds an unstructured header according to the rules in rfc822.
 *
 * Returns: an allocated string containing the folded header.
 **/
char *
g_mime_utils_unstructured_header_fold (const char *str)
{
	return header_fold (str, FALSE);
}


/**
 * g_mime_utils_header_fold:
 * @str: input string
 *
 * Folds a structured header according to the rules in rfc822.
 *
 * Returns: an allocated string containing the folded header.
 **/
char *
g_mime_utils_header_fold (const char *str)
{
	return header_fold (str, TRUE);
}


/**
 * g_mime_utils_header_printf:
 * @format: string format
 * @Varargs: arguments
 *
 * Allocates a buffer containing a formatted header specified by the
 * @Varargs.
 *
 * Returns: an allocated string containing the folded header specified
 * by @format and the following arguments.
 **/
char *
g_mime_utils_header_printf (const char *format, ...)
{
	char *buf, *ret;
	va_list ap;

	va_start (ap, format);
	buf = g_strdup_vprintf (format, ap);
	va_end (ap);

	ret = header_fold (buf, TRUE);
	g_free (buf);

	return ret;
}

static gboolean
need_quotes (const char *string)
{
	gboolean quoted = FALSE;
	const char *inptr;

	inptr = string;

	while (*inptr) {
		if (*inptr == '\\')
			inptr++;
		else if (*inptr == '"')
			quoted = !quoted;
		else if (!quoted && (is_tspecial (*inptr) || *inptr == '.'))
			return TRUE;

		if (*inptr)
			inptr++;
	}

	return FALSE;
}

/**
 * g_mime_utils_quote_string:
 * @str: input string
 *
 * Quotes @string as needed according to the rules in rfc2045.
 *
 * Returns: an allocated string containing the escaped and quoted (if
 * needed to be) input string. The decision to quote the string is
 * based on whether or not the input string contains any 'tspecials'
 * as defined by rfc2045.
 **/
char *
g_mime_utils_quote_string (const char *str)
{
	gboolean quote;
	const char *c;
	char *qstring;
	GString *out;

	out = g_string_new ("");

	if ((quote = need_quotes (str)))
		g_string_append_c (out, '"');

	for (c = str; *c; c++) {
		if ((*c == '"' && quote) || *c == '\\')
			g_string_append_c (out, '\\');

		g_string_append_c (out, *c);
	}

	if (quote)
		g_string_append_c (out, '"');

	qstring = out->str;
	g_string_free (out, FALSE);

	return qstring;
}


/**
 * g_mime_utils_unquote_string:
 * @str: input string
 *
 * Unquotes and unescapes a string.
 **/
void
g_mime_utils_unquote_string (char *str)
{
	/* if the string is quoted, unquote it */
	register char *inptr = str;
	int escaped = FALSE;
	int quoted = FALSE;

	if (!str)
		return;

	while (*inptr) {
		if (*inptr == '\\') {
			if (escaped)
				*str++ = *inptr++;
			else
				inptr++;
			escaped = !escaped;
		} else if (*inptr == '"') {
			if (escaped) {
				*str++ = *inptr++;
				escaped = FALSE;
			} else {
				quoted = !quoted;
				inptr++;
			}
		} else {
			*str++ = *inptr++;
			escaped = FALSE;
		}
	}

	*str = '\0';
}


/**
 * g_mime_utils_text_is_8bit:
 * @text: text to check for 8bit chars
 * @len: text length
 *
 * Determines if @text contains 8bit characters within the first @len
 * bytes.
 *
 * Returns: %TRUE if the text contains 8bit characters or %FALSE
 * otherwise.
 **/
gboolean
g_mime_utils_text_is_8bit (const unsigned char *text, size_t len)
{
	register const unsigned char *inptr;
	const unsigned char *inend;

	g_return_val_if_fail (text != NULL, FALSE);

	inend = text + len;
	for (inptr = text; *inptr && inptr < inend; inptr++)
		if (*inptr > (unsigned char) 127)
			return TRUE;

	return FALSE;
}


/**
 * g_mime_utils_best_encoding:
 * @text: text to encode
 * @len: text length
 *
 * Determines the best content encoding for the first @len bytes of
 * @text.
 *
 * Returns: a #GMimeContentEncoding that is determined to be the best
 * encoding type for the specified block of text. ("best" in this
 * particular case means smallest output size)
 **/
GMimeContentEncoding
g_mime_utils_best_encoding (const unsigned char *text, size_t len)
{
	const unsigned char *ch, *inend;
	size_t count = 0;

	inend = text + len;
	for (ch = text; ch < inend; ch++)
		if (*ch > (unsigned char) 127)
			count++;

	if ((float) count <= len * 0.17)
		return GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE;
	else
		return GMIME_CONTENT_ENCODING_BASE64;
}


/**
 * charset_convert:
 * @cd: iconv converter
 * @inbuf: input text buffer to convert
 * @inleft: length of the input buffer
 * @outp: pointer to output buffer
 * @outlenp: pointer to output buffer length
 * @ninval: the number of invalid bytes in @inbuf
 *
 * Converts the input buffer from one charset to another using the
 * @cd. On completion, @outp will point to the output buffer
 * containing the converted text (nul-terminated), @outlenp will be
 * the size of the @outp buffer (note: not the strlen() of @outp) and
 * @ninval will contain the number of bytes which could not be
 * converted.
 *
 * Bytes which cannot be converted from @inbuf will appear as '?'
 * characters in the output buffer.
 *
 * If *@outp is non-NULL, then it is assumed that it points to a
 * pre-allocated buffer of length *@outlenp. This is done so that the
 * same output buffer can be reused multiple times.
 *
 * Returns: the string length of the output buffer.
 **/
static size_t
charset_convert (iconv_t cd, const char *inbuf, size_t inleft, char **outp, size_t *outlenp, size_t *ninval)
{
	size_t outlen, outleft, rc, n = 0;
	char *outbuf, *out;

	if (*outp == NULL) {
		outleft = outlen = (inleft * 2) + 16;
		outbuf = out = g_malloc (outlen + 1);
	} else {
		outleft = outlen = *outlenp;
		outbuf = out = *outp;
	}

	do {
		rc = iconv (cd, (char **) &inbuf, &inleft, &outbuf, &outleft);
		if (rc == (size_t) -1) {
			if (errno == EINVAL) {
				/* incomplete sequence at the end of the input buffer */
				n += inleft;
				break;
			}

#ifdef G_OS_WIN32
			/* seems that GnuWin32's libiconv 1.9 does not set errno in
			 * the E2BIG case, so we have to fake it */
			if (outleft <= inleft)
				errno = E2BIG;
#endif

			if (errno == E2BIG) {
				/* need to grow the output buffer */
				outlen += (inleft * 2) + 16;
				rc = (size_t) (outbuf - out);
				out = g_realloc (out, outlen + 1);
				outleft = outlen - rc;
				outbuf = out + rc;
			} else {
				/* invalid byte(-sequence) in the input buffer */
				*outbuf++ = '?';
				outleft--;
				inleft--;
				inbuf++;
				n++;
			}
		}
	} while (inleft > 0);

	iconv (cd, NULL, NULL, &outbuf, &outleft);
	*outbuf++ = '\0';

	*outlenp = outlen;
	*outp = out;
	*ninval = n;

	return (outbuf - out);
}


#define USER_CHARSETS_INCLUDE_UTF8    (1 << 0)
#define USER_CHARSETS_INCLUDE_LOCALE  (1 << 1)


/**
 * g_mime_utils_decode_8bit:
 * @text: input text in unknown 8bit/multibyte character set
 * @len: input text length
 *
 * Attempts to convert text in an unknown 8bit/multibyte charset into
 * UTF-8 by finding the charset which will convert the most bytes into
 * valid UTF-8 characters as possible. If no exact match can be found,
 * it will choose the best match and convert invalid byte sequences
 * into question-marks (?) in the returned string buffer.
 *
 * Returns: a UTF-8 string representation of @text.
 **/
char *
g_mime_utils_decode_8bit (const char *text, size_t len)
{
	const char **charsets, **user_charsets, *locale, *best;
	size_t outleft, outlen, min, ninval;
	unsigned int included = 0;
	iconv_t cd;
	char *out;
	int i = 0;

	g_return_val_if_fail (text != NULL, NULL);

	locale = g_mime_locale_charset ();
	if (locale && !g_ascii_strcasecmp (locale, "UTF-8"))
		included |= USER_CHARSETS_INCLUDE_LOCALE;

	if ((user_charsets = g_mime_user_charsets ())) {
		while (user_charsets[i])
			i++;
	}

	charsets = g_alloca (sizeof (char *) * (i + 3));
	i = 0;

	if (user_charsets) {
		while (user_charsets[i]) {
			/* keep a record of whether or not the user-supplied
			 * charsets include UTF-8 and/or the default fallback
			 * charset so that we avoid doubling our efforts for
			 * these 2 charsets. We could have used a hash table
			 * to keep track of unique charsets, but we can
			 * (hopefully) assume that user_charsets is a unique
			 * list of charsets with no duplicates. */
			if (!g_ascii_strcasecmp (user_charsets[i], "UTF-8"))
				included |= USER_CHARSETS_INCLUDE_UTF8;

			if (locale && !g_ascii_strcasecmp (user_charsets[i], locale))
				included |= USER_CHARSETS_INCLUDE_LOCALE;

			charsets[i] = user_charsets[i];
			i++;
		}
	}

	if (!(included & USER_CHARSETS_INCLUDE_UTF8))
		charsets[i++] = "UTF-8";

	if (!(included & USER_CHARSETS_INCLUDE_LOCALE))
		charsets[i++] = locale;

	charsets[i] = NULL;

	min = len;
	best = charsets[0];

	outleft = (len * 2) + 16;
	out = g_malloc (outleft + 1);

	for (i = 0; charsets[i]; i++) {
		if ((cd = g_mime_iconv_open ("UTF-8", charsets[i])) == (iconv_t) -1)
			continue;

		outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);

		g_mime_iconv_close (cd);

		if (ninval == 0)
			return g_realloc (out, outlen + 1);

		if (ninval < min) {
			best = charsets[i];
			min = ninval;
		}
	}

	/* if we get here, then none of the charsets fit the 8bit text flawlessly...
	 * try to find the one that fit the best and use that to convert what we can,
	 * replacing any byte we can't convert with a '?' */

	if ((cd = g_mime_iconv_open ("UTF-8", best)) == (iconv_t) -1) {
		/* this shouldn't happen... but if we are here, then
		 * it did...  the only thing we can do at this point
		 * is replace the 8bit garbage and pray */
		register const char *inptr = text;
		const char *inend = inptr + len;
		char *outbuf = out;

		while (inptr < inend) {
			if (is_ascii (*inptr))
				*outbuf++ = *inptr++;
			else
				*outbuf++ = '?';
		}

		*outbuf++ = '\0';

		return g_realloc (out, (size_t) (outbuf - out));
	}

	outlen = charset_convert (cd, text, len, &out, &outleft, &ninval);

	g_mime_iconv_close (cd);

	return g_realloc (out, outlen + 1);
}


/* this decodes rfc2047's version of quoted-printable */
static ssize_t
quoted_decode (const unsigned char *in, size_t len, unsigned char *out)
{
	register const unsigned char *inptr;
	register unsigned char *outptr;
	const unsigned char *inend;
	unsigned char c, c1;

	inend = in + len;
	outptr = out;

	inptr = in;
	while (inptr < inend) {
		c = *inptr++;
		if (c == '=') {
			if (inend - inptr >= 2) {
				c = toupper (*inptr++);
				c1 = toupper (*inptr++);
				*outptr++ = (((c >= 'A' ? c - 'A' + 10 : c - '0') & 0x0f) << 4)
					| ((c1 >= 'A' ? c1 - 'A' + 10 : c1 - '0') & 0x0f);
			} else {
				/* data was truncated */
				return -1;
			}
		} else if (c == '_') {
			/* _'s are an rfc2047 shortcut for encoding spaces */
			*outptr++ = ' ';
		} else {
			*outptr++ = c;
		}
	}

	return (ssize_t) (outptr - out);
}

#define is_rfc2047_encoded_word(atom, len) (len >= 7 && !strncmp (atom, "=?", 2) && !strncmp (atom + len - 2, "?=", 2))

static char *
rfc2047_decode_word (const char *in, size_t inlen)
{
	const unsigned char *instart = (const unsigned char *) in;
	const register unsigned char *inptr = instart + 2;
	const unsigned char *inend = instart + inlen - 2;
	unsigned char *decoded;
	const char *charset;
	size_t len, ninval;
	char *charenc, *p;
	guint32 save = 0;
	ssize_t declen;
	int state = 0;
	iconv_t cd;
	char *buf;

	/* skip over the charset */
	if (!(inptr = memchr (inptr, '?', inend - inptr)) || inptr[2] != '?')
		return NULL;

	inptr++;

	switch (*inptr) {
	case 'B':
	case 'b':
		inptr += 2;
		len = (size_t) (inend - inptr);
		decoded = g_alloca (len);
		declen = g_mime_encoding_base64_decode_step (inptr, len, decoded, &state, &save);

		if (declen == -1) {
			d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
			return NULL;
		}
		break;
	case 'Q':
	case 'q':
		inptr += 2;
		len = (size_t) (inend - inptr);
		decoded = g_alloca (len);
		declen = quoted_decode (inptr, len, decoded);

		if (declen == -1) {
			d(fprintf (stderr, "encountered broken 'Q' encoding\n"));
			return NULL;
		}
		break;
	default:
		d(fprintf (stderr, "unknown encoding\n"));
		return NULL;
	}

	len = (inptr - 3) - (instart + 2);
	charenc = g_alloca (len + 1);
	memcpy (charenc, in + 2, len);
	charenc[len] = '\0';
	charset = charenc;

	/* rfc2231 updates rfc2047 encoded words...
	 * The ABNF given in RFC 2047 for encoded-words is:
	 *   encoded-word := "=?" charset "?" encoding "?" encoded-text "?="
	 * This specification changes this ABNF to:
	 *   encoded-word := "=?" charset ["*" language] "?" encoding "?" encoded-text "?="
	 */

	/* trim off the 'language' part if it's there... */
	if ((p = strchr (charset, '*')))
		*p = '\0';

	/* slight optimization? */
	if (!g_ascii_strcasecmp (charset, "UTF-8")) {
		p = (char *) decoded;
		len = declen;

		//while (!g_utf8_validate (p, len, (const char **) &p)) {
		//	len = declen - (p - (char *) decoded);
		//	*p = '?';
		//}

		return g_strndup ((char *) decoded, declen);
	}

	if (!charset[0] || (cd = g_mime_iconv_open ("UTF-8", charset)) == (iconv_t) -1) {
		w(g_warning ("Cannot convert from %s to UTF-8, header display may "
			     "be corrupt: %s", charset[0] ? charset : "unspecified charset",
			     g_strerror (errno)));

		return g_mime_utils_decode_8bit ((char *) decoded, declen);
	}

	len = declen;
	buf = g_malloc (len + 1);

	charset_convert (cd, (char *) decoded, declen, &buf, &len, &ninval);

	g_mime_iconv_close (cd);

#if w(!)0
	if (ninval > 0) {
		g_warning ("Failed to completely convert \"%.*s\" to UTF-8, display may be "
			   "corrupt: %s", declen, decoded, g_strerror (errno));
	}
#endif

	return buf;
}


/**
 * g_mime_utils_header_decode_text:
 * @text: header text to decode
 *
 * Decodes an rfc2047 encoded 'text' header.
 *
 * Note: See g_mime_set_user_charsets() for details on how charset
 * conversion is handled for unencoded 8bit text and/or wrongly
 * specified rfc2047 encoded-word tokens.
 *
 * Returns: a newly allocated UTF-8 string representing the the decoded
 * header.
 **/
char *
g_mime_utils_header_decode_text (const char *text)
{
	gboolean enable_rfc2047_workarounds = _g_mime_enable_rfc2047_workarounds ();
	register const char *inptr = text;
	gboolean encoded = FALSE;
	const char *lwsp, *word;
	size_t nlwsp, n;
	gboolean ascii;
	char *decoded;
	GString *out;

	if (text == NULL)
		return g_strdup ("");

	out = g_string_sized_new (strlen (text) + 1);

	while (*inptr != '\0') {
		lwsp = inptr;
		while (is_lwsp (*inptr))
			inptr++;

		nlwsp = (size_t) (inptr - lwsp);

		if (*inptr != '\0') {
			word = inptr;
			ascii = TRUE;

			if (enable_rfc2047_workarounds) {
				if (!strncmp (inptr, "=?", 2)) {
					inptr += 2;

					/* skip past the charset (if one is even declared, sigh) */
					while (*inptr && *inptr != '?') {
						ascii = ascii && is_ascii (*inptr);
						inptr++;
					}

					/* sanity check encoding type */
					if (inptr[0] != '?' || !strchr ("BbQq", inptr[1]) || inptr[2] != '?')
						goto non_rfc2047;

					inptr += 3;

					/* find the end of the rfc2047 encoded word token */
					while (*inptr && strncmp (inptr, "?=", 2) != 0) {
						ascii = ascii && is_ascii (*inptr);
						inptr++;
					}

					if (!strncmp (inptr, "?=", 2))
						inptr += 2;
				} else {
				non_rfc2047:
					/* stop if we encounter a possible rfc2047 encoded
					 * token even if it's inside another word, sigh. */
					while (*inptr && !is_lwsp (*inptr) &&
					       strncmp (inptr, "=?", 2) != 0) {
						ascii = ascii && is_ascii (*inptr);
						inptr++;
					}
				}
			} else {
				while (*inptr && !is_lwsp (*inptr)) {
					ascii = ascii && is_ascii (*inptr);
					inptr++;
				}
			}

			n = (size_t) (inptr - word);
			if (is_rfc2047_encoded_word (word, n)) {
				if ((decoded = rfc2047_decode_word (word, n))) {
					/* rfc2047 states that you must ignore all
					 * whitespace between encoded words */
					if (!encoded)
						g_string_append_len (out, lwsp, nlwsp);

					g_string_append (out, decoded);
					g_free (decoded);

					encoded = TRUE;
				} else {
					/* append lwsp and invalid rfc2047 encoded-word token */
					g_string_append_len (out, lwsp, nlwsp + n);
					encoded = FALSE;
				}
			} else {
				/* append lwsp */
				g_string_append_len (out, lwsp, nlwsp);

				/* append word token */
				if (!ascii) {
					/* *sigh* I hate broken mailers... */
					decoded = g_mime_utils_decode_8bit (word, n);
					g_string_append (out, decoded);
					g_free (decoded);
				} else {
					g_string_append_len (out, word, n);
				}

				encoded = FALSE;
			}
		} else {
			/* appending trailing lwsp */
			g_string_append_len (out, lwsp, nlwsp);
			break;
		}
	}

	decoded = out->str;
	g_string_free (out, FALSE);

	return decoded;
}


/**
 * g_mime_utils_header_decode_phrase:
 * @phrase: header to decode
 *
 * Decodes an rfc2047 encoded 'phrase' header.
 *
 * Note: See g_mime_set_user_charsets() for details on how charset
 * conversion is handled for unencoded 8bit text and/or wrongly
 * specified rfc2047 encoded-word tokens.
 *
 * Returns: a newly allocated UTF-8 string representing the the decoded
 * header.
 **/
char *
g_mime_utils_header_decode_phrase (const char *phrase)
{
	register const char *inptr = phrase;
	gboolean encoded = FALSE;
	const char *lwsp, *text;
	size_t nlwsp, n;
	gboolean ascii;
	char *decoded;
	GString *out;

	if (phrase == NULL)
		return g_strdup ("");

	out = g_string_sized_new (strlen (phrase) + 1);

	while (*inptr != '\0') {
		lwsp = inptr;
		while (is_lwsp (*inptr))
			inptr++;

		nlwsp = (size_t) (inptr - lwsp);

		text = inptr;
		if (is_atom (*inptr)) {
			while (is_atom (*inptr))
				inptr++;

			n = (size_t) (inptr - text);
			if (is_rfc2047_encoded_word (text, n)) {
				if ((decoded = rfc2047_decode_word (text, n))) {
					/* rfc2047 states that you must ignore all
					 * whitespace between encoded words */
					if (!encoded)
						g_string_append_len (out, lwsp, nlwsp);

					g_string_append (out, decoded);
					g_free (decoded);

					encoded = TRUE;
				} else {
					/* append lwsp and invalid rfc2047 encoded-word token */
					g_string_append_len (out, lwsp, nlwsp + n);
					encoded = FALSE;
				}
			} else {
				/* append lwsp and atom token */
				g_string_append_len (out, lwsp, nlwsp + n);
				encoded = FALSE;
			}
		} else {
			g_string_append_len (out, lwsp, nlwsp);

			ascii = TRUE;
			while (*inptr && !is_lwsp (*inptr)) {
				ascii = ascii && is_ascii (*inptr);
				inptr++;
			}

			n = (size_t) (inptr - text);

			if (!ascii) {
				/* *sigh* I hate broken mailers... */
				decoded = g_mime_utils_decode_8bit (text, n);
				g_string_append (out, decoded);
				g_free (decoded);
			} else {
				g_string_append_len (out, text, n);
			}

			encoded = FALSE;
		}
	}

	decoded = out->str;
	g_string_free (out, FALSE);

	return decoded;
}


/* rfc2047 version of quoted-printable */
static size_t
quoted_encode (const char *in, size_t len, unsigned char *out, gushort safemask)
{
	register const unsigned char *inptr = (const unsigned char *) in;
	const unsigned char *inend = inptr + len;
	register unsigned char *outptr = out;
	unsigned char c;

	while (inptr < inend) {
		c = *inptr++;
		if (c == ' ') {
			*outptr++ = '_';
		} else if (c != '_' && gmime_special_table[c] & safemask) {
			*outptr++ = c;
		} else {
			*outptr++ = '=';
			*outptr++ = tohex[(c >> 4) & 0xf];
			*outptr++ = tohex[c & 0xf];
		}
	}

	return (outptr - out);
}

static void
rfc2047_encode_word (GString *string, const char *word, size_t len,
		     const char *charset, gushort safemask)
{
	register char *inptr, *outptr;
	iconv_t cd = (iconv_t) -1;
	unsigned char *encoded;
	size_t enclen, pos;
	char *uword = NULL;
	guint32 save = 0;
	int state = 0;
	char encoding;

	if (g_ascii_strcasecmp (charset, "UTF-8") != 0)
		cd = g_mime_iconv_open (charset, "UTF-8");

	if (cd != (iconv_t) -1) {
		uword = g_mime_iconv_strndup (cd, (char *) word, len);
		g_mime_iconv_close (cd);
	}

	if (uword) {
		len = strlen (uword);
		word = uword;
	} else {
		charset = "UTF-8";
	}

	switch (g_mime_utils_best_encoding ((const unsigned char *) word, len)) {
	case GMIME_CONTENT_ENCODING_BASE64:
		enclen = GMIME_BASE64_ENCODE_LEN (len);
		encoded = g_alloca (enclen + 1);

		encoding = 'b';

		pos = g_mime_encoding_base64_encode_close ((const unsigned char *) word, len, encoded, &state, &save);
		encoded[pos] = '\0';

		/* remove \n chars as headers need to be wrapped differently */
		if (G_UNLIKELY ((inptr = strchr ((char *) encoded, '\n')))) {
			outptr = inptr++;
			while (G_LIKELY (*inptr)) {
				if (G_LIKELY (*inptr != '\n'))
					*outptr++ = *inptr;

				inptr++;
			}

			*outptr = '\0';
		}

		break;
	case GMIME_CONTENT_ENCODING_QUOTEDPRINTABLE:
		enclen = GMIME_QP_ENCODE_LEN (len);
		encoded = g_alloca (enclen + 1);

		encoding = 'q';

		pos = quoted_encode (word, len, encoded, safemask);
		encoded[pos] = '\0';

		break;
	default:
		encoded = NULL;
		encoding = '\0';
		g_assert_not_reached ();
	}

	g_free (uword);

	g_string_append_printf (string, "=?%s?%c?%s?=", charset, encoding, encoded);
}


typedef enum {
	WORD_ATOM,
	WORD_QSTRING,
	WORD_2047
} rfc822_word_t;

typedef struct _rfc822_word {
	struct _rfc822_word *next;
	const char *start, *end;
	rfc822_word_t type;
	int encoding;
} rfc822_word;

#define rfc822_word_free(word) g_slice_free (rfc822_word, word)
#define rfc822_word_new() g_slice_new (rfc822_word)

/* okay, so 'unstructured text' fields don't actually contain 'word'
 * tokens, but we can group stuff similarly... */
static rfc822_word *
rfc2047_encode_get_rfc822_words (const char *in, gboolean phrase)
{
	rfc822_word *words, *tail, *word;
	rfc822_word_t type = WORD_ATOM;
	const char *inptr, *start, *last;
	int count = 0, encoding = 0;

	words = NULL;
	tail = (rfc822_word *) &words;

	last = start = inptr = in;
	while (inptr && *inptr) {
		const char *newinptr;
		gunichar c;

		newinptr = g_utf8_next_char (inptr);
		c = g_utf8_get_char (inptr);
		if (newinptr == NULL || !g_unichar_validate (c)) {
			w(g_warning ("Invalid UTF-8 sequence encountered"));
			inptr++;
			continue;
		}

		inptr = newinptr;

		if (c < 256 && is_lwsp (c)) {
			if (count > 0) {
				word = rfc822_word_new ();
				word->next = NULL;
				word->start = start;
				word->end = last;
				word->type = type;
				word->encoding = encoding;

				tail->next = word;
				tail = word;
				count = 0;
			}

			start = inptr;
			type = WORD_ATOM;
			encoding = 0;
		} else {
			count++;
			if (phrase && c < 128) {
				/* phrases can have qstring words */
				if (!is_atom (c))
					type = MAX (type, WORD_QSTRING);
			} else if (c > 127 && c < 256) {
				type = WORD_2047;
				encoding = MAX (encoding, 1);
			} else if (c >= 256) {
				type = WORD_2047;
				encoding = 2;
			}

			if (count >= GMIME_FOLD_PREENCODED) {
				word = rfc822_word_new ();
				word->next = NULL;
				word->start = start;
				word->end = inptr;
				word->type = type;
				word->encoding = encoding;

				tail->next = word;
				tail = word;
				count = 0;

				/* Note: don't reset 'type' as it
				 * needs to be preserved when breaking
				 * long words */
				start = inptr;
				encoding = 0;
			}
		}

		last = inptr;
	}

	if (count > 0) {
		word = rfc822_word_new ();
		word->next = NULL;
		word->start = start;
		word->end = last;
		word->type = type;
		word->encoding = encoding;

		tail->next = word;
		tail = word;
	}

#if d(!)0
	printf ("rfc822 word tokens:\n");
	word = words;
	while (word) {
		printf ("\t'%.*s'; type=%d, encoding=%d\n",
			word->end - word->start, word->start,
			word->type, word->encoding);

		word = word->next;
	}
#endif

	return words;
}

#define MERGED_WORD_LT_FOLDLEN(wlen, type) ((type) == WORD_2047 ? (wlen) < GMIME_FOLD_PREENCODED : (wlen) < (GMIME_FOLD_LEN - 8))

static gboolean
should_merge_words (rfc822_word *word, rfc822_word *next)
{
	switch (word->type) {
	case WORD_ATOM:
		if (next->type == WORD_2047)
			return FALSE;

		return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, next->type));
	case WORD_QSTRING:
		/* avoid merging with words that need to be rfc2047 encoded */
		if (next->type == WORD_2047)
			return FALSE;

		return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_QSTRING));
	case WORD_2047:
		if (next->type == WORD_ATOM) {
			/* whether we merge or not is dependent upon:
			 * 1. the number of atoms in a row after 'word'
			 * 2. if there is another encword after the string of atoms.
			 */
			int natoms = 0;

			while (next && next->type == WORD_ATOM) {
				next = next->next;
				natoms++;
			}

			/* if all the words after the encword are atoms, don't merge */
			if (!next || natoms > 3)
				return FALSE;
		}

		/* avoid merging with qstrings */
		if (next->type == WORD_QSTRING)
			return FALSE;

		return (MERGED_WORD_LT_FOLDLEN (next->end - word->start, WORD_2047));
	default:
		return FALSE;
	}
}

static void
rfc2047_encode_merge_rfc822_words (rfc822_word **wordsp)
{
	rfc822_word *word, *next, *words = *wordsp;

	/* first pass: merge qstrings with adjacent qstrings and encwords with adjacent encwords */
	word = words;
	while (word && word->next) {
		next = word->next;

		if (word->type != WORD_ATOM && word->type == next->type &&
		    MERGED_WORD_LT_FOLDLEN (next->end - word->start, word->type)) {
			/* merge the words */
			word->encoding = MAX (word->encoding, next->encoding);

			word->end = next->end;
			word->next = next->next;

			rfc822_word_free (next);

			next = word;
		}

		word = next;
	}

	/* second pass: now merge atoms with the other words */
	word = words;
	while (word && word->next) {
		next = word->next;

		if (should_merge_words (word, next)) {
			/* the resulting word type is the MAX of the 2 types */
			word->type = MAX (word->type, next->type);

			word->encoding = MAX (word->encoding, next->encoding);

			word->end = next->end;
			word->next = next->next;

			rfc822_word_free (next);

			continue;
		}

		word = next;
	}

	*wordsp = words;
}

static void
g_string_append_len_quoted (GString *out, const char *in, size_t len)
{
	register const char *inptr;
	const char *inend;

	g_string_append_c (out, '"');

	inptr = in;
	inend = in + len;

	while (inptr < inend) {
		if (*inptr == '"' || *inptr == '\\')
			g_string_append_c (out, '\\');

		g_string_append_c (out, *inptr);

		inptr++;
	}

	g_string_append_c (out, '"');
}

static char *
rfc2047_encode (const char *in, gushort safemask)
{
	rfc822_word *words, *word, *prev = NULL;
	const char **charsets, *charset;
	const char *start;
	GMimeCharset mask;
	GString *out;
	char *outstr;
	size_t len;
	int i;

	if (!(words = rfc2047_encode_get_rfc822_words (in, safemask & IS_PSAFE)))
		return g_strdup (in);

	rfc2047_encode_merge_rfc822_words (&words);

	charsets = g_mime_user_charsets ();

	out = g_string_new ("");

	/* output words now with spaces between them */
	word = words;
	while (word) {
		/* append correct number of spaces between words */
		if (prev && !(prev->type == WORD_2047 && word->type == WORD_2047)) {
			/* one or both of the words are not encoded so we write the spaces out untouched */
			len = word->start - prev->end;
			g_string_append_len (out, prev->end, len);
		}

		switch (word->type) {
		case WORD_ATOM:
			g_string_append_len (out, word->start, (size_t) (word->end - word->start));
			break;
		case WORD_QSTRING:
			g_assert (safemask & IS_PSAFE);
			g_string_append_len_quoted (out, word->start, (size_t) (word->end - word->start));
			break;
		case WORD_2047:
			if (prev && prev->type == WORD_2047) {
				/* include the whitespace chars between these 2 words in the
				   resulting rfc2047 encoded word. */
				len = word->end - prev->end;
				start = prev->end;

				/* encoded words need to be separated by linear whitespace */
				g_string_append_c (out, ' ');
			} else {
				len = word->end - word->start;
				start = word->start;
			}

			switch (word->encoding) {
			case 0: /* us-ascii */
				rfc2047_encode_word (out, start, len, "us-ascii", safemask);
				break;
			case 1: /* iso-8859-1 */
				rfc2047_encode_word (out, start, len, "iso-8859-1", safemask);
				break;
			default:
				charset = NULL;
				g_mime_charset_init (&mask);
				g_mime_charset_step (&mask, start, len);

				for (i = 0; charsets && charsets[i]; i++) {
					if (g_mime_charset_can_encode (&mask, charsets[i], start, len)) {
						charset = charsets[i];
						break;
					}
				}

				if (!charset)
					charset = g_mime_charset_best_name (&mask);

				rfc2047_encode_word (out, start, len, charset, safemask);
				break;
			}

			break;
		}

		rfc822_word_free (prev);

		prev = word;
		word = word->next;
	}

	rfc822_word_free (prev);

	outstr = out->str;
	g_string_free (out, FALSE);

	return outstr;
}


/**
 * g_mime_utils_header_encode_phrase:
 * @phrase: phrase to encode
 *
 * Encodes a 'phrase' header according to the rules in rfc2047.
 *
 * Returns: the encoded 'phrase'. Useful for encoding internet
 * addresses.
 **/
char *
g_mime_utils_header_encode_phrase (const char *phrase)
{
	if (phrase == NULL)
		return NULL;

	return rfc2047_encode (phrase, IS_PSAFE);
}


/**
 * g_mime_utils_header_encode_text:
 * @text: text to encode
 *
 * Encodes a 'text' header according to the rules in rfc2047.
 *
 * Returns: the encoded header. Useful for encoding
 * headers like "Subject".
 **/
char *
g_mime_utils_header_encode_text (const char *text)
{
	if (text == NULL)
		return NULL;

	return rfc2047_encode (text, IS_ESAFE);
}