lib/database.cc: change how the parent of a message is calculated

Presently, the code which finds the parent of a message as it is being
added to the database assumes that the first Message-ID-like substring
of the In-Reply-To header is the parent Message ID.  Some mail clients,
however, put stuff other than the Message-ID of the parent in the
In-Reply-To header, such as the email address of the sender of the
parent.  This can fool notmuch.

The updated algorithm prefers the last Message ID in the References
header.  The References header lists messages oldest-first, so the last
Message ID is the parent (RFC2822, p. 24).  The References header is
also less likely to be in a non-standard
syntax (http://cr.yp.to/immhf/thread.html,
http://www.jwz.org/doc/threading.html).  In case the References header
is not to be found, fall back to the old behavior.

V2 of this patch, incorporating feedback from Jani and (indirectly)
Austin.
This commit is contained in:
Aaron Ecay 2013-03-05 22:31:49 -05:00 committed by David Bremner
parent 983d5e1df2
commit cf8aaafbad
2 changed files with 32 additions and 17 deletions

View file

@ -501,8 +501,10 @@ _parse_message_id (void *ctx, const char *message_id, const char **next)
* 'message_id' in the result (to avoid mass confusion when a single * 'message_id' in the result (to avoid mass confusion when a single
* message references itself cyclically---and yes, mail messages are * message references itself cyclically---and yes, mail messages are
* not infrequent in the wild that do this---don't ask me why). * not infrequent in the wild that do this---don't ask me why).
*/ *
static void * Return the last reference parsed, if it is not equal to message_id.
*/
static char *
parse_references (void *ctx, parse_references (void *ctx,
const char *message_id, const char *message_id,
GHashTable *hash, GHashTable *hash,
@ -511,7 +513,7 @@ parse_references (void *ctx,
char *ref; char *ref;
if (refs == NULL || *refs == '\0') if (refs == NULL || *refs == '\0')
return; return NULL;
while (*refs) { while (*refs) {
ref = _parse_message_id (ctx, refs, &refs); ref = _parse_message_id (ctx, refs, &refs);
@ -519,6 +521,17 @@ parse_references (void *ctx,
if (ref && strcmp (ref, message_id)) if (ref && strcmp (ref, message_id))
g_hash_table_insert (hash, ref, NULL); g_hash_table_insert (hash, ref, NULL);
} }
/* The return value of this function is used to add a parent
* reference to the database. We should avoid making a message
* its own parent, thus the following check.
*/
if (ref && strcmp(ref, message_id)) {
return ref;
} else {
return NULL;
}
} }
notmuch_status_t notmuch_status_t
@ -1510,28 +1523,33 @@ _notmuch_database_link_message_to_parents (notmuch_database_t *notmuch,
{ {
GHashTable *parents = NULL; GHashTable *parents = NULL;
const char *refs, *in_reply_to, *in_reply_to_message_id; const char *refs, *in_reply_to, *in_reply_to_message_id;
const char *last_ref_message_id, *this_message_id;
GList *l, *keys = NULL; GList *l, *keys = NULL;
notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS; notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
parents = g_hash_table_new_full (g_str_hash, g_str_equal, parents = g_hash_table_new_full (g_str_hash, g_str_equal,
_my_talloc_free_for_g_hash, NULL); _my_talloc_free_for_g_hash, NULL);
this_message_id = notmuch_message_get_message_id (message);
refs = notmuch_message_file_get_header (message_file, "references"); refs = notmuch_message_file_get_header (message_file, "references");
parse_references (message, notmuch_message_get_message_id (message), last_ref_message_id = parse_references (message,
parents, refs); this_message_id,
parents, refs);
in_reply_to = notmuch_message_file_get_header (message_file, "in-reply-to"); in_reply_to = notmuch_message_file_get_header (message_file, "in-reply-to");
parse_references (message, notmuch_message_get_message_id (message), in_reply_to_message_id = parse_references (message,
parents, in_reply_to); this_message_id,
parents, in_reply_to);
/* Carefully avoid adding any self-referential in-reply-to term. */ /* For the parent of this message, use the last message ID of the
in_reply_to_message_id = _parse_message_id (message, in_reply_to, NULL); * References header, if available. If not, fall back to the
if (in_reply_to_message_id && * first message ID in the In-Reply-To header. */
strcmp (in_reply_to_message_id, if (last_ref_message_id) {
notmuch_message_get_message_id (message))) _notmuch_message_add_term (message, "replyto",
{ last_ref_message_id);
} else if (in_reply_to_message_id) {
_notmuch_message_add_term (message, "replyto", _notmuch_message_add_term (message, "replyto",
_parse_message_id (message, in_reply_to, NULL)); in_reply_to_message_id);
} }
keys = g_hash_table_get_keys (parents); keys = g_hash_table_get_keys (parents);

View file

@ -12,7 +12,6 @@ test_description='test of proper handling of in-reply-to and references headers'
. ./test-lib.sh . ./test-lib.sh
test_begin_subtest "Use References when In-Reply-To is broken" test_begin_subtest "Use References when In-Reply-To is broken"
test_subtest_known_broken
add_message '[id]="foo@one.com"' \ add_message '[id]="foo@one.com"' \
'[subject]=one' '[subject]=one'
add_message '[in-reply-to]="mumble"' \ add_message '[in-reply-to]="mumble"' \
@ -47,7 +46,6 @@ expected=`echo "$expected" | notmuch_json_show_sanitize`
test_expect_equal_json "$output" "$expected" test_expect_equal_json "$output" "$expected"
test_begin_subtest "Prefer References to In-Reply-To" test_begin_subtest "Prefer References to In-Reply-To"
test_subtest_known_broken
add_message '[id]="foo@two.com"' \ add_message '[id]="foo@two.com"' \
'[subject]=two' '[subject]=two'
add_message '[in-reply-to]="<bar@baz.com>"' \ add_message '[in-reply-to]="<bar@baz.com>"' \
@ -104,7 +102,6 @@ expected=`echo "$expected" | notmuch_json_show_sanitize`
test_expect_equal_json "$output" "$expected" test_expect_equal_json "$output" "$expected"
test_begin_subtest "Use last Reference" test_begin_subtest "Use last Reference"
test_subtest_known_broken
add_message '[id]="foo@four.com"' \ add_message '[id]="foo@four.com"' \
'[subject]="four"' '[subject]="four"'
add_message '[id]="bar@four.com"' \ add_message '[id]="bar@four.com"' \