Change database to store only a single thread ID per message.

Instead of supporting multiple thread IDs, we now merge together
thread IDs if one message is ever found to belong to more than one
thread. This allows for constructing complete threads when, for
example, a child message doesn't include a complete list of References
headers back to the beginning of the thread.

It also simplifies dealing with mapping a message ID to a thread ID
which is now a simple get_thread_id just like get_message_id, (and no
longer an iterator-based thing like get_tags).
This commit is contained in:
Carl Worth 2009-10-25 14:54:13 -07:00
parent ec77f6b50c
commit a360670c03
4 changed files with 301 additions and 278 deletions

View file

@ -51,15 +51,18 @@ typedef struct {
* id: Unique ID of mail, (from Message-ID header or generated
* as "notmuch-sha1-<sha1_sum_of_entire_file>.
*
* thread: The ID of the thread to which the mail belongs
*
* Multiple terms of given prefix:
*
* ref: The message IDs from all In-Reply-To and References
* headers in the message.
* ref: All unresolved message IDs from In-Reply-To and
* References headers in the message. (Once a referenced
* message is added to the database and the thread IDs
* are linked the corresponding "ref" term is dropped
* from the message document.)
*
* tag: Any tags associated with this message by the user.
*
* thread: The thread ID of all threads to which the mail belongs
*
* A mail document also has two values:
*
* TIMESTAMP: The time_t value corresponding to the message's
@ -111,6 +114,20 @@ prefix_t BOOLEAN_PREFIX_EXTERNAL[] = {
{ "id", "Q" }
};
int
_internal_error (const char *format, ...)
{
va_list va_args;
va_start (va_args, format);
vfprintf (stderr, format, va_args);
exit (1);
return 1;
}
const char *
_find_prefix (const char *name)
{
@ -240,37 +257,6 @@ find_unique_document (notmuch_database_t *notmuch,
return NOTMUCH_PRIVATE_STATUS_SUCCESS;
}
/* XXX: Should rewrite this to accept a notmuch_message_t* instead of
* a Xapian:Document and then we could just use
* notmuch_message_get_thread_ids instead of duplicating its logic
* here. */
static void
insert_thread_id (GHashTable *thread_ids, Xapian::Document doc)
{
string value_string;
Xapian::TermIterator i;
const char *prefix_str = _find_prefix ("thread");
char prefix;
assert (strlen (prefix_str) == 1);
prefix = *prefix_str;
i = doc.termlist_begin ();
i.skip_to (prefix_str);
while (1) {
if (i == doc.termlist_end ())
break;
value_string = *i;
if (value_string.empty () || value_string[0] != prefix)
break;
g_hash_table_insert (thread_ids,
strdup (value_string.c_str () + 1), NULL);
i++;
}
}
notmuch_message_t *
notmuch_database_find_message (notmuch_database_t *notmuch,
const char *message_id)
@ -286,75 +272,6 @@ notmuch_database_find_message (notmuch_database_t *notmuch,
return _notmuch_message_create (notmuch, notmuch, doc_id, NULL);
}
/* Return one or more thread_ids, (as a GPtrArray of strings), for the
* given message based on looking into the database for any messages
* referenced in parents, and also for any messages in the database
* referencing message_id.
*
* Caller should free all strings in the array and the array itself,
* (g_ptr_array_free) when done. */
static GPtrArray *
find_thread_ids (notmuch_database_t *notmuch,
GPtrArray *parents,
const char *message_id)
{
Xapian::PostingIterator child, children_end;
Xapian::Document doc;
GHashTable *thread_ids;
GList *keys, *l;
unsigned int i;
const char *parent_message_id;
GPtrArray *result;
thread_ids = g_hash_table_new_full (g_str_hash, g_str_equal,
free, NULL);
find_doc_ids (notmuch, "ref", message_id, &child, &children_end);
for ( ; child != children_end; child++) {
doc = find_document_for_doc_id (notmuch, *child);
insert_thread_id (thread_ids, doc);
}
for (i = 0; i < parents->len; i++) {
notmuch_message_t *parent;
notmuch_thread_ids_t *ids;
parent_message_id = (char *) g_ptr_array_index (parents, i);
parent = notmuch_database_find_message (notmuch, parent_message_id);
if (parent == NULL)
continue;
for (ids = notmuch_message_get_thread_ids (parent);
notmuch_thread_ids_has_more (ids);
notmuch_thread_ids_advance (ids))
{
const char *id;
id = notmuch_thread_ids_get (ids);
g_hash_table_insert (thread_ids, strdup (id), NULL);
}
notmuch_message_destroy (parent);
}
result = g_ptr_array_new ();
keys = g_hash_table_get_keys (thread_ids);
for (l = keys; l; l = l->next) {
char *id = (char *) l->data;
g_ptr_array_add (result, id);
}
g_list_free (keys);
/* We're done with the hash table, but we've taken the pointers to
* the allocated strings and put them into our result array, so
* tell the hash not to free them on its way out. */
g_hash_table_steal_all (thread_ids);
g_hash_table_unref (thread_ids);
return result;
}
/* Advance 'str' past any whitespace or RFC 822 comments. A comment is
* a (potentially nested) parenthesized sequence with '\' used to
* escape any character (including parentheses).
@ -460,9 +377,9 @@ parse_message_id (const char *message_id, const char **next)
}
/* Parse a References header value, putting a copy of each referenced
* message-id into 'array'. */
* message-id into 'hash'. */
static void
parse_references (GPtrArray *array,
parse_references (GHashTable *hash,
const char *refs)
{
char *ref;
@ -474,7 +391,7 @@ parse_references (GPtrArray *array,
ref = parse_message_id (refs, &refs);
if (ref)
g_ptr_array_add (array, ref);
g_hash_table_insert (hash, ref, NULL);
}
}
@ -699,6 +616,171 @@ notmuch_database_get_timestamp (notmuch_database_t *notmuch, const char *key)
return ret;
}
/* Find the thread ID to which the message with 'message_id' belongs.
*
* Returns NULL if no message with message ID 'message_id' is in the
* database.
*
* Otherwise, returns a newly talloced string belonging to 'ctx'.
*/
const char *
_resolve_message_id_to_thread_id (notmuch_database_t *notmuch,
void *ctx,
const char *message_id)
{
notmuch_message_t *message;
const char *ret = NULL;
message = notmuch_database_find_message (notmuch, message_id);
if (message == NULL)
goto DONE;
ret = talloc_steal (ctx, notmuch_message_get_thread_id (message));
DONE:
if (message)
notmuch_message_destroy (message);
return ret;
}
static notmuch_status_t
_merge_threads (notmuch_database_t *notmuch,
const char *winner_thread_id,
const char *loser_thread_id)
{
Xapian::PostingIterator loser, loser_end;
notmuch_message_t *message = NULL;
notmuch_private_status_t private_status;
notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
find_doc_ids (notmuch, "thread", loser_thread_id, &loser, &loser_end);
for ( ; loser != loser_end; loser++) {
message = _notmuch_message_create (notmuch, notmuch,
*loser, &private_status);
if (message == NULL) {
ret = COERCE_STATUS (private_status,
"Cannot find document for doc_id from query");
goto DONE;
}
_notmuch_message_remove_term (message, "thread", loser_thread_id);
_notmuch_message_add_term (message, "thread", winner_thread_id);
_notmuch_message_sync (message);
notmuch_message_destroy (message);
message = NULL;
}
DONE:
if (message)
notmuch_message_destroy (message);
return ret;
}
static notmuch_status_t
_notmuch_database_link_message_to_parents (notmuch_database_t *notmuch,
notmuch_message_t *message,
notmuch_message_file_t *message_file,
const char **thread_id)
{
GHashTable *parents = NULL;
const char *refs, *in_reply_to;
GList *l, *keys = NULL;
notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
parents = g_hash_table_new_full (g_str_hash, g_str_equal,
free, NULL);
refs = notmuch_message_file_get_header (message_file, "references");
parse_references (parents, refs);
in_reply_to = notmuch_message_file_get_header (message_file, "in-reply-to");
parse_references (parents, in_reply_to);
keys = g_hash_table_get_keys (parents);
for (l = keys; l; l = l->next) {
char *parent_message_id;
const char *parent_thread_id;
parent_message_id = (char *) l->data;
parent_thread_id = _resolve_message_id_to_thread_id (notmuch,
message,
parent_message_id);
if (parent_thread_id == NULL) {
_notmuch_message_add_term (message, "ref", parent_message_id);
} else {
if (*thread_id == NULL) {
*thread_id = talloc_strdup (message, parent_thread_id);
_notmuch_message_add_term (message, "thread", *thread_id);
} else if (strcmp (*thread_id, parent_thread_id)) {
ret = _merge_threads (notmuch, *thread_id, parent_thread_id);
if (ret)
goto DONE;
}
}
}
DONE:
if (keys)
g_list_free (keys);
if (parents)
g_hash_table_unref (parents);
return ret;
}
static notmuch_status_t
_notmuch_database_link_message_to_children (notmuch_database_t *notmuch,
notmuch_message_t *message,
const char **thread_id)
{
const char *message_id = notmuch_message_get_message_id (message);
Xapian::PostingIterator child, children_end;
notmuch_message_t *child_message = NULL;
const char *child_thread_id;
notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
notmuch_private_status_t private_status;
find_doc_ids (notmuch, "ref", message_id, &child, &children_end);
for ( ; child != children_end; child++) {
child_message = _notmuch_message_create (message, notmuch,
*child, &private_status);
if (child_message == NULL) {
ret = COERCE_STATUS (private_status,
"Cannot find document for doc_id from query");
goto DONE;
}
child_thread_id = notmuch_message_get_thread_id (child_message);
if (*thread_id == NULL) {
*thread_id = talloc_strdup (message, child_thread_id);
_notmuch_message_add_term (message, "thread", *thread_id);
} else if (strcmp (*thread_id, child_thread_id)) {
_notmuch_message_remove_term (child_message, "ref",
message_id);
_notmuch_message_sync (child_message);
ret = _merge_threads (notmuch, *thread_id, child_thread_id);
if (ret)
goto DONE;
}
notmuch_message_destroy (child_message);
child_message = NULL;
}
DONE:
if (child_message)
notmuch_message_destroy (child_message);
return ret;
}
/* Given a (mostly empty) 'message' and its corresponding
* 'message_file' link it to existing threads in the database.
*
@ -716,44 +798,20 @@ _notmuch_database_link_message (notmuch_database_t *notmuch,
notmuch_message_t *message,
notmuch_message_file_t *message_file)
{
GPtrArray *parents, *thread_ids;
const char *refs, *in_reply_to;
const char *message_id = notmuch_message_get_message_id (message);
unsigned int i;
notmuch_status_t ret = NOTMUCH_STATUS_SUCCESS;
const char *thread_id = NULL;
parents = g_ptr_array_new ();
_notmuch_database_link_message_to_parents (notmuch, message,
message_file,
&thread_id);
refs = notmuch_message_file_get_header (message_file, "references");
parse_references (parents, refs);
ret = _notmuch_database_link_message_to_children (notmuch, message,
&thread_id);
in_reply_to = notmuch_message_file_get_header (message_file, "in-reply-to");
parse_references (parents, in_reply_to);
for (i = 0; i < parents->len; i++)
_notmuch_message_add_term (message, "ref",
(char *) g_ptr_array_index (parents, i));
thread_ids = find_thread_ids (notmuch, parents, message_id);
for (i = 0; i < parents->len; i++)
g_free (g_ptr_array_index (parents, i));
g_ptr_array_free (parents, TRUE);
if (thread_ids->len) {
char *id;
for (i = 0; i < thread_ids->len; i++) {
id = (char *) thread_ids->pdata[i];
_notmuch_message_add_thread_id (message, id);
free (id);
}
} else {
if (thread_id == NULL)
_notmuch_message_ensure_thread_id (message);
}
g_ptr_array_free (thread_ids, TRUE);
return NOTMUCH_STATUS_SUCCESS;
return ret;
}
notmuch_status_t

View file

@ -27,6 +27,7 @@ struct _notmuch_message {
notmuch_database_t *notmuch;
Xapian::docid doc_id;
char *message_id;
char *thread_id;
char *filename;
Xapian::Document doc;
};
@ -41,10 +42,6 @@ struct _notmuch_tags {
notmuch_terms_t terms;
};
struct _notmuch_thread_ids {
notmuch_terms_t terms;
};
/* "128 bits of thread-id ought to be enough for anybody" */
#define NOTMUCH_THREAD_ID_BITS 128
#define NOTMUCH_THREAD_ID_DIGITS (NOTMUCH_THREAD_ID_BITS / 4)
@ -80,13 +77,15 @@ _notmuch_message_destructor (notmuch_message_t *message)
* caller *is* responsible for calling notmuch_message_destroy.
*
* If no document exists in the database with document ID of 'doc_id'
* then this function returns NULL and sets *status to
* then this function returns NULL and optionally sets *status to
* NOTMUCH_PRIVATE_STATUS_NO_DOCUMENT_FOUND.
*
* This function can also fail to due lack of available memory,
* returning NULL and optionally setting *status to
* NOTMUCH_PRIVATE_STATUS_OUT_OF_MEMORY. Caller can pass NULL
* for status if uninterested in distinguishing these two cases.
* NOTMUCH_PRIVATE_STATUS_OUT_OF_MEMORY.
*
* The caller can pass NULL for status if uninterested in
* distinguishing these two cases.
*/
notmuch_message_t *
_notmuch_message_create (const void *talloc_owner,
@ -109,6 +108,7 @@ _notmuch_message_create (const void *talloc_owner,
message->notmuch = notmuch;
message->doc_id = doc_id;
message->message_id = NULL; /* lazily created */
message->thread_id = NULL; /* lazily created */
message->filename = NULL; /* lazily created */
/* This is C++'s creepy "placement new", which is really just an
@ -195,12 +195,8 @@ _notmuch_message_create_for_message_id (const void *talloc_owner,
message = _notmuch_message_create (talloc_owner, notmuch,
doc_id, &private_status);
if (private_status >= (notmuch_private_status_t) NOTMUCH_STATUS_LAST_STATUS)
{
INTERNAL_ERROR ("Failed to find document immediately after adding it.\n");
}
*status = (notmuch_status_t) private_status;
*status = COERCE_STATUS (private_status,
"Failed to find dcocument after inserting it.");
return message;
}
@ -216,22 +212,63 @@ notmuch_message_get_message_id (notmuch_message_t *message)
i = message->doc.termlist_begin ();
i.skip_to (_find_prefix ("id"));
if (i == message->doc.termlist_end ()) {
if (i == message->doc.termlist_end ())
INTERNAL_ERROR ("Message with document ID of %d has no message ID.\n",
message->doc_id);
}
message->message_id = talloc_strdup (message, (*i).c_str () + 1);
#if DEBUG_DATABASE_SANITY
i++;
if (i != message->doc.termlist_end () &&
strncmp ((*i).c_str (), _find_prefix ("id"),
strlen (_find_prefix ("id"))) == 0)
{
INTERNAL_ERROR ("Mail (doc_id: %d) has duplicate message IDs",
message->doc_id);
}
#endif
return message->message_id;
}
const char *
notmuch_message_get_thread_id (notmuch_message_t *message)
{
Xapian::TermIterator i;
if (message->thread_id)
return message->thread_id;
i = message->doc.termlist_begin ();
i.skip_to (_find_prefix ("thread"));
if (i == message->doc.termlist_end ())
INTERNAL_ERROR ("Message with document ID of %d has no thread ID.\n",
message->doc_id);
message->thread_id = talloc_strdup (message, (*i).c_str () + 1);
#if DEBUG_DATABASE_SANITY
i++;
if (i != message->doc.termlist_end () &&
strncmp ((*i).c_str (), _find_prefix ("thread"),
strlen (_find_prefix ("thread"))) == 0)
{
INTERNAL_ERROR ("Message with document ID of %d has duplicate thread IDs.\n",
message->doc_id);
}
#endif
return message->thread_id;
}
/* Set the filename for 'message' to 'filename'.
*
* XXX: We should still figure out what we want to do for multiple
* files with identical message IDs. We will probably want to store a
* list of filenames here, (so that this will be "add_filename"
* instead of "set_filename"). Which would make this very similar to
* add_thread_ids.
* XXX: We should still figure out if we think it's important to store
* multiple filenames for email messages with identical message IDs.
*
* This change will not be reflected in the database until the next
* call to _notmuch_message_set_sync. */
@ -318,13 +355,6 @@ notmuch_message_get_tags (notmuch_message_t *message)
notmuch_tags_t);
}
notmuch_thread_ids_t *
notmuch_message_get_thread_ids (notmuch_message_t *message)
{
return _notmuch_terms_create_type (message, message->doc, "thread",
notmuch_thread_ids_t);
}
void
_notmuch_message_set_date (notmuch_message_t *message,
const char *date)
@ -337,13 +367,6 @@ _notmuch_message_set_date (notmuch_message_t *message,
Xapian::sortable_serialise (time_value));
}
void
_notmuch_message_add_thread_id (notmuch_message_t *message,
const char *thread_id)
{
_notmuch_message_add_term (message, "thread", thread_id);
}
static void
thread_id_generate (thread_id_t *thread_id)
{
@ -559,27 +582,3 @@ notmuch_tags_destroy (notmuch_tags_t *tags)
{
return _notmuch_terms_destroy (&tags->terms);
}
notmuch_bool_t
notmuch_thread_ids_has_more (notmuch_thread_ids_t *thread_ids)
{
return _notmuch_terms_has_more (&thread_ids->terms);
}
const char *
notmuch_thread_ids_get (notmuch_thread_ids_t *thread_ids)
{
return _notmuch_terms_get (&thread_ids->terms);
}
void
notmuch_thread_ids_advance (notmuch_thread_ids_t *thread_ids)
{
return _notmuch_terms_advance (&thread_ids->terms);
}
void
notmuch_thread_ids_destroy (notmuch_thread_ids_t *thread_ids)
{
return _notmuch_terms_destroy (&thread_ids->terms);
}

View file

@ -46,8 +46,22 @@ NOTMUCH_BEGIN_DECLS
#include "xutil.h"
#ifdef DEBUG
# define DEBUG_DATABASE_SANITY 1
# define DEBUG_QUERY 1
#endif
#define COMPILE_TIME_ASSERT(pred) ((void)sizeof(char[1 - 2*!(pred)]))
/* There's no point in continuing when we've detected that we've done
* something wrong internally (as opposed to the user passing in a
* bogus value).
*
* Note that PRINTF_ATTRIBUTE comes from talloc.h
*/
int
_internal_error (const char *format, ...) PRINTF_ATTRIBUTE (1, 2);
/* There's no point in continuing when we've detected that we've done
* something wrong internally (as opposed to the user passing in a
* bogus value).
@ -55,12 +69,8 @@ NOTMUCH_BEGIN_DECLS
* Note that __location__ comes from talloc.h.
*/
#define INTERNAL_ERROR(format, ...) \
do { \
fprintf(stderr, \
"Internal error: " format " (%s)\n", \
##__VA_ARGS__, __location__); \
exit (1); \
} while (0)
_internal_error (format " (%s).\n", \
##__VA_ARGS__, __location__)
/* Thanks to Andrew Tridgell's (SAMBA's) talloc for this definition of
* unlikely. The talloc source code comes to us via the GNU LGPL v. 3.
@ -110,6 +120,21 @@ typedef enum _notmuch_private_status {
NOTMUCH_PRIVATE_STATUS_LAST_STATUS
} notmuch_private_status_t;
/* Coerce a notmuch_private_status_t value to a notmuch_status_t
* value, generating an internal error if the private value is equal
* to or greater than NOTMUCH_STATUS_LAST_STATUS. (The idea here is
* that the caller has previously handled any expected
* notmuch_private_status_t values.)
*/
#define COERCE_STATUS(private_status, format, ...) \
((private_status >= (notmuch_private_status_t) NOTMUCH_STATUS_LAST_STATUS)\
? \
(notmuch_status_t) _internal_error (format " (%s).\n", \
##__VA_ARGS__, \
__location__) \
: \
(notmuch_status_t) private_status)
/* message.cc */
notmuch_message_t *

View file

@ -105,7 +105,6 @@ typedef struct _notmuch_query notmuch_query_t;
typedef struct _notmuch_results notmuch_results_t;
typedef struct _notmuch_message notmuch_message_t;
typedef struct _notmuch_tags notmuch_tags_t;
typedef struct _notmuch_thread_ids notmuch_thread_ids_t;
/* Lookup the default database path.
*
@ -417,6 +416,21 @@ notmuch_results_destroy (notmuch_results_t *results);
const char *
notmuch_message_get_message_id (notmuch_message_t *message);
/* Get the thread ID of 'message'.
*
* The returned string belongs to 'message' and as such, should not be
* modified by the caller and will only be valid for as long as the
* message is valid, (for example, until the user calls
* notmuch_message_destroy on 'message' or until a query from which it
* derived is destroyed).
*
* This function will not return NULL since Notmuch ensures that every
* message belongs to a single thread.
*/
const char *
notmuch_message_get_thread_id (notmuch_message_t *message);
/* Get the filename for the email corresponding to 'message'.
*
* The returned filename is relative to the base of the database from
@ -460,39 +474,6 @@ notmuch_message_get_filename (notmuch_message_t *message);
notmuch_tags_t *
notmuch_message_get_tags (notmuch_message_t *message);
/* Get the thread IDs for 'message', returning a notmuch_thread_ids_t
* object which can be used to iterate over all thread IDs.
*
* The thread_ids object is owned by the message and as such, will
* only be valid for as long as the message is valid, (which is until
* the query from which it derived is destroyed).
*
* Typical usage might be:
*
* notmuch_message_t *message;
* notmuch_thread_ids_t *thread_ids;
* const char *thread_id;
*
* message = notmuch_database_find_message (database, message_id);
*
* for (thread_ids = notmuch_message_get_thread_ids (message);
* notmuch_thread_ids_has_more (thread_ids);
* notmuch_thread_ids_advance (thread_ids))
* {
* thread_id = notmuch_thread_ids_get (thread_ids);
* ....
* }
*
* notmuch_message_destroy (message);
*
* Note that there's no explicit destructor needed for the
* notmuch_thread_ids_t object. (For consistency, we do provide a
* notmuch_thread_ids_destroy function, but there's no good reason to
* call it if the message is about to be destroyed).
*/
notmuch_thread_ids_t *
notmuch_message_get_thread_ids (notmuch_message_t *message);
/* The longest possible tag value. */
#define NOTMUCH_TAG_MAX 200
@ -575,46 +556,6 @@ notmuch_tags_advance (notmuch_tags_t *tags);
void
notmuch_tags_destroy (notmuch_tags_t *tags);
/* Does the given notmuch_thread_ids_t object contain any more thread IDs.
*
* When this function returns TRUE, notmuch_thread_ids_get will return a
* valid string. Whereas when this function returns FALSE,
* notmuch_thread_ids_get will return NULL.
*
* See the documentation of notmuch_message_get_thread_ids for example code
* showing how to iterate over a notmuch_thread_ids_t object.
*/
notmuch_bool_t
notmuch_thread_ids_has_more (notmuch_thread_ids_t *thread_ids);
/* Get the current thread ID from 'thread_ids' as a string.
*
* Note: The returned string belongs to 'thread_ids' and has a lifetime
* identical to it (and the query to which it utlimately belongs).
*
* See the documentation of notmuch_message_get_thread_ids for example code
* showing how to iterate over a notmuch_thread_ids_t object.
*/
const char *
notmuch_thread_ids_get (notmuch_thread_ids_t *thread_ids);
/* Advance the 'thread_ids' iterator to the next tag.
*
* See the documentation of notmuch_message_get_thread_ids for example code
* showing how to iterate over a notmuch_thread_ids_t object.
*/
void
notmuch_thread_ids_advance (notmuch_thread_ids_t *thread_ids);
/* Destroy a notmuch_thread_ids_t object.
*
* It's not strictly necessary to call this function. All memory from
* the notmuch_thread_ids_t object will be reclaimed when the containg
* message or query objects are destroyed.
*/
void
notmuch_thread_ids_destroy (notmuch_thread_ids_t *thread_ids);
NOTMUCH_END_DECLS
#endif