From ad784f38ce30d39b058325baf050eb784fb9a02e Mon Sep 17 00:00:00 2001 From: Carl Worth Date: Mon, 19 Oct 2009 23:08:49 -0700 Subject: [PATCH] notmuch: Ignore files that don't look like email messages. This is helpful for things like indexes that other mail programs may have left around. It also means we can make the initial instructions much easier, (the user need not worry about moving away auxiliary files from some other email program). --- database.cc | 21 ++++++++++++++++++--- message.c | 15 +++++++++++++-- notmuch.c | 17 ++++++++++++----- notmuch.h | 16 ++++++++++++++-- 4 files changed, 57 insertions(+), 12 deletions(-) diff --git a/database.cc b/database.cc index efc38762..70f80f9f 100644 --- a/database.cc +++ b/database.cc @@ -479,6 +479,7 @@ notmuch_database_add_message (notmuch_database_t *notmuch, GPtrArray *parents, *thread_ids; const char *refs, *in_reply_to, *date, *header; + const char *from, *to, *subject; char *message_id; time_t time_value; @@ -487,10 +488,12 @@ notmuch_database_add_message (notmuch_database_t *notmuch, message = notmuch_message_open (filename); notmuch_message_restrict_headers (message, - "references", + "date", + "from", "in-reply-to", "message-id", - "date", + "references", + "subject", (char *) NULL); try { @@ -567,7 +570,19 @@ notmuch_database_add_message (notmuch_database_t *notmuch, doc.add_value (NOTMUCH_VALUE_DATE, Xapian::sortable_serialise (time_value)); - db->add_document (doc); + from = notmuch_message_get_header (message, "from"); + subject = notmuch_message_get_header (message, "subject"); + to = notmuch_message_get_header (message, "to"); + + if (from == NULL && + subject == NULL && + to == NULL) + { + notmuch_message_close (message); + return NOTMUCH_STATUS_FILE_NOT_EMAIL; + } else { + db->add_document (doc); + } } catch (const Xapian::Error &error) { fprintf (stderr, "A Xapian exception occurred: %s.\n", error.get_msg().c_str()); diff --git a/message.c b/message.c index 03583c8d..1a5994ff 100644 --- a/message.c +++ b/message.c @@ -37,6 +37,8 @@ struct _notmuch_message { /* Header storage */ int restrict_headers; GHashTable *headers; + int broken_headers; + int good_headers; /* Parsing state */ char *line; @@ -234,12 +236,21 @@ notmuch_message_get_header (notmuch_message_t *message, colon = strchr (message->line, ':'); if (colon == NULL) { - fprintf (stderr, "Warning: Unexpected non-header line: %s\n", - message->line); + message->broken_headers++; + /* A simple heuristic for giving up on things that just + * don't look like mail messages. */ + if (message->broken_headers >= 10 && + message->good_headers < 5) + { + message->parsing_finished = 1; + continue; + } NEXT_HEADER_LINE (NULL); continue; } + message->good_headers++; + header = xstrndup (message->line, colon - message->line); if (message->restrict_headers && diff --git a/notmuch.c b/notmuch.c index 91ea3451..01000c2a 100644 --- a/notmuch.c +++ b/notmuch.c @@ -145,6 +145,7 @@ add_files (notmuch_database_t *notmuch, const char *path, int err; char *next; struct stat st; + notmuch_status_t status; dir = opendir (path); @@ -187,8 +188,13 @@ add_files (notmuch_database_t *notmuch, const char *path, stat (next, &st); if (S_ISREG (st.st_mode)) { - notmuch_database_add_message (notmuch, next); - state->count++; + status = notmuch_database_add_message (notmuch, next); + if (status == NOTMUCH_STATUS_FILE_NOT_EMAIL) { + fprintf (stderr, "Note: Ignoring non-mail file: %s\n", + next); + } else { + state->count++; + } if (state->count % 1000 == 0) add_files_print_progress (state); } else if (S_ISDIR (st.st_mode)) { @@ -293,9 +299,10 @@ setup_command (int argc, char *argv[]) printf ("Notmuch needs to know the top-level directory of your email archive,\n" "(where you already have mail stored and where messages will be delivered\n" "in the future). This directory can contain any number of sub-directories\n" - "but the only files it contains should be individual email messages.\n" - "Either maildir or mh format directories are fine, but you will want to\n" - "move away any auxiliary files maintained by other email programs.\n\n"); + "and primarily just files with indvidual email messages (eg. maildir or mh\n" + "archives are perfect). If there are other, non-email files (such as\n" + "indexes maintained by other email programs) then notmuch will do its\n" + "best to detect those and ignore them.\n\n"); printf ("Mail storage that uses mbox format, (where one mbox file contains many\n" "messages), will not work with notmuch. If that's how your mail is currently\n" diff --git a/notmuch.h b/notmuch.h index 873c88d2..e0b57db2 100644 --- a/notmuch.h +++ b/notmuch.h @@ -40,10 +40,14 @@ NOTMUCH_BEGIN_DECLS * NOTMUCH_STATUS_SUCCESS: No error occurred. * * NOTMUCH_STATUS_XAPIAN_EXCEPTION: A Xapian exception occurred + * + * NOTMUCH_STATUS_FILE_NOT_EMAIL: A file was presented that doesn't + * appear to be an email message. */ typedef enum _notmuch_status { NOTMUCH_STATUS_SUCCESS = 0, - NOTMUCH_STATUS_XAPIAN_EXCEPTION + NOTMUCH_STATUS_XAPIAN_EXCEPTION, + NOTMUCH_STATUS_FILE_NOT_EMAIL } notmuch_status_t; /* An opaque data structure representing a notmuch database. See @@ -116,7 +120,15 @@ notmuch_database_get_path (notmuch_database_t *database); * single mail message (not a multi-message mbox) that is expected to * remain at its current location, (since the notmuch database will * reference the filename, and will not copy the entire contents of - * the file. */ + * the file. + * + * Return value: + * + * NOTMUCH_STATUS_SUCCESS: Message successfully added to database. + * + * NOTMUCH_STATUS_FILE_NOT_EMAIL: the contents of filename don't look + * like an email message. Nothing added to the database. + */ notmuch_status_t notmuch_database_add_message (notmuch_database_t *database, const char *filename);