lib: index attachments with mime types matching index.as_text

Instead of skipping indexing all attachments, we check of a (user
configured) mime type that is indexable as text.
This commit is contained in:
David Bremner 2023-01-05 20:02:06 -04:00
parent 3f5809bf28
commit a554690d6a
7 changed files with 136 additions and 11 deletions

View file

@ -122,6 +122,16 @@ paths are presumed relative to `$HOME` for items in section
Default tag prefix (filter) for :any:`notmuch-git`.
.. nmconfig:: index.as_text
List of regular expressions (without delimiters) for MIME types to
be indexed as text. Currently this applies only to attachments. By
default the regex matches anywhere in the content type; if they
user wants an anchored match, they should include anchors in their
regexes.
History: This configuration value was introduced in notmuch 0.38.
.. nmconfig:: index.decrypt
Policy for decrypting encrypted messages during indexing. Must be

View file

@ -1573,3 +1573,15 @@ notmuch_database_status_string (const notmuch_database_t *notmuch)
{
return notmuch->status_string;
}
bool
_notmuch_database_indexable_as_text (notmuch_database_t *notmuch, const char *mime_string)
{
for (size_t i = 0; i < notmuch->index_as_text_length; i++) {
if (regexec (&notmuch->index_as_text[i], mime_string, 0, NULL, 0) == 0) {
return true;
}
}
return false;
}

View file

@ -380,6 +380,23 @@ _index_pkcs7_part (notmuch_message_t *message,
GMimeObject *part,
_notmuch_message_crypto_t *msg_crypto);
static bool
_indexable_as_text (notmuch_message_t *message, GMimeObject *part)
{
GMimeContentType *content_type = g_mime_object_get_content_type (part);
notmuch_database_t *notmuch = notmuch_message_get_database (message);
if (content_type) {
char *mime_string = g_mime_content_type_get_mime_type (content_type);
if (mime_string) {
bool ret = _notmuch_database_indexable_as_text (notmuch, mime_string);
g_free (mime_string);
return ret;
}
}
return false;
}
/* Callback to generate terms for each mime part of a message. */
static void
_index_mime_part (notmuch_message_t *message,
@ -497,10 +514,12 @@ _index_mime_part (notmuch_message_t *message,
_notmuch_message_add_term (message, "tag", "attachment");
_notmuch_message_gen_terms (message, "attachment", filename);
if (! _indexable_as_text (message, part)) {
/* XXX: Would be nice to call out to something here to parse
* the attachment into text and then index that. */
goto DONE;
}
}
byte_array = g_byte_array_new ();

View file

@ -259,6 +259,10 @@ _notmuch_database_filename_to_direntry (void *ctx,
notmuch_find_flags_t flags,
char **direntry);
bool
_notmuch_database_indexable_as_text (notmuch_database_t *notmuch,
const char *mime_string);
/* directory.cc */
notmuch_directory_t *

View file

@ -470,12 +470,4 @@ Date: Fri, 17 Jun 2016 22:14:41 -0400
EOF
test_expect_equal_file EXPECTED OUTPUT
add_email_corpus indexing
test_begin_subtest "index text/* attachments"
test_subtest_known_broken
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_done

77
test/T760-as-text.sh Executable file
View file

@ -0,0 +1,77 @@
#!/usr/bin/env bash
test_description='index attachments as text'
. $(dirname "$0")/test-lib.sh || exit 1
add_email_corpus indexing
test_begin_subtest "empty as_text; skip text/x-diff"
messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
test_expect_equal "$messages,$count" "1,0"
notmuch config set index.as_text "^text/"
add_email_corpus indexing
test_begin_subtest "as_index is text/; find text/x-diff"
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "reindex with empty as_text, skips text/x-diff"
notmuch config set index.as_text
notmuch reindex '*'
messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
test_expect_equal "$messages,$count" "1,0"
test_begin_subtest "reindex with empty as_text; skips application/pdf"
notmuch config set index.as_text
notmuch reindex '*'
gmessages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
test_expect_equal "$messages,$count" "1,0"
test_begin_subtest "reindex with as_text as text/; finds text/x-diff"
notmuch config set index.as_text "^text/"
notmuch reindex '*'
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "reindex with as_text as text/; skips application/pdf"
notmuch config set index.as_text "^text/"
notmuch config set index.as_text
notmuch reindex '*'
messages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
test_expect_equal "$messages,$count" "1,0"
test_begin_subtest "as_text has multiple regexes"
notmuch config set index.as_text "blahblah;^text/"
notmuch reindex '*'
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "as_text is non-anchored regex"
notmuch config set index.as_text "e.t/"
notmuch reindex '*'
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "as_text is 'application/pdf'"
notmuch config set index.as_text "^application/pdf$"
notmuch reindex '*'
notmuch search id:871qo9p4tf.fsf@tethera.net > EXPECTED
notmuch search id:871qo9p4tf.fsf@tethera.net and '"not really PDF"' > OUTPUT
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "as_text is bad regex"
notmuch config set index.as_text '['
notmuch reindex '*' >& OUTPUT
cat<<EOF > EXPECTED
Error in index.as_text: Invalid regular expression: [
EOF
test_expect_equal_file EXPECTED OUTPUT
test_done

View file

@ -0,0 +1,11 @@
From: David Bremner <david@tethera.net>
To: example@example.com
Subject: attachment content type
Date: Thu, 05 Jan 2023 08:02:36 -0400
Message-ID: <871qo9p4tf.fsf@tethera.net>
MIME-Version: 1.0
Content-Type: application/pdf
Content-Disposition: attachment; filename=fake.pdf
Content-Transfer-Encoding: base64
dGhpcyBpcyBub3QgcmVhbGx5IFBERgo=