mirror of
https://git.notmuchmail.org/git/notmuch
synced 2024-11-21 10:28:09 +01:00
lib: index attachments with mime types matching index.as_text
Instead of skipping indexing all attachments, we check of a (user configured) mime type that is indexable as text.
This commit is contained in:
parent
3f5809bf28
commit
a554690d6a
7 changed files with 136 additions and 11 deletions
|
@ -122,6 +122,16 @@ paths are presumed relative to `$HOME` for items in section
|
|||
|
||||
Default tag prefix (filter) for :any:`notmuch-git`.
|
||||
|
||||
.. nmconfig:: index.as_text
|
||||
|
||||
List of regular expressions (without delimiters) for MIME types to
|
||||
be indexed as text. Currently this applies only to attachments. By
|
||||
default the regex matches anywhere in the content type; if they
|
||||
user wants an anchored match, they should include anchors in their
|
||||
regexes.
|
||||
|
||||
History: This configuration value was introduced in notmuch 0.38.
|
||||
|
||||
.. nmconfig:: index.decrypt
|
||||
|
||||
Policy for decrypting encrypted messages during indexing. Must be
|
||||
|
|
|
@ -1573,3 +1573,15 @@ notmuch_database_status_string (const notmuch_database_t *notmuch)
|
|||
{
|
||||
return notmuch->status_string;
|
||||
}
|
||||
|
||||
bool
|
||||
_notmuch_database_indexable_as_text (notmuch_database_t *notmuch, const char *mime_string)
|
||||
{
|
||||
for (size_t i = 0; i < notmuch->index_as_text_length; i++) {
|
||||
if (regexec (¬much->index_as_text[i], mime_string, 0, NULL, 0) == 0) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
|
19
lib/index.cc
19
lib/index.cc
|
@ -380,6 +380,23 @@ _index_pkcs7_part (notmuch_message_t *message,
|
|||
GMimeObject *part,
|
||||
_notmuch_message_crypto_t *msg_crypto);
|
||||
|
||||
static bool
|
||||
_indexable_as_text (notmuch_message_t *message, GMimeObject *part)
|
||||
{
|
||||
GMimeContentType *content_type = g_mime_object_get_content_type (part);
|
||||
notmuch_database_t *notmuch = notmuch_message_get_database (message);
|
||||
|
||||
if (content_type) {
|
||||
char *mime_string = g_mime_content_type_get_mime_type (content_type);
|
||||
if (mime_string) {
|
||||
bool ret = _notmuch_database_indexable_as_text (notmuch, mime_string);
|
||||
g_free (mime_string);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Callback to generate terms for each mime part of a message. */
|
||||
static void
|
||||
_index_mime_part (notmuch_message_t *message,
|
||||
|
@ -497,10 +514,12 @@ _index_mime_part (notmuch_message_t *message,
|
|||
_notmuch_message_add_term (message, "tag", "attachment");
|
||||
_notmuch_message_gen_terms (message, "attachment", filename);
|
||||
|
||||
if (! _indexable_as_text (message, part)) {
|
||||
/* XXX: Would be nice to call out to something here to parse
|
||||
* the attachment into text and then index that. */
|
||||
goto DONE;
|
||||
}
|
||||
}
|
||||
|
||||
byte_array = g_byte_array_new ();
|
||||
|
||||
|
|
|
@ -259,6 +259,10 @@ _notmuch_database_filename_to_direntry (void *ctx,
|
|||
notmuch_find_flags_t flags,
|
||||
char **direntry);
|
||||
|
||||
bool
|
||||
_notmuch_database_indexable_as_text (notmuch_database_t *notmuch,
|
||||
const char *mime_string);
|
||||
|
||||
/* directory.cc */
|
||||
|
||||
notmuch_directory_t *
|
||||
|
|
|
@ -470,12 +470,4 @@ Date: Fri, 17 Jun 2016 22:14:41 -0400
|
|||
EOF
|
||||
test_expect_equal_file EXPECTED OUTPUT
|
||||
|
||||
add_email_corpus indexing
|
||||
|
||||
test_begin_subtest "index text/* attachments"
|
||||
test_subtest_known_broken
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||
|
||||
test_done
|
||||
|
|
77
test/T760-as-text.sh
Executable file
77
test/T760-as-text.sh
Executable file
|
@ -0,0 +1,77 @@
|
|||
#!/usr/bin/env bash
|
||||
test_description='index attachments as text'
|
||||
. $(dirname "$0")/test-lib.sh || exit 1
|
||||
|
||||
add_email_corpus indexing
|
||||
test_begin_subtest "empty as_text; skip text/x-diff"
|
||||
messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
|
||||
count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
|
||||
test_expect_equal "$messages,$count" "1,0"
|
||||
|
||||
notmuch config set index.as_text "^text/"
|
||||
add_email_corpus indexing
|
||||
|
||||
test_begin_subtest "as_index is text/; find text/x-diff"
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "reindex with empty as_text, skips text/x-diff"
|
||||
notmuch config set index.as_text
|
||||
notmuch reindex '*'
|
||||
messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
|
||||
count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
|
||||
test_expect_equal "$messages,$count" "1,0"
|
||||
|
||||
test_begin_subtest "reindex with empty as_text; skips application/pdf"
|
||||
notmuch config set index.as_text
|
||||
notmuch reindex '*'
|
||||
gmessages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
|
||||
count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
|
||||
test_expect_equal "$messages,$count" "1,0"
|
||||
|
||||
test_begin_subtest "reindex with as_text as text/; finds text/x-diff"
|
||||
notmuch config set index.as_text "^text/"
|
||||
notmuch reindex '*'
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "reindex with as_text as text/; skips application/pdf"
|
||||
notmuch config set index.as_text "^text/"
|
||||
notmuch config set index.as_text
|
||||
notmuch reindex '*'
|
||||
messages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
|
||||
count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
|
||||
test_expect_equal "$messages,$count" "1,0"
|
||||
|
||||
test_begin_subtest "as_text has multiple regexes"
|
||||
notmuch config set index.as_text "blahblah;^text/"
|
||||
notmuch reindex '*'
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "as_text is non-anchored regex"
|
||||
notmuch config set index.as_text "e.t/"
|
||||
notmuch reindex '*'
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "as_text is 'application/pdf'"
|
||||
notmuch config set index.as_text "^application/pdf$"
|
||||
notmuch reindex '*'
|
||||
notmuch search id:871qo9p4tf.fsf@tethera.net > EXPECTED
|
||||
notmuch search id:871qo9p4tf.fsf@tethera.net and '"not really PDF"' > OUTPUT
|
||||
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "as_text is bad regex"
|
||||
notmuch config set index.as_text '['
|
||||
notmuch reindex '*' >& OUTPUT
|
||||
cat<<EOF > EXPECTED
|
||||
Error in index.as_text: Invalid regular expression: [
|
||||
EOF
|
||||
test_expect_equal_file EXPECTED OUTPUT
|
||||
|
||||
test_done
|
11
test/corpora/indexing/fake-pdf:2,S
Normal file
11
test/corpora/indexing/fake-pdf:2,S
Normal file
|
@ -0,0 +1,11 @@
|
|||
From: David Bremner <david@tethera.net>
|
||||
To: example@example.com
|
||||
Subject: attachment content type
|
||||
Date: Thu, 05 Jan 2023 08:02:36 -0400
|
||||
Message-ID: <871qo9p4tf.fsf@tethera.net>
|
||||
MIME-Version: 1.0
|
||||
Content-Type: application/pdf
|
||||
Content-Disposition: attachment; filename=fake.pdf
|
||||
Content-Transfer-Encoding: base64
|
||||
|
||||
dGhpcyBpcyBub3QgcmVhbGx5IFBERgo=
|
Loading…
Reference in a new issue