mirror of
https://git.notmuchmail.org/git/notmuch
synced 2024-11-21 10:28:09 +01:00
lib: index attachments with mime types matching index.as_text
Instead of skipping indexing all attachments, we check of a (user configured) mime type that is indexable as text.
This commit is contained in:
parent
3f5809bf28
commit
a554690d6a
7 changed files with 136 additions and 11 deletions
|
@ -122,6 +122,16 @@ paths are presumed relative to `$HOME` for items in section
|
||||||
|
|
||||||
Default tag prefix (filter) for :any:`notmuch-git`.
|
Default tag prefix (filter) for :any:`notmuch-git`.
|
||||||
|
|
||||||
|
.. nmconfig:: index.as_text
|
||||||
|
|
||||||
|
List of regular expressions (without delimiters) for MIME types to
|
||||||
|
be indexed as text. Currently this applies only to attachments. By
|
||||||
|
default the regex matches anywhere in the content type; if they
|
||||||
|
user wants an anchored match, they should include anchors in their
|
||||||
|
regexes.
|
||||||
|
|
||||||
|
History: This configuration value was introduced in notmuch 0.38.
|
||||||
|
|
||||||
.. nmconfig:: index.decrypt
|
.. nmconfig:: index.decrypt
|
||||||
|
|
||||||
Policy for decrypting encrypted messages during indexing. Must be
|
Policy for decrypting encrypted messages during indexing. Must be
|
||||||
|
|
|
@ -1573,3 +1573,15 @@ notmuch_database_status_string (const notmuch_database_t *notmuch)
|
||||||
{
|
{
|
||||||
return notmuch->status_string;
|
return notmuch->status_string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool
|
||||||
|
_notmuch_database_indexable_as_text (notmuch_database_t *notmuch, const char *mime_string)
|
||||||
|
{
|
||||||
|
for (size_t i = 0; i < notmuch->index_as_text_length; i++) {
|
||||||
|
if (regexec (¬much->index_as_text[i], mime_string, 0, NULL, 0) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
25
lib/index.cc
25
lib/index.cc
|
@ -380,6 +380,23 @@ _index_pkcs7_part (notmuch_message_t *message,
|
||||||
GMimeObject *part,
|
GMimeObject *part,
|
||||||
_notmuch_message_crypto_t *msg_crypto);
|
_notmuch_message_crypto_t *msg_crypto);
|
||||||
|
|
||||||
|
static bool
|
||||||
|
_indexable_as_text (notmuch_message_t *message, GMimeObject *part)
|
||||||
|
{
|
||||||
|
GMimeContentType *content_type = g_mime_object_get_content_type (part);
|
||||||
|
notmuch_database_t *notmuch = notmuch_message_get_database (message);
|
||||||
|
|
||||||
|
if (content_type) {
|
||||||
|
char *mime_string = g_mime_content_type_get_mime_type (content_type);
|
||||||
|
if (mime_string) {
|
||||||
|
bool ret = _notmuch_database_indexable_as_text (notmuch, mime_string);
|
||||||
|
g_free (mime_string);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
/* Callback to generate terms for each mime part of a message. */
|
/* Callback to generate terms for each mime part of a message. */
|
||||||
static void
|
static void
|
||||||
_index_mime_part (notmuch_message_t *message,
|
_index_mime_part (notmuch_message_t *message,
|
||||||
|
@ -497,9 +514,11 @@ _index_mime_part (notmuch_message_t *message,
|
||||||
_notmuch_message_add_term (message, "tag", "attachment");
|
_notmuch_message_add_term (message, "tag", "attachment");
|
||||||
_notmuch_message_gen_terms (message, "attachment", filename);
|
_notmuch_message_gen_terms (message, "attachment", filename);
|
||||||
|
|
||||||
/* XXX: Would be nice to call out to something here to parse
|
if (! _indexable_as_text (message, part)) {
|
||||||
* the attachment into text and then index that. */
|
/* XXX: Would be nice to call out to something here to parse
|
||||||
goto DONE;
|
* the attachment into text and then index that. */
|
||||||
|
goto DONE;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
byte_array = g_byte_array_new ();
|
byte_array = g_byte_array_new ();
|
||||||
|
|
|
@ -259,6 +259,10 @@ _notmuch_database_filename_to_direntry (void *ctx,
|
||||||
notmuch_find_flags_t flags,
|
notmuch_find_flags_t flags,
|
||||||
char **direntry);
|
char **direntry);
|
||||||
|
|
||||||
|
bool
|
||||||
|
_notmuch_database_indexable_as_text (notmuch_database_t *notmuch,
|
||||||
|
const char *mime_string);
|
||||||
|
|
||||||
/* directory.cc */
|
/* directory.cc */
|
||||||
|
|
||||||
notmuch_directory_t *
|
notmuch_directory_t *
|
||||||
|
|
|
@ -470,12 +470,4 @@ Date: Fri, 17 Jun 2016 22:14:41 -0400
|
||||||
EOF
|
EOF
|
||||||
test_expect_equal_file EXPECTED OUTPUT
|
test_expect_equal_file EXPECTED OUTPUT
|
||||||
|
|
||||||
add_email_corpus indexing
|
|
||||||
|
|
||||||
test_begin_subtest "index text/* attachments"
|
|
||||||
test_subtest_known_broken
|
|
||||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
|
||||||
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
|
||||||
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
|
||||||
|
|
||||||
test_done
|
test_done
|
||||||
|
|
77
test/T760-as-text.sh
Executable file
77
test/T760-as-text.sh
Executable file
|
@ -0,0 +1,77 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
test_description='index attachments as text'
|
||||||
|
. $(dirname "$0")/test-lib.sh || exit 1
|
||||||
|
|
||||||
|
add_email_corpus indexing
|
||||||
|
test_begin_subtest "empty as_text; skip text/x-diff"
|
||||||
|
messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
|
||||||
|
count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
|
||||||
|
test_expect_equal "$messages,$count" "1,0"
|
||||||
|
|
||||||
|
notmuch config set index.as_text "^text/"
|
||||||
|
add_email_corpus indexing
|
||||||
|
|
||||||
|
test_begin_subtest "as_index is text/; find text/x-diff"
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||||
|
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||||
|
|
||||||
|
test_begin_subtest "reindex with empty as_text, skips text/x-diff"
|
||||||
|
notmuch config set index.as_text
|
||||||
|
notmuch reindex '*'
|
||||||
|
messages=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain)
|
||||||
|
count=$(notmuch count id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz)
|
||||||
|
test_expect_equal "$messages,$count" "1,0"
|
||||||
|
|
||||||
|
test_begin_subtest "reindex with empty as_text; skips application/pdf"
|
||||||
|
notmuch config set index.as_text
|
||||||
|
notmuch reindex '*'
|
||||||
|
gmessages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
|
||||||
|
count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
|
||||||
|
test_expect_equal "$messages,$count" "1,0"
|
||||||
|
|
||||||
|
test_begin_subtest "reindex with as_text as text/; finds text/x-diff"
|
||||||
|
notmuch config set index.as_text "^text/"
|
||||||
|
notmuch reindex '*'
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||||
|
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||||
|
|
||||||
|
test_begin_subtest "reindex with as_text as text/; skips application/pdf"
|
||||||
|
notmuch config set index.as_text "^text/"
|
||||||
|
notmuch config set index.as_text
|
||||||
|
notmuch reindex '*'
|
||||||
|
messages=$(notmuch count id:871qo9p4tf.fsf@tethera.net)
|
||||||
|
count=$(notmuch count id:871qo9p4tf.fsf@tethera.net and body:not-really-PDF)
|
||||||
|
test_expect_equal "$messages,$count" "1,0"
|
||||||
|
|
||||||
|
test_begin_subtest "as_text has multiple regexes"
|
||||||
|
notmuch config set index.as_text "blahblah;^text/"
|
||||||
|
notmuch reindex '*'
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||||
|
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||||
|
|
||||||
|
test_begin_subtest "as_text is non-anchored regex"
|
||||||
|
notmuch config set index.as_text "e.t/"
|
||||||
|
notmuch reindex '*'
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain > EXPECTED
|
||||||
|
notmuch search id:20200930101213.2m2pt3jrspvcrxfx@localhost.localdomain and ersatz > OUTPUT
|
||||||
|
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||||
|
|
||||||
|
test_begin_subtest "as_text is 'application/pdf'"
|
||||||
|
notmuch config set index.as_text "^application/pdf$"
|
||||||
|
notmuch reindex '*'
|
||||||
|
notmuch search id:871qo9p4tf.fsf@tethera.net > EXPECTED
|
||||||
|
notmuch search id:871qo9p4tf.fsf@tethera.net and '"not really PDF"' > OUTPUT
|
||||||
|
test_expect_equal_file_nonempty EXPECTED OUTPUT
|
||||||
|
|
||||||
|
test_begin_subtest "as_text is bad regex"
|
||||||
|
notmuch config set index.as_text '['
|
||||||
|
notmuch reindex '*' >& OUTPUT
|
||||||
|
cat<<EOF > EXPECTED
|
||||||
|
Error in index.as_text: Invalid regular expression: [
|
||||||
|
EOF
|
||||||
|
test_expect_equal_file EXPECTED OUTPUT
|
||||||
|
|
||||||
|
test_done
|
11
test/corpora/indexing/fake-pdf:2,S
Normal file
11
test/corpora/indexing/fake-pdf:2,S
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
From: David Bremner <david@tethera.net>
|
||||||
|
To: example@example.com
|
||||||
|
Subject: attachment content type
|
||||||
|
Date: Thu, 05 Jan 2023 08:02:36 -0400
|
||||||
|
Message-ID: <871qo9p4tf.fsf@tethera.net>
|
||||||
|
MIME-Version: 1.0
|
||||||
|
Content-Type: application/pdf
|
||||||
|
Content-Disposition: attachment; filename=fake.pdf
|
||||||
|
Content-Transfer-Encoding: base64
|
||||||
|
|
||||||
|
dGhpcyBpcyBub3QgcmVhbGx5IFBERgo=
|
Loading…
Reference in a new issue