diff --git a/doc/man7/notmuch-search-terms.rst b/doc/man7/notmuch-search-terms.rst index de93d733..47cab48d 100644 --- a/doc/man7/notmuch-search-terms.rst +++ b/doc/man7/notmuch-search-terms.rst @@ -34,10 +34,14 @@ indicate user-supplied values): - from: +- from:// + - to: - subject: +- subject:// + - attachment: - mimetype: @@ -71,6 +75,15 @@ subject of an email. Searching for a phrase in the subject is supported by including quotation marks around the phrase, immediately following **subject:**. +If notmuch is built with **Xapian Field Processors** (see below) the +**from:** and **subject** prefix can be also used to restrict the +results to those whose from/subject value matches a regular expression +(see **regex(7)**) delimited with //. + +:: + + notmuch search 'from:/bob@.*[.]example[.]com/' + The **attachment:** prefix can be used to search for specific filenames (or extensions) of attachments to email messages. @@ -220,13 +233,18 @@ Boolean and Probabilistic Prefixes ---------------------------------- Xapian (and hence notmuch) prefixes are either **boolean**, supporting -exact matches like "tag:inbox" or **probabilistic**, supporting a more flexible **term** based searching. The prefixes currently supported by notmuch are as follows. - +exact matches like "tag:inbox" or **probabilistic**, supporting a more +flexible **term** based searching. Certain **special** prefixes are +processed by notmuch in a way not stricly fitting either of Xapian's +built in styles. The prefixes currently supported by notmuch are as +follows. Boolean **tag:**, **id:**, **thread:**, **folder:**, **path:**, **property:** Probabilistic - **from:**, **to:**, **subject:**, **attachment:**, **mimetype:** + **to:**, **attachment:**, **mimetype:** +Special + **from:**, **query:**, **subject:** Terms and phrases ----------------- @@ -396,6 +414,7 @@ Currently the following features require field processor support: - non-range date queries, e.g. "date:today" - named queries e.g. "query:my_special_query" +- regular expression searches, e.g. "subject:/^\\[SPAM\\]/" SEE ALSO ======== diff --git a/lib/Makefile.local b/lib/Makefile.local index b77e5780..cd92fc79 100644 --- a/lib/Makefile.local +++ b/lib/Makefile.local @@ -52,6 +52,7 @@ libnotmuch_cxx_srcs = \ $(dir)/query.cc \ $(dir)/query-fp.cc \ $(dir)/config.cc \ + $(dir)/regexp-fields.cc \ $(dir)/thread.cc libnotmuch_modules := $(libnotmuch_c_srcs:.c=.o) $(libnotmuch_cxx_srcs:.cc=.o) diff --git a/lib/database.cc b/lib/database.cc index fa4c3116..573c9fe0 100644 --- a/lib/database.cc +++ b/lib/database.cc @@ -21,6 +21,7 @@ #include "database-private.h" #include "parse-time-vrp.h" #include "query-fp.h" +#include "regexp-fields.h" #include "string-util.h" #include @@ -277,7 +278,8 @@ prefix_t prefix_table[] = { NOTMUCH_FIELD_PROCESSOR }, #endif { "from", "XFROM", NOTMUCH_FIELD_EXTERNAL | - NOTMUCH_FIELD_PROBABILISTIC }, + NOTMUCH_FIELD_PROBABILISTIC | + NOTMUCH_FIELD_PROCESSOR }, { "to", "XTO", NOTMUCH_FIELD_EXTERNAL | NOTMUCH_FIELD_PROBABILISTIC }, { "attachment", "XATTACHMENT", NOTMUCH_FIELD_EXTERNAL | @@ -285,7 +287,8 @@ prefix_t prefix_table[] = { { "mimetype", "XMIMETYPE", NOTMUCH_FIELD_EXTERNAL | NOTMUCH_FIELD_PROBABILISTIC }, { "subject", "XSUBJECT", NOTMUCH_FIELD_EXTERNAL | - NOTMUCH_FIELD_PROBABILISTIC }, + NOTMUCH_FIELD_PROBABILISTIC | + NOTMUCH_FIELD_PROCESSOR}, }; static void @@ -309,7 +312,7 @@ _setup_query_field (const prefix_t *prefix, notmuch_database_t *notmuch) else if (STRNCMP_LITERAL(prefix->name, "query") == 0) fp = (new QueryFieldProcessor (*notmuch->query_parser, notmuch))->release (); else - INTERNAL_ERROR("unsupported field processor prefix: %s\n", prefix->name); + fp = (new RegexpFieldProcessor (prefix->name, *notmuch->query_parser, notmuch))->release (); /* we treat all field-processor fields as boolean in order to get the raw input */ notmuch->query_parser->add_boolean_prefix (prefix->name, fp); diff --git a/lib/regexp-fields.cc b/lib/regexp-fields.cc new file mode 100644 index 00000000..9873af80 --- /dev/null +++ b/lib/regexp-fields.cc @@ -0,0 +1,149 @@ +/* regexp-fields.cc - field processor glue for regex supporting fields + * + * This file is part of notmuch. + * + * Copyright © 2015 Austin Clements + * Copyright © 2016 David Bremner + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see https://www.gnu.org/licenses/ . + * + * Author: Austin Clements + * David Bremner + */ + +#include "regexp-fields.h" +#include "notmuch-private.h" +#include "database-private.h" + +#if HAVE_XAPIAN_FIELD_PROCESSOR +static void +compile_regex (regex_t ®exp, const char *str) +{ + int err = regcomp (®exp, str, REG_EXTENDED | REG_NOSUB); + + if (err != 0) { + size_t len = regerror (err, ®exp, NULL, 0); + char *buffer = new char[len]; + std::string msg; + (void) regerror (err, ®exp, buffer, len); + msg.assign (buffer, len); + delete buffer; + + throw Xapian::QueryParserError (msg); + } +} + +RegexpPostingSource::RegexpPostingSource (Xapian::valueno slot, const std::string ®exp) + : slot_ (slot) +{ + compile_regex (regexp_, regexp.c_str ()); +} + +RegexpPostingSource::~RegexpPostingSource () +{ + regfree (®exp_); +} + +void +RegexpPostingSource::init (const Xapian::Database &db) +{ + db_ = db; + it_ = db_.valuestream_begin (slot_); + end_ = db.valuestream_end (slot_); + started_ = false; + + /* make sure we start on a matching value */ + while (!at_end() && regexec (®exp_, (*it_).c_str (), 0, NULL, 0) != 0) { + ++it_; + } +} + +Xapian::doccount +RegexpPostingSource::get_termfreq_min () const +{ + return 0; +} + +Xapian::doccount +RegexpPostingSource::get_termfreq_est () const +{ + return get_termfreq_max () / 2; +} + +Xapian::doccount +RegexpPostingSource::get_termfreq_max () const +{ + return db_.get_value_freq (slot_); +} + +Xapian::docid +RegexpPostingSource::get_docid () const +{ + return it_.get_docid (); +} + +bool +RegexpPostingSource::at_end () const +{ + return it_ == end_; +} + +void +RegexpPostingSource::next (unused (double min_wt)) +{ + if (started_ && ! at_end ()) + ++it_; + started_ = true; + + for (; ! at_end (); ++it_) { + std::string value = *it_; + if (regexec (®exp_, value.c_str (), 0, NULL, 0) == 0) + break; + } +} + +static inline Xapian::valueno _find_slot (std::string prefix) +{ + if (prefix == "from") + return NOTMUCH_VALUE_FROM; + else if (prefix == "subject") + return NOTMUCH_VALUE_SUBJECT; + else + throw Xapian::QueryParserError ("unsupported regexp field '" + prefix + "'"); +} + +RegexpFieldProcessor::RegexpFieldProcessor (std::string prefix, Xapian::QueryParser &parser_, notmuch_database_t *notmuch_) + : slot (_find_slot (prefix)), term_prefix (_find_prefix (prefix.c_str ())), + parser (parser_), notmuch (notmuch_) +{ +}; + +Xapian::Query +RegexpFieldProcessor::operator() (const std::string & str) +{ + if (str.at (0) == '/') { + if (str.at (str.size () - 1) == '/'){ + RegexpPostingSource *postings = new RegexpPostingSource (slot, str.substr(1,str.size () - 2)); + return Xapian::Query (postings->release ()); + } else { + throw Xapian::QueryParserError ("unmatched regex delimiter in '" + str + "'"); + } + } else { + /* TODO replace this with a nicer API level triggering of + * phrase parsing, when possible */ + std::string quoted='"' + str + '"'; + return parser.parse_query (quoted, NOTMUCH_QUERY_PARSER_FLAGS, term_prefix); + } +} +#endif diff --git a/lib/regexp-fields.h b/lib/regexp-fields.h new file mode 100644 index 00000000..bac11999 --- /dev/null +++ b/lib/regexp-fields.h @@ -0,0 +1,77 @@ +/* regex-fields.h - xapian glue for semi-bruteforce regexp search + * + * This file is part of notmuch. + * + * Copyright © 2015 Austin Clements + * Copyright © 2016 David Bremner + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see https://www.gnu.org/licenses/ . + * + * Author: Austin Clements + * David Bremner + */ + +#ifndef NOTMUCH_REGEXP_FIELDS_H +#define NOTMUCH_REGEXP_FIELDS_H +#if HAVE_XAPIAN_FIELD_PROCESSOR +#include +#include +#include "database-private.h" +#include "notmuch-private.h" + +/* A posting source that returns documents where a value matches a + * regexp. + */ +class RegexpPostingSource : public Xapian::PostingSource +{ + protected: + const Xapian::valueno slot_; + regex_t regexp_; + Xapian::Database db_; + bool started_; + Xapian::ValueIterator it_, end_; + +/* No copying */ + RegexpPostingSource (const RegexpPostingSource &); + RegexpPostingSource &operator= (const RegexpPostingSource &); + + public: + RegexpPostingSource (Xapian::valueno slot, const std::string ®exp); + ~RegexpPostingSource (); + void init (const Xapian::Database &db); + Xapian::doccount get_termfreq_min () const; + Xapian::doccount get_termfreq_est () const; + Xapian::doccount get_termfreq_max () const; + Xapian::docid get_docid () const; + bool at_end () const; + void next (unused (double min_wt)); +}; + + +class RegexpFieldProcessor : public Xapian::FieldProcessor { + protected: + Xapian::valueno slot; + std::string term_prefix; + Xapian::QueryParser &parser; + notmuch_database_t *notmuch; + + public: + RegexpFieldProcessor (std::string prefix, Xapian::QueryParser &parser_, notmuch_database_t *notmuch_); + + ~RegexpFieldProcessor () { }; + + Xapian::Query operator()(const std::string & str); +}; +#endif +#endif /* NOTMUCH_REGEXP_FIELDS_H */ diff --git a/test/T650-regexp-query.sh b/test/T650-regexp-query.sh new file mode 100755 index 00000000..a8039610 --- /dev/null +++ b/test/T650-regexp-query.sh @@ -0,0 +1,82 @@ +#!/usr/bin/env bash +test_description='regular expression searches' +. ./test-lib.sh || exit 1 + +add_email_corpus + + +if [ $NOTMUCH_HAVE_XAPIAN_FIELD_PROCESSOR -eq 0 ]; then + test_done +fi + +notmuch search --output=messages from:cworth > cworth.msg-ids + +test_begin_subtest "regexp from search, case sensitive" +notmuch search --output=messages from:/carl/ > OUTPUT +test_expect_equal_file /dev/null OUTPUT + +test_begin_subtest "empty regexp or query" +notmuch search --output=messages from:/carl/ or from:/cworth/ > OUTPUT +test_expect_equal_file cworth.msg-ids OUTPUT + +test_begin_subtest "non-empty regexp and query" +notmuch search from:/cworth@cworth.org/ and subject:patch | notmuch_search_sanitize > OUTPUT +cat < EXPECTED +thread:XXX 2009-11-18 [1/2] Carl Worth| Alex Botero-Lowry; [notmuch] [PATCH] Error out if no query is supplied to search instead of going into an infinite loop (attachment inbox unread) +thread:XXX 2009-11-18 [1/2] Carl Worth| Ingmar Vanhassel; [notmuch] [PATCH] Typsos (inbox unread) +thread:XXX 2009-11-18 [1/2] Carl Worth| Jan Janak; [notmuch] [PATCH] Older versions of install do not support -C. (inbox unread) +thread:XXX 2009-11-18 [1/2] Carl Worth| Keith Packard; [notmuch] [PATCH] Make notmuch-show 'X' (and 'x') commands remove inbox (and unread) tags (inbox unread) +thread:XXX 2009-11-18 [2/5] Carl Worth| Mikhail Gusarov, Keith Packard; [notmuch] [PATCH 1/2] Close message file after parsing message headers (inbox unread) +EOF +test_expect_equal_file EXPECTED OUTPUT + +test_begin_subtest "regexp from search, duplicate term search" +notmuch search --output=messages from:/cworth/ > OUTPUT +test_expect_equal_file cworth.msg-ids OUTPUT + +test_begin_subtest "long enough regexp matches only desired senders" +notmuch search --output=messages 'from:"/C.* Wo/"' > OUTPUT +test_expect_equal_file cworth.msg-ids OUTPUT + +test_begin_subtest "shorter regexp matches one more sender" +notmuch search --output=messages 'from:"/C.* W/"' > OUTPUT +{ echo id:1258544095-16616-1-git-send-email-chris@chris-wilson.co.uk; cat cworth.msg-ids; } > EXPECTED +test_expect_equal_file EXPECTED OUTPUT + +test_begin_subtest "regexp subject search, non-ASCII" +notmuch search --output=messages subject:/accentué/ > OUTPUT +echo id:877h1wv7mg.fsf@inf-8657.int-evry.fr > EXPECTED +test_expect_equal_file EXPECTED OUTPUT + +test_begin_subtest "regexp subject search, punctuation" +notmuch search subject:/\'X\'/ | notmuch_search_sanitize > OUTPUT +cat < EXPECTED +thread:XXX 2009-11-18 [2/2] Keith Packard, Carl Worth; [notmuch] [PATCH] Make notmuch-show 'X' (and 'x') commands remove inbox (and unread) tags (inbox unread) +EOF +test_expect_equal_file EXPECTED OUTPUT + +test_begin_subtest "regexp subject search, no punctuation" +notmuch search subject:/X/ | notmuch_search_sanitize > OUTPUT +cat < EXPECTED +thread:XXX 2009-11-18 [2/2] Keith Packard, Carl Worth; [notmuch] [PATCH] Make notmuch-show 'X' (and 'x') commands remove inbox (and unread) tags (inbox unread) +thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread) +EOF +test_expect_equal_file EXPECTED OUTPUT + +test_begin_subtest "combine regexp from and subject" +notmuch search subject:/-C/ and from:/.an.k/ | notmuch_search_sanitize > OUTPUT +cat < EXPECTED +thread:XXX 2009-11-17 [1/2] Jan Janak| Carl Worth; [notmuch] [PATCH] Older versions of install do not support -C. (inbox unread) +EOF +test_expect_equal_file EXPECTED OUTPUT + +test_begin_subtest "regexp error reporting" +notmuch search 'from:/unbalanced[/' 1>OUTPUT 2>&1 +cat < EXPECTED +notmuch search: A Xapian exception occurred +A Xapian exception occurred performing query: Invalid regular expression +Query string was: from:/unbalanced[/ +EOF +test_expect_equal_file EXPECTED OUTPUT + +test_done