From 90d9c2ad5c459624d17f92d0844e7a7fbb87d7a2 Mon Sep 17 00:00:00 2001 From: David Bremner Date: Tue, 24 Aug 2021 08:17:22 -0700 Subject: [PATCH] lib/parse-sexp: support phrase queries. Anything that is quoted or not purely word characters is considered a phrase. Phrases are not stemmed, because the stems do not have positional information in the database. It is less efficient to scan the term twice, but it avoids a second pass to add prefixes, so maybe it balances out. In any case, it seems unlikely query parsing is very often a bottleneck. --- doc/man7/notmuch-sexp-queries.rst | 32 ++++++++++++++++++---- lib/parse-sexp.cc | 45 +++++++++++++++++++++++++------ test/T081-sexpr-search.sh | 21 +++++++++++++-- 3 files changed, 83 insertions(+), 15 deletions(-) diff --git a/doc/man7/notmuch-sexp-queries.rst b/doc/man7/notmuch-sexp-queries.rst index 08e97cc3..b763876d 100644 --- a/doc/man7/notmuch-sexp-queries.rst +++ b/doc/man7/notmuch-sexp-queries.rst @@ -40,10 +40,12 @@ subqueries. Match all messages. *term* - Match all messages containing *term*, possibly after - stemming or phase splitting. For discussion of stemming in - notmuch see :any:`notmuch-search-terms(7)`. Stemming only applies - to unquoted terms (basic values) in s-expression queries. + + Match all messages containing *term*, possibly after stemming or + phrase splitting. For discussion of stemming in notmuch see + :any:`notmuch-search-terms(7)`. Stemming only applies to unquoted + terms (basic values) in s-expression queries. For information on + phrase splitting see :any:`fields`. ``(`` *field* |q1| |q2| ... |qn| ``)`` Restrict the queries |q1| to |qn| to *field*, and combine with *and* @@ -63,7 +65,7 @@ subqueries. FIELDS `````` -*Fields* (also called *prefixes* in notmuch documentation) +*Fields* [#aka-pref]_ correspond to attributes of mail messages. Some are inherent (and immutable) like ``subject``, while others ``tag`` and ``property`` are settable by the user. Each concrete field in @@ -72,6 +74,13 @@ is discussed further under "Search prefixes" in :any:`notmuch-search-terms(7)`. The row *user* refers to user defined fields, described in :any:`notmuch-config(1)`. +Most fields are either *phrase fields* [#aka-prob]_ (which match +sequences of words), or *term fields* [#aka-bool]_ (which match exact +strings). *Phrase splitting* breaks the term (basic value or quoted +string) into words, ignore punctuation. Phrase splitting is applied to +terms in phrase (probabilistic) fields. Both phrase splitting and +stemming apply only in phrase fields. + .. _field-table: .. table:: Fields with supported modifiers @@ -138,10 +147,23 @@ EXAMPLES ``(not Bob Marley)`` Match messages containing neither "Bob" nor "Marley", nor their stems, +``"quick fox"`` ``quick-fox`` ``quick@fox`` + Match the *phrase* "quick" followed by "fox" in phrase fields (or + outside a field). Match the literal string in a term field. + ``(subject quick "brown fox")`` Match messages whose subject contains "quick" (anywhere, stemmed) and the phrase "brown fox". +NOTES +===== + +.. [#aka-pref] a.k.a. prefixes + +.. [#aka-prob] a.k.a. probabilistic prefixes + +.. [#aka-bool] a.k.a. boolean prefixes + .. |q1| replace:: :math:`q_1` .. |q2| replace:: :math:`q_2` .. |qn| replace:: :math:`q_n` diff --git a/lib/parse-sexp.cc b/lib/parse-sexp.cc index 25556058..0917f505 100644 --- a/lib/parse-sexp.cc +++ b/lib/parse-sexp.cc @@ -2,7 +2,7 @@ #if HAVE_SFSEXP #include "sexp.h" - +#include "unicode-util.h" /* _sexp is used for file scope symbols to avoid clashing with * definitions from sexp.h */ @@ -67,6 +67,36 @@ _sexp_combine_query (notmuch_database_t *notmuch, sx->next, output); } +static notmuch_status_t +_sexp_parse_phrase (std::string term_prefix, const char *phrase, Xapian::Query &output) +{ + Xapian::Utf8Iterator p (phrase); + Xapian::Utf8Iterator end; + std::vector terms; + + while (p != end) { + Xapian::Utf8Iterator start; + while (p != end && ! Xapian::Unicode::is_wordchar (*p)) + p++; + + if (p == end) + break; + + start = p; + + while (p != end && Xapian::Unicode::is_wordchar (*p)) + p++; + + if (p != start) { + std::string word (start, p); + word = Xapian::Unicode::tolower (word); + terms.push_back (term_prefix + word); + } + } + output = Xapian::Query (Xapian::Query::OP_PHRASE, terms.begin (), terms.end ()); + return NOTMUCH_STATUS_SUCCESS; +} + /* Here we expect the s-expression to be a proper list, with first * element defining and operation, or as a special case the empty * list */ @@ -80,13 +110,12 @@ _sexp_to_xapian_query (notmuch_database_t *notmuch, const _sexp_prefix_t *parent std::string term = Xapian::Unicode::tolower (sx->val); Xapian::Stem stem = *(notmuch->stemmer); std::string term_prefix = parent ? _find_prefix (parent->name) : ""; - if (sx->aty == SEXP_BASIC) - term = "Z" + term_prefix + stem (term); - else - term = term_prefix + term; - - output = Xapian::Query (term); - return NOTMUCH_STATUS_SUCCESS; + if (sx->aty == SEXP_BASIC && unicode_word_utf8 (sx->val)) { + output = Xapian::Query ("Z" + term_prefix + stem (term)); + return NOTMUCH_STATUS_SUCCESS; + } else { + return _sexp_parse_phrase (term_prefix, sx->val, output); + } } /* Empty list */ diff --git a/test/T081-sexpr-search.sh b/test/T081-sexpr-search.sh index 90cef50c..4a051a50 100755 --- a/test/T081-sexpr-search.sh +++ b/test/T081-sexpr-search.sh @@ -102,15 +102,32 @@ EOF test_expect_equal_file EXPECTED OUTPUT test_begin_subtest "Search by 'subject' (utf-8, phrase-token):" -test_subtest_known_broken output=$(notmuch search --query=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize) test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)" test_begin_subtest "Search by 'subject' (utf-8, quoted string):" -test_subtest_known_broken output=$(notmuch search --query=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize) test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)" +test_begin_subtest "Search by 'subject' (combine phrase, term):" +output=$(notmuch search --query=sexp '(subject Mac "compatibility issues")' | notmuch_search_sanitize) +test_expect_equal "$output" "thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)" + +test_begin_subtest "Search by 'subject' (combine phrase, term 2):" +notmuch search --query=sexp '(subject (or utf8 "compatibility issues"))' | notmuch_search_sanitize > OUTPUT +cat < EXPECTED +thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread) +thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread) +EOF +test_expect_equal_file EXPECTED OUTPUT + +test_begin_subtest "Search by 'subject' (combine phrase, term 3):" +notmuch search --query=sexp '(subject issues X/Darwin)' | notmuch_search_sanitize > OUTPUT +cat < EXPECTED +thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread) +EOF +test_expect_equal_file EXPECTED OUTPUT + test_begin_subtest "Unbalanced parens" # A code 1 indicates the error was handled (a crash will return e.g. 139). test_expect_code 1 "notmuch search --query=sexp '('"