lib/parse-sexp: support phrase queries.

Anything that is quoted or not purely word characters is considered a phrase. Phrases are not stemmed, because the stems do not have positional information in the database. It is less efficient to scan the term twice, but it avoids a second pass to add prefixes, so maybe it balances out. In any case, it seems unlikely query parsing is very often a bottleneck.
2024-11-25 04:18:08 +01:00 · 2021-08-24 08:17:22 -07:00 · 2021-08-24 08:17:22 -07:00 · 90d9c2ad5c
commit 90d9c2ad5c
parent 48ad0e1ff3
3 changed files with 83 additions and 15 deletions
--- a/doc/man7/notmuch-sexp-queries.rst
+++ b/doc/man7/notmuch-sexp-queries.rst
@ -40,10 +40,12 @@ subqueries.
    Match all messages.
 *term*
-    Match all messages containing *term*, possibly after
+
-    stemming or phase splitting. For discussion of stemming in
+    Match all messages containing *term*, possibly after stemming or
-    notmuch see :any:`notmuch-search-terms(7)`. Stemming only applies
+    phrase splitting. For discussion of stemming in notmuch see
-    to unquoted terms (basic values) in s-expression queries.
+    :any:`notmuch-search-terms(7)`. Stemming only applies to unquoted
    terms (basic values) in s-expression queries.  For information on
    phrase splitting see :any:`fields`.
 ``(`` *field* |q1| |q2| ... |qn| ``)``
    Restrict the queries |q1| to |qn| to *field*, and combine with *and*
@ -63,7 +65,7 @@ subqueries.
 FIELDS
 ``````
-*Fields* (also called *prefixes* in notmuch documentation)
+*Fields* [#aka-pref]_
 correspond to attributes of mail messages. Some are inherent (and
 immutable) like ``subject``, while others ``tag`` and ``property`` are
 settable by the user.  Each concrete field in
@ -72,6 +74,13 @@ is discussed further under "Search prefixes" in
 :any:`notmuch-search-terms(7)`. The row *user* refers to user defined
 fields, described in :any:`notmuch-config(1)`.
 Most fields are either *phrase fields* [#aka-prob]_ (which match
 sequences of words), or *term fields* [#aka-bool]_ (which match exact
 strings). *Phrase splitting* breaks the term (basic value or quoted
 string) into words, ignore punctuation. Phrase splitting is applied to
 terms in phrase (probabilistic) fields. Both phrase splitting and
 stemming apply only in phrase fields.
 .. _field-table:
 .. table:: Fields with supported modifiers
@ -138,10 +147,23 @@ EXAMPLES
 ``(not Bob Marley)``
    Match messages containing neither "Bob" nor "Marley", nor their stems,
 ``"quick fox"`` ``quick-fox`` ``quick@fox``
    Match the *phrase* "quick" followed by "fox" in phrase fields (or
    outside a field). Match the literal string in a term field.
 ``(subject quick "brown fox")``
    Match messages whose subject contains "quick" (anywhere, stemmed) and
    the phrase "brown fox".
 NOTES
 =====
 .. [#aka-pref] a.k.a. prefixes
 .. [#aka-prob] a.k.a. probabilistic prefixes
 .. [#aka-bool] a.k.a. boolean prefixes
 .. |q1| replace:: :math:`q_1`
 .. |q2| replace:: :math:`q_2`
 .. |qn| replace:: :math:`q_n`
--- a/lib/parse-sexp.cc
+++ b/lib/parse-sexp.cc
@ -2,7 +2,7 @@
 #if HAVE_SFSEXP
 #include "sexp.h"
-
+#include "unicode-util.h"
 /* _sexp is used for file scope symbols to avoid clashing with
 * definitions from sexp.h */
@ -67,6 +67,36 @@ _sexp_combine_query (notmuch_database_t *notmuch,
 				sx->next, output);
 }
 static notmuch_status_t
 _sexp_parse_phrase (std::string term_prefix, const char *phrase, Xapian::Query &output)
 {
    Xapian::Utf8Iterator p (phrase);
    Xapian::Utf8Iterator end;
    std::vector<std::string> terms;
    while (p != end) {
 	Xapian::Utf8Iterator start;
 	while (p != end && ! Xapian::Unicode::is_wordchar (*p))
 	    p++;
 	if (p == end)
 	    break;
 	start = p;
 	while (p != end && Xapian::Unicode::is_wordchar (*p))
 	    p++;
 	if (p != start) {
 	    std::string word (start, p);
 	    word = Xapian::Unicode::tolower (word);
 	    terms.push_back (term_prefix + word);
 	}
    }
    output = Xapian::Query (Xapian::Query::OP_PHRASE, terms.begin (), terms.end ());
    return NOTMUCH_STATUS_SUCCESS;
 }
 /* Here we expect the s-expression to be a proper list, with first
 * element defining and operation, or as a special case the empty
 * list */
@ -80,13 +110,12 @@ _sexp_to_xapian_query (notmuch_database_t *notmuch, const _sexp_prefix_t *parent
 	std::string term = Xapian::Unicode::tolower (sx->val);
 	Xapian::Stem stem = *(notmuch->stemmer);
 	std::string term_prefix = parent ? _find_prefix (parent->name) : "";
-	if (sx->aty == SEXP_BASIC)
+	if (sx->aty == SEXP_BASIC && unicode_word_utf8 (sx->val)) {
-	    term = "Z" + term_prefix + stem (term);
+	    output = Xapian::Query ("Z" + term_prefix + stem (term));
 	else
 	    term = term_prefix + term;
 	output = Xapian::Query (term);
 	    return NOTMUCH_STATUS_SUCCESS;
 	} else {
 	    return _sexp_parse_phrase (term_prefix, sx->val, output);
 	}
    }
    /* Empty list */
--- a/test/T081-sexpr-search.sh
+++ b/test/T081-sexpr-search.sh
@ -102,15 +102,32 @@ EOF
 test_expect_equal_file EXPECTED OUTPUT
 test_begin_subtest "Search by 'subject' (utf-8, phrase-token):"
 test_subtest_known_broken
 output=$(notmuch search --query=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize)
 test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
 test_begin_subtest "Search by 'subject' (utf-8, quoted string):"
 test_subtest_known_broken
 output=$(notmuch search --query=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize)
 test_expect_equal "$output" "thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
 test_begin_subtest "Search by 'subject' (combine phrase, term):"
 output=$(notmuch search --query=sexp '(subject Mac "compatibility issues")' | notmuch_search_sanitize)
 test_expect_equal "$output" "thread:XXX   2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)"
 test_begin_subtest "Search by 'subject' (combine phrase, term 2):"
 notmuch search --query=sexp '(subject (or utf8 "compatibility issues"))' | notmuch_search_sanitize > OUTPUT
 cat <<EOF > EXPECTED
 thread:XXX   2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
 thread:XXX   2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)
 EOF
 test_expect_equal_file EXPECTED OUTPUT
 test_begin_subtest "Search by 'subject' (combine phrase, term 3):"
 notmuch search --query=sexp '(subject issues X/Darwin)' | notmuch_search_sanitize > OUTPUT
 cat <<EOF > EXPECTED
 thread:XXX   2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
 EOF
 test_expect_equal_file EXPECTED OUTPUT
 test_begin_subtest "Unbalanced parens"
 # A code 1 indicates the error was handled (a crash will return e.g. 139).
 test_expect_code 1 "notmuch search --query=sexp '('"