mirror of
https://git.notmuchmail.org/git/notmuch
synced 2024-12-22 09:24:54 +01:00
lib/parse-sexp: support phrase queries.
Anything that is quoted or not purely word characters is considered a phrase. Phrases are not stemmed, because the stems do not have positional information in the database. It is less efficient to scan the term twice, but it avoids a second pass to add prefixes, so maybe it balances out. In any case, it seems unlikely query parsing is very often a bottleneck.
This commit is contained in:
parent
48ad0e1ff3
commit
90d9c2ad5c
3 changed files with 83 additions and 15 deletions
|
@ -40,10 +40,12 @@ subqueries.
|
|||
Match all messages.
|
||||
|
||||
*term*
|
||||
Match all messages containing *term*, possibly after
|
||||
stemming or phase splitting. For discussion of stemming in
|
||||
notmuch see :any:`notmuch-search-terms(7)`. Stemming only applies
|
||||
to unquoted terms (basic values) in s-expression queries.
|
||||
|
||||
Match all messages containing *term*, possibly after stemming or
|
||||
phrase splitting. For discussion of stemming in notmuch see
|
||||
:any:`notmuch-search-terms(7)`. Stemming only applies to unquoted
|
||||
terms (basic values) in s-expression queries. For information on
|
||||
phrase splitting see :any:`fields`.
|
||||
|
||||
``(`` *field* |q1| |q2| ... |qn| ``)``
|
||||
Restrict the queries |q1| to |qn| to *field*, and combine with *and*
|
||||
|
@ -63,7 +65,7 @@ subqueries.
|
|||
FIELDS
|
||||
``````
|
||||
|
||||
*Fields* (also called *prefixes* in notmuch documentation)
|
||||
*Fields* [#aka-pref]_
|
||||
correspond to attributes of mail messages. Some are inherent (and
|
||||
immutable) like ``subject``, while others ``tag`` and ``property`` are
|
||||
settable by the user. Each concrete field in
|
||||
|
@ -72,6 +74,13 @@ is discussed further under "Search prefixes" in
|
|||
:any:`notmuch-search-terms(7)`. The row *user* refers to user defined
|
||||
fields, described in :any:`notmuch-config(1)`.
|
||||
|
||||
Most fields are either *phrase fields* [#aka-prob]_ (which match
|
||||
sequences of words), or *term fields* [#aka-bool]_ (which match exact
|
||||
strings). *Phrase splitting* breaks the term (basic value or quoted
|
||||
string) into words, ignore punctuation. Phrase splitting is applied to
|
||||
terms in phrase (probabilistic) fields. Both phrase splitting and
|
||||
stemming apply only in phrase fields.
|
||||
|
||||
.. _field-table:
|
||||
|
||||
.. table:: Fields with supported modifiers
|
||||
|
@ -138,10 +147,23 @@ EXAMPLES
|
|||
``(not Bob Marley)``
|
||||
Match messages containing neither "Bob" nor "Marley", nor their stems,
|
||||
|
||||
``"quick fox"`` ``quick-fox`` ``quick@fox``
|
||||
Match the *phrase* "quick" followed by "fox" in phrase fields (or
|
||||
outside a field). Match the literal string in a term field.
|
||||
|
||||
``(subject quick "brown fox")``
|
||||
Match messages whose subject contains "quick" (anywhere, stemmed) and
|
||||
the phrase "brown fox".
|
||||
|
||||
NOTES
|
||||
=====
|
||||
|
||||
.. [#aka-pref] a.k.a. prefixes
|
||||
|
||||
.. [#aka-prob] a.k.a. probabilistic prefixes
|
||||
|
||||
.. [#aka-bool] a.k.a. boolean prefixes
|
||||
|
||||
.. |q1| replace:: :math:`q_1`
|
||||
.. |q2| replace:: :math:`q_2`
|
||||
.. |qn| replace:: :math:`q_n`
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
#if HAVE_SFSEXP
|
||||
#include "sexp.h"
|
||||
|
||||
#include "unicode-util.h"
|
||||
|
||||
/* _sexp is used for file scope symbols to avoid clashing with
|
||||
* definitions from sexp.h */
|
||||
|
@ -67,6 +67,36 @@ _sexp_combine_query (notmuch_database_t *notmuch,
|
|||
sx->next, output);
|
||||
}
|
||||
|
||||
static notmuch_status_t
|
||||
_sexp_parse_phrase (std::string term_prefix, const char *phrase, Xapian::Query &output)
|
||||
{
|
||||
Xapian::Utf8Iterator p (phrase);
|
||||
Xapian::Utf8Iterator end;
|
||||
std::vector<std::string> terms;
|
||||
|
||||
while (p != end) {
|
||||
Xapian::Utf8Iterator start;
|
||||
while (p != end && ! Xapian::Unicode::is_wordchar (*p))
|
||||
p++;
|
||||
|
||||
if (p == end)
|
||||
break;
|
||||
|
||||
start = p;
|
||||
|
||||
while (p != end && Xapian::Unicode::is_wordchar (*p))
|
||||
p++;
|
||||
|
||||
if (p != start) {
|
||||
std::string word (start, p);
|
||||
word = Xapian::Unicode::tolower (word);
|
||||
terms.push_back (term_prefix + word);
|
||||
}
|
||||
}
|
||||
output = Xapian::Query (Xapian::Query::OP_PHRASE, terms.begin (), terms.end ());
|
||||
return NOTMUCH_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
/* Here we expect the s-expression to be a proper list, with first
|
||||
* element defining and operation, or as a special case the empty
|
||||
* list */
|
||||
|
@ -80,13 +110,12 @@ _sexp_to_xapian_query (notmuch_database_t *notmuch, const _sexp_prefix_t *parent
|
|||
std::string term = Xapian::Unicode::tolower (sx->val);
|
||||
Xapian::Stem stem = *(notmuch->stemmer);
|
||||
std::string term_prefix = parent ? _find_prefix (parent->name) : "";
|
||||
if (sx->aty == SEXP_BASIC)
|
||||
term = "Z" + term_prefix + stem (term);
|
||||
else
|
||||
term = term_prefix + term;
|
||||
|
||||
output = Xapian::Query (term);
|
||||
return NOTMUCH_STATUS_SUCCESS;
|
||||
if (sx->aty == SEXP_BASIC && unicode_word_utf8 (sx->val)) {
|
||||
output = Xapian::Query ("Z" + term_prefix + stem (term));
|
||||
return NOTMUCH_STATUS_SUCCESS;
|
||||
} else {
|
||||
return _sexp_parse_phrase (term_prefix, sx->val, output);
|
||||
}
|
||||
}
|
||||
|
||||
/* Empty list */
|
||||
|
|
|
@ -102,15 +102,32 @@ EOF
|
|||
test_expect_equal_file EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "Search by 'subject' (utf-8, phrase-token):"
|
||||
test_subtest_known_broken
|
||||
output=$(notmuch search --query=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize)
|
||||
test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
|
||||
|
||||
test_begin_subtest "Search by 'subject' (utf-8, quoted string):"
|
||||
test_subtest_known_broken
|
||||
output=$(notmuch search --query=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize)
|
||||
test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
|
||||
|
||||
test_begin_subtest "Search by 'subject' (combine phrase, term):"
|
||||
output=$(notmuch search --query=sexp '(subject Mac "compatibility issues")' | notmuch_search_sanitize)
|
||||
test_expect_equal "$output" "thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)"
|
||||
|
||||
test_begin_subtest "Search by 'subject' (combine phrase, term 2):"
|
||||
notmuch search --query=sexp '(subject (or utf8 "compatibility issues"))' | notmuch_search_sanitize > OUTPUT
|
||||
cat <<EOF > EXPECTED
|
||||
thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
|
||||
thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)
|
||||
EOF
|
||||
test_expect_equal_file EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "Search by 'subject' (combine phrase, term 3):"
|
||||
notmuch search --query=sexp '(subject issues X/Darwin)' | notmuch_search_sanitize > OUTPUT
|
||||
cat <<EOF > EXPECTED
|
||||
thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
|
||||
EOF
|
||||
test_expect_equal_file EXPECTED OUTPUT
|
||||
|
||||
test_begin_subtest "Unbalanced parens"
|
||||
# A code 1 indicates the error was handled (a crash will return e.g. 139).
|
||||
test_expect_code 1 "notmuch search --query=sexp '('"
|
||||
|
|
Loading…
Reference in a new issue