mirror of
https://git.notmuchmail.org/git/notmuch
synced 2024-11-21 18:38:08 +01:00
lib/parse-sexp: support phrase queries.
Anything that is quoted or not purely word characters is considered a phrase. Phrases are not stemmed, because the stems do not have positional information in the database. It is less efficient to scan the term twice, but it avoids a second pass to add prefixes, so maybe it balances out. In any case, it seems unlikely query parsing is very often a bottleneck.
This commit is contained in:
parent
48ad0e1ff3
commit
90d9c2ad5c
3 changed files with 83 additions and 15 deletions
|
@ -40,10 +40,12 @@ subqueries.
|
||||||
Match all messages.
|
Match all messages.
|
||||||
|
|
||||||
*term*
|
*term*
|
||||||
Match all messages containing *term*, possibly after
|
|
||||||
stemming or phase splitting. For discussion of stemming in
|
Match all messages containing *term*, possibly after stemming or
|
||||||
notmuch see :any:`notmuch-search-terms(7)`. Stemming only applies
|
phrase splitting. For discussion of stemming in notmuch see
|
||||||
to unquoted terms (basic values) in s-expression queries.
|
:any:`notmuch-search-terms(7)`. Stemming only applies to unquoted
|
||||||
|
terms (basic values) in s-expression queries. For information on
|
||||||
|
phrase splitting see :any:`fields`.
|
||||||
|
|
||||||
``(`` *field* |q1| |q2| ... |qn| ``)``
|
``(`` *field* |q1| |q2| ... |qn| ``)``
|
||||||
Restrict the queries |q1| to |qn| to *field*, and combine with *and*
|
Restrict the queries |q1| to |qn| to *field*, and combine with *and*
|
||||||
|
@ -63,7 +65,7 @@ subqueries.
|
||||||
FIELDS
|
FIELDS
|
||||||
``````
|
``````
|
||||||
|
|
||||||
*Fields* (also called *prefixes* in notmuch documentation)
|
*Fields* [#aka-pref]_
|
||||||
correspond to attributes of mail messages. Some are inherent (and
|
correspond to attributes of mail messages. Some are inherent (and
|
||||||
immutable) like ``subject``, while others ``tag`` and ``property`` are
|
immutable) like ``subject``, while others ``tag`` and ``property`` are
|
||||||
settable by the user. Each concrete field in
|
settable by the user. Each concrete field in
|
||||||
|
@ -72,6 +74,13 @@ is discussed further under "Search prefixes" in
|
||||||
:any:`notmuch-search-terms(7)`. The row *user* refers to user defined
|
:any:`notmuch-search-terms(7)`. The row *user* refers to user defined
|
||||||
fields, described in :any:`notmuch-config(1)`.
|
fields, described in :any:`notmuch-config(1)`.
|
||||||
|
|
||||||
|
Most fields are either *phrase fields* [#aka-prob]_ (which match
|
||||||
|
sequences of words), or *term fields* [#aka-bool]_ (which match exact
|
||||||
|
strings). *Phrase splitting* breaks the term (basic value or quoted
|
||||||
|
string) into words, ignore punctuation. Phrase splitting is applied to
|
||||||
|
terms in phrase (probabilistic) fields. Both phrase splitting and
|
||||||
|
stemming apply only in phrase fields.
|
||||||
|
|
||||||
.. _field-table:
|
.. _field-table:
|
||||||
|
|
||||||
.. table:: Fields with supported modifiers
|
.. table:: Fields with supported modifiers
|
||||||
|
@ -138,10 +147,23 @@ EXAMPLES
|
||||||
``(not Bob Marley)``
|
``(not Bob Marley)``
|
||||||
Match messages containing neither "Bob" nor "Marley", nor their stems,
|
Match messages containing neither "Bob" nor "Marley", nor their stems,
|
||||||
|
|
||||||
|
``"quick fox"`` ``quick-fox`` ``quick@fox``
|
||||||
|
Match the *phrase* "quick" followed by "fox" in phrase fields (or
|
||||||
|
outside a field). Match the literal string in a term field.
|
||||||
|
|
||||||
``(subject quick "brown fox")``
|
``(subject quick "brown fox")``
|
||||||
Match messages whose subject contains "quick" (anywhere, stemmed) and
|
Match messages whose subject contains "quick" (anywhere, stemmed) and
|
||||||
the phrase "brown fox".
|
the phrase "brown fox".
|
||||||
|
|
||||||
|
NOTES
|
||||||
|
=====
|
||||||
|
|
||||||
|
.. [#aka-pref] a.k.a. prefixes
|
||||||
|
|
||||||
|
.. [#aka-prob] a.k.a. probabilistic prefixes
|
||||||
|
|
||||||
|
.. [#aka-bool] a.k.a. boolean prefixes
|
||||||
|
|
||||||
.. |q1| replace:: :math:`q_1`
|
.. |q1| replace:: :math:`q_1`
|
||||||
.. |q2| replace:: :math:`q_2`
|
.. |q2| replace:: :math:`q_2`
|
||||||
.. |qn| replace:: :math:`q_n`
|
.. |qn| replace:: :math:`q_n`
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
#if HAVE_SFSEXP
|
#if HAVE_SFSEXP
|
||||||
#include "sexp.h"
|
#include "sexp.h"
|
||||||
|
#include "unicode-util.h"
|
||||||
|
|
||||||
/* _sexp is used for file scope symbols to avoid clashing with
|
/* _sexp is used for file scope symbols to avoid clashing with
|
||||||
* definitions from sexp.h */
|
* definitions from sexp.h */
|
||||||
|
@ -67,6 +67,36 @@ _sexp_combine_query (notmuch_database_t *notmuch,
|
||||||
sx->next, output);
|
sx->next, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static notmuch_status_t
|
||||||
|
_sexp_parse_phrase (std::string term_prefix, const char *phrase, Xapian::Query &output)
|
||||||
|
{
|
||||||
|
Xapian::Utf8Iterator p (phrase);
|
||||||
|
Xapian::Utf8Iterator end;
|
||||||
|
std::vector<std::string> terms;
|
||||||
|
|
||||||
|
while (p != end) {
|
||||||
|
Xapian::Utf8Iterator start;
|
||||||
|
while (p != end && ! Xapian::Unicode::is_wordchar (*p))
|
||||||
|
p++;
|
||||||
|
|
||||||
|
if (p == end)
|
||||||
|
break;
|
||||||
|
|
||||||
|
start = p;
|
||||||
|
|
||||||
|
while (p != end && Xapian::Unicode::is_wordchar (*p))
|
||||||
|
p++;
|
||||||
|
|
||||||
|
if (p != start) {
|
||||||
|
std::string word (start, p);
|
||||||
|
word = Xapian::Unicode::tolower (word);
|
||||||
|
terms.push_back (term_prefix + word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
output = Xapian::Query (Xapian::Query::OP_PHRASE, terms.begin (), terms.end ());
|
||||||
|
return NOTMUCH_STATUS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
/* Here we expect the s-expression to be a proper list, with first
|
/* Here we expect the s-expression to be a proper list, with first
|
||||||
* element defining and operation, or as a special case the empty
|
* element defining and operation, or as a special case the empty
|
||||||
* list */
|
* list */
|
||||||
|
@ -80,13 +110,12 @@ _sexp_to_xapian_query (notmuch_database_t *notmuch, const _sexp_prefix_t *parent
|
||||||
std::string term = Xapian::Unicode::tolower (sx->val);
|
std::string term = Xapian::Unicode::tolower (sx->val);
|
||||||
Xapian::Stem stem = *(notmuch->stemmer);
|
Xapian::Stem stem = *(notmuch->stemmer);
|
||||||
std::string term_prefix = parent ? _find_prefix (parent->name) : "";
|
std::string term_prefix = parent ? _find_prefix (parent->name) : "";
|
||||||
if (sx->aty == SEXP_BASIC)
|
if (sx->aty == SEXP_BASIC && unicode_word_utf8 (sx->val)) {
|
||||||
term = "Z" + term_prefix + stem (term);
|
output = Xapian::Query ("Z" + term_prefix + stem (term));
|
||||||
else
|
return NOTMUCH_STATUS_SUCCESS;
|
||||||
term = term_prefix + term;
|
} else {
|
||||||
|
return _sexp_parse_phrase (term_prefix, sx->val, output);
|
||||||
output = Xapian::Query (term);
|
}
|
||||||
return NOTMUCH_STATUS_SUCCESS;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Empty list */
|
/* Empty list */
|
||||||
|
|
|
@ -102,15 +102,32 @@ EOF
|
||||||
test_expect_equal_file EXPECTED OUTPUT
|
test_expect_equal_file EXPECTED OUTPUT
|
||||||
|
|
||||||
test_begin_subtest "Search by 'subject' (utf-8, phrase-token):"
|
test_begin_subtest "Search by 'subject' (utf-8, phrase-token):"
|
||||||
test_subtest_known_broken
|
|
||||||
output=$(notmuch search --query=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize)
|
output=$(notmuch search --query=sexp '(subject utf8-sübjéct)' | notmuch_search_sanitize)
|
||||||
test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
|
test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
|
||||||
|
|
||||||
test_begin_subtest "Search by 'subject' (utf-8, quoted string):"
|
test_begin_subtest "Search by 'subject' (utf-8, quoted string):"
|
||||||
test_subtest_known_broken
|
|
||||||
output=$(notmuch search --query=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize)
|
output=$(notmuch search --query=sexp '(subject "utf8 sübjéct")' | notmuch_search_sanitize)
|
||||||
test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
|
test_expect_equal "$output" "thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)"
|
||||||
|
|
||||||
|
test_begin_subtest "Search by 'subject' (combine phrase, term):"
|
||||||
|
output=$(notmuch search --query=sexp '(subject Mac "compatibility issues")' | notmuch_search_sanitize)
|
||||||
|
test_expect_equal "$output" "thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)"
|
||||||
|
|
||||||
|
test_begin_subtest "Search by 'subject' (combine phrase, term 2):"
|
||||||
|
notmuch search --query=sexp '(subject (or utf8 "compatibility issues"))' | notmuch_search_sanitize > OUTPUT
|
||||||
|
cat <<EOF > EXPECTED
|
||||||
|
thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
|
||||||
|
thread:XXX 2000-01-01 [1/1] Notmuch Test Suite; utf8-sübjéct (inbox unread)
|
||||||
|
EOF
|
||||||
|
test_expect_equal_file EXPECTED OUTPUT
|
||||||
|
|
||||||
|
test_begin_subtest "Search by 'subject' (combine phrase, term 3):"
|
||||||
|
notmuch search --query=sexp '(subject issues X/Darwin)' | notmuch_search_sanitize > OUTPUT
|
||||||
|
cat <<EOF > EXPECTED
|
||||||
|
thread:XXX 2009-11-18 [4/4] Jjgod Jiang, Alexander Botero-Lowry; [notmuch] Mac OS X/Darwin compatibility issues (inbox unread)
|
||||||
|
EOF
|
||||||
|
test_expect_equal_file EXPECTED OUTPUT
|
||||||
|
|
||||||
test_begin_subtest "Unbalanced parens"
|
test_begin_subtest "Unbalanced parens"
|
||||||
# A code 1 indicates the error was handled (a crash will return e.g. 139).
|
# A code 1 indicates the error was handled (a crash will return e.g. 139).
|
||||||
test_expect_code 1 "notmuch search --query=sexp '('"
|
test_expect_code 1 "notmuch search --query=sexp '('"
|
||||||
|
|
Loading…
Reference in a new issue