lib/parse-sexp: support regular expressions

At least to the degree that the Xapian QueryParser based parser
also supports them. Support short alias 'rx' as it seems to make more
complex queries nicer to read.
This commit is contained in:
David Bremner 2021-08-24 08:17:29 -07:00
parent 5cb452c325
commit 1870b3ae4b
3 changed files with 124 additions and 10 deletions

View file

@ -144,6 +144,11 @@ MODIFIERS
*Modifiers* refer to any prefixes (first elements of compound queries)
that are neither operators nor fields.
``(regex`` *atom* ``)`` ``(rx`` *atom* ``)``
Interpret *atom* as a POSIX.2 regular expression (see
:manpage:`regex(7)`). This applies in term fields and a subset [#not-phrase]_ of
phrase fields (see :any:`field-table`).
``(starts-with`` *subword* ``)``
Matches any term starting with *subword*. This applies in either
phrase or term :any:`fields <fields>`, or outside of fields [#not-body]_. Note that
@ -205,6 +210,9 @@ NOTES
.. [#aka-bool] a.k.a. boolean prefixes
.. [#not-phrase] Due to the implemention of phrase fields in Xapian,
regex queries could only match individual words.
.. [#not-body] Due the the way ``body`` is implemented in notmuch,
this modifier is not supported in the ``body`` field.

View file

@ -13,6 +13,8 @@ typedef enum {
SEXP_FLAG_BOOLEAN = 1 << 1,
SEXP_FLAG_SINGLE = 1 << 2,
SEXP_FLAG_WILDCARD = 1 << 3,
SEXP_FLAG_REGEX = 1 << 4,
SEXP_FLAG_DO_REGEX = 1 << 5,
} _sexp_flag_t;
/*
@ -48,15 +50,15 @@ static _sexp_prefix_t prefixes[] =
{ "body", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD },
{ "from", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "folder", Xapian::Query::OP_OR, Xapian::Query::MatchNothing,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "id", Xapian::Query::OP_OR, Xapian::Query::MatchNothing,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "is", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "mid", Xapian::Query::OP_OR, Xapian::Query::MatchNothing,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "mimetype", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
{ "not", Xapian::Query::OP_AND_NOT, Xapian::Query::MatchAll,
@ -64,17 +66,21 @@ static _sexp_prefix_t prefixes[] =
{ "or", Xapian::Query::OP_OR, Xapian::Query::MatchNothing,
SEXP_FLAG_NONE },
{ "path", Xapian::Query::OP_OR, Xapian::Query::MatchNothing,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "property", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "regex", Xapian::Query::OP_INVALID, Xapian::Query::MatchAll,
SEXP_FLAG_SINGLE | SEXP_FLAG_DO_REGEX },
{ "rx", Xapian::Query::OP_INVALID, Xapian::Query::MatchAll,
SEXP_FLAG_SINGLE | SEXP_FLAG_DO_REGEX },
{ "starts-with", Xapian::Query::OP_WILDCARD, Xapian::Query::MatchAll,
SEXP_FLAG_SINGLE },
{ "subject", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "tag", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "thread", Xapian::Query::OP_OR, Xapian::Query::MatchNothing,
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD },
SEXP_FLAG_FIELD | SEXP_FLAG_BOOLEAN | SEXP_FLAG_WILDCARD | SEXP_FLAG_REGEX },
{ "to", Xapian::Query::OP_AND, Xapian::Query::MatchAll,
SEXP_FLAG_FIELD | SEXP_FLAG_WILDCARD },
{ }
@ -180,6 +186,30 @@ _sexp_parse_one_term (notmuch_database_t *notmuch, std::string term_prefix, cons
}
}
notmuch_status_t
_sexp_parse_regex (notmuch_database_t *notmuch,
const _sexp_prefix_t *prefix, const _sexp_prefix_t *parent,
std::string val, Xapian::Query &output)
{
if (! parent) {
_notmuch_database_log (notmuch, "illegal '%s' outside field\n",
prefix->name);
return NOTMUCH_STATUS_BAD_QUERY_SYNTAX;
}
if (! (parent->flags & SEXP_FLAG_REGEX)) {
_notmuch_database_log (notmuch, "'%s' not supported in field '%s'\n",
prefix->name, parent->name);
return NOTMUCH_STATUS_BAD_QUERY_SYNTAX;
}
std::string msg; /* ignored */
return _notmuch_regexp_to_query (notmuch, Xapian::BAD_VALUENO, parent->name,
val, output, msg);
}
/* Here we expect the s-expression to be a proper list, with first
* element defining and operation, or as a special case the empty
* list */
@ -254,6 +284,10 @@ _sexp_to_xapian_query (notmuch_database_t *notmuch, const _sexp_prefix_t *parent
if (prefix->xapian_op == Xapian::Query::OP_WILDCARD)
return _sexp_parse_wildcard (notmuch, parent, sx->list->next->val, output);
if (prefix->flags & SEXP_FLAG_DO_REGEX) {
return _sexp_parse_regex (notmuch, prefix, parent, sx->list->next->val, output);
}
return _sexp_combine_query (notmuch, parent, prefix->xapian_op, prefix->initial,
sx->list->next, output);
}

View file

@ -565,4 +565,76 @@ output=$(notmuch search --query=sexp '(subject deleted)' | notmuch_search_saniti
test_expect_equal "$output" "thread:XXX 2001-01-05 [1/1] Notmuch Test Suite; Not deleted (inbox unread)
thread:XXX 2001-01-05 [2/2] Notmuch Test Suite; Deleted (deleted inbox unread)"
test_begin_subtest "regex at top level"
notmuch search --query=sexp '(rx foo)' >& OUTPUT
cat <<EOF > EXPECTED
notmuch search: Syntax error in query
illegal 'rx' outside field
EOF
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "regex in illegal field"
notmuch search --query=sexp '(body (regex foo))' >& OUTPUT
cat <<EOF > EXPECTED
notmuch search: Syntax error in query
'regex' not supported in field 'body'
EOF
test_expect_equal_file EXPECTED OUTPUT
notmuch search --output=messages from:cworth > cworth.msg-ids
test_begin_subtest "regexp 'from' search"
notmuch search --output=messages --query=sexp '(from (rx cworth))' > OUTPUT
test_expect_equal_file cworth.msg-ids OUTPUT
test_begin_subtest "regexp search for 'from' 2"
notmuch search from:/cworth@cworth.org/ and subject:patch | notmuch_search_sanitize > EXPECTED
notmuch search --query=sexp '(and (from (rx cworth@cworth.org)) (subject patch))' \
| notmuch_search_sanitize > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "regexp 'folder' search"
notmuch search 'folder:/^bar$/' | notmuch_search_sanitize > EXPECTED
notmuch search --query=sexp '(folder (rx ^bar$))' | notmuch_search_sanitize > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "regexp 'id' search"
notmuch search --output=messages --query=sexp '(id (rx yoom))' > OUTPUT
test_expect_equal_file cworth.msg-ids OUTPUT
test_begin_subtest "unanchored 'is' search"
notmuch search tag:signed or tag:inbox > EXPECTED
notmuch search --query=sexp '(is (rx i))' > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "anchored 'is' search"
notmuch search tag:signed > EXPECTED
notmuch search --query=sexp '(is (rx ^si))' > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "combine regexp mid and subject"
notmuch search subject:/-C/ and mid:/y..m/ | notmuch_search_sanitize > EXPECTED
notmuch search --query=sexp '(and (subject (rx -C)) (mid (rx y..m)))' | notmuch_search_sanitize > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "regexp 'path' search"
notmuch search 'path:/^bar$/' | notmuch_search_sanitize > EXPECTED
notmuch search --query=sexp '(path (rx ^bar$))' | notmuch_search_sanitize > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "regexp 'property' search"
notmuch search property:foo=bar > EXPECTED
notmuch search --query=sexp '(property (rx foo=.*))' > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "anchored 'tag' search"
notmuch search tag:signed > EXPECTED
notmuch search --query=sexp '(tag (rx ^si))' > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_begin_subtest "regexp 'thread' search"
notmuch search --output=threads '*' | grep '7$' > EXPECTED
notmuch search --output=threads --query=sexp '(thread (rx 7$))' > OUTPUT
test_expect_equal_file EXPECTED OUTPUT
test_done