notmuch/lib/regexp-fields.cc
David Bremner c62c22c9fb lib: drop trailing slash for path and folder searches (infix)
This resolves an old bug reported by David Edmondson in 2014. The fix
is only needed for the "boolean" case, as probabilistic / phrase
searching already ignores punctuation.

This fix is only for the infix (xapian provided) query parser.

[1]: id:cunoasuolcv.fsf@gargravarr.hh.sledj.net
2022-01-27 07:48:27 -04:00

250 lines
6.4 KiB
C++

/* regexp-fields.cc - field processor glue for regex supporting fields
*
* This file is part of notmuch.
*
* Copyright © 2015 Austin Clements
* Copyright © 2016 David Bremner
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see https://www.gnu.org/licenses/ .
*
* Author: Austin Clements <aclements@csail.mit.edu>
* David Bremner <david@tethera.net>
*/
#include "regexp-fields.h"
#include "notmuch-private.h"
#include "database-private.h"
notmuch_status_t
compile_regex (regex_t &regexp, const char *str, std::string &msg)
{
int err = regcomp (&regexp, str, REG_EXTENDED | REG_NOSUB);
if (err != 0) {
size_t len = regerror (err, &regexp, NULL, 0);
char *buffer = new char[len];
msg = "Regexp error: ";
(void) regerror (err, &regexp, buffer, len);
msg.append (buffer, len);
delete[] buffer;
return NOTMUCH_STATUS_ILLEGAL_ARGUMENT;
}
return NOTMUCH_STATUS_SUCCESS;
}
RegexpPostingSource::RegexpPostingSource (Xapian::valueno slot, const std::string &regexp)
: slot_ (slot)
{
std::string msg;
notmuch_status_t status = compile_regex (regexp_, regexp.c_str (), msg);
if (status)
throw Xapian::QueryParserError (msg);
}
RegexpPostingSource::~RegexpPostingSource ()
{
regfree (&regexp_);
}
void
RegexpPostingSource::init (const Xapian::Database &db)
{
db_ = db;
it_ = db_.valuestream_begin (slot_);
end_ = db.valuestream_end (slot_);
started_ = false;
}
Xapian::doccount
RegexpPostingSource::get_termfreq_min () const
{
return 0;
}
Xapian::doccount
RegexpPostingSource::get_termfreq_est () const
{
return get_termfreq_max () / 2;
}
Xapian::doccount
RegexpPostingSource::get_termfreq_max () const
{
return db_.get_value_freq (slot_);
}
Xapian::docid
RegexpPostingSource::get_docid () const
{
return it_.get_docid ();
}
bool
RegexpPostingSource::at_end () const
{
return it_ == end_;
}
void
RegexpPostingSource::next (unused (double min_wt))
{
if (started_ && ! at_end ())
++it_;
started_ = true;
for (; ! at_end (); ++it_) {
std::string value = *it_;
if (regexec (&regexp_, value.c_str (), 0, NULL, 0) == 0)
break;
}
}
void
RegexpPostingSource::skip_to (Xapian::docid did, unused (double min_wt))
{
started_ = true;
it_.skip_to (did);
for (; ! at_end (); ++it_) {
std::string value = *it_;
if (regexec (&regexp_, value.c_str (), 0, NULL, 0) == 0)
break;
}
}
bool
RegexpPostingSource::check (Xapian::docid did, unused (double min_wt))
{
started_ = true;
if (! it_.check (did) || at_end ())
return false;
return (regexec (&regexp_, (*it_).c_str (), 0, NULL, 0) == 0);
}
static inline Xapian::valueno
_find_slot (std::string prefix)
{
if (prefix == "from")
return NOTMUCH_VALUE_FROM;
else if (prefix == "subject")
return NOTMUCH_VALUE_SUBJECT;
else if (prefix == "mid")
return NOTMUCH_VALUE_MESSAGE_ID;
else
return Xapian::BAD_VALUENO;
}
RegexpFieldProcessor::RegexpFieldProcessor (std::string field_,
notmuch_field_flag_t options_,
Xapian::QueryParser &parser_,
notmuch_database_t *notmuch_)
: slot (_find_slot (field_)),
field (field_),
term_prefix (_find_prefix (field_.c_str ())),
options (options_),
parser (parser_),
notmuch (notmuch_)
{
};
notmuch_status_t
_notmuch_regexp_to_query (notmuch_database_t *notmuch, Xapian::valueno slot, std::string field,
std::string regexp_str,
Xapian::Query &output, std::string &msg)
{
regex_t regexp;
notmuch_status_t status;
status = compile_regex (regexp, regexp_str.c_str (), msg);
if (status) {
_notmuch_database_log_append (notmuch, "error compiling regex %s", msg.c_str ());
return status;
}
if (slot == Xapian::BAD_VALUENO)
slot = _find_slot (field);
if (slot == Xapian::BAD_VALUENO) {
std::string term_prefix = _find_prefix (field.c_str ());
std::vector<std::string> terms;
for (Xapian::TermIterator it = notmuch->xapian_db->allterms_begin (term_prefix);
it != notmuch->xapian_db->allterms_end (); ++it) {
if (regexec (&regexp, (*it).c_str () + term_prefix.size (),
0, NULL, 0) == 0)
terms.push_back (*it);
}
output = Xapian::Query (Xapian::Query::OP_OR, terms.begin (), terms.end ());
} else {
RegexpPostingSource *postings = new RegexpPostingSource (slot, regexp_str);
output = Xapian::Query (postings->release ());
}
return NOTMUCH_STATUS_SUCCESS;
}
Xapian::Query
RegexpFieldProcessor::operator() (const std::string & str)
{
if (str.empty ()) {
if (options & NOTMUCH_FIELD_PROBABILISTIC) {
return Xapian::Query (Xapian::Query::OP_AND_NOT,
Xapian::Query::MatchAll,
Xapian::Query (Xapian::Query::OP_WILDCARD, term_prefix));
} else {
return Xapian::Query (term_prefix);
}
}
if (str.at (0) == '/') {
if (str.length () > 1 && str.at (str.size () - 1) == '/') {
Xapian::Query query;
std::string regexp_str = str.substr (1, str.size () - 2);
std::string msg;
notmuch_status_t status;
status = _notmuch_regexp_to_query (notmuch, slot, field, regexp_str, query, msg);
if (status)
throw Xapian::QueryParserError (msg);
return query;
} else {
throw Xapian::QueryParserError ("unmatched regex delimiter in '" + str + "'");
}
} else {
if (options & NOTMUCH_FIELD_PROBABILISTIC) {
/* TODO replace this with a nicer API level triggering of
* phrase parsing, when possible */
std::string query_str;
if (*str.rbegin () != '*' || str.find (' ') != std::string::npos)
query_str = '"' + str + '"';
else
query_str = str;
return parser.parse_query (query_str, NOTMUCH_QUERY_PARSER_FLAGS, term_prefix);
} else {
/* Boolean prefix */
std::string query_str;
std::string term;
if (str.length () > 1 && str.at (str.size () - 1) == '/')
query_str = str.substr (0, str.size () - 1);
else
query_str = str;
term = term_prefix + query_str;
return Xapian::Query (term);
}
}
}