From 781125c9e92a2b9a2b9fbe54adec28ddb60f35b1 Mon Sep 17 00:00:00 2001 From: David Bremner Date: Mon, 25 Mar 2019 23:07:24 -0300 Subject: [PATCH] util: add unicode_word_utf8 This originally use Xapian::Unicode::is_wordchar, but that forces clients to link directly to libxapian, which seems like it might be busywork if nothing else. --- util/Makefile.local | 3 ++- util/unicode-util.c | 43 +++++++++++++++++++++++++++++++++++++++++++ util/unicode-util.h | 12 ++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) create mode 100644 util/unicode-util.c create mode 100644 util/unicode-util.h diff --git a/util/Makefile.local b/util/Makefile.local index ba03230e..46f8af3a 100644 --- a/util/Makefile.local +++ b/util/Makefile.local @@ -5,7 +5,8 @@ extra_cflags += -I$(srcdir)/$(dir) libnotmuch_util_c_srcs := $(dir)/xutil.c $(dir)/error_util.c $(dir)/hex-escape.c \ $(dir)/string-util.c $(dir)/talloc-extra.c $(dir)/zlib-extra.c \ - $(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c + $(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c \ + $(dir)/unicode-util.c libnotmuch_util_modules := $(libnotmuch_util_c_srcs:.c=.o) diff --git a/util/unicode-util.c b/util/unicode-util.c new file mode 100644 index 00000000..312e900f --- /dev/null +++ b/util/unicode-util.c @@ -0,0 +1,43 @@ +#include "unicode-util.h" + +/* Based on Xapian::Unicode::is_wordchar, to avoid forcing clients to + link directly to libxapian. +*/ + +static bool +unicode_is_wordchar (notmuch_unichar ch) +{ + switch (g_unichar_type (ch)) { + case G_UNICODE_UPPERCASE_LETTER: + case G_UNICODE_LOWERCASE_LETTER: + case G_UNICODE_TITLECASE_LETTER: + case G_UNICODE_MODIFIER_LETTER: + case G_UNICODE_OTHER_LETTER: + case G_UNICODE_NON_SPACING_MARK: + case G_UNICODE_ENCLOSING_MARK: + case G_UNICODE_SPACING_MARK: + case G_UNICODE_DECIMAL_NUMBER: + case G_UNICODE_LETTER_NUMBER: + case G_UNICODE_OTHER_NUMBER: + case G_UNICODE_CONNECT_PUNCTUATION: + return true; + default: + return false; + } +} + +bool +unicode_word_utf8 (const char *utf8_str) +{ + gunichar *decoded = g_utf8_to_ucs4_fast (utf8_str, -1, NULL); + const gunichar *p = decoded; + bool ret; + + while (*p && unicode_is_wordchar (*p)) + p++; + + ret = (*p == '\0'); + + g_free (decoded); + return ret; +} diff --git a/util/unicode-util.h b/util/unicode-util.h new file mode 100644 index 00000000..32d1e6ef --- /dev/null +++ b/util/unicode-util.h @@ -0,0 +1,12 @@ +#ifndef UNICODE_UTIL_H +#define UNICODE_UTIL_H + +#include +#include + +/* The utf8 encoded string would tokenize as a single word, according + * to xapian. */ +bool unicode_word_utf8 (const char *str); +typedef gunichar notmuch_unichar; + +#endif