util: add unicode_word_utf8

This originally use Xapian::Unicode::is_wordchar, but that forces clients to link directly to libxapian, which seems like it might be busywork if nothing else.
2024-11-22 02:48:08 +01:00 · 2019-03-25 23:07:24 -03:00 · 2019-03-25 23:07:24 -03:00 · 781125c9e9
commit 781125c9e9
parent 46ab6013a2
3 changed files with 57 additions and 1 deletions
--- a/util/Makefile.local
+++ b/util/Makefile.local
@ -5,7 +5,8 @@ extra_cflags += -I$(srcdir)/$(dir)
 libnotmuch_util_c_srcs := $(dir)/xutil.c $(dir)/error_util.c $(dir)/hex-escape.c \
 		  $(dir)/string-util.c $(dir)/talloc-extra.c $(dir)/zlib-extra.c \
-		$(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c
+		$(dir)/util.c $(dir)/gmime-extra.c $(dir)/crypto.c \
 		$(dir)/unicode-util.c
 libnotmuch_util_modules := $(libnotmuch_util_c_srcs:.c=.o)
--- a/util/unicode-util.c
+++ b/util/unicode-util.c
@ -0,0 +1,43 @@
 #include "unicode-util.h"
 /* Based on Xapian::Unicode::is_wordchar, to avoid forcing clients to
   link directly to libxapian.
 */
 static bool
 unicode_is_wordchar (notmuch_unichar ch)
 {
    switch (g_unichar_type (ch)) {
    case G_UNICODE_UPPERCASE_LETTER:
    case G_UNICODE_LOWERCASE_LETTER:
    case G_UNICODE_TITLECASE_LETTER:
    case G_UNICODE_MODIFIER_LETTER:
    case G_UNICODE_OTHER_LETTER:
    case G_UNICODE_NON_SPACING_MARK:
    case G_UNICODE_ENCLOSING_MARK:
    case G_UNICODE_SPACING_MARK:
    case G_UNICODE_DECIMAL_NUMBER:
    case G_UNICODE_LETTER_NUMBER:
    case G_UNICODE_OTHER_NUMBER:
    case G_UNICODE_CONNECT_PUNCTUATION:
 	return true;
    default:
 	return false;
    }
 }
 bool
 unicode_word_utf8 (const char *utf8_str)
 {
    gunichar *decoded = g_utf8_to_ucs4_fast (utf8_str, -1, NULL);
    const gunichar *p = decoded;
    bool ret;
    while (*p && unicode_is_wordchar (*p))
 	p++;
    ret =  (*p == '\0');
    g_free (decoded);
    return ret;
 }
--- a/util/unicode-util.h
+++ b/util/unicode-util.h
@ -0,0 +1,12 @@
 #ifndef UNICODE_UTIL_H
 #define UNICODE_UTIL_H
 #include <stdbool.h>
 #include <gmodule.h>
 /* The utf8 encoded string would tokenize as a single word, according
 * to xapian. */
 bool unicode_word_utf8 (const char *str);
 typedef gunichar notmuch_unichar;
 #endif