mirror of
https://git.notmuchmail.org/git/notmuch
synced 2024-11-25 04:18:08 +01:00
Remove test programs, xapian-dump and notmuch-index-message
These were just little tests while getting comfortable with GMime and xapian. I'll likely use pieces of these as notmuch continues, but for now let's not distract anyone looking at notmuch with these. And the code will live on in the history if I need to look at it.
This commit is contained in:
parent
2269106466
commit
fedef062ce
4 changed files with 1 additions and 1055 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,4 +1,2 @@
|
|||
xapian-dump
|
||||
notmuch-index-message
|
||||
notmuch
|
||||
|
||||
|
|
8
Makefile
8
Makefile
|
@ -1,4 +1,4 @@
|
|||
PROGS=notmuch notmuch-index-message xapian-dump
|
||||
PROGS=notmuch
|
||||
|
||||
MYCFLAGS=-Wall -O0 -g `pkg-config --cflags glib-2.0`
|
||||
MYCXXFLAGS=$(MYCFLAGS) `xapian-config --cxxflags`
|
||||
|
@ -16,11 +16,5 @@ all: $(PROGS)
|
|||
notmuch: notmuch.o database.o date.o message.o xutil.o
|
||||
$(CC) $(MYLDFLAGS) $^ -o $@
|
||||
|
||||
notmuch-index-message: notmuch-index-message.cc
|
||||
$(CXX) $(CXXFLAGS) $(MYCXXFLAGS) notmuch-index-message.cc `pkg-config --cflags --libs gmime-2.4` `xapian-config --cxxflags --libs` -o notmuch-index-message
|
||||
|
||||
xapian-dump: xapian-dump.cc
|
||||
$(CXX) $(CXXFLAGS) $(MYCXXFLAGS) xapian-dump.cc `xapian-config --libs --cxxflags` -o xapian-dump
|
||||
|
||||
clean:
|
||||
rm -f $(PROGS) *.o
|
||||
|
|
|
@ -1,842 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2009 Carl Worth
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see http://www.gnu.org/licenses/ .
|
||||
*
|
||||
* Author: Carl Worth <cworth@cworth.org>
|
||||
*/
|
||||
|
||||
/* This indexer creates a Xapian mail index that is remarkably similar
|
||||
* to that created by sup. The big difference, (and the thing that
|
||||
* will keep a notmuch index from being used by sup directly), is that
|
||||
* sup expects a serialized ruby data structure in the document's data
|
||||
* field, but notmuch just puts the mail's filename there (trusting
|
||||
* that the email client can get the data in needs from the filename).
|
||||
*
|
||||
* Note: One bug here is that sup actually merges together fields such
|
||||
* as To, CC, Bcc etc. when finding multiple emails with the same
|
||||
* message ID. To support something similar, notmuch should list
|
||||
* multiple files in the data field.
|
||||
*
|
||||
* Other differences between sup and notmuch-index identified so far:
|
||||
*
|
||||
* o sup supports encrypted mime parts by prompting for a passphrase
|
||||
* to decrypt the message. So far, notmuch doesn't support this,
|
||||
* both because I'm lazy to code it, and I also think doing so
|
||||
* would present a security leak.
|
||||
*
|
||||
* o sup and notmuch have different heuristics for identifying (and
|
||||
* thus ignoring) signatures. For example, sup considers a line
|
||||
* consisting of two hypens as a signature separator, while
|
||||
* notmuch expects those two hyphens to be followed by a space
|
||||
* character.
|
||||
*
|
||||
* o sup as been seen to split some numbers before indexing
|
||||
* them. For example, the number 1754 in an email message was
|
||||
* indexed by sup as separate terms 17 and 54. I couldn't find any
|
||||
* explanation for this behavior and did not try to replicate it
|
||||
* in notmuch.
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <errno.h>
|
||||
#include <time.h>
|
||||
#include <sys/time.h>
|
||||
|
||||
#include <iostream>
|
||||
|
||||
#include <gmime/gmime.h>
|
||||
|
||||
#include <xapian.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
#define ARRAY_SIZE(arr) (sizeof (arr) / sizeof (arr[0]))
|
||||
|
||||
/* Xapian complains if we provide a term longer than this. */
|
||||
#define NOTMUCH_MAX_TERM 245
|
||||
|
||||
/* These prefix values are specifically chosen to be compatible
|
||||
* with sup, (http://sup.rubyforge.org), written by
|
||||
* William Morgan <wmorgan-sup@masanjin.net>, and released
|
||||
* under the GNU GPL v2.
|
||||
*/
|
||||
|
||||
typedef struct {
|
||||
const char *name;
|
||||
const char *prefix;
|
||||
} prefix_t;
|
||||
|
||||
prefix_t NORMAL_PREFIX[] = {
|
||||
{ "subject", "S" },
|
||||
{ "body", "B" },
|
||||
{ "from_name", "FN" },
|
||||
{ "to_name", "TN" },
|
||||
{ "name", "N" },
|
||||
{ "attachment", "A" }
|
||||
};
|
||||
|
||||
prefix_t BOOLEAN_PREFIX[] = {
|
||||
{ "type", "K" },
|
||||
{ "from_email", "FE" },
|
||||
{ "to_email", "TE" },
|
||||
{ "email", "E" },
|
||||
{ "date", "D" },
|
||||
{ "label", "L" },
|
||||
{ "source_id", "I" },
|
||||
{ "attachment_extension", "O" },
|
||||
{ "msgid", "Q" },
|
||||
{ "thread", "H" },
|
||||
{ "ref", "R" }
|
||||
};
|
||||
|
||||
/* Similarly, these value numbers are also chosen to be sup
|
||||
* compatible. */
|
||||
|
||||
typedef enum {
|
||||
NOTMUCH_VALUE_MESSAGE_ID = 0,
|
||||
NOTMUCH_VALUE_THREAD = 1,
|
||||
NOTMUCH_VALUE_DATE = 2
|
||||
} notmuch_value_t;
|
||||
|
||||
static const char *
|
||||
find_prefix (const char *name)
|
||||
{
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE (NORMAL_PREFIX); i++)
|
||||
if (strcmp (name, NORMAL_PREFIX[i].name) == 0)
|
||||
return NORMAL_PREFIX[i].prefix;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE (BOOLEAN_PREFIX); i++)
|
||||
if (strcmp (name, BOOLEAN_PREFIX[i].name) == 0)
|
||||
return BOOLEAN_PREFIX[i].prefix;
|
||||
|
||||
return "";
|
||||
}
|
||||
|
||||
/* "128 bits of thread-id ought to be enough for anybody" */
|
||||
#define NOTMUCH_THREAD_ID_BITS 128
|
||||
#define NOTMUCH_THREAD_ID_DIGITS (NOTMUCH_THREAD_ID_BITS / 4)
|
||||
typedef struct _thread_id {
|
||||
char str[NOTMUCH_THREAD_ID_DIGITS + 1];
|
||||
} thread_id_t;
|
||||
|
||||
static void
|
||||
thread_id_generate (thread_id_t *thread_id)
|
||||
{
|
||||
FILE *urandom;
|
||||
uint32_t value;
|
||||
char *s;
|
||||
int i;
|
||||
|
||||
urandom = fopen ("/dev/urandom", "r");
|
||||
if (urandom == NULL) {
|
||||
fprintf (stderr, "Error opening /dev/urandom: %s\n",
|
||||
strerror (errno));
|
||||
fprintf (stderr, "Perhaps notmuch needs some portability fixes for your platform?\n");
|
||||
exit (1);
|
||||
}
|
||||
|
||||
s = thread_id->str;
|
||||
for (i = 0; i < NOTMUCH_THREAD_ID_DIGITS; i += 8) {
|
||||
fread ((void *) &value, sizeof (value), 1, urandom);
|
||||
sprintf (s, "%08x", value);
|
||||
s += 8;
|
||||
}
|
||||
|
||||
fclose (urandom);
|
||||
|
||||
printf ("Generated thread id: %s\n", thread_id->str);
|
||||
}
|
||||
|
||||
static void
|
||||
add_term (Xapian::Document doc,
|
||||
const char *prefix_name,
|
||||
const char *value)
|
||||
{
|
||||
const char *prefix;
|
||||
char *term;
|
||||
|
||||
if (value == NULL)
|
||||
return;
|
||||
|
||||
prefix = find_prefix (prefix_name);
|
||||
|
||||
term = g_strdup_printf ("%s%s", prefix, value);
|
||||
|
||||
if (strlen (term) <= NOTMUCH_MAX_TERM)
|
||||
doc.add_term (term);
|
||||
|
||||
g_free (term);
|
||||
}
|
||||
|
||||
static void
|
||||
gen_terms (Xapian::TermGenerator term_gen,
|
||||
const char *prefix_name,
|
||||
const char *text)
|
||||
{
|
||||
const char *prefix;
|
||||
|
||||
if (text == NULL)
|
||||
return;
|
||||
|
||||
prefix = find_prefix (prefix_name);
|
||||
|
||||
term_gen.index_text (text, 1, prefix);
|
||||
}
|
||||
|
||||
static void
|
||||
gen_terms_address_name (Xapian::TermGenerator term_gen,
|
||||
InternetAddress *address,
|
||||
const char *prefix_name)
|
||||
{
|
||||
if (INTERNET_ADDRESS_IS_MAILBOX(address)) {
|
||||
const char *name;
|
||||
int own_name = 0;
|
||||
|
||||
name = internet_address_get_name (address);
|
||||
|
||||
/* In the absence of a name, we'll strip the part before the @
|
||||
* from the address. */
|
||||
if (! name) {
|
||||
InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
|
||||
const char *addr = internet_address_mailbox_get_addr (mailbox);
|
||||
const char *at;
|
||||
|
||||
at = strchr (addr, '@');
|
||||
if (at) {
|
||||
name = strndup (addr, at - addr);
|
||||
own_name = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (name)
|
||||
gen_terms (term_gen, prefix_name, name);
|
||||
|
||||
if (own_name)
|
||||
free ((void *) name);
|
||||
} else if (INTERNET_ADDRESS_IS_GROUP (address)) {
|
||||
InternetAddressGroup *group = INTERNET_ADDRESS_GROUP (address);
|
||||
InternetAddressList *list = internet_address_group_get_members(group);
|
||||
if (list) {
|
||||
int length = internet_address_list_length(list);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < length; i++)
|
||||
gen_terms_address_name(term_gen,
|
||||
internet_address_list_get_address(list, i),
|
||||
prefix_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
gen_terms_address_names (Xapian::TermGenerator term_gen,
|
||||
InternetAddressList *addresses,
|
||||
const char *address_type)
|
||||
{
|
||||
int i;
|
||||
InternetAddress *address;
|
||||
|
||||
if (addresses == NULL)
|
||||
return;
|
||||
|
||||
for (i = 0; i < internet_address_list_length (addresses); i++) {
|
||||
address = internet_address_list_get_address (addresses, i);
|
||||
gen_terms_address_name (term_gen, address, address_type);
|
||||
gen_terms_address_name (term_gen, address, "name");
|
||||
gen_terms_address_name (term_gen, address, "body");
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
add_term_address_addr (Xapian::Document doc,
|
||||
InternetAddress *address,
|
||||
const char *prefix_name)
|
||||
{
|
||||
if (INTERNET_ADDRESS_IS_MAILBOX(address)) {
|
||||
InternetAddressMailbox *mailbox = INTERNET_ADDRESS_MAILBOX (address);
|
||||
const char *addr;
|
||||
|
||||
addr = internet_address_mailbox_get_addr (mailbox);
|
||||
|
||||
if (addr)
|
||||
add_term (doc, prefix_name, addr);
|
||||
} else if (INTERNET_ADDRESS_IS_GROUP (address)) {
|
||||
InternetAddressGroup *group = INTERNET_ADDRESS_GROUP (address);
|
||||
InternetAddressList *list = internet_address_group_get_members(group);
|
||||
if (list) {
|
||||
int length = internet_address_list_length(list);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < length; i++)
|
||||
add_term_address_addr(doc,
|
||||
internet_address_list_get_address(list, i),
|
||||
prefix_name);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
add_terms_address_addrs (Xapian::Document doc,
|
||||
InternetAddressList *addresses,
|
||||
const char *address_type)
|
||||
{
|
||||
int i;
|
||||
InternetAddress *address;
|
||||
|
||||
if (addresses == NULL)
|
||||
return;
|
||||
|
||||
for (i = 0; i < internet_address_list_length (addresses); i++) {
|
||||
address = internet_address_list_get_address (addresses, i);
|
||||
add_term_address_addr (doc, address, address_type);
|
||||
add_term_address_addr (doc, address, "email");
|
||||
}
|
||||
}
|
||||
|
||||
static const char *
|
||||
skip_re_in_subject (const char *subject)
|
||||
{
|
||||
const char *s = subject;
|
||||
|
||||
if (subject == NULL)
|
||||
return NULL;
|
||||
|
||||
while (*s) {
|
||||
while (*s && isspace (*s))
|
||||
s++;
|
||||
if (strncasecmp (s, "re:", 3) == 0)
|
||||
s += 3;
|
||||
else
|
||||
break;
|
||||
}
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
static void
|
||||
find_messages_by_term (Xapian::Database db,
|
||||
const char *prefix_name,
|
||||
const char *value,
|
||||
Xapian::PostingIterator *begin,
|
||||
Xapian::PostingIterator *end)
|
||||
{
|
||||
Xapian::PostingIterator i;
|
||||
char *term;
|
||||
|
||||
term = g_strdup_printf ("%s%s", find_prefix (prefix_name), value);
|
||||
|
||||
*begin = db.postlist_begin (term);
|
||||
|
||||
if (end)
|
||||
*end = db.postlist_end (term);
|
||||
|
||||
free (term);
|
||||
}
|
||||
|
||||
Xapian::Document
|
||||
find_message_by_docid (Xapian::Database db, Xapian::docid docid)
|
||||
{
|
||||
return db.get_document (docid);
|
||||
}
|
||||
|
||||
Xapian::Document
|
||||
find_message_by_message_id (Xapian::Database db, const char *message_id)
|
||||
{
|
||||
Xapian::PostingIterator i, end;
|
||||
|
||||
find_messages_by_term (db, "msgid", message_id, &i, &end);
|
||||
|
||||
if (i != end)
|
||||
return find_message_by_docid (db, *i);
|
||||
else
|
||||
return Xapian::Document ();
|
||||
}
|
||||
|
||||
static void
|
||||
insert_thread_id (GHashTable *thread_ids, Xapian::Document doc)
|
||||
{
|
||||
string value_string;
|
||||
const char *value, *id, *comma;
|
||||
|
||||
value_string = doc.get_value (NOTMUCH_VALUE_THREAD);
|
||||
value = value_string.c_str();
|
||||
if (strlen (value)) {
|
||||
id = value;
|
||||
while (*id) {
|
||||
comma = strchr (id, ',');
|
||||
if (comma == NULL)
|
||||
comma = id + strlen (id);
|
||||
g_hash_table_insert (thread_ids,
|
||||
strndup (id, comma - id), NULL);
|
||||
id = comma;
|
||||
if (*id)
|
||||
id++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Return one or more thread_ids, (as a GPtrArray of strings), for the
|
||||
* given message based on looking into the database for any messages
|
||||
* referenced in parents, and also for any messages in the database
|
||||
* referencing message_id.
|
||||
*
|
||||
* Caller should free all strings in the array and the array itself,
|
||||
* (g_ptr_array_free) when done. */
|
||||
static GPtrArray *
|
||||
find_thread_ids (Xapian::Database db,
|
||||
GPtrArray *parents,
|
||||
const char *message_id)
|
||||
{
|
||||
Xapian::PostingIterator child, children_end;
|
||||
Xapian::Document doc;
|
||||
GHashTable *thread_ids;
|
||||
GList *keys, *l;
|
||||
unsigned int i;
|
||||
const char *parent_message_id;
|
||||
GPtrArray *result;
|
||||
|
||||
thread_ids = g_hash_table_new_full (g_str_hash, g_str_equal,
|
||||
free, NULL);
|
||||
|
||||
find_messages_by_term (db, "ref", message_id, &child, &children_end);
|
||||
for ( ; child != children_end; child++) {
|
||||
doc = find_message_by_docid (db, *child);
|
||||
insert_thread_id (thread_ids, doc);
|
||||
}
|
||||
|
||||
for (i = 0; i < parents->len; i++) {
|
||||
parent_message_id = (char *) g_ptr_array_index (parents, i);
|
||||
doc = find_message_by_message_id (db, parent_message_id);
|
||||
insert_thread_id (thread_ids, doc);
|
||||
}
|
||||
|
||||
result = g_ptr_array_new ();
|
||||
|
||||
keys = g_hash_table_get_keys (thread_ids);
|
||||
for (l = keys; l; l = l->next) {
|
||||
char *id = (char *) l->data;
|
||||
g_ptr_array_add (result, id);
|
||||
}
|
||||
g_list_free (keys);
|
||||
|
||||
/* We're done with the hash table, but we've taken the pointers to
|
||||
* the allocated strings and put them into our result array, so
|
||||
* tell the hash not to free them on its way out. */
|
||||
g_hash_table_steal_all (thread_ids);
|
||||
g_hash_table_unref (thread_ids);
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
/* Add a term for each message-id in the References header of the
|
||||
* message. */
|
||||
static void
|
||||
parse_references (GPtrArray *array,
|
||||
const char *refs_str)
|
||||
{
|
||||
GMimeReferences *refs, *r;
|
||||
const char *message_id;
|
||||
|
||||
if (refs_str == NULL)
|
||||
return;
|
||||
|
||||
refs = g_mime_references_decode (refs_str);
|
||||
|
||||
for (r = refs; r; r = r->next) {
|
||||
message_id = g_mime_references_get_message_id (r);
|
||||
g_ptr_array_add (array, g_strdup (message_id));
|
||||
}
|
||||
|
||||
g_mime_references_free (refs);
|
||||
}
|
||||
|
||||
/* Given a string representing the body of a message, generate terms
|
||||
* for it, (skipping quoted portions and signatures). */
|
||||
static void
|
||||
gen_terms_body_str (Xapian::TermGenerator term_gen,
|
||||
char *body)
|
||||
{
|
||||
char *line, *line_end, *next_line;
|
||||
|
||||
if (body == NULL)
|
||||
return;
|
||||
|
||||
next_line = body;
|
||||
|
||||
while (1) {
|
||||
line = next_line;
|
||||
if (*line == '\0')
|
||||
break;
|
||||
|
||||
next_line = strchr (line, '\n');
|
||||
if (next_line == NULL) {
|
||||
next_line = line + strlen (line);
|
||||
}
|
||||
line_end = next_line - 1;
|
||||
|
||||
/* Get to the next non-blank line. */
|
||||
while (*next_line == '\n')
|
||||
next_line++;
|
||||
|
||||
/* Skip blank lines. */
|
||||
if (line_end < line)
|
||||
continue;
|
||||
|
||||
/* Skip lines that are quotes. */
|
||||
if (*line == '>')
|
||||
continue;
|
||||
|
||||
/* Also skip lines introducing a quote on the next line. */
|
||||
if (*line_end == ':' && *next_line == '>')
|
||||
continue;
|
||||
|
||||
/* Finally, bail as soon as we see a signature. */
|
||||
/* XXX: Should only do this if "near" the end of the message. */
|
||||
if (strncmp (line, "-- ", 3) == 0 ||
|
||||
strncmp (line, "----------", 10) == 0 ||
|
||||
strncmp (line, "__________", 10) == 0)
|
||||
break;
|
||||
|
||||
*(line_end + 1) = '\0';
|
||||
gen_terms (term_gen, "body", line);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/* Callback to generate terms for each mime part of a message. */
|
||||
static void
|
||||
gen_terms_part (Xapian::TermGenerator term_gen,
|
||||
GMimeObject *part)
|
||||
{
|
||||
GMimeStream *stream;
|
||||
GMimeDataWrapper *wrapper;
|
||||
GByteArray *byte_array;
|
||||
GMimeContentDisposition *disposition;
|
||||
char *body;
|
||||
|
||||
if (GMIME_IS_MULTIPART (part)) {
|
||||
GMimeMultipart *multipart = GMIME_MULTIPART (part);
|
||||
int i;
|
||||
|
||||
for (i = 0; i < g_mime_multipart_get_count (multipart); i++) {
|
||||
if (GMIME_IS_MULTIPART_SIGNED (multipart)) {
|
||||
/* Don't index the signature. */
|
||||
if (i == 1)
|
||||
continue;
|
||||
if (i > 1)
|
||||
fprintf (stderr, "Warning: Unexpected extra parts of mutlipart/signed. Indexing anyway.\n");
|
||||
}
|
||||
gen_terms_part (term_gen,
|
||||
g_mime_multipart_get_part (multipart, i));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (GMIME_IS_MESSAGE_PART (part)) {
|
||||
GMimeMessage *message;
|
||||
|
||||
message = g_mime_message_part_get_message (GMIME_MESSAGE_PART (part));
|
||||
|
||||
gen_terms_part (term_gen, g_mime_message_get_mime_part (message));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (! (GMIME_IS_PART (part))) {
|
||||
fprintf (stderr, "Warning: Not indexing unknown mime part: %s.\n",
|
||||
g_type_name (G_OBJECT_TYPE (part)));
|
||||
return;
|
||||
}
|
||||
|
||||
disposition = g_mime_object_get_content_disposition (part);
|
||||
if (disposition &&
|
||||
strcmp (disposition->disposition, GMIME_DISPOSITION_ATTACHMENT) == 0)
|
||||
{
|
||||
const char *filename = g_mime_part_get_filename (GMIME_PART (part));
|
||||
const char *extension;
|
||||
|
||||
add_term (term_gen.get_document (), "label", "attachment");
|
||||
gen_terms (term_gen, "attachment", filename);
|
||||
|
||||
if (filename) {
|
||||
extension = strchr (filename, '.');
|
||||
if (extension) {
|
||||
add_term (term_gen.get_document (), "attachment_extension",
|
||||
extension + 1);
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
byte_array = g_byte_array_new ();
|
||||
|
||||
stream = g_mime_stream_mem_new_with_byte_array (byte_array);
|
||||
g_mime_stream_mem_set_owner (GMIME_STREAM_MEM (stream), FALSE);
|
||||
wrapper = g_mime_part_get_content_object (GMIME_PART (part));
|
||||
if (wrapper)
|
||||
g_mime_data_wrapper_write_to_stream (wrapper, stream);
|
||||
|
||||
g_object_unref (stream);
|
||||
|
||||
g_byte_array_append (byte_array, (guint8 *) "\0", 1);
|
||||
body = (char *) g_byte_array_free (byte_array, FALSE);
|
||||
|
||||
gen_terms_body_str (term_gen, body);
|
||||
|
||||
free (body);
|
||||
}
|
||||
|
||||
static void
|
||||
index_file (Xapian::WritableDatabase db,
|
||||
Xapian::TermGenerator term_gen,
|
||||
const char *filename)
|
||||
{
|
||||
Xapian::Document doc;
|
||||
|
||||
GMimeStream *stream;
|
||||
GMimeParser *parser;
|
||||
GMimeMessage *message;
|
||||
InternetAddressList *addresses;
|
||||
GPtrArray *parents, *thread_ids;
|
||||
|
||||
FILE *file;
|
||||
|
||||
const char *subject, *refs, *in_reply_to, *from;
|
||||
const char *message_id;
|
||||
|
||||
time_t time;
|
||||
struct tm gm_time_tm;
|
||||
char date_str[16]; /* YYYYMMDDHHMMSS + 1 for Y100k compatibility ;-) */
|
||||
unsigned int i;
|
||||
|
||||
file = fopen (filename, "r");
|
||||
if (! file) {
|
||||
fprintf (stderr, "Error opening %s: %s\n", filename, strerror (errno));
|
||||
exit (1);
|
||||
}
|
||||
|
||||
stream = g_mime_stream_file_new (file);
|
||||
|
||||
parser = g_mime_parser_new_with_stream (stream);
|
||||
|
||||
message = g_mime_parser_construct_message (parser);
|
||||
|
||||
doc = Xapian::Document ();
|
||||
|
||||
doc.set_data (filename);
|
||||
|
||||
term_gen.set_stemmer (Xapian::Stem ("english"));
|
||||
|
||||
term_gen.set_document (doc);
|
||||
|
||||
from = g_mime_message_get_sender (message);
|
||||
addresses = internet_address_list_parse_string (from);
|
||||
|
||||
gen_terms_address_names (term_gen, addresses, "from_name");
|
||||
|
||||
addresses = g_mime_message_get_all_recipients (message);
|
||||
gen_terms_address_names (term_gen, addresses, "to_name");
|
||||
|
||||
subject = g_mime_message_get_subject (message);
|
||||
subject = skip_re_in_subject (subject);
|
||||
gen_terms (term_gen, "subject", subject);
|
||||
gen_terms (term_gen, "body", subject);
|
||||
|
||||
gen_terms_part (term_gen, g_mime_message_get_mime_part (message));
|
||||
|
||||
parents = g_ptr_array_new ();
|
||||
|
||||
refs = g_mime_object_get_header (GMIME_OBJECT (message), "references");
|
||||
parse_references (parents, refs);
|
||||
|
||||
in_reply_to = g_mime_object_get_header (GMIME_OBJECT (message),
|
||||
"in-reply-to");
|
||||
parse_references (parents, in_reply_to);
|
||||
|
||||
for (i = 0; i < parents->len; i++)
|
||||
add_term (doc, "ref", (char *) g_ptr_array_index (parents, i));
|
||||
|
||||
message_id = g_mime_message_get_message_id (message);
|
||||
|
||||
thread_ids = find_thread_ids (db, parents, message_id);
|
||||
|
||||
for (i = 0; i < parents->len; i++)
|
||||
g_free (g_ptr_array_index (parents, i));
|
||||
g_ptr_array_free (parents, TRUE);
|
||||
|
||||
from = g_mime_message_get_sender (message);
|
||||
addresses = internet_address_list_parse_string (from);
|
||||
|
||||
add_terms_address_addrs (doc, addresses, "from_email");
|
||||
|
||||
add_terms_address_addrs (doc,
|
||||
g_mime_message_get_all_recipients (message),
|
||||
"to_email");
|
||||
|
||||
g_mime_message_get_date (message, &time, NULL);
|
||||
|
||||
gmtime_r (&time, &gm_time_tm);
|
||||
|
||||
if (strftime (date_str, sizeof (date_str),
|
||||
"%Y%m%d%H%M%S", &gm_time_tm) == 0) {
|
||||
fprintf (stderr, "Internal error formatting time\n");
|
||||
exit (1);
|
||||
}
|
||||
|
||||
add_term (doc, "date", date_str);
|
||||
|
||||
add_term (doc, "label", "inbox");
|
||||
add_term (doc, "label", "unread");
|
||||
add_term (doc, "type", "mail");
|
||||
add_term (doc, "source_id", "1");
|
||||
|
||||
if (message_id) {
|
||||
add_term (doc, "msgid", message_id);
|
||||
doc.add_value (NOTMUCH_VALUE_MESSAGE_ID, message_id);
|
||||
}
|
||||
|
||||
if (thread_ids->len) {
|
||||
unsigned int i;
|
||||
GString *thread_id;
|
||||
char *id;
|
||||
|
||||
for (i = 0; i < thread_ids->len; i++) {
|
||||
id = (char *) thread_ids->pdata[i];
|
||||
|
||||
add_term (doc, "thread", id);
|
||||
|
||||
if (i == 0)
|
||||
thread_id = g_string_new (id);
|
||||
else
|
||||
g_string_append_printf (thread_id, ",%s", id);
|
||||
|
||||
free (id);
|
||||
}
|
||||
g_ptr_array_free (thread_ids, TRUE);
|
||||
|
||||
doc.add_value (NOTMUCH_VALUE_THREAD, thread_id->str);
|
||||
|
||||
g_string_free (thread_id, TRUE);
|
||||
} else if (message_id) {
|
||||
/* If not part of any existing thread, generate a new thread_id. */
|
||||
thread_id_t thread_id;
|
||||
|
||||
thread_id_generate (&thread_id);
|
||||
|
||||
add_term (doc, "thread", thread_id.str);
|
||||
doc.add_value (NOTMUCH_VALUE_THREAD, thread_id.str);
|
||||
}
|
||||
|
||||
doc.add_value (NOTMUCH_VALUE_DATE, Xapian::sortable_serialise (time));
|
||||
|
||||
db.add_document (doc);
|
||||
|
||||
g_object_unref (message);
|
||||
g_object_unref (parser);
|
||||
g_object_unref (stream);
|
||||
}
|
||||
|
||||
static void
|
||||
usage (const char *argv0)
|
||||
{
|
||||
fprintf (stderr, "Usage: %s <path-to-xapian-database>\n", argv0);
|
||||
fprintf (stderr, "\n");
|
||||
fprintf (stderr, "Messages to be indexed are read from stdnin as absolute filenames\n");
|
||||
fprintf (stderr, "one file per line.");
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char **argv)
|
||||
{
|
||||
const char *database_path;
|
||||
char *filename;
|
||||
GIOChannel *channel;
|
||||
GIOStatus gio_status;
|
||||
GError *error = NULL;
|
||||
int count;
|
||||
struct timeval tv_start, tv_last, tv_now;
|
||||
double elapsed;
|
||||
|
||||
if (argc < 2) {
|
||||
usage (argv[0]);
|
||||
exit (1);
|
||||
}
|
||||
|
||||
database_path = argv[1];
|
||||
|
||||
g_mime_init (0);
|
||||
|
||||
try {
|
||||
Xapian::WritableDatabase db;
|
||||
Xapian::TermGenerator term_gen;
|
||||
|
||||
db = Xapian::WritableDatabase (database_path,
|
||||
Xapian::DB_CREATE_OR_OPEN);
|
||||
|
||||
term_gen = Xapian::TermGenerator ();
|
||||
|
||||
channel = g_io_channel_unix_new (fileno (stdin));
|
||||
|
||||
count = 0;
|
||||
|
||||
gettimeofday (&tv_start, NULL);
|
||||
tv_last = tv_start;
|
||||
|
||||
while (1) {
|
||||
gio_status = g_io_channel_read_line (channel, &filename,
|
||||
NULL, NULL, &error);
|
||||
if (gio_status == G_IO_STATUS_EOF)
|
||||
break;
|
||||
if (gio_status != G_IO_STATUS_NORMAL) {
|
||||
fprintf (stderr, "An error occurred reading from stdin: %s\n",
|
||||
error->message);
|
||||
exit (1);
|
||||
}
|
||||
|
||||
g_strchomp (filename);
|
||||
index_file (db, term_gen, filename);
|
||||
|
||||
g_free (filename);
|
||||
|
||||
count++;
|
||||
if (count % 1000 == 0) {
|
||||
gettimeofday (&tv_now, NULL);
|
||||
printf ("Indexed %d messages (%g messages/second)\n",
|
||||
count, 1000 / ((tv_now.tv_sec - tv_last.tv_sec) +
|
||||
(tv_now.tv_usec - tv_last.tv_usec) / 1e6));
|
||||
tv_last = tv_now;
|
||||
}
|
||||
}
|
||||
|
||||
g_io_channel_unref (channel);
|
||||
|
||||
gettimeofday (&tv_now, NULL);
|
||||
elapsed = (tv_now.tv_sec - tv_start.tv_sec +
|
||||
(tv_now.tv_usec - tv_start.tv_usec) / 1e6);
|
||||
printf ("Completed indexing of %d messages in %g seconds (%g messages/second)\n",
|
||||
count, elapsed, count / elapsed);
|
||||
|
||||
} catch (const Xapian::Error &error) {
|
||||
cerr << "A Xapian exception occurred: " << error.get_msg () << endl;
|
||||
exit (1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
204
xapian-dump.cc
204
xapian-dump.cc
|
@ -1,204 +0,0 @@
|
|||
/* xapian-dump: Create a textual dump of a Xapian database.
|
||||
*
|
||||
* Copyright © 2009 Carl Worth
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see http://www.gnu.org/licenses/ .
|
||||
*
|
||||
* Author: Carl Worth <cworth@cworth.org>
|
||||
*/
|
||||
|
||||
/* Currently the dumped data includes:
|
||||
*
|
||||
* All document IDs
|
||||
*
|
||||
* And for each document ID:
|
||||
*
|
||||
* Document data
|
||||
* All document terms
|
||||
* All document values
|
||||
*/
|
||||
|
||||
#include <cstdlib>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
#include <xapian.h>
|
||||
|
||||
using namespace std;
|
||||
|
||||
vector<int> UNSERIALIZE;
|
||||
|
||||
unsigned int MAX_TERMS = 0;
|
||||
|
||||
static void
|
||||
print_escaped_string (const char *s)
|
||||
{
|
||||
printf ("\"");
|
||||
|
||||
while (*s) {
|
||||
if (*s == '"')
|
||||
printf ("\\");
|
||||
printf ("%c", *s);
|
||||
s++;
|
||||
}
|
||||
|
||||
printf ("\"");
|
||||
}
|
||||
|
||||
static void
|
||||
print_document_terms (Xapian::Document doc)
|
||||
{
|
||||
Xapian::TermIterator it;
|
||||
unsigned int i;
|
||||
|
||||
printf (" {\n");
|
||||
|
||||
for (it = doc.termlist_begin (), i = 0;
|
||||
it != doc.termlist_end ();
|
||||
it++, i++)
|
||||
{
|
||||
printf (" ");
|
||||
print_escaped_string ((*it).c_str());
|
||||
printf (",\n");
|
||||
}
|
||||
|
||||
for ( ; i < MAX_TERMS; i++)
|
||||
printf (" \"\",\n");
|
||||
|
||||
printf (" },\n");
|
||||
}
|
||||
|
||||
static int
|
||||
vector_int_contains (vector<int> v, int i)
|
||||
{
|
||||
vector<int>::iterator result;
|
||||
|
||||
result = find (v.begin(), v.end(), i);
|
||||
|
||||
return result != v.end();
|
||||
}
|
||||
|
||||
static void
|
||||
print_document_values (Xapian::Document doc)
|
||||
{
|
||||
Xapian::ValueIterator i;
|
||||
int value_no, value_int;
|
||||
double value_float;
|
||||
|
||||
for (i = doc.values_begin (); i != doc.values_end (); i++) {
|
||||
value_no = i.get_valueno();
|
||||
|
||||
printf (" ");
|
||||
|
||||
if (vector_int_contains (UNSERIALIZE, value_no)) {
|
||||
value_float = Xapian::sortable_unserialise (*i);
|
||||
value_int = value_float;
|
||||
if (value_int == value_float)
|
||||
printf ("%d", value_int);
|
||||
else
|
||||
printf ("\"%f\"", value_float);
|
||||
} else {
|
||||
print_escaped_string ((*i).c_str ());
|
||||
}
|
||||
|
||||
printf (",\n");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void
|
||||
print_document (Xapian::Database db, Xapian::docid id)
|
||||
{
|
||||
Xapian::Document doc;
|
||||
|
||||
printf ("{\n");
|
||||
|
||||
doc = db.get_document (id);
|
||||
|
||||
printf (" \"%s\",\n", doc.get_data ().c_str());
|
||||
|
||||
print_document_terms (doc);
|
||||
|
||||
print_document_values (doc);
|
||||
|
||||
printf ("},\n");
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char *argv[])
|
||||
{
|
||||
const char *database_path;
|
||||
int i;
|
||||
|
||||
if (argc < 2) {
|
||||
fprintf (stderr, "Usage: %s <path-to-xapian-database> [value_nos...]\n",
|
||||
argv[0]);
|
||||
fprintf (stderr, "Dumps data from the given database.\n");
|
||||
fprintf (stderr, "The values corresponding to any value numbers given on the command line\n");
|
||||
fprintf (stderr, "will be unserialized to an before being printed.\n");
|
||||
exit (1);
|
||||
}
|
||||
|
||||
database_path = argv[1];
|
||||
|
||||
UNSERIALIZE = vector<int> ();
|
||||
|
||||
for (i = 2; i < argc; i++)
|
||||
UNSERIALIZE.push_back (atoi (argv[i]));
|
||||
|
||||
try {
|
||||
Xapian::Database db;
|
||||
Xapian::PostingIterator i;
|
||||
Xapian::docid doc_id;
|
||||
|
||||
db = Xapian::Database (database_path);
|
||||
|
||||
for (i = db.postlist_begin (""); i != db.postlist_end (""); i++) {
|
||||
Xapian::Document doc;
|
||||
|
||||
doc_id = *i;
|
||||
|
||||
doc = db.get_document (doc_id);
|
||||
|
||||
if (doc.termlist_count () > MAX_TERMS)
|
||||
MAX_TERMS = doc.termlist_count ();
|
||||
}
|
||||
|
||||
printf ("#define MAX_TERMS %d\n\n", MAX_TERMS);
|
||||
|
||||
printf ("typedef struct {\n"
|
||||
" char data[255];\n"
|
||||
" char terms[MAX_TERMS][255];\n"
|
||||
" char message_id[255];\n"
|
||||
" char thread_id[4096];\n"
|
||||
" time_t time;\n"
|
||||
"} document_dump_t;\n\n");
|
||||
|
||||
printf ("document_dump_t dump[] = {\n");
|
||||
|
||||
for (i = db.postlist_begin (""); i != db.postlist_end (""); i++) {
|
||||
doc_id = *i;
|
||||
|
||||
print_document (db, doc_id);
|
||||
}
|
||||
|
||||
printf ("};\n");
|
||||
|
||||
} catch (const Xapian::Error &error) {
|
||||
cerr << "A Xapian exception occurred: " << error.get_msg () << endl;
|
||||
exit (1);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in a new issue