CL/git: add format version 1

The original nmbug format (now called version 0) creates 1
subdirectory of 'tags/' per message. This causes problems for more
than (roughly) 100k messages.

Version 1 introduces 2 layers of hashed directories. This scheme was
chose to balance the number of subdirectories with the number of extra
directories (and git objects) created via hashing.

This should be upward compatible in the sense that old repositories
will continue to work with the updated notmuch-git.
This commit is contained in:
David Bremner 2022-06-23 09:30:44 -03:00
parent b07e121923
commit 6219e7380a
4 changed files with 109 additions and 35 deletions

View file

@ -235,14 +235,46 @@ REPOSITORY CONTENTS
===================
The tags are stored in the git repo (and exported) as a set of empty
files. For a message with Message-Id *id*, for each tag *tag*, there
files. These empty files are contained within a directory named after
the message-id.
In what follows `encode()` represents a POSIX filesystem safe
encoding. The encoding preserves alphanumerics, and the characters
`+-_@=.,:`. All other octets are replaced with `%` followed by a two
digit hex number.
Currently :any:`notmuch-git` can read any format version, but can only
create (via :any:`init`) :ref:`version 1 <format_version_1>` repositories.
.. _format_version_0:
Version 0
---------
This is the legacy format created by the `nmbug` tool prior to release
0.37. For a message with Message-Id *id*, for each tag *tag*, there
is an empty file with path
tags/ `encode` (*id*) / `encode` (*tag*)
The encoding preserves alphanumerics, and the characters `+-_@=.,:`.
All other octets are replaced with `%` followed by a two digit hex
number.
.. _format_version_1:
Version 1
---------
In format version 1 and later, the format version is contained in a
top level file called FORMAT.
For a message with Message-Id *id*, for each tag *tag*, there
is an empty file with path
tags/ `hash1` (*id*) / `hash2` (*id*) `encode` (*id*) / `encode` (*tag*)
The hash functions each represent one byte of the `blake2b` hex
digest.
Compared to :ref:`version 0 <format_version_0>`, this reduces the
number of subdirectories within each directory.
.. _repo_location:

View file

@ -46,10 +46,12 @@ _LOG.addHandler(_logging.StreamHandler())
NOTMUCH_GIT_DIR = None
TAG_PREFIX = None
FORMAT_VERSION = 0
_HEX_ESCAPE_REGEX = _re.compile('%[0-9A-F]{2}')
_TAG_DIRECTORY = 'tags/'
_TAG_FILE_REGEX = _re.compile(_TAG_DIRECTORY + '(?P<id>[^/]*)/(?P<tag>[^/]*)')
_TAG_FILE_REGEX = ( _re.compile(_TAG_DIRECTORY + '(?P<id>[^/]*)/(?P<tag>[^/]*)'),
_re.compile(_TAG_DIRECTORY + '([0-9a-f]{2}/){2}(?P<id>[^/]*)/(?P<tag>[^/]*)'))
# magic hash for Git (git hash-object -t blob /dev/null)
_EMPTYBLOB = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
@ -265,7 +267,7 @@ def archive(treeish='HEAD', args=()):
Each tag $tag for message with Message-Id $id is written to
an empty file
tags/encode($id)/encode($tag)
tags/hash1(id)/hash2(id)/encode($id)/encode($tag)
The encoding preserves alphanumerics, and the characters
"+-_@=.:," (not the quotes). All other octets are replaced with
@ -469,9 +471,17 @@ def init(remote=None):
_git(args=['config', 'core.logallrefupdates', 'true'], wait=True)
# create an empty blob (e69de29bb2d1d6434b8b29ae775ad8c2e48c5391)
_git(args=['hash-object', '-w', '--stdin'], input='', wait=True)
# create a blob for the FORMAT file
(status, stdout, _) = _git(args=['hash-object', '-w', '--stdin'], stdout=_subprocess.PIPE,
input='1\n', wait=True)
verhash=stdout.rstrip()
_LOG.debug('hash of FORMAT blob = {:s}'.format(verhash))
# Add FORMAT to the index
_git(args=['update-index', '--add', '--cacheinfo', '100644,{:s},FORMAT'.format(verhash)], wait=True)
_git(
args=[
'commit', '--allow-empty', '-m', 'Start a new nmbug repository'
'commit', '-m', 'Start a new notmuch-git repository'
],
additional_env={'GIT_WORK_TREE': NOTMUCH_GIT_DIR},
wait=True)
@ -821,7 +831,7 @@ def _clear_tags_for_message(index, id):
Neither 'id' nor the tags in 'tags' should be encoded/escaped.
"""
dir = 'tags/{id}'.format(id=_hex_quote(string=id))
dir = _id_path(id)
with _git(
args=['ls-files', dir],
@ -838,6 +848,21 @@ def _read_database_lastmod():
(count,uuid,lastmod_str) = notmuch.stdout.readline().split()
return (count,uuid,int(lastmod_str))
def _id_path(id):
hid=_hex_quote(string=id)
from hashlib import blake2b
if FORMAT_VERSION==0:
return 'tags/{hid}'.format(hid=hid)
elif FORMAT_VERSION==1:
idhash = blake2b(hid.encode('utf8'), digest_size=2).hexdigest()
return 'tags/{dir1}/{dir2}/{hid}'.format(
hid=hid,
dir1=idhash[0:2],dir2=idhash[2:])
else:
_LOG.error("Unknown format version",FORMAT_VERSION)
_sys.exit(1)
def _index_tags_for_message(id, status, tags):
"""
Update the Git index to either create or delete an empty file.
@ -852,8 +877,7 @@ def _index_tags_for_message(id, status, tags):
hash = '0000000000000000000000000000000000000000'
for tag in tags:
path = 'tags/{id}/{tag}'.format(
id=_hex_quote(string=id), tag=_hex_quote(string=tag))
path = '{ipath}/{tag}'.format(ipath=_id_path(id),tag=_hex_quote(string=tag))
yield '{mode} {hash}\t{path}\n'.format(mode=mode, hash=hash, path=path)
@ -869,7 +893,7 @@ def _diff_refs(filter, a='HEAD', b='@{upstream}'):
def _unpack_diff_lines(stream):
"Iterate through (id, tag) tuples in a diff stream."
for line in stream:
match = _TAG_FILE_REGEX.match(line.strip())
match = _TAG_FILE_REGEX[FORMAT_VERSION].match(line.strip())
if not match:
message = 'non-tag line in diff: {!r}'.format(line.strip())
if line.startswith(_TAG_DIRECTORY):
@ -907,6 +931,17 @@ def _notmuch_config_get(key):
_sys.exit(1)
return stdout.rstrip()
def read_format_version():
try:
(status, stdout, stderr) = _git(
args=['cat-file', 'blob', 'master:FORMAT'],
stdout=_subprocess.PIPE, stderr=_subprocess.PIPE, wait=True)
except SubprocessError as e:
_LOG.debug("failed to read FORMAT file from git, assuming format version 0")
return 0
return int(stdout)
# based on BaseDirectory.save_data_path from pyxdg (LGPL2+)
def xdg_data_path(profile):
resource = _os.path.join('notmuch',profile,'git')
@ -1104,6 +1139,9 @@ if __name__ == '__main__':
_LOG.debug('prefix = {:s}'.format(TAG_PREFIX))
_LOG.debug('repository = {:s}'.format(NOTMUCH_GIT_DIR))
FORMAT_VERSION = read_format_version()
_LOG.debug('FORMAT_VERSION={:d}'.format(FORMAT_VERSION))
if args.func == help:
arg_names = ['command']
else:

View file

@ -40,10 +40,10 @@ notmuch tag -new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
test_begin_subtest "committing new prefix works with force"
notmuch tag +new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -l debug -p 'new-prefix::' -C force-prefix.git commit --force
git -C force-prefix.git ls-tree -r --name-only HEAD | xargs dirname | sort -u | sed s,tags/,id:, > OUTPUT
git -C force-prefix.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | xargs dirname | sort -u > OUTPUT
notmuch tag -new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
cat <<EOF>EXPECTED
id:20091117190054.GU3165@dottiness.seas.harvard.edu
20091117190054.GU3165@dottiness.seas.harvard.edu
EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT
@ -62,8 +62,8 @@ test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "commit"
notmuch git -C tags.git commit --force
git -C tags.git ls-tree -r --name-only HEAD | xargs dirname | sort -u | sed s,tags/,id:, > OUTPUT
notmuch search --output=messages '*' | sort > EXPECTED
git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | xargs dirname | sort -u > OUTPUT
notmuch search --output=messages '*' | sed s/^id:// | sort > EXPECTED
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "commit --force succeeds"
@ -88,22 +88,22 @@ test_expect_equal_file_nonempty BEFORE AFTER
test_begin_subtest "commit (incremental)"
notmuch tag +test id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git commit
git -C tags.git ls-tree -r --name-only HEAD |
git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
grep 20091117190054 | sort > OUTPUT
echo "--------------------------------------------------" >> OUTPUT
notmuch tag -test id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git commit
git -C tags.git ls-tree -r --name-only HEAD |
git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
grep 20091117190054 | sort >> OUTPUT
cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/test
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
20091117190054.GU3165@dottiness.seas.harvard.edu/signed
20091117190054.GU3165@dottiness.seas.harvard.edu/test
20091117190054.GU3165@dottiness.seas.harvard.edu/unread
--------------------------------------------------
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
20091117190054.GU3165@dottiness.seas.harvard.edu/signed
20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT
@ -111,18 +111,18 @@ test_begin_subtest "commit (change prefix)"
notmuch tag +test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git -p 'test::' commit --force
git -C tags.git ls-tree -r --name-only HEAD |
grep 20091117190054 | sort > OUTPUT
grep 20091117190054 | notmuch_git_sanitize | sort > OUTPUT
echo "--------------------------------------------------" >> OUTPUT
notmuch tag -test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git commit --force
git -C tags.git ls-tree -r --name-only HEAD |
git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
grep 20091117190054 | sort >> OUTPUT
cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/one
20091117190054.GU3165@dottiness.seas.harvard.edu/one
--------------------------------------------------
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
20091117190054.GU3165@dottiness.seas.harvard.edu/signed
20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT
@ -151,12 +151,12 @@ test_expect_equal_file_nonempty BEFORE AFTER
test_begin_subtest "archive"
notmuch git -C tags.git archive | tar tf - | \
grep 20091117190054.GU3165@dottiness.seas.harvard.edu | sort > OUTPUT
grep 20091117190054.GU3165@dottiness.seas.harvard.edu | notmuch_git_sanitize | sort > OUTPUT
cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
20091117190054.GU3165@dottiness.seas.harvard.edu/
20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
20091117190054.GU3165@dottiness.seas.harvard.edu/signed
20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF
notmuch git -C tags.git checkout
test_expect_equal_file EXPECTED OUTPUT

View file

@ -559,6 +559,10 @@ notmuch_date_sanitize () {
-e 's/^Date: Fri, 05 Jan 2001 .*0000/Date: GENERATED_DATE/'
}
# remove redundant parts of notmuch-git internal paths
notmuch_git_sanitize () {
sed -e 's,tags/\([0-9a-f]\{2\}/\)\{2\},,' -e '/FORMAT/d'
}
notmuch_uuid_sanitize () {
sed 's/[0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}/UUID/g'
}