CL/git: add format version 1

The original nmbug format (now called version 0) creates 1
subdirectory of 'tags/' per message. This causes problems for more
than (roughly) 100k messages.

Version 1 introduces 2 layers of hashed directories. This scheme was
chose to balance the number of subdirectories with the number of extra
directories (and git objects) created via hashing.

This should be upward compatible in the sense that old repositories
will continue to work with the updated notmuch-git.
This commit is contained in:
David Bremner 2022-06-23 09:30:44 -03:00
parent b07e121923
commit 6219e7380a
4 changed files with 109 additions and 35 deletions

View file

@ -235,14 +235,46 @@ REPOSITORY CONTENTS
=================== ===================
The tags are stored in the git repo (and exported) as a set of empty The tags are stored in the git repo (and exported) as a set of empty
files. For a message with Message-Id *id*, for each tag *tag*, there files. These empty files are contained within a directory named after
the message-id.
In what follows `encode()` represents a POSIX filesystem safe
encoding. The encoding preserves alphanumerics, and the characters
`+-_@=.,:`. All other octets are replaced with `%` followed by a two
digit hex number.
Currently :any:`notmuch-git` can read any format version, but can only
create (via :any:`init`) :ref:`version 1 <format_version_1>` repositories.
.. _format_version_0:
Version 0
---------
This is the legacy format created by the `nmbug` tool prior to release
0.37. For a message with Message-Id *id*, for each tag *tag*, there
is an empty file with path is an empty file with path
tags/ `encode` (*id*) / `encode` (*tag*) tags/ `encode` (*id*) / `encode` (*tag*)
The encoding preserves alphanumerics, and the characters `+-_@=.,:`. .. _format_version_1:
All other octets are replaced with `%` followed by a two digit hex
number. Version 1
---------
In format version 1 and later, the format version is contained in a
top level file called FORMAT.
For a message with Message-Id *id*, for each tag *tag*, there
is an empty file with path
tags/ `hash1` (*id*) / `hash2` (*id*) `encode` (*id*) / `encode` (*tag*)
The hash functions each represent one byte of the `blake2b` hex
digest.
Compared to :ref:`version 0 <format_version_0>`, this reduces the
number of subdirectories within each directory.
.. _repo_location: .. _repo_location:

View file

@ -46,10 +46,12 @@ _LOG.addHandler(_logging.StreamHandler())
NOTMUCH_GIT_DIR = None NOTMUCH_GIT_DIR = None
TAG_PREFIX = None TAG_PREFIX = None
FORMAT_VERSION = 0
_HEX_ESCAPE_REGEX = _re.compile('%[0-9A-F]{2}') _HEX_ESCAPE_REGEX = _re.compile('%[0-9A-F]{2}')
_TAG_DIRECTORY = 'tags/' _TAG_DIRECTORY = 'tags/'
_TAG_FILE_REGEX = _re.compile(_TAG_DIRECTORY + '(?P<id>[^/]*)/(?P<tag>[^/]*)') _TAG_FILE_REGEX = ( _re.compile(_TAG_DIRECTORY + '(?P<id>[^/]*)/(?P<tag>[^/]*)'),
_re.compile(_TAG_DIRECTORY + '([0-9a-f]{2}/){2}(?P<id>[^/]*)/(?P<tag>[^/]*)'))
# magic hash for Git (git hash-object -t blob /dev/null) # magic hash for Git (git hash-object -t blob /dev/null)
_EMPTYBLOB = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391' _EMPTYBLOB = 'e69de29bb2d1d6434b8b29ae775ad8c2e48c5391'
@ -265,7 +267,7 @@ def archive(treeish='HEAD', args=()):
Each tag $tag for message with Message-Id $id is written to Each tag $tag for message with Message-Id $id is written to
an empty file an empty file
tags/encode($id)/encode($tag) tags/hash1(id)/hash2(id)/encode($id)/encode($tag)
The encoding preserves alphanumerics, and the characters The encoding preserves alphanumerics, and the characters
"+-_@=.:," (not the quotes). All other octets are replaced with "+-_@=.:," (not the quotes). All other octets are replaced with
@ -469,9 +471,17 @@ def init(remote=None):
_git(args=['config', 'core.logallrefupdates', 'true'], wait=True) _git(args=['config', 'core.logallrefupdates', 'true'], wait=True)
# create an empty blob (e69de29bb2d1d6434b8b29ae775ad8c2e48c5391) # create an empty blob (e69de29bb2d1d6434b8b29ae775ad8c2e48c5391)
_git(args=['hash-object', '-w', '--stdin'], input='', wait=True) _git(args=['hash-object', '-w', '--stdin'], input='', wait=True)
# create a blob for the FORMAT file
(status, stdout, _) = _git(args=['hash-object', '-w', '--stdin'], stdout=_subprocess.PIPE,
input='1\n', wait=True)
verhash=stdout.rstrip()
_LOG.debug('hash of FORMAT blob = {:s}'.format(verhash))
# Add FORMAT to the index
_git(args=['update-index', '--add', '--cacheinfo', '100644,{:s},FORMAT'.format(verhash)], wait=True)
_git( _git(
args=[ args=[
'commit', '--allow-empty', '-m', 'Start a new nmbug repository' 'commit', '-m', 'Start a new notmuch-git repository'
], ],
additional_env={'GIT_WORK_TREE': NOTMUCH_GIT_DIR}, additional_env={'GIT_WORK_TREE': NOTMUCH_GIT_DIR},
wait=True) wait=True)
@ -821,7 +831,7 @@ def _clear_tags_for_message(index, id):
Neither 'id' nor the tags in 'tags' should be encoded/escaped. Neither 'id' nor the tags in 'tags' should be encoded/escaped.
""" """
dir = 'tags/{id}'.format(id=_hex_quote(string=id)) dir = _id_path(id)
with _git( with _git(
args=['ls-files', dir], args=['ls-files', dir],
@ -838,6 +848,21 @@ def _read_database_lastmod():
(count,uuid,lastmod_str) = notmuch.stdout.readline().split() (count,uuid,lastmod_str) = notmuch.stdout.readline().split()
return (count,uuid,int(lastmod_str)) return (count,uuid,int(lastmod_str))
def _id_path(id):
hid=_hex_quote(string=id)
from hashlib import blake2b
if FORMAT_VERSION==0:
return 'tags/{hid}'.format(hid=hid)
elif FORMAT_VERSION==1:
idhash = blake2b(hid.encode('utf8'), digest_size=2).hexdigest()
return 'tags/{dir1}/{dir2}/{hid}'.format(
hid=hid,
dir1=idhash[0:2],dir2=idhash[2:])
else:
_LOG.error("Unknown format version",FORMAT_VERSION)
_sys.exit(1)
def _index_tags_for_message(id, status, tags): def _index_tags_for_message(id, status, tags):
""" """
Update the Git index to either create or delete an empty file. Update the Git index to either create or delete an empty file.
@ -852,8 +877,7 @@ def _index_tags_for_message(id, status, tags):
hash = '0000000000000000000000000000000000000000' hash = '0000000000000000000000000000000000000000'
for tag in tags: for tag in tags:
path = 'tags/{id}/{tag}'.format( path = '{ipath}/{tag}'.format(ipath=_id_path(id),tag=_hex_quote(string=tag))
id=_hex_quote(string=id), tag=_hex_quote(string=tag))
yield '{mode} {hash}\t{path}\n'.format(mode=mode, hash=hash, path=path) yield '{mode} {hash}\t{path}\n'.format(mode=mode, hash=hash, path=path)
@ -869,7 +893,7 @@ def _diff_refs(filter, a='HEAD', b='@{upstream}'):
def _unpack_diff_lines(stream): def _unpack_diff_lines(stream):
"Iterate through (id, tag) tuples in a diff stream." "Iterate through (id, tag) tuples in a diff stream."
for line in stream: for line in stream:
match = _TAG_FILE_REGEX.match(line.strip()) match = _TAG_FILE_REGEX[FORMAT_VERSION].match(line.strip())
if not match: if not match:
message = 'non-tag line in diff: {!r}'.format(line.strip()) message = 'non-tag line in diff: {!r}'.format(line.strip())
if line.startswith(_TAG_DIRECTORY): if line.startswith(_TAG_DIRECTORY):
@ -907,6 +931,17 @@ def _notmuch_config_get(key):
_sys.exit(1) _sys.exit(1)
return stdout.rstrip() return stdout.rstrip()
def read_format_version():
try:
(status, stdout, stderr) = _git(
args=['cat-file', 'blob', 'master:FORMAT'],
stdout=_subprocess.PIPE, stderr=_subprocess.PIPE, wait=True)
except SubprocessError as e:
_LOG.debug("failed to read FORMAT file from git, assuming format version 0")
return 0
return int(stdout)
# based on BaseDirectory.save_data_path from pyxdg (LGPL2+) # based on BaseDirectory.save_data_path from pyxdg (LGPL2+)
def xdg_data_path(profile): def xdg_data_path(profile):
resource = _os.path.join('notmuch',profile,'git') resource = _os.path.join('notmuch',profile,'git')
@ -1104,6 +1139,9 @@ if __name__ == '__main__':
_LOG.debug('prefix = {:s}'.format(TAG_PREFIX)) _LOG.debug('prefix = {:s}'.format(TAG_PREFIX))
_LOG.debug('repository = {:s}'.format(NOTMUCH_GIT_DIR)) _LOG.debug('repository = {:s}'.format(NOTMUCH_GIT_DIR))
FORMAT_VERSION = read_format_version()
_LOG.debug('FORMAT_VERSION={:d}'.format(FORMAT_VERSION))
if args.func == help: if args.func == help:
arg_names = ['command'] arg_names = ['command']
else: else:

View file

@ -40,10 +40,10 @@ notmuch tag -new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
test_begin_subtest "committing new prefix works with force" test_begin_subtest "committing new prefix works with force"
notmuch tag +new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu notmuch tag +new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -l debug -p 'new-prefix::' -C force-prefix.git commit --force notmuch git -l debug -p 'new-prefix::' -C force-prefix.git commit --force
git -C force-prefix.git ls-tree -r --name-only HEAD | xargs dirname | sort -u | sed s,tags/,id:, > OUTPUT git -C force-prefix.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | xargs dirname | sort -u > OUTPUT
notmuch tag -new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu notmuch tag -new-prefix::foo id:20091117190054.GU3165@dottiness.seas.harvard.edu
cat <<EOF>EXPECTED cat <<EOF>EXPECTED
id:20091117190054.GU3165@dottiness.seas.harvard.edu 20091117190054.GU3165@dottiness.seas.harvard.edu
EOF EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT test_expect_equal_file_nonempty EXPECTED OUTPUT
@ -62,8 +62,8 @@ test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "commit" test_begin_subtest "commit"
notmuch git -C tags.git commit --force notmuch git -C tags.git commit --force
git -C tags.git ls-tree -r --name-only HEAD | xargs dirname | sort -u | sed s,tags/,id:, > OUTPUT git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | xargs dirname | sort -u > OUTPUT
notmuch search --output=messages '*' | sort > EXPECTED notmuch search --output=messages '*' | sed s/^id:// | sort > EXPECTED
test_expect_equal_file_nonempty EXPECTED OUTPUT test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "commit --force succeeds" test_begin_subtest "commit --force succeeds"
@ -88,22 +88,22 @@ test_expect_equal_file_nonempty BEFORE AFTER
test_begin_subtest "commit (incremental)" test_begin_subtest "commit (incremental)"
notmuch tag +test id:20091117190054.GU3165@dottiness.seas.harvard.edu notmuch tag +test id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git commit notmuch git -C tags.git commit
git -C tags.git ls-tree -r --name-only HEAD | git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
grep 20091117190054 | sort > OUTPUT grep 20091117190054 | sort > OUTPUT
echo "--------------------------------------------------" >> OUTPUT echo "--------------------------------------------------" >> OUTPUT
notmuch tag -test id:20091117190054.GU3165@dottiness.seas.harvard.edu notmuch tag -test id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git commit notmuch git -C tags.git commit
git -C tags.git ls-tree -r --name-only HEAD | git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
grep 20091117190054 | sort >> OUTPUT grep 20091117190054 | sort >> OUTPUT
cat <<EOF > EXPECTED cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox 20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed 20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/test 20091117190054.GU3165@dottiness.seas.harvard.edu/test
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread 20091117190054.GU3165@dottiness.seas.harvard.edu/unread
-------------------------------------------------- --------------------------------------------------
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox 20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed 20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread 20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT test_expect_equal_file_nonempty EXPECTED OUTPUT
@ -111,18 +111,18 @@ test_begin_subtest "commit (change prefix)"
notmuch tag +test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu notmuch tag +test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git -p 'test::' commit --force notmuch git -C tags.git -p 'test::' commit --force
git -C tags.git ls-tree -r --name-only HEAD | git -C tags.git ls-tree -r --name-only HEAD |
grep 20091117190054 | sort > OUTPUT grep 20091117190054 | notmuch_git_sanitize | sort > OUTPUT
echo "--------------------------------------------------" >> OUTPUT echo "--------------------------------------------------" >> OUTPUT
notmuch tag -test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu notmuch tag -test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git commit --force notmuch git -C tags.git commit --force
git -C tags.git ls-tree -r --name-only HEAD | git -C tags.git ls-tree -r --name-only HEAD | notmuch_git_sanitize | \
grep 20091117190054 | sort >> OUTPUT grep 20091117190054 | sort >> OUTPUT
cat <<EOF > EXPECTED cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/one 20091117190054.GU3165@dottiness.seas.harvard.edu/one
-------------------------------------------------- --------------------------------------------------
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox 20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed 20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread 20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT test_expect_equal_file_nonempty EXPECTED OUTPUT
@ -151,12 +151,12 @@ test_expect_equal_file_nonempty BEFORE AFTER
test_begin_subtest "archive" test_begin_subtest "archive"
notmuch git -C tags.git archive | tar tf - | \ notmuch git -C tags.git archive | tar tf - | \
grep 20091117190054.GU3165@dottiness.seas.harvard.edu | sort > OUTPUT grep 20091117190054.GU3165@dottiness.seas.harvard.edu | notmuch_git_sanitize | sort > OUTPUT
cat <<EOF > EXPECTED cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/ 20091117190054.GU3165@dottiness.seas.harvard.edu/
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox 20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed 20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread 20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF EOF
notmuch git -C tags.git checkout notmuch git -C tags.git checkout
test_expect_equal_file EXPECTED OUTPUT test_expect_equal_file EXPECTED OUTPUT

View file

@ -559,6 +559,10 @@ notmuch_date_sanitize () {
-e 's/^Date: Fri, 05 Jan 2001 .*0000/Date: GENERATED_DATE/' -e 's/^Date: Fri, 05 Jan 2001 .*0000/Date: GENERATED_DATE/'
} }
# remove redundant parts of notmuch-git internal paths
notmuch_git_sanitize () {
sed -e 's,tags/\([0-9a-f]\{2\}/\)\{2\},,' -e '/FORMAT/d'
}
notmuch_uuid_sanitize () { notmuch_uuid_sanitize () {
sed 's/[0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}/UUID/g' sed 's/[0-9a-f]\{8\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{4\}-[0-9a-f]\{12\}/UUID/g'
} }