CLI/git: cache git indices

If the private index file matches a previously known revision of the
database, we can update the index incrementally using the recorded
lastmod counter. This is typically much faster than a full update,
although it could be slower in the case of large changes to the
database.

The "git-read-tree HEAD" is also a bottleneck, but unfortunately
sometimes is needed. Cache the index checksum and hash to reduce the
number of times the operation is run. The overall design is a
simplified version of the PrivateIndex class.
This commit is contained in:
David Bremner 2022-04-19 08:32:14 -03:00
parent 5ef56fe812
commit 66ccf420c2
2 changed files with 275 additions and 86 deletions

View file

@ -38,6 +38,7 @@ import tempfile as _tempfile
import textwrap as _textwrap import textwrap as _textwrap
from urllib.parse import quote as _quote from urllib.parse import quote as _quote
from urllib.parse import unquote as _unquote from urllib.parse import unquote as _unquote
import json as _json
_LOG = _logging.getLogger('nmbug') _LOG = _logging.getLogger('nmbug')
_LOG.setLevel(_logging.WARNING) _LOG.setLevel(_logging.WARNING)
@ -299,41 +300,98 @@ def _is_committed(status):
return len(status['added']) + len(status['deleted']) == 0 return len(status['added']) + len(status['deleted']) == 0
class CachedIndex:
def __init__(self, repo, treeish):
self.cache_path = _os.path.join(repo, 'notmuch', 'index_cache.json')
self.index_path = _os.path.join(repo, 'index')
self.current_treeish = treeish
# cached values
self.treeish = None
self.hash = None
self.index_checksum = None
self._load_cache_file()
def _load_cache_file(self):
try:
with open(self.cache_path) as f:
data = _json.load(f)
self.treeish = data['treeish']
self.hash = data['hash']
self.index_checksum = data['index_checksum']
except FileNotFoundError:
pass
except _json.JSONDecodeError:
_LOG.error("Error decoding cache")
_sys.exit(1)
def __enter__(self):
self.read_tree()
return self
def __exit__(self, type, value, traceback):
checksum = _read_index_checksum(self.index_path)
(_, hash, _) = _git(
args=['rev-parse', self.current_treeish],
stdout=_subprocess.PIPE,
wait=True)
with open(self.cache_path, "w") as f:
_json.dump({'treeish': self.current_treeish,
'hash': hash.rstrip(), 'index_checksum': checksum }, f)
@timed
def read_tree(self):
current_checksum = _read_index_checksum(self.index_path)
(_, hash, _) = _git(
args=['rev-parse', self.current_treeish],
stdout=_subprocess.PIPE,
wait=True)
current_hash = hash.rstrip()
if self.current_treeish == self.treeish and \
self.index_checksum and self.index_checksum == current_checksum and \
self.hash and self.hash == current_hash:
return
_git(args=['read-tree', self.current_treeish], wait=True)
def commit(treeish='HEAD', message=None): def commit(treeish='HEAD', message=None):
""" """
Commit prefix-matching tags from the notmuch database to Git. Commit prefix-matching tags from the notmuch database to Git.
""" """
status = get_status() status = get_status()
if _is_committed(status=status): if _is_committed(status=status):
_LOG.warning('Nothing to commit') _LOG.warning('Nothing to commit')
return return
_git(args=['read-tree', '--empty'], wait=True) with CachedIndex(NOTMUCH_GIT_DIR, treeish) as index:
_git(args=['read-tree', treeish], wait=True) try:
try: _update_index(status=status)
_update_index(status=status) (_, tree, _) = _git(
(_, tree, _) = _git( args=['write-tree'],
args=['write-tree'], stdout=_subprocess.PIPE,
stdout=_subprocess.PIPE, wait=True)
wait=True) (_, parent, _) = _git(
(_, parent, _) = _git( args=['rev-parse', treeish],
args=['rev-parse', treeish], stdout=_subprocess.PIPE,
stdout=_subprocess.PIPE, wait=True)
wait=True) (_, commit, _) = _git(
(_, commit, _) = _git( args=['commit-tree', tree.strip(), '-p', parent.strip()],
args=['commit-tree', tree.strip(), '-p', parent.strip()], input=message,
input=message, stdout=_subprocess.PIPE,
stdout=_subprocess.PIPE, wait=True)
wait=True) _git(
_git( args=['update-ref', treeish, commit.strip()],
args=['update-ref', treeish, commit.strip()], stdout=_subprocess.PIPE,
stdout=_subprocess.PIPE, wait=True)
wait=True) except Exception as e:
except Exception as e: _git(args=['read-tree', '--empty'], wait=True)
_git(args=['read-tree', '--empty'], wait=True) _git(args=['read-tree', treeish], wait=True)
_git(args=['read-tree', treeish], wait=True) raise
raise
@timed @timed
def _update_index(status): def _update_index(status):
@ -582,50 +640,160 @@ def get_status():
'deleted': {}, 'deleted': {},
'missing': {}, 'missing': {},
} }
index = _index_tags() with PrivateIndex(repo=NOTMUCH_GIT_DIR, prefix=TAG_PREFIX) as index:
maybe_deleted = _diff_index(index=index, filter='D') maybe_deleted = index.diff(filter='D')
for id, tags in maybe_deleted.items(): for id, tags in maybe_deleted.items():
(_, stdout, stderr) = _spawn( (_, stdout, stderr) = _spawn(
args=['notmuch', 'search', '--output=files', 'id:{0}'.format(id)], args=['notmuch', 'search', '--output=files', 'id:{0}'.format(id)],
stdout=_subprocess.PIPE, stdout=_subprocess.PIPE,
wait=True) wait=True)
if stdout: if stdout:
status['deleted'][id] = tags status['deleted'][id] = tags
else: else:
status['missing'][id] = tags status['missing'][id] = tags
status['added'] = _diff_index(index=index, filter='A') status['added'] = index.diff(filter='A')
_os.remove(index)
return status return status
@timed class PrivateIndex:
def _index_tags(): def __init__(self, repo, prefix):
"Write notmuch tags to the nmbug.index." try:
path = _os.path.join(NOTMUCH_GIT_DIR, 'nmbug.index') _os.makedirs(_os.path.join(repo, 'notmuch'))
prefix = '+{0}'.format(_ENCODED_TAG_PREFIX) except FileExistsError:
_git( pass
args=['read-tree', '--empty'],
additional_env={'GIT_INDEX_FILE': path}, wait=True)
with _spawn(
args=['notmuch', 'dump', '--format=batch-tag', '--query=sexp', '--', _tag_query()],
stdout=_subprocess.PIPE) as notmuch:
with _git(
args=['update-index', '--index-info'],
stdin=_subprocess.PIPE,
additional_env={'GIT_INDEX_FILE': path}) as git:
for line in notmuch.stdout:
if line.strip().startswith('#'):
continue
(tags_string, id) = [_.strip() for _ in line.split(' -- id:')]
tags = [
_unquote(tag[len(prefix):])
for tag in tags_string.split()
if tag.startswith(prefix)]
id = _xapian_unquote(string=id)
for line in _index_tags_for_message(
id=id, status='A', tags=tags):
git.stdin.write(line)
return path
file_name = 'notmuch/index'
self.index_path = _os.path.join(repo, file_name)
self.cache_path = _os.path.join(repo, 'notmuch', '{:s}.json'.format(_hex_quote(file_name)))
self.current_prefix = prefix
self.prefix = None
self.uuid = None
self.lastmod = None
self.checksum = None
self._load_cache_file()
self._index_tags()
def __enter__(self):
return self
def __exit__(self, type, value, traceback):
checksum = _read_index_checksum(self.index_path)
(count, uuid, lastmod) = _read_database_lastmod()
with open(self.cache_path, "w") as f:
_json.dump({'prefix': self.current_prefix, 'uuid': uuid, 'lastmod': lastmod, 'checksum': checksum }, f)
def _load_cache_file(self):
try:
with open(self.cache_path) as f:
data = _json.load(f)
self.prefix = data['prefix']
self.uuid = data['uuid']
self.lastmod = data['lastmod']
self.checksum = data['checksum']
except FileNotFoundError:
return None
except _json.JSONDecodeError:
_LOG.error("Error decoding cache")
_sys.exit(1)
@timed
def _index_tags(self):
"Write notmuch tags to private git index."
prefix = '+{0}'.format(_ENCODED_TAG_PREFIX)
current_checksum = _read_index_checksum(self.index_path)
if (self.prefix == None or self.prefix != self.current_prefix
or self.checksum == None or self.checksum != current_checksum):
_git(
args=['read-tree', '--empty'],
additional_env={'GIT_INDEX_FILE': self.index_path}, wait=True)
query = _tag_query()
clear_tags = False
(count,uuid,lastmod) = _read_database_lastmod()
if self.prefix == self.current_prefix and self.uuid \
and self.uuid == uuid and self.checksum == current_checksum:
query = '(and (infix "lastmod:{:d}..")) {:s})'.format(self.lastmod+1, query)
clear_tags = True
with _spawn(
args=['notmuch', 'dump', '--format=batch-tag', '--query=sexp', '--', query],
stdout=_subprocess.PIPE) as notmuch:
with _git(
args=['update-index', '--index-info'],
stdin=_subprocess.PIPE,
additional_env={'GIT_INDEX_FILE': self.index_path}) as git:
for line in notmuch.stdout:
if line.strip().startswith('#'):
continue
(tags_string, id) = [_.strip() for _ in line.split(' -- id:')]
tags = [
_unquote(tag[len(prefix):])
for tag in tags_string.split()
if tag.startswith(prefix)]
id = _xapian_unquote(string=id)
if clear_tags:
for line in _clear_tags_for_message(index=self.index_path, id=id):
git.stdin.write(line)
for line in _index_tags_for_message(
id=id, status='A', tags=tags):
git.stdin.write(line)
@timed
def diff(self, filter):
"""
Get an {id: {tag, ...}} dict for a given filter.
For example, use 'A' to find added tags, and 'D' to find deleted tags.
"""
s = _collections.defaultdict(set)
with _git(
args=[
'diff-index', '--cached', '--diff-filter', filter,
'--name-only', 'HEAD'],
additional_env={'GIT_INDEX_FILE': self.index_path},
stdout=_subprocess.PIPE) as p:
# Once we drop Python < 3.3, we can use 'yield from' here
for id, tag in _unpack_diff_lines(stream=p.stdout):
s[id].add(tag)
return s
def _read_index_checksum (index_path):
"""Read the index checksum, as defined by index-format.txt in the git source
WARNING: assumes SHA1 repo"""
import binascii
try:
with open(index_path, 'rb') as f:
size=_os.path.getsize(index_path)
f.seek(size-20);
return binascii.hexlify(f.read(20)).decode('ascii')
except FileNotFoundError:
return None
def _clear_tags_for_message(index, id):
"""
Clear any existing index entries for message 'id'
Neither 'id' nor the tags in 'tags' should be encoded/escaped.
"""
dir = 'tags/{id}'.format(id=_hex_quote(string=id))
with _git(
args=['ls-files', dir],
additional_env={'GIT_INDEX_FILE': index},
stdout=_subprocess.PIPE) as git:
for file in git.stdout:
line = '0 0000000000000000000000000000000000000000\t{:s}\n'.format(file.strip())
yield line
def _read_database_lastmod():
with _spawn(
args=['notmuch', 'count', '--lastmod', '*'],
stdout=_subprocess.PIPE) as notmuch:
(count,uuid,lastmod_str) = notmuch.stdout.readline().split()
return (count,uuid,int(lastmod_str))
def _index_tags_for_message(id, status, tags): def _index_tags_for_message(id, status, tags):
""" """
@ -646,26 +814,6 @@ def _index_tags_for_message(id, status, tags):
yield '{mode} {hash}\t{path}\n'.format(mode=mode, hash=hash, path=path) yield '{mode} {hash}\t{path}\n'.format(mode=mode, hash=hash, path=path)
@timed
def _diff_index(index, filter):
"""
Get an {id: {tag, ...}} dict for a given filter.
For example, use 'A' to find added tags, and 'D' to find deleted tags.
"""
s = _collections.defaultdict(set)
with _git(
args=[
'diff-index', '--cached', '--diff-filter', filter,
'--name-only', 'HEAD'],
additional_env={'GIT_INDEX_FILE': index},
stdout=_subprocess.PIPE) as p:
# Once we drop Python < 3.3, we can use 'yield from' here
for id, tag in _unpack_diff_lines(stream=p.stdout):
s[id].add(tag)
return s
def _diff_refs(filter, a='HEAD', b='@{upstream}'): def _diff_refs(filter, a='HEAD', b='@{upstream}'):
with _git( with _git(
args=['diff', '--diff-filter', filter, '--name-only', a, b], args=['diff', '--diff-filter', filter, '--name-only', a, b],

View file

@ -33,6 +33,47 @@ notmuch tag '-"quoted tag"' '*'
git -C clone2.git ls-tree -r --name-only HEAD | grep /inbox > AFTER git -C clone2.git ls-tree -r --name-only HEAD | grep /inbox > AFTER
test_expect_equal_file_nonempty BEFORE AFTER test_expect_equal_file_nonempty BEFORE AFTER
test_begin_subtest "commit (incremental)"
notmuch tag +test id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git -p '' commit
git -C tags.git ls-tree -r --name-only HEAD |
grep 20091117190054 | sort > OUTPUT
echo "--------------------------------------------------" >> OUTPUT
notmuch tag -test id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git -p '' commit
git -C tags.git ls-tree -r --name-only HEAD |
grep 20091117190054 | sort >> OUTPUT
cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/test
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
--------------------------------------------------
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "commit (change prefix)"
notmuch tag +test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git -p 'test::' commit
git -C tags.git ls-tree -r --name-only HEAD |
grep 20091117190054 | sort > OUTPUT
echo "--------------------------------------------------" >> OUTPUT
notmuch tag -test::one id:20091117190054.GU3165@dottiness.seas.harvard.edu
notmuch git -C tags.git -p '' commit
git -C tags.git ls-tree -r --name-only HEAD |
grep 20091117190054 | sort >> OUTPUT
cat <<EOF > EXPECTED
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/one
--------------------------------------------------
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/inbox
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/signed
tags/20091117190054.GU3165@dottiness.seas.harvard.edu/unread
EOF
test_expect_equal_file_nonempty EXPECTED OUTPUT
test_begin_subtest "checkout" test_begin_subtest "checkout"
notmuch dump > BEFORE notmuch dump > BEFORE
notmuch tag -inbox '*' notmuch tag -inbox '*'