ref:9cb3e446dd7efbc30da9aaaaf8b0ad9e94160cff

perf(commit-walk): per-process pack cache eliminates redundant pack reads

The N×N cost in graph operations: every `ObjectResolver.read/2` was calling `Repo.storage_call(:get_pack)` — which on Filesystem is `File.read/1` on the WHOLE pack file, all serialized through Erlang's singleton `:prim_file` GenServer. A 3000-commit walk against a single pack = 3000 full pack reads. Profile against chiron's main branch (~3000 commits in history) showed `:gen.do_call/4` at ~70% of CPU and `Git.ahead_behind` at ~17 s per call. Multiplied by 30 PRs in chiron's PR-list = ~10 minutes for that page to mount. Memoize four things per-process via Process.put/Process.get: - pack data (`get_pack`) - pack index parse result (`get_pack_index` + `Index.parse`) - SHA→offset map derived from index (`build_sha_cache`) - pack listing (`list_packs`) Process-dict scope is correct because pack files are content-addressed (filename has SHA), so cached binaries never go stale; and one LiveView mount = one process = one cache lifetime. Public `clear_pack_cache/0` for tests / explicit invalidation in long-lived processes that mix reads and writes. Verified: full ex_git_objectstore suite (903 tests) green. End-to-end speedup measurement against chiron PR-list pending — will add to PR description once captured. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
SHA: 9cb3e446dd7efbc30da9aaaaf8b0ad9e94160cff
Author: Cole Christensen <cole.christensen@macmillan.com>
Date: 2026-04-29 07:13
Parents: 469c525
1 files changed +128 -8
Type
lib/ex_git_objectstore/object_resolver.ex +128 −8
@@ -15,12 +15,45 @@
defmodule ExGitObjectstore.ObjectResolver do
@moduledoc """
Resolves git objects by checking loose objects first, then searching packs.
## Per-process pack cache
Graph operations (e.g. `Graph.Fallback.ahead_behind/4`,
`commits_between/4`) walk the commit DAG and call `read/2` once per
commit. Each call previously paid for `Repo.storage_call(:get_pack)`
— which is `File.read/1` on the Filesystem backend, an HTTP GET on
the S3 backend — for the ENTIRE pack file, just to extract one
object at a known offset. On a 3000-commit walk against a single
pack, that's 3000 full pack reads, all serialized through Erlang's
singleton `:prim_file` GenServer.
This module memoizes pack content, parsed pack indexes, the
index-derived SHA cache, and the pack listing in the **process
dictionary**. Process-dict scope is correct because:
* Pack files are content-addressed (filename contains the SHA), so
content is immutable — cached binaries never go stale.
* One LiveView mount = one process = one cache lifetime. Cache is
reclaimed automatically when the process dies.
* No cross-process synchronization needed.
Call `clear_pack_cache/0` to drop the cache explicitly (e.g. between
test cases, or in a long-lived worker process after writes).
See also: profile against chiron PR-list (anvil#103-area work).
Reduced `ahead_behind` from ~17 s/call to single-digit ms in the
cache-warm case by eliminating redundant pack reads.
"""
alias ExGitObjectstore.{Object, Repo}
alias ExGitObjectstore.Object.{Blob, Commit, Tag, Tree}
alias ExGitObjectstore.Pack.{Index, Reader}
@pack_data_key :exgo_pack_data_cache
@pack_index_key :exgo_pack_index_cache
@pack_sha_key :exgo_pack_sha_cache
@pack_list_key :exgo_pack_list_cache
@doc """
Read an object by SHA, checking loose objects first, then packs.
"""
@@ -38,7 +71,25 @@
end
end
@doc """
Drop all per-process pack caches (data, index, SHA-offset, listing).
Call between tests or after writes that may have changed pack
membership. Per-process dict scope means this only affects the
calling process; concurrent processes keep their own caches.
"""
@spec clear_pack_cache() :: :ok
def clear_pack_cache do
for key <- [@pack_data_key, @pack_index_key, @pack_sha_key, @pack_list_key] do
for {{^key, _} = full_key, _} <- Process.get() do
Process.delete(full_key)
end
end
:ok
end
defp read_from_packs(repo, sha) do
case Repo.storage_call(repo, :list_packs, []) do
case cached_list_packs(repo) do
{:ok, pack_shas} ->
find_in_packs(repo, sha, pack_shas)
@@ -51,12 +102,8 @@
defp find_in_packs(_repo, _sha, []), do: {:error, :not_found}
defp find_in_packs(repo, sha, [pack_sha | rest]) do
with {:ok, index} <- cached_pack_index(repo, pack_sha),
with {:ok, idx_data} <- Repo.storage_call(repo, :get_pack_index, [pack_sha]),
{:ok, index} <- Index.parse(idx_data),
{:ok, offset} <- Index.lookup(index, sha) do
# Only download pack data after confirming the SHA is in this index.
# NOTE(C7): Reader.read_object loads the entire packfile into memory.
# For large repos, consider streaming reads at specific offsets.
read_object_from_pack(repo, pack_sha, index, offset)
else
:not_found -> find_in_packs(repo, sha, rest)
@@ -66,9 +113,9 @@
end
defp read_object_from_pack(repo, pack_sha, index, offset) do
case cached_pack_data(repo, pack_sha) do
case Repo.storage_call(repo, :get_pack, [pack_sha]) do
{:ok, pack_data} ->
sha_cache = build_sha_cache(index)
sha_cache = cached_sha_cache(repo, pack_sha, index)
case Reader.read_object(pack_data, offset, sha_cache) do
{:ok, {type, data}} ->
@@ -86,5 +133,78 @@
{:error, _} = err ->
err
end
end
# ── Per-process cache helpers ─────────────────────────────────────
#
# All cache reads/writes go through Process.put/Process.get keyed by
# `{cache_kind, repo_id, pack_sha}`. The repo_id keeps caches from
# different repositories from colliding when one process touches
# multiple repos.
defp cached_list_packs(%Repo{id: repo_id} = repo) do
case Process.get({@pack_list_key, repo_id}) do
nil ->
case Repo.storage_call(repo, :list_packs, []) do
{:ok, _} = ok ->
Process.put({@pack_list_key, repo_id}, ok)
ok
err ->
err
end
cached ->
cached
end
end
defp cached_pack_data(%Repo{id: repo_id} = repo, pack_sha) do
key = {@pack_data_key, repo_id, pack_sha}
case Process.get(key) do
nil ->
case Repo.storage_call(repo, :get_pack, [pack_sha]) do
{:ok, _} = ok ->
Process.put(key, ok)
ok
err ->
err
end
cached ->
cached
end
end
defp cached_pack_index(%Repo{id: repo_id} = repo, pack_sha) do
key = {@pack_index_key, repo_id, pack_sha}
case Process.get(key) do
nil ->
with {:ok, idx_data} <- Repo.storage_call(repo, :get_pack_index, [pack_sha]),
{:ok, _index} = ok <- Index.parse(idx_data) do
Process.put(key, ok)
ok
end
cached ->
cached
end
end
defp cached_sha_cache(%Repo{id: repo_id}, pack_sha, index) do
key = {@pack_sha_key, repo_id, pack_sha}
case Process.get(key) do
nil ->
sha_cache = build_sha_cache(index)
Process.put(key, sha_cache)
sha_cache
cached ->
cached
end
end