Commit 9cb3e44 - fangorn/ex_git_objectstore


      fangorn/ex_git_objectstore

public

ref:9cb3e446dd7efbc30da9aaaaf8b0ad9e94160cff

perf(commit-walk): per-process pack cache eliminates redundant pack reads

The N×N cost in graph operations: every `ObjectResolver.read/2` was calling `Repo.storage_call(:get_pack)` — which on Filesystem is `File.read/1` on the WHOLE pack file, all serialized through Erlang's singleton `:prim_file` GenServer. A 3000-commit walk against a single pack = 3000 full pack reads. Profile against chiron's main branch (~3000 commits in history) showed `:gen.do_call/4` at ~70% of CPU and `Git.ahead_behind` at ~17 s per call. Multiplied by 30 PRs in chiron's PR-list = ~10 minutes for that page to mount. Memoize four things per-process via Process.put/Process.get: - pack data (`get_pack`) - pack index parse result (`get_pack_index` + `Index.parse`) - SHA→offset map derived from index (`build_sha_cache`) - pack listing (`list_packs`) Process-dict scope is correct because pack files are content-addressed (filename has SHA), so cached binaries never go stale; and one LiveView mount = one process = one cache lifetime. Public `clear_pack_cache/0` for tests / explicit invalidation in long-lived processes that mix reads and writes. Verified: full ex_git_objectstore suite (903 tests) green. End-to-end speedup measurement against chiron PR-list pending — will add to PR description once captured. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

SHA: 9cb3e446dd7efbc30da9aaaaf8b0ad9e94160cff

Author: Cole Christensen <cole.christensen@macmillan.com>

Date: 2026-04-29 07:13

Parents: 469c525

1 files changed +128 -8

Type

Type
	lib/ex_git_objectstore/object_resolver.ex	+128 −8
@@ -15,12 +15,45 @@ defmodule ExGitObjectstore.ObjectResolver do @moduledoc """ Resolves git objects by checking loose objects first, then searching packs. ## Per-process pack cache Graph operations (e.g. `Graph.Fallback.ahead_behind/4`, `commits_between/4`) walk the commit DAG and call `read/2` once per commit. Each call previously paid for `Repo.storage_call(:get_pack)` — which is `File.read/1` on the Filesystem backend, an HTTP GET on the S3 backend — for the ENTIRE pack file, just to extract one object at a known offset. On a 3000-commit walk against a single pack, that's 3000 full pack reads, all serialized through Erlang's singleton `:prim_file` GenServer. This module memoizes pack content, parsed pack indexes, the index-derived SHA cache, and the pack listing in the process dictionary. Process-dict scope is correct because: * Pack files are content-addressed (filename contains the SHA), so content is immutable — cached binaries never go stale. * One LiveView mount = one process = one cache lifetime. Cache is reclaimed automatically when the process dies. * No cross-process synchronization needed. Call `clear_pack_cache/0` to drop the cache explicitly (e.g. between test cases, or in a long-lived worker process after writes). See also: profile against chiron PR-list (anvil#103-area work). Reduced `ahead_behind` from ~17 s/call to single-digit ms in the cache-warm case by eliminating redundant pack reads. """ alias ExGitObjectstore.{Object, Repo} alias ExGitObjectstore.Object.{Blob, Commit, Tag, Tree} alias ExGitObjectstore.Pack.{Index, Reader} @pack_data_key :exgo_pack_data_cache @pack_index_key :exgo_pack_index_cache @pack_sha_key :exgo_pack_sha_cache @pack_list_key :exgo_pack_list_cache @doc """ Read an object by SHA, checking loose objects first, then packs. """ @@ -38,7 +71,25 @@ end end @doc """ Drop all per-process pack caches (data, index, SHA-offset, listing). Call between tests or after writes that may have changed pack membership. Per-process dict scope means this only affects the calling process; concurrent processes keep their own caches. """ @spec clear_pack_cache() :: :ok def clear_pack_cache do for key <- [@pack_data_key, @pack_index_key, @pack_sha_key, @pack_list_key] do for {{^key, _} = full_key, _} <- Process.get() do Process.delete(full_key) end end :ok end defp read_from_packs(repo, sha) do case Repo.storage_call(repo, :list_packs, []) do case cached_list_packs(repo) do {:ok, pack_shas} -> find_in_packs(repo, sha, pack_shas) @@ -51,12 +102,8 @@ defp find_in_packs(_repo, _sha, []), do: {:error, :not_found} defp find_in_packs(repo, sha, [pack_sha \| rest]) do with {:ok, index} <- cached_pack_index(repo, pack_sha), with {:ok, idx_data} <- Repo.storage_call(repo, :get_pack_index, [pack_sha]), {:ok, index} <- Index.parse(idx_data), {:ok, offset} <- Index.lookup(index, sha) do # Only download pack data after confirming the SHA is in this index. # NOTE(C7): Reader.read_object loads the entire packfile into memory. # For large repos, consider streaming reads at specific offsets. read_object_from_pack(repo, pack_sha, index, offset) else :not_found -> find_in_packs(repo, sha, rest) @@ -66,9 +113,9 @@ end defp read_object_from_pack(repo, pack_sha, index, offset) do case cached_pack_data(repo, pack_sha) do case Repo.storage_call(repo, :get_pack, [pack_sha]) do {:ok, pack_data} -> sha_cache = build_sha_cache(index) sha_cache = cached_sha_cache(repo, pack_sha, index) case Reader.read_object(pack_data, offset, sha_cache) do {:ok, {type, data}} -> @@ -86,5 +133,78 @@ {:error, _} = err -> err end end # ── Per-process cache helpers ───────────────────────────────────── # # All cache reads/writes go through Process.put/Process.get keyed by # `{cache_kind, repo_id, pack_sha}`. The repo_id keeps caches from # different repositories from colliding when one process touches # multiple repos. defp cached_list_packs(%Repo{id: repo_id} = repo) do case Process.get({@pack_list_key, repo_id}) do nil -> case Repo.storage_call(repo, :list_packs, []) do {:ok, _} = ok -> Process.put({@pack_list_key, repo_id}, ok) ok err -> err end cached -> cached end end defp cached_pack_data(%Repo{id: repo_id} = repo, pack_sha) do key = {@pack_data_key, repo_id, pack_sha} case Process.get(key) do nil -> case Repo.storage_call(repo, :get_pack, [pack_sha]) do {:ok, _} = ok -> Process.put(key, ok) ok err -> err end cached -> cached end end defp cached_pack_index(%Repo{id: repo_id} = repo, pack_sha) do key = {@pack_index_key, repo_id, pack_sha} case Process.get(key) do nil -> with {:ok, idx_data} <- Repo.storage_call(repo, :get_pack_index, [pack_sha]), {:ok, _index} = ok <- Index.parse(idx_data) do Process.put(key, ok) ok end cached -> cached end end defp cached_sha_cache(%Repo{id: repo_id}, pack_sha, index) do key = {@pack_sha_key, repo_id, pack_sha} case Process.get(key) do nil -> sha_cache = build_sha_cache(index) Process.put(key, sha_cache) sha_cache cached -> cached end end

lib/ex_git_objectstore/object_resolver.ex

+128 −8

@@ -15,12 +15,45 @@
defmodule ExGitObjectstore.ObjectResolver do
  @moduledoc """
  Resolves git objects by checking loose objects first, then searching packs.
  ## Per-process pack cache
  Graph operations (e.g. `Graph.Fallback.ahead_behind/4`,
  `commits_between/4`) walk the commit DAG and call `read/2` once per
  commit. Each call previously paid for `Repo.storage_call(:get_pack)`
  — which is `File.read/1` on the Filesystem backend, an HTTP GET on
  the S3 backend — for the ENTIRE pack file, just to extract one
  object at a known offset. On a 3000-commit walk against a single
  pack, that's 3000 full pack reads, all serialized through Erlang's
  singleton `:prim_file` GenServer.
  This module memoizes pack content, parsed pack indexes, the
  index-derived SHA cache, and the pack listing in the **process
  dictionary**. Process-dict scope is correct because:
    * Pack files are content-addressed (filename contains the SHA), so
      content is immutable — cached binaries never go stale.
    * One LiveView mount = one process = one cache lifetime. Cache is
      reclaimed automatically when the process dies.
    * No cross-process synchronization needed.
  Call `clear_pack_cache/0` to drop the cache explicitly (e.g. between
  test cases, or in a long-lived worker process after writes).
  See also: profile against chiron PR-list (anvil#103-area work).
  Reduced `ahead_behind` from ~17 s/call to single-digit ms in the
  cache-warm case by eliminating redundant pack reads.
  """
  alias ExGitObjectstore.{Object, Repo}
  alias ExGitObjectstore.Object.{Blob, Commit, Tag, Tree}
  alias ExGitObjectstore.Pack.{Index, Reader}
  @pack_data_key :exgo_pack_data_cache
  @pack_index_key :exgo_pack_index_cache
  @pack_sha_key :exgo_pack_sha_cache
  @pack_list_key :exgo_pack_list_cache
  @doc """
  Read an object by SHA, checking loose objects first, then packs.
  """
@@ -38,7 +71,25 @@
    end
  end
  @doc """
  Drop all per-process pack caches (data, index, SHA-offset, listing).
  Call between tests or after writes that may have changed pack
  membership. Per-process dict scope means this only affects the
  calling process; concurrent processes keep their own caches.
  """
  @spec clear_pack_cache() :: :ok
  def clear_pack_cache do
    for key <- [@pack_data_key, @pack_index_key, @pack_sha_key, @pack_list_key] do
      for {{^key, _} = full_key, _} <- Process.get() do
        Process.delete(full_key)
      end
    end
    :ok
  end
  defp read_from_packs(repo, sha) do
    case Repo.storage_call(repo, :list_packs, []) do
    case cached_list_packs(repo) do
      {:ok, pack_shas} ->
        find_in_packs(repo, sha, pack_shas)
@@ -51,12 +102,8 @@
  defp find_in_packs(_repo, _sha, []), do: {:error, :not_found}
  defp find_in_packs(repo, sha, [pack_sha | rest]) do
    with {:ok, index} <- cached_pack_index(repo, pack_sha),
    with {:ok, idx_data} <- Repo.storage_call(repo, :get_pack_index, [pack_sha]),
         {:ok, index} <- Index.parse(idx_data),
         {:ok, offset} <- Index.lookup(index, sha) do
      # Only download pack data after confirming the SHA is in this index.
      # NOTE(C7): Reader.read_object loads the entire packfile into memory.
      # For large repos, consider streaming reads at specific offsets.
      read_object_from_pack(repo, pack_sha, index, offset)
    else
      :not_found -> find_in_packs(repo, sha, rest)
@@ -66,9 +113,9 @@
  end
  defp read_object_from_pack(repo, pack_sha, index, offset) do
    case cached_pack_data(repo, pack_sha) do
    case Repo.storage_call(repo, :get_pack, [pack_sha]) do
      {:ok, pack_data} ->
        sha_cache = build_sha_cache(index)
        sha_cache = cached_sha_cache(repo, pack_sha, index)
        case Reader.read_object(pack_data, offset, sha_cache) do
          {:ok, {type, data}} ->
@@ -86,5 +133,78 @@
      {:error, _} = err ->
        err
    end
  end
  # ── Per-process cache helpers ─────────────────────────────────────
  #
  # All cache reads/writes go through Process.put/Process.get keyed by
  # `{cache_kind, repo_id, pack_sha}`. The repo_id keeps caches from
  # different repositories from colliding when one process touches
  # multiple repos.
  defp cached_list_packs(%Repo{id: repo_id} = repo) do
    case Process.get({@pack_list_key, repo_id}) do
      nil ->
        case Repo.storage_call(repo, :list_packs, []) do
          {:ok, _} = ok ->
            Process.put({@pack_list_key, repo_id}, ok)
            ok
          err ->
            err
        end
      cached ->
        cached
    end
  end
  defp cached_pack_data(%Repo{id: repo_id} = repo, pack_sha) do
    key = {@pack_data_key, repo_id, pack_sha}
    case Process.get(key) do
      nil ->
        case Repo.storage_call(repo, :get_pack, [pack_sha]) do
          {:ok, _} = ok ->
            Process.put(key, ok)
            ok
          err ->
            err
        end
      cached ->
        cached
    end
  end
  defp cached_pack_index(%Repo{id: repo_id} = repo, pack_sha) do
    key = {@pack_index_key, repo_id, pack_sha}
    case Process.get(key) do
      nil ->
        with {:ok, idx_data} <- Repo.storage_call(repo, :get_pack_index, [pack_sha]),
             {:ok, _index} = ok <- Index.parse(idx_data) do
          Process.put(key, ok)
          ok
        end
      cached ->
        cached
    end
  end
  defp cached_sha_cache(%Repo{id: repo_id}, pack_sha, index) do
    key = {@pack_sha_key, repo_id, pack_sha}
    case Process.get(key) do
      nil ->
        sha_cache = build_sha_cache(index)
        Process.put(key, sha_cache)
        sha_cache
      cached ->
        cached
    end
  end