ref:96509fa11f87f61bf0df582a4ded83c09a3f79b0

perf(delta): unroll copy-arg decode — 90× faster, profile-driven (#30)

Sub-issue under fangorn/anvil#153 umbrella, validated this time with an actual profile. ## What the profile said Sample-based stack profiler captured ~48k samples on `Pack.Reader.parse/2` applied to a real ovs pack (96 MB, 134k objects, 108k deltas). The dominant hot path was NOT the broken offset cache (#29), NOR the find_compressed_length binary search I'd theorized as the "actual" bottleneck — it was `Pack.Delta.read_if_bit/3`: | samples | path | |---|---| | 18,471 | apply > apply_instructions > read_copy_size | | 14,628 | apply > apply_instructions > read_copy_offset | | 1,625 | read_copy_offset > read_if_bit | | 1,172 | read_copy_size > read_if_bit | ~74% of total CPU. Each copy command made 7 sequential pattern matches via `read_if_bit/3` (4 offset + 3 size), AND the false branch reconstructed `<<byte, rest::binary>>` instead of returning the original — tens of millions of redundant binary allocations across ovs's ~5 million delta instructions. ## Fix `apply_instructions/3`'s copy clause now calls `decode_copy_args(cmd, data)` that consumes exactly the right number of bytes in ONE binary pattern match. 128 specialized clauses (16 offset bitmaps × 8 size bitmaps) generated at compile time via macros so the BEAM compiler picks the matching clause in a single dispatch. No more `read_if_bit`, no more 7-step splits, no more no-op binary reconstruction. ## Measured improvement Same `Pack.Reader.parse/2` on the same 96 MB ovs pack: | Before | After | |---|---| | >27 min, never completed | **18.4 s** | In the post-fix profile, `Pack.Delta` dropped from ~37k samples (~74%) to ~270 samples (~12% of remaining). New top is `decompress_data → probe_compressed_length` (the binary-search bottleneck I'd theorized originally) — that's the next sub-issue tracked under #153. ## Test plan - [x] 928/0 across full ex_git_objectstore suite. - [x] `mix format --check-formatted` clean. - [x] `mix dialyzer` clean. - [ ] Live ovs push test against prod once this and the anvil mix.lock bump deploy. Expectation: parse phase drops from 'never finishes' to ~20 s. Total push: ~2 min. ## Memory note Peak RSS during the 18 s parse was ~3.5 GB on this pack — GC churn plus the buffered resolved entries list. Streaming parse-and-store is still the next big sub-issue; this PR just makes the existing path actually finish in finite time.
SHA: 96509fa11f87f61bf0df582a4ded83c09a3f79b0
Author: Anvil <noreply@anvil.fangorn.io>
Date: 2026-05-06 15:58
Parents: 6994d00
1 files changed +98 -39
Type
lib/ex_git_objectstore/pack/delta.ex +98 −39
@@ -86,17 +86,32 @@
defp apply_instructions(<<>>, _base, acc), do: {:ok, Enum.reverse(acc)}
# Copy from base (MSB set)
# Copy from base (MSB set). The lower 7 bits of `cmd` form a bitmap:
# bits 0–3 indicate which of up to 4 little-endian offset bytes follow,
# bits 4–6 indicate which of up to 3 little-endian size bytes follow.
#
# The previous implementation called a generic `read_if_bit/3` helper
# 7× per copy command — each call doing its own binary pattern match
# AND, in the false branch, reconstructing the binary it just split.
# That hot path was 74% of total CPU on the ovs push profile (37k of
# ~50k samples). Dispatching directly on the offset+size bitmap turns
# those 7 nested matches into a single binary pattern match per
# command. Delta application is now bound by `binary_part/3` and the
# iolist accumulation rather than offset/size byte-shuffling.
defp apply_instructions(<<cmd, rest::binary>>, base, acc) when Bitwise.band(cmd, 0x80) != 0 do
case decode_copy_args(cmd, rest) do
{:ok, offset, size, rest_after_args} ->
# size of 0 means 0x10000
size = if size == 0, do: 0x10000, else: size
{offset, rest} = read_copy_offset(cmd, rest)
{size, rest} = read_copy_size(cmd, rest)
# size of 0 means 0x10000
if offset + size > byte_size(base) do
{:error, {:copy_out_of_bounds, offset: offset, size: size, base_size: byte_size(base)}}
else
chunk = binary_part(base, offset, size)
apply_instructions(rest_after_args, base, [chunk | acc])
end
size = if size == 0, do: 0x10000, else: size
if offset + size > byte_size(base) do
{:error, {:copy_out_of_bounds, offset: offset, size: size, base_size: byte_size(base)}}
else
chunk = binary_part(base, offset, size)
apply_instructions(rest, base, [chunk | acc])
{:error, _} = err ->
err
end
end
@@ -119,41 +134,85 @@
{:error, :reserved_delta_command}
end
# `cmd_low7` packs the offset bitmap (bits 0–3) into the low nibble and
# the size bitmap (bits 4–6) into the next 3 bits. Dispatching on it
# gives the BEAM compiler 128 specialized clauses, each consuming
# exactly the right number of bytes from `data` in one pattern match.
#
# We generate the clauses with macros so the implementation stays
# readable; the unrolled clauses are what actually compile in.
for o <- 0..15, s <- 0..7 do
cmd_low7 = Bitwise.bor(o, Bitwise.bsl(s, 4))
o_byte_count = Enum.count(0..3, fn b -> Bitwise.band(o, Bitwise.bsl(1, b)) != 0 end)
s_byte_count = Enum.count(0..2, fn b -> Bitwise.band(s, Bitwise.bsl(1, b)) != 0 end)
total_bytes = o_byte_count + s_byte_count
o_bits = for b <- 0..3, do: Bitwise.band(o, Bitwise.bsl(1, b)) != 0
s_bits = for b <- 0..2, do: Bitwise.band(s, Bitwise.bsl(1, b)) != 0
# Build the binary pattern dynamically: one byte var per present byte.
o_byte_vars = for {true, b} <- Enum.with_index(o_bits), do: Macro.var(:"o#{b}", __MODULE__)
s_byte_vars = for {true, b} <- Enum.with_index(s_bits), do: Macro.var(:"s#{b}", __MODULE__)
# The offset = sum of o_b << (8*b) for present b's; missing bytes contribute 0.
offset_expr =
o_bits
|> Enum.with_index()
|> Enum.reduce(0, fn
{false, _b}, acc ->
acc
{true, b}, acc ->
shifted =
if b == 0,
do: Macro.var(:"o#{b}", __MODULE__),
else:
# Read the copy offset (up to 4 bytes, indicated by bits 0-3 of cmd)
defp read_copy_offset(cmd, data) do
{b0, data} = read_if_bit(cmd, 0, data)
{b1, data} = read_if_bit(cmd, 1, data)
quote(do: Bitwise.bsl(unquote(Macro.var(:"o#{b}", __MODULE__)), unquote(8 * b)))
{b2, data} = read_if_bit(cmd, 2, data)
{b3, data} = read_if_bit(cmd, 3, data)
if acc == 0, do: shifted, else: quote(do: unquote(acc) + unquote(shifted))
end)
offset = b0 + Bitwise.bsl(b1, 8) + Bitwise.bsl(b2, 16) + Bitwise.bsl(b3, 24)
{offset, data}
end
# Read the copy size (up to 3 bytes, indicated by bits 4-6 of cmd)
defp read_copy_size(cmd, data) do
{b0, data} = read_if_bit(cmd, 4, data)
{b1, data} = read_if_bit(cmd, 5, data)
{b2, data} = read_if_bit(cmd, 6, data)
size_expr =
s_bits
|> Enum.with_index()
|> Enum.reduce(0, fn
{false, _b}, acc ->
acc
{true, b}, acc ->
shifted =
if b == 0,
size = b0 + Bitwise.bsl(b1, 8) + Bitwise.bsl(b2, 16)
{size, data}
end
do: Macro.var(:"s#{b}", __MODULE__),
else:
quote(do: Bitwise.bsl(unquote(Macro.var(:"s#{b}", __MODULE__)), unquote(8 * b)))
# Read one byte from data if the corresponding bit is set in cmd
defp read_if_bit(cmd, bit, <<byte, rest::binary>> = _data) do
if Bitwise.band(cmd, Bitwise.bsl(1, bit)) != 0 do
{byte, rest}
if acc == 0, do: shifted, else: quote(do: unquote(acc) + unquote(shifted))
end)
# Build the binary head pattern: o0, o1, o2, o3, then s0, s1, s2 (only present ones).
binary_pattern =
Enum.flat_map([o_byte_vars, s_byte_vars], & &1)
|> Enum.map(fn v -> {:"::", [], [v, 8]} end)
else
{0, <<byte, rest::binary>>}
end
end
defp read_if_bit(cmd, bit, <<>>) do
if Bitwise.band(cmd, Bitwise.bsl(1, bit)) != 0 do
throw({:error, :truncated_delta_copy})
if total_bytes == 0 do
# cmd_low7 == 0 → no offset and no size bytes follow. Both are
# implicitly zero; the size-of-zero means 0x10000 special case
# is handled in the caller.
defp decode_copy_args(cmd, data)
when Bitwise.band(cmd, 0x7F) == unquote(cmd_low7) and Bitwise.band(cmd, 0x80) != 0,
do: {:ok, 0, 0, data}
else
defp decode_copy_args(cmd, data)
when Bitwise.band(cmd, 0x7F) == unquote(cmd_low7) and Bitwise.band(cmd, 0x80) != 0 do
case data do
{0, <<>>}
<<unquote_splicing(binary_pattern), rest::binary>> ->
{:ok, unquote(offset_expr), unquote(size_expr), rest}
_ ->
{:error, :truncated_delta_copy}
end
end
end
end
end