ref:dc2d28f203e0402512ba03c409d58b4ee4651b30

fix(diff): linear-space Myers (Myers 1986 §4b)

Replaces the previous hand-rolled Myers, which kept every V-table from every `d` iteration alive in a list (O(D²) memory) and used a `Map` for V (O(log n) per access). On large inputs the BEAM allocator climbed into the GBs and got cgroup-killed under load — verified against chiron PR #68 (1700× peak memory reduction in this commit alone). This commit is the actual linear-space variant from Myers' 1986 paper §4b: divide-and-conquer at the middle snake. Memory becomes O(N+M) total — each bisect call holds two V tables for its lifetime, then GCs them before recursing. Recursion depth is O(log(N+M)). Translation reference: Google diff_match_patch's `diff_bisect`, itself a faithful port of Myers §4b, cross-checked against git xdiff's `xdl_split` (https://github.com/git/git/blob/master/xdiff/xdiffi.c). Verified: - All 11 Myers unit tests pass byte-identical to before. - Stress test (10k-line × ~33%-diff input) peaks at ~9 MB process heap vs unbounded growth in old / stdlib variants. The 200 MB stress-test bound is now real. - Full ex_git_objectstore suite (903 tests) green. Critical correctness rules from the paper that earlier attempts got wrong: - Δ = N - M parity drives WHICH sweep checks overlap (front when Δ odd, reverse when Δ even). Doing both is wrong. - Reverse-frame ↔ forward-frame mapping: `k_other = delta - k_self` (minus, not plus). - When bisect runs out of d iterations without finding overlap (tiny inputs, no commonality), fall back to splitting at the top-right corner so both halves are STRICTLY smaller — otherwise the recursion can re-call itself on the same range. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
SHA: dc2d28f203e0402512ba03c409d58b4ee4651b30
Author: Cole Christensen <cole.christensen@macmillan.com>
Date: 2026-04-29 04:57
Parents: dbd335e
2 files changed +586 -99
Type
lib/ex_git_objectstore/diff/myers.ex +516 −99
@@ -14,9 +14,34 @@
defmodule ExGitObjectstore.Diff.Myers do
@moduledoc """
Myers diff algorithm for computing the shortest edit script between two sequences.
Linear-space Myers diff (Myers 1986 §4b — "An O(ND) Difference Algorithm
and Its Variations", Algorithmica 1(2):251-266).
Divide-and-conquer at the middle snake: find the split point (x, y) where
the optimal edit script crosses the middle of the edit graph, then recurse
on a[0..x) vs b[0..y) and a[x..n) vs b[y..m). The snake's equalities fall
out of the recursion naturally because they appear in both halves.
Memory: O(N+M) total. Each recursion frame holds two V tables of size
~max_d (= (n+m+1)/2 in that frame) for the duration of one bisect; once
bisect returns, the V tables are GC'd before recursing. Recursion depth
is O(log(N+M)) on average.
This module is a careful Elixir translation of Google diff_match_patch's
`diff_bisect` (https://github.com/google/diff-match-patch), which is
itself a faithful port of Myers §4b. The C reference is git xdiff's
`xdl_split` / `xdl_recs_cmp` (https://github.com/git/git/blob/master/xdiff/xdiffi.c).
## Why we needed this
Uses the standard forward-only approach with trace recording and backtracking.
The previous hand-rolled implementation kept every V-table from every `d`
iteration in a list (O(D²) memory) and used a `Map` for V (O(log n) per
access). On large inputs the BEAM allocator climbed into the GBs and
cgroup-killed under load. Stdlib's `List.myers_difference/2` is also
naive Myers (one path-with-suffix per diagonal kept alive simultaneously
— O(D × max(D, N+M)) memory) and exhibits the same problem. This module
is the proper linear-space variant, the same algorithm git uses by
default in its `xdiff` library.
"""
@type edit :: {:eq, term()} | {:ins, term()} | {:del, term()}
@@ -27,11 +52,12 @@
"""
@spec diff(list(), list()) :: [edit()]
def diff(a, b) when is_list(a) and is_list(b) do
n = length(a)
m = length(b)
a_vec = List.to_tuple(a)
b_vec = List.to_tuple(b)
n = tuple_size(a_vec)
m = tuple_size(b_vec)
diff_range(a_vec, b_vec, 0, n, 0, m)
ses(a_vec, b_vec, n, m)
end
@doc """
@@ -39,138 +65,529 @@
"""
@spec diff_lines(String.t(), String.t()) :: [edit()]
def diff_lines(text_a, text_b) do
lines_a = String.split(text_a, "\n", trim: false)
lines_b = String.split(text_b, "\n", trim: false)
diff(String.split(text_a, "\n", trim: false), String.split(text_b, "\n", trim: false))
diff(lines_a, lines_b)
end
# ── Recursive driver ──────────────────────────────────────────────────
#
defp ses(_a, _b, 0, 0), do: []
# Compute edits for a[a_lo..a_hi) vs b[b_lo..b_hi). Strips common
# prefix/suffix first (huge speedup on typical code diffs), then either
# bottoms out or bisects and recurses on the two halves.
defp diff_range(a, b, a_lo, a_hi, b_lo, b_hi) do
{prefix_eqs, a_lo, b_lo} = strip_prefix(a, b, a_lo, a_hi, b_lo, b_hi, [])
{suffix_eqs, a_hi, b_hi} = strip_suffix(a, b, a_lo, a_hi, b_lo, b_hi, [])
defp ses(a, b, n, m) do
max = n + m
v0 = %{1 => 0}
# Forward pass: build trace of V snapshots
# trace stores V state AFTER processing each d value
# We also need v0 (the initial state before d=0)
case find_d(a, b, n, m, max, 0, v0, [v0]) do
{:found, trace} ->
backtrack(trace, a, b, n, m)
middle =
cond do
a_lo == a_hi and b_lo == b_hi ->
[]
:not_found ->
[]
end
end
a_lo == a_hi ->
for i <- b_lo..(b_hi - 1)//1, do: {:ins, elem(b, i)}
defp find_d(_a, _b, _n, _m, max, d, _v, _trace) when d > max, do: :not_found
b_lo == b_hi ->
for i <- a_lo..(a_hi - 1)//1, do: {:del, elem(a, i)}
true ->
{x, y} = bisect(a, b, a_lo, a_hi, b_lo, b_hi)
defp find_d(a, b, n, m, max, d, v, trace) do
v_new = process_diag(a, b, n, m, d, -d, v)
left = diff_range(a, b, a_lo, x, b_lo, y)
right = diff_range(a, b, x, a_hi, y, b_hi)
left ++ right
end
prefix_eqs ++ middle ++ suffix_eqs
end_x = Map.get(v_new, n - m, -1)
end
if end_x >= n do
# trace already has [v0, ...] accumulated in reverse
# Add v_new and reverse to get [v0, v_after_d0, v_after_d1, ..., v_after_dD]
{:found, Enum.reverse([v_new | trace])}
defp strip_prefix(a, b, a_lo, a_hi, b_lo, b_hi, acc)
when a_lo < a_hi and b_lo < b_hi do
if elem(a, a_lo) == elem(b, b_lo) do
strip_prefix(a, b, a_lo + 1, a_hi, b_lo + 1, b_hi, [{:eq, elem(a, a_lo)} | acc])
else
find_d(a, b, n, m, max, d + 1, v_new, [v_new | trace])
{Enum.reverse(acc), a_lo, b_lo}
end
end
defp process_diag(_a, _b, _n, _m, d, k, v) when k > d, do: v
defp strip_prefix(_a, _b, a_lo, _a_hi, b_lo, _b_hi, acc), do: {Enum.reverse(acc), a_lo, b_lo}
defp strip_suffix(a, b, a_lo, a_hi, b_lo, b_hi, acc)
when a_lo < a_hi and b_lo < b_hi do
if elem(a, a_hi - 1) == elem(b, b_hi - 1) do
defp process_diag(a, b, n, m, d, k, v) do
x =
if k == -d or (k != d and Map.get(v, k - 1, 0) < Map.get(v, k + 1, 0)) do
Map.get(v, k + 1, 0)
else
Map.get(v, k - 1, 0) + 1
end
y = x - k
strip_suffix(a, b, a_lo, a_hi - 1, b_lo, b_hi - 1, [{:eq, elem(a, a_hi - 1)} | acc])
{x, _y} = snake(a, b, n, m, x, y)
v = Map.put(v, k, x)
process_diag(a, b, n, m, d, k + 2, v)
end
defp snake(a, b, n, m, x, y) when x < n and y < m do
if elem(a, x) == elem(b, y) do
snake(a, b, n, m, x + 1, y + 1)
else
{acc, a_hi, b_hi}
{x, y}
end
end
defp strip_suffix(_a, _b, _a_lo, a_hi, _b_lo, b_hi, acc), do: {acc, a_hi, b_hi}
defp snake(_a, _b, _n, _m, x, y), do: {x, y}
# Backtrack through V snapshots to reconstruct edit script.
# ── Bisect — find the middle-snake split point ────────────────────────
#
# trace = [v_initial, v_after_d0, v_after_d1, ..., v_after_dD]
# Translation of diff_match_patch's diff_bisect (Python). Returns
# `{x, y}` in ABSOLUTE coordinates (within the original a/b) such that
# the optimal edit script from `a[a_lo..a_hi)` to `b[b_lo..b_hi)` passes
# through (x, y) in the middle of the edit graph.
# trace has D+2 elements. trace[0] = initial V = %{1 => 0}.
# trace[d+1] = V after processing d.
#
# For backtracking at step d (1-indexed edit step):
# - The end point at this step is determined by trace[d+1] at diagonal k
# - The previous state is trace[d] (which is V after processing d-1)
#
# We start at (n, m) and work backwards d = D, D-1, ..., 1
defp backtrack(trace, a, b, n, m) do
trace_arr = List.to_tuple(trace)
# V tables are stored as Maps keyed by k_offset = v_offset + k. Sentinel
# `-1` means "not yet reached" (distinct from "reached at x=0"). Map is
# used for clarity; could be swapped for `:atomics` for speed once
# correctness is established.
defp bisect(a, b, a_lo, a_hi, b_lo, b_hi) do
n = a_hi - a_lo
m = b_hi - b_lo
max_d = div(n + m + 1, 2)
v_offset = max_d
# Init: V_f[1] = 0 means "forward path on diagonal 1 is at x=0 (start)".
# Same for V_r. All other entries default to -1 (not reached).
v1 = %{(v_offset + 1) => 0}
v2 = %{(v_offset + 1) => 0}
# D = number of edits = tuple_size - 2 (since we have D+2 elements)
d_max = tuple_size(trace_arr) - 2
do_backtrack(trace_arr, a, b, d_max, n, m, [])
delta = n - m
front? = rem(delta, 2) != 0
bisect_d(a, b, a_lo, b_lo, n, m, max_d, v_offset, delta, front?, v1, v2, 0, 0, 0, 0, 0)
end
# bisect_d/17: search depth d. For each d, walk the forward path one step
# then the reverse path one step. Check overlap on the appropriate side
# (forward when delta is odd, reverse when delta is even).
defp bisect_d(
_a,
_b,
a_lo,
b_lo,
n,
_m,
max_d,
_v_off,
_delta,
_front?,
_v1,
_v2,
d,
_k1s,
_k1e,
_k2s,
_k2e
)
when d >= max_d do
# No middle snake found within max_d iterations — happens when D >= 2 but
# max_d is too small (e.g. tiny inputs like n=m=1 with no match) or no
# commonality at all. Fall back to "split at the top-right corner": left
# half becomes (full a, empty b) → all deletes, right half becomes
# Base case: d=0, just emit the initial snake from (0,0) to (x, y)
defp do_backtrack(_trace, a, _b, 0, x, _y, edits) do
# (empty a, full b) → all inserts. Both halves are STRICTLY smaller, so
# the recursion terminates.
{a_lo + n, b_lo}
diag(a, 0, x) ++ edits
end
defp bisect_d(
a,
b,
a_lo,
b_lo,
n,
m,
max_d,
v_off,
delta,
front?,
v1,
v2,
d,
k1s,
k1e,
k2s,
k2e
) do
case forward_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
-d + k1s,
k1s,
k1e
) do
{:found, x, y} ->
{a_lo + x, b_lo + y}
{:cont, v1_new, k1s_new, k1e_new} ->
case reverse_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1_new,
v2,
d,
-d + k2s,
k2s,
k2e
) do
{:found, x, y} ->
{a_lo + x, b_lo + y}
{:cont, v2_new, k2s_new, k2e_new} ->
bisect_d(
a,
b,
a_lo,
b_lo,
n,
m,
max_d,
v_off,
delta,
front?,
v1_new,
v2_new,
d + 1,
k1s_new,
k1e_new,
k2s_new,
k2e_new
)
end
end
end
# Forward sweep: walk diagonals k = -d+k1s, -d+k1s+2, ..., d-k1e.
# Returns {:found, x, y} on overlap (when delta is odd), else
# {:cont, v1, k1s, k1e} with possibly-adjusted bounds.
defp forward_sweep(
_a,
_b,
_a_lo,
_b_lo,
_n,
_m,
_v_off,
_delta,
_front?,
v1,
_v2,
d,
k1,
k1s,
k1e
)
when k1 > d - k1e do
{:cont, v1, k1s, k1e}
end
defp forward_sweep(a, b, a_lo, b_lo, n, m, v_off, delta, front?, v1, v2, d, k1, k1s, k1e) do
k1_off = v_off + k1
# Pick predecessor — down (k+1) preferred over right (k-1) when tied.
x1 =
cond do
k1 == -d ->
Map.get(v1, k1_off + 1, -1) |> max(0)
k1 == d ->
defp do_backtrack(trace, a, b, d, x, y, edits) do
k = x - y
# V state from previous round (after processing d-1)
v_prev = elem(trace, d)
Map.get(v1, k1_off - 1, -1) + 1
# Determine which direction we took at step d
prev_k =
if k == -d or (k != d and Map.get(v_prev, k - 1, 0) < Map.get(v_prev, k + 1, 0)) do
k + 1
else
k - 1
Map.get(v1, k1_off - 1, -1) < Map.get(v1, k1_off + 1, -1) ->
Map.get(v1, k1_off + 1, -1)
true ->
Map.get(v1, k1_off - 1, -1) + 1
end
y1 = x1 - k1
{x1, y1} = snake_forward(a, b, a_lo, b_lo, n, m, x1, y1)
v1 = Map.put(v1, k1_off, x1)
cond do
x1 > n ->
# Ran off the right edge — stop extending this side, shrink k range.
forward_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k1 + 2,
k1s,
k1e + 2
)
y1 > m ->
# Ran off the bottom — shrink the OTHER end of the k range.
forward_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k1 + 2,
k1s + 2,
k1e
)
front? ->
# Overlap check on forward sweep (delta is odd).
k2_off = v_off + delta - k1
if k2_off >= 0 and k2_off < 2 * v_off + 1 do
v2_x = Map.get(v2, k2_off, -1)
if v2_x != -1 do
# Reverse coordinate → forward: x2_forward = n - v2_x
x2_fwd = n - v2_x
if x1 >= x2_fwd do
{:found, x1, y1}
else
forward_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k1 + 2,
k1s,
k1e
)
end
else
forward_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k1 + 2,
k1s,
k1e
)
end
else
forward_sweep(a, b, a_lo, b_lo, n, m, v_off, delta, front?, v1, v2, d, k1 + 2, k1s, k1e)
end
true ->
forward_sweep(a, b, a_lo, b_lo, n, m, v_off, delta, front?, v1, v2, d, k1 + 2, k1s, k1e)
end
end
# Reverse sweep: same shape, mirrored. Snake compares from the END going
# backward.
defp reverse_sweep(
_a,
_b,
_a_lo,
_b_lo,
_n,
_m,
_v_off,
_delta,
_front?,
_v1,
v2,
d,
k2,
k2s,
k2e
)
when k2 > d - k2e do
{:cont, v2, k2s, k2e}
end
defp reverse_sweep(a, b, a_lo, b_lo, n, m, v_off, delta, front?, v1, v2, d, k2, k2s, k2e) do
k2_off = v_off + k2
x2 =
cond do
k2 == -d ->
Map.get(v2, k2_off + 1, -1) |> max(0)
k2 == d ->
Map.get(v2, k2_off - 1, -1) + 1
# End point of previous round on diagonal prev_k
prev_x = Map.get(v_prev, prev_k, 0)
prev_y = prev_x - prev_k
# Mid point after the edit (before snake)
{mid_x, edit} =
if prev_k > k do
# Insert (moved down: y+1, x stays)
{prev_x, {:ins, elem(b, prev_y)}}
else
# Delete (moved right: x+1, y stays)
{prev_x + 1, {:del, elem(a, prev_x)}}
Map.get(v2, k2_off - 1, -1) < Map.get(v2, k2_off + 1, -1) ->
Map.get(v2, k2_off + 1, -1)
true ->
Map.get(v2, k2_off - 1, -1) + 1
end
y2 = x2 - k2
{x2, y2} = snake_reverse(a, b, a_lo, b_lo, n, m, x2, y2)
v2 = Map.put(v2, k2_off, x2)
cond do
x2 > n ->
reverse_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k2 + 2,
k2s,
k2e + 2
)
# Snake from mid to (x, y) — these are all :eq
edits = diag(a, mid_x, x) ++ edits
y2 > m ->
reverse_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k2 + 2,
k2s + 2,
k2e
)
not front? ->
# Overlap check on reverse sweep (delta is even).
k1_off = v_off + delta - k2
if k1_off >= 0 and k1_off < 2 * v_off + 1 do
v1_x = Map.get(v1, k1_off, -1)
if v1_x != -1 do
x1_fwd = v1_x
x2_fwd = n - x2
if x1_fwd >= x2_fwd do
# Use forward coordinates of the meeting point.
y1_fwd = x1_fwd - (k1_off - v_off)
{:found, x1_fwd, y1_fwd}
else
reverse_sweep(
# Prepend the edit
edits = [edit | edits]
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k2 + 2,
k2s,
k2e
)
end
else
reverse_sweep(
a,
b,
a_lo,
b_lo,
n,
m,
v_off,
delta,
front?,
v1,
v2,
d,
k2 + 2,
k2s,
k2e
)
end
else
reverse_sweep(a, b, a_lo, b_lo, n, m, v_off, delta, front?, v1, v2, d, k2 + 2, k2s, k2e)
end
do_backtrack(trace, a, b, d - 1, prev_x, prev_y, edits)
true ->
reverse_sweep(a, b, a_lo, b_lo, n, m, v_off, delta, front?, v1, v2, d, k2 + 2, k2s, k2e)
end
end
# ── Snakes ────────────────────────────────────────────────────────────
#
# snake_forward: extend (x, y) forward (in local coordinates within the
defp diag(a, x_start, x_end) do
diag_acc(a, x_start, x_end, [])
# bisect range) as long as a[a_lo + x] == b[b_lo + y].
defp snake_forward(a, b, a_lo, b_lo, n, m, x, y) when x < n and y < m do
if elem(a, a_lo + x) == elem(b, b_lo + y) do
snake_forward(a, b, a_lo, b_lo, n, m, x + 1, y + 1)
else
{x, y}
end
end
defp snake_forward(_a, _b, _a_lo, _b_lo, _n, _m, x, y), do: {x, y}
defp diag_acc(_a, x, x, acc), do: Enum.reverse(acc)
# snake_reverse: extend (x, y) backward — local x, y are STEPS BACK from
# the (n, m) corner. Compare a[a_lo + n - 1 - x] vs b[b_lo + m - 1 - y].
defp snake_reverse(a, b, a_lo, b_lo, n, m, x, y) when x < n and y < m do
ai = a_lo + n - 1 - x
bi = b_lo + m - 1 - y
if elem(a, ai) == elem(b, bi) do
snake_reverse(a, b, a_lo, b_lo, n, m, x + 1, y + 1)
else
{x, y}
end
defp diag_acc(a, x_start, x_end, acc) when x_start < x_end do
diag_acc(a, x_start + 1, x_end, [{:eq, elem(a, x_start)} | acc])
end
defp snake_reverse(_a, _b, _a_lo, _b_lo, _n, _m, x, y), do: {x, y}
end