Commit df0a437 - fangorn/ex_git_objectstore


      fangorn/ex_git_objectstore

public

ref:df0a4374cd49001f91e8dc0f8120d068143a5d15

fix: produce separate hunks for distant changes instead of showing entire file (#128)

The collect_change_groups function never split groups apart — context lines kept getting appended to the current group, so all edits ended up in one mega-hunk spanning the entire file. Replace with a correct algorithm: 1. Find indices of all change (add/del) lines 2. Expand each by context lines on each side 3. Merge overlapping ranges 4. Build one hunk per merged range Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

SHA: df0a4374cd49001f91e8dc0f8120d068143a5d15

Author: Cole Christensen <cole.christensen@macmillan.com>

Date: 2026-03-07 02:44

Parents: 802e450

2 files changed +173 -74

Type
	lib/ex_git_objectstore/diff.ex	+63 −74
@@ -76,9 +76,13 @@ old_content = load_blob_content(repo, old_sha) new_content = load_blob_content(repo, new_sha) edits = Myers.diff_lines(old_content, new_content) hunks = edits_to_hunks(edits, context) {:ok, hunks} if binary?(old_content) or binary?(new_content) do {:ok, :binary, %{old_size: byte_size(old_content), new_size: byte_size(new_content)}} else edits = Myers.diff_lines(old_content, new_content) hunks = edits_to_hunks(edits, context) {:ok, hunks} end end @doc """ @@ -92,8 +96,20 @@ {:ok, changes} <- diff_trees(repo, old_commit.tree, new_commit.tree) do file_diffs = Enum.map(changes, fn change -> case diff_blobs(repo, change.old_sha, change.new_sha, opts) do {:ok, :binary, sizes} -> %{ path: change.path, status: change.status, hunks: [], binary: true, old_size: sizes.old_size, new_size: sizes.new_size } {:ok, hunks} -> %{path: change.path, status: change.status, hunks: hunks} {:ok, hunks} = diff_blobs(repo, change.old_sha, change.new_sha, opts) %{path: change.path, status: change.status, hunks: hunks} end end) {:ok, file_diffs} @@ -177,6 +193,13 @@ end end defp binary?(content) when byte_size(content) == 0, do: false defp binary?(content) do chunk = binary_part(content, 0, min(byte_size(content), 8192)) :binary.match(chunk, <<0>>) != :nomatch end defp load_blob_content(_repo, nil), do: "" defp load_blob_content(repo, sha) do @@ -277,15 +300,45 @@ end defp edits_to_hunks(edits, context) do # Convert Myers edits to indexed lines indexed = index_edits(edits) indexed_vec = :array.from_list(indexed) total = :array.size(indexed_vec) # Find indices of all change (add/del) lines change_indices = indexed \|> Enum.with_index() # Group consecutive non-eq edits into change regions change_groups = group_changes(indexed, context) \|> Enum.filter(fn {{type, _, _, _}, _idx} -> type in [:add, :del] end) \|> Enum.map(fn {_, idx} -> idx end) if change_indices == [] do [] else # Expand each change index into a range with context, then merge overlapping ranges ranges = change_indices \|> Enum.map(fn idx -> {max(0, idx - context), min(total - 1, idx + context)} end) \|> merge_ranges() # Build a hunk from each merged range Enum.map(ranges, fn {start_idx, end_idx} -> lines = for i <- start_idx..end_idx, do: :array.get(i, indexed_vec) build_hunk(lines) end) end end defp merge_ranges([]), do: [] defp merge_ranges([first \| rest]) do Enum.reduce(rest, [first], fn {s, e}, [{prev_s, prev_e} \| acc] -> if s <= prev_e + 1 do [{prev_s, max(prev_e, e)} \| acc] else [{s, e}, {prev_s, prev_e} \| acc] Enum.map(change_groups, fn group -> build_hunk(group) end end) \|> Enum.reverse() end defp index_edits(edits) do @@ -302,70 +355,6 @@ end) Enum.reverse(result) end defp group_changes(indexed, context) do indexed_vec = :array.from_list(indexed) total = :array.size(indexed_vec) # Build index map for O(1) position lookups instead of O(n) find_index index_map = indexed \|> Enum.with_index() \|> Map.new(fn {item, idx} -> {item, idx} end) # Find change regions using prepend + reverse to avoid O(n^2) append all_groups = collect_change_groups(indexed) # Add context lines around each group using pre-computed indices all_groups \|> Enum.map(fn group -> add_context(group, indexed_vec, total, index_map, context) end) \|> Enum.reject(&Enum.empty?/1) end defp collect_change_groups(indexed) do {groups, current_group} = Enum.reduce(indexed, {[], []}, fn item, {groups, current} -> reduce_change_group(item, groups, current) end) if current_group != [] do [Enum.reverse(current_group) \| groups] else groups end \|> Enum.reverse() end defp reduce_change_group({:context, _, _, _} = item, groups, []) do {groups, [item]} end defp reduce_change_group({:context, _, _, _} = item, groups, current) do {groups, [item \| current]} end defp reduce_change_group(item, groups, current) do {groups, [item \| current]} end defp add_context([], _indexed_vec, _total, _index_map, _context), do: [] defp add_context(group, indexed_vec, total, index_map, context) do first = List.first(group) last = List.last(group) first_idx = Map.get(index_map, first) last_idx = Map.get(index_map, last) if first_idx == nil or last_idx == nil do group else start_idx = max(0, first_idx - context) end_idx = min(total - 1, last_idx + context) for i <- start_idx..end_idx, do: :array.get(i, indexed_vec) end end defp build_hunk(lines) do
	test/ex_git_objectstore/diff/diff_test.exs	+110 −0
@@ -145,6 +145,116 @@ {:ok, hunks} = Diff.diff_blobs(repo, old_sha, nil) assert hunks != [] end test "changes far apart produce separate hunks with context lines only" do repo = RepoHelper.memory_repo() # 20-line file with changes at line 2 and line 19 (far apart) old_lines = Enum.map(1..20, fn n -> "line#{n}" end) \|> Enum.join("\n") \|> Kernel.<>("\n") new_lines = Enum.map(1..20, fn n -> case n do 2 -> "CHANGED2" 19 -> "CHANGED19" _ -> "line#{n}" end end) \|> Enum.join("\n") \|> Kernel.<>("\n") old = Blob.from_content(old_lines) {:ok, old_sha} = Object.write(repo, old) new = Blob.from_content(new_lines) {:ok, new_sha} = Object.write(repo, new) # With default context=3, changes at line 2 and 19 should be separate hunks {:ok, hunks} = Diff.diff_blobs(repo, old_sha, new_sha, context: 3) assert length(hunks) == 2, "Expected 2 separate hunks, got #{length(hunks)}" [hunk1, hunk2] = hunks # First hunk: change at line 2, with context lines around it hunk1_types = Enum.map(hunk1.lines, &elem(&1, 0)) assert :del in hunk1_types assert :add in hunk1_types # Should NOT contain the entire file — max lines = change (2) + context (3+3) assert length(hunk1.lines) <= 8 # Second hunk: change at line 19, with context lines around it hunk2_types = Enum.map(hunk2.lines, &elem(&1, 0)) assert :del in hunk2_types assert :add in hunk2_types assert length(hunk2.lines) <= 8 end test "nearby changes merge into a single hunk" do repo = RepoHelper.memory_repo() # Changes at lines 3 and 7 — with context=3, their context windows overlap old_lines = Enum.map(1..10, fn n -> "line#{n}" end) \|> Enum.join("\n") \|> Kernel.<>("\n") new_lines = Enum.map(1..10, fn n -> case n do 3 -> "CHANGED3" 7 -> "CHANGED7" _ -> "line#{n}" end end) \|> Enum.join("\n") \|> Kernel.<>("\n") old = Blob.from_content(old_lines) {:ok, old_sha} = Object.write(repo, old) new = Blob.from_content(new_lines) {:ok, new_sha} = Object.write(repo, new) {:ok, hunks} = Diff.diff_blobs(repo, old_sha, new_sha, context: 3) # Lines 3 and 7 are only 4 apart — context windows overlap, should merge assert length(hunks) == 1 end test "context option controls number of surrounding lines" do repo = RepoHelper.memory_repo() # Single change at line 10 in a 20-line file old_lines = Enum.map(1..20, fn n -> "line#{n}" end) \|> Enum.join("\n") \|> Kernel.<>("\n") new_lines = Enum.map(1..20, fn n -> if n == 10, do: "CHANGED", else: "line#{n}" end) \|> Enum.join("\n") \|> Kernel.<>("\n") old = Blob.from_content(old_lines) {:ok, old_sha} = Object.write(repo, old) new = Blob.from_content(new_lines) {:ok, new_sha} = Object.write(repo, new) # context=1: should show 1 line before + del + add + 1 line after = 4 lines {:ok, [hunk]} = Diff.diff_blobs(repo, old_sha, new_sha, context: 1) assert length(hunk.lines) == 4 # context=0: should show only del + add = 2 lines {:ok, [hunk]} = Diff.diff_blobs(repo, old_sha, new_sha, context: 0) assert length(hunk.lines) == 2 end end describe "format_unified" do