ref:3b5c89918f52b52d8e72eadfb0f5c964bd7083f6

Add diff engine: Myers algorithm + tree/blob/commit diff with hunk formatting

Phase 5 of ex_git_objectstore. Implements: - Myers diff algorithm for computing shortest edit scripts - Tree-level diff (recursive tree entry comparison) - Blob-level diff with configurable context lines and hunk grouping - Commit-level diff (tree diff + blob diffs per file) - Unified diff format output Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
SHA: 3b5c89918f52b52d8e72eadfb0f5c964bd7083f6
Author: Cole Christensen <cole.christensen@macmillan.com>
Date: 2026-02-10 07:51
Parents: c48d20e
4 files changed +817 -0
Type
lib/ex_git_objectstore/diff.ex +348 −0
@@ -1,0 +1,348 @@
defmodule ExGitObjectstore.Diff do
@moduledoc """
Tree-level diff and line-level diff with hunk formatting.
"""
alias ExGitObjectstore.{ObjectResolver, Repo}
alias ExGitObjectstore.Object.{Blob, Tree}
alias ExGitObjectstore.Diff.Myers
@type file_change :: %{
path: String.t(),
status: :added | :deleted | :modified | :renamed,
old_sha: String.t() | nil,
new_sha: String.t() | nil,
old_mode: String.t() | nil,
new_mode: String.t() | nil
}
@type hunk :: %{
old_start: non_neg_integer(),
old_count: non_neg_integer(),
new_start: non_neg_integer(),
new_count: non_neg_integer(),
lines: [{:context | :add | :del, String.t()}]
}
@type file_diff :: %{
path: String.t(),
status: :added | :deleted | :modified,
hunks: [hunk()]
}
@doc """
Compute tree-level diff between two tree SHAs.
Returns a list of file changes.
"""
@spec diff_trees(Repo.t(), String.t() | nil, String.t() | nil) ::
{:ok, [file_change()]} | {:error, term()}
def diff_trees(repo, old_tree_sha, new_tree_sha) do
old_entries = load_tree_entries(repo, old_tree_sha)
new_entries = load_tree_entries(repo, new_tree_sha)
changes = diff_tree_entries(repo, old_entries, new_entries, "")
{:ok, List.flatten(changes)}
end
@doc """
Compute line-level diff for a file change.
Returns hunks with context lines.
"""
@spec diff_blobs(Repo.t(), String.t() | nil, String.t() | nil, keyword()) ::
{:ok, [hunk()]} | {:error, term()}
def diff_blobs(repo, old_sha, new_sha, opts \\ []) do
context = Keyword.get(opts, :context, 3)
old_content = load_blob_content(repo, old_sha)
new_content = load_blob_content(repo, new_sha)
edits = Myers.diff_lines(old_content, new_content)
hunks = edits_to_hunks(edits, context)
{:ok, hunks}
end
@doc """
Compute full diff (tree changes + line diffs) between two commits.
"""
@spec diff_commits(Repo.t(), String.t(), String.t(), keyword()) ::
{:ok, [file_diff()]} | {:error, term()}
def diff_commits(repo, old_commit_sha, new_commit_sha, opts \\ []) do
with {:ok, old_commit} <- ObjectResolver.read(repo, old_commit_sha),
{:ok, new_commit} <- ObjectResolver.read(repo, new_commit_sha),
{:ok, changes} <- diff_trees(repo, old_commit.tree, new_commit.tree) do
file_diffs =
Enum.map(changes, fn change ->
case diff_blobs(repo, change.old_sha, change.new_sha, opts) do
{:ok, hunks} ->
%{path: change.path, status: change.status, hunks: hunks}
{:error, _} ->
%{path: change.path, status: change.status, hunks: []}
end
end)
{:ok, file_diffs}
end
end
@doc """
Format hunks into unified diff text.
"""
@spec format_unified(String.t(), [hunk()]) :: String.t()
def format_unified(path, hunks) do
header = "--- a/#{path}\n+++ b/#{path}\n"
body =
Enum.map(hunks, fn hunk ->
hunk_header =
"@@ -#{hunk.old_start},#{hunk.old_count} +#{hunk.new_start},#{hunk.new_count} @@\n"
lines =
Enum.map(hunk.lines, fn
{:context, line} -> " #{line}\n"
{:add, line} -> "+#{line}\n"
{:del, line} -> "-#{line}\n"
end)
[hunk_header | lines]
end)
IO.iodata_to_binary([header | body])
end
# -- Private --
defp load_tree_entries(_repo, nil), do: %{}
defp load_tree_entries(repo, tree_sha) do
case ObjectResolver.read(repo, tree_sha) do
{:ok, %Tree{entries: entries}} ->
Map.new(entries, fn entry -> {entry.name, entry} end)
_ ->
%{}
end
end
defp load_blob_content(_repo, nil), do: ""
defp load_blob_content(repo, sha) do
case ObjectResolver.read(repo, sha) do
{:ok, %Blob{content: content}} -> content
_ -> ""
end
end
defp diff_tree_entries(repo, old_entries, new_entries, prefix) do
all_names =
MapSet.union(
MapSet.new(Map.keys(old_entries)),
MapSet.new(Map.keys(new_entries))
)
|> Enum.sort()
Enum.flat_map(all_names, fn name ->
path = if prefix == "", do: name, else: "#{prefix}/#{name}"
old = Map.get(old_entries, name)
new = Map.get(new_entries, name)
cond do
old == nil and new != nil ->
if new.mode == "40000" do
# New directory — recurse
new_sub = load_tree_entries(repo, new.sha)
diff_tree_entries(repo, %{}, new_sub, path)
else
[
%{
path: path,
status: :added,
old_sha: nil,
new_sha: new.sha,
old_mode: nil,
new_mode: new.mode
}
]
end
old != nil and new == nil ->
if old.mode == "40000" do
old_sub = load_tree_entries(repo, old.sha)
diff_tree_entries(repo, old_sub, %{}, path)
else
[
%{
path: path,
status: :deleted,
old_sha: old.sha,
new_sha: nil,
old_mode: old.mode,
new_mode: nil
}
]
end
old.sha == new.sha and old.mode == new.mode ->
# Unchanged
[]
old.mode == "40000" and new.mode == "40000" ->
# Both directories — recurse
old_sub = load_tree_entries(repo, old.sha)
new_sub = load_tree_entries(repo, new.sha)
diff_tree_entries(repo, old_sub, new_sub, path)
true ->
[
%{
path: path,
status: :modified,
old_sha: old.sha,
new_sha: new.sha,
old_mode: old.mode,
new_mode: new.mode
}
]
end
end)
end
defp edits_to_hunks(edits, context) do
# Convert Myers edits to indexed lines
indexed = index_edits(edits)
# Group consecutive non-eq edits into change regions
change_groups = group_changes(indexed, context)
Enum.map(change_groups, fn group ->
build_hunk(group)
end)
end
defp index_edits(edits) do
{result, _old_line, _new_line} =
Enum.reduce(edits, {[], 1, 1}, fn
{:eq, line}, {acc, old, new} ->
{[{:context, line, old, new} | acc], old + 1, new + 1}
{:del, line}, {acc, old, new} ->
{[{:del, line, old, new} | acc], old + 1, new}
{:ins, line}, {acc, old, new} ->
{[{:add, line, old, new} | acc], old, new + 1}
end)
Enum.reverse(result)
end
defp group_changes(indexed, context) do
# Find change regions (del/add) and expand with context
{groups, current_group} =
Enum.reduce(indexed, {[], []}, fn item, {groups, current} ->
case item do
{:context, _, _, _} ->
if current == [] do
{groups, []}
else
{groups, current ++ [item]}
end
_ ->
{groups, current ++ [item]}
end
end)
all_groups = if current_group != [], do: groups ++ [current_group], else: groups
# Add context lines around each group
Enum.map(all_groups, fn group ->
add_context(group, indexed, context)
end)
|> Enum.reject(&Enum.empty?/1)
end
defp add_context(group, all_indexed, context) do
return_if_empty = fn -> [] end
case {List.first(group), List.last(group)} do
{nil, _} ->
return_if_empty.()
{first, last} ->
# Find indices in all_indexed
first_idx = Enum.find_index(all_indexed, &(&1 == first))
last_idx = Enum.find_index(all_indexed, &(&1 == last))
if first_idx == nil or last_idx == nil do
group
else
start_idx = max(0, first_idx - context)
end_idx = min(length(all_indexed) - 1, last_idx + context)
Enum.slice(all_indexed, start_idx..end_idx)
end
end
end
defp build_hunk(lines) do
# Filter out trailing empty context that's just the last empty-string split artifact
lines = drop_trailing_empty_context(lines)
{old_lines, new_lines, formatted} =
Enum.reduce(lines, {0, 0, []}, fn
{:context, line, _old, _new}, {old_c, new_c, acc} ->
{old_c + 1, new_c + 1, [{:context, line} | acc]}
{:del, line, _old, _new}, {old_c, new_c, acc} ->
{old_c + 1, new_c, [{:del, line} | acc]}
{:add, line, _old, _new}, {old_c, new_c, acc} ->
{old_c, new_c + 1, [{:add, line} | acc]}
end)
# Find the starting line numbers
first_old =
lines
|> Enum.find(fn
{:context, _, _, _} -> true
{:del, _, _, _} -> true
_ -> false
end)
|> case do
{:context, _, old, _} -> old
{:del, _, old, _} -> old
nil -> 1
end
first_new =
lines
|> Enum.find(fn
{:context, _, _, _} -> true
{:add, _, _, _} -> true
_ -> false
end)
|> case do
{:context, _, _, new} -> new
{:add, _, _, new} -> new
nil -> 1
end
%{
old_start: first_old,
old_count: old_lines,
new_start: first_new,
new_count: new_lines,
lines: Enum.reverse(formatted)
}
end
defp drop_trailing_empty_context(lines) do
lines
|> Enum.reverse()
|> Enum.drop_while(fn
{:context, "", _, _} -> true
_ -> false
end)
|> Enum.reverse()
end
end