diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 5c63dec..d3cbab7 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -16,8 +16,8 @@ jobs:
 
     strategy:
       matrix:
-        elixir: ['1.18.4']
-        otp: ['27.3']
+        elixir: ['1.19.5']
+        otp: ['28.4']
 
     steps:
       - name: Checkout
diff --git a/.tool-versions b/.tool-versions
new file mode 100644
index 0000000..af669b2
--- /dev/null
+++ b/.tool-versions
@@ -0,0 +1,2 @@
+erlang 28.4
+elixir 1.19.5
diff --git a/lib/deep_eval_ex/metrics/tool_correctness.ex b/lib/deep_eval_ex/metrics/tool_correctness.ex
new file mode 100644
index 0000000..ed9fe3e
--- /dev/null
+++ b/lib/deep_eval_ex/metrics/tool_correctness.ex
@@ -0,0 +1,485 @@
+# Copyright 2025 Steven Holdsworth (@holsee)
+# SPDX-License-Identifier: Apache-2.0
+#
+# Ported from deepeval/metrics/tool_correctness/tool_correctness.py
+# Original: https://github.com/confident-ai/deepeval
+
+defmodule DeepEvalEx.Metrics.ToolCorrectness do
+  @moduledoc """
+  Metric for evaluating tool calling correctness and tool selection quality.
+
+  Compares the tools called by an LLM agent against a set of expected tools
+  using deterministic comparison. Optionally evaluates tool selection quality
+  via an LLM call when `available_tools` are provided.
+
+  Follows the same logic as the Python `deepeval` library's ToolCorrectnessMetric.
+
+  ## Usage
+
+      metric = DeepEvalEx.Metrics.ToolCorrectness
+
+      test_case = %DeepEvalEx.TestCase{
+        input: "Look up Kai Nakamura",
+        actual_output: "",
+        tools_called: [%ToolCall{name: "oa_find_applicants"}],
+        expected_tools: [%ToolCall{name: "oa_find_applicants"}]
+      }
+
+      {:ok, result} = metric.measure(test_case)
+      # => %DeepEvalEx.Result{score: 1.0, success: true, ...}
+
+  ## Options
+
+  - `:threshold` - Score threshold for pass/fail (default: 0.5)
+  - `:should_exact_match` - All-or-nothing positional matching; length mismatch → 0.0 (default: false)
+  - `:should_consider_ordering` - Use weighted LCS to enforce tool call order (default: false)
+  - `:evaluation_params` - Which additional ToolCall fields to compare beyond name:
+    `[:input_parameters]`, `[:output]`, or `[:input_parameters, :output]` (default: `[]`)
+  - `:available_tools` - List of available ToolCall structs. When provided, an LLM
+    evaluates whether the right tools were selected. Final score = min(calling, selection).
+  - `:strict_mode` - When true, threshold is forced to 1.0 and any score below is zeroed (default: false)
+  - `:include_reason` - Whether to include reason in results (default: true)
+  - `:adapter` - LLM adapter for tool selection scoring
+  - `:model` - Model name for tool selection scoring
+
+  ## Scoring Modes
+
+  - **Default mode:** For each expected tool, find the best matching called tool (greedy).
+    Score = total matches / |expected|. Extra tools do not penalise.
+  - **Exact match mode:** Lists must be the same length. Each position is compared.
+    Any mismatch → score 0.0. All match → score 1.0.
+  - **Ordering mode:** Uses weighted Longest Common Subsequence (LCS) DP algorithm.
+    Score = weighted LCS score / |expected|. Preserves relative order.
+  - When `:input_parameters` is in `evaluation_params`, parameter similarity is computed
+    via recursive dictionary comparison returning a fractional score (0.0–1.0).
+  """
+
+  use DeepEvalEx.Metrics.BaseMetric, default_threshold: 0.5
+
+  alias DeepEvalEx.LLM.Adapter
+  alias DeepEvalEx.Prompts.ToolCorrectness, as: Template
+  alias DeepEvalEx.Schemas.MetricOutputs.ToolCorrectness, as: Schema
+
+  @impl true
+  def metric_name, do: "Tool Correctness"
+
+  @impl true
+  def required_params, do: [:tools_called, :expected_tools]
+
+  @doc """
+  Override validate_test_case to allow empty lists.
+
+  Python DeepEval allows empty tools_called/expected_tools and returns
+  scores rather than validation errors.
+  """
+  def validate_test_case(test_case) do
+    # Only validate that the fields exist (are not nil)
+    missing =
+      required_params()
+      |> Enum.filter(fn param ->
+        Map.get(test_case, param) == nil
+      end)
+
+    case missing do
+      [] -> :ok
+      params -> {:error, {:missing_params, params}}
+    end
+  end
+
+  def do_measure(test_case, opts) do
+    config = parse_config(opts)
+    called = test_case.tools_called
+    expected = test_case.expected_tools
+
+    tool_calling_score =
+      calculate_score(
+        called,
+        expected,
+        config.exact_match?,
+        config.consider_ordering?,
+        config.eval_params
+      )
+      |> apply_strict_mode(config)
+
+    {tool_selection_score, tool_selection_reason} =
+      evaluate_tool_selection(test_case.input, called, config.available_tools, opts)
+
+    score =
+      min(tool_calling_score, tool_selection_score)
+      |> apply_strict_mode(config)
+
+    reason =
+      build_combined_reason(
+        called,
+        expected,
+        config,
+        tool_selection_reason
+      )
+
+    {:ok,
+     Result.new(
+       metric: metric_name(),
+       score: score,
+       threshold: config.threshold,
+       reason: reason,
+       success: score >= config.threshold,
+       metadata: %{
+         tool_calling_score: tool_calling_score,
+         tool_selection_score: tool_selection_score,
+         tool_selection_reason: tool_selection_reason,
+         should_exact_match: config.exact_match?,
+         should_consider_ordering: config.consider_ordering?,
+         evaluation_params: config.eval_params,
+         strict_mode: config.strict_mode?
+       }
+     )}
+  end
+
+  defp parse_config(opts) do
+    strict_mode? = Keyword.get(opts, :strict_mode, false)
+
+    %{
+      strict_mode?: strict_mode?,
+      threshold:
+        if(strict_mode?, do: 1.0, else: Keyword.get(opts, :threshold, default_threshold())),
+      exact_match?: Keyword.get(opts, :should_exact_match, false),
+      consider_ordering?: Keyword.get(opts, :should_consider_ordering, false),
+      eval_params: Keyword.get(opts, :evaluation_params, []),
+      available_tools: Keyword.get(opts, :available_tools),
+      include_reason: Keyword.get(opts, :include_reason, true)
+    }
+  end
+
+  defp apply_strict_mode(score, %{strict_mode?: true, threshold: threshold})
+       when score < threshold, do: 0.0
+
+  defp apply_strict_mode(score, _config), do: score
+
+  defp evaluate_tool_selection(_input, _called, nil, _opts) do
+    {1.0, "No available tools were provided to assess tool selection criteria"}
+  end
+
+  defp evaluate_tool_selection(_input, _called, [], _opts) do
+    {1.0, "No available tools were provided to assess tool selection criteria"}
+  end
+
+  defp evaluate_tool_selection(input, called, available_tools, opts) do
+    get_tool_selection_score(input, called, available_tools, opts)
+  end
+
+  defp build_combined_reason(_called, _expected, %{include_reason: false}, _selection_reason),
+    do: nil
+
+  defp build_combined_reason(called, expected, config, selection_reason) do
+    calling_reason =
+      generate_reason(
+        called,
+        expected,
+        config.exact_match?,
+        config.consider_ordering?,
+        config.eval_params
+      )
+
+    construct_final_reason(calling_reason, selection_reason)
+  end
+
+  # --- Score calculation dispatch (matches Python _calculate_score) ---
+
+  defp calculate_score(called, expected, exact_match?, consider_ordering?, eval_params) do
+    cond do
+      exact_match? ->
+        calculate_exact_match_score(called, expected, eval_params)
+
+      consider_ordering? ->
+        {_lcs, weighted_length} = compute_weighted_lcs(called, expected, eval_params)
+        calculate_ordering_score(called, expected, weighted_length)
+
+      true ->
+        calculate_non_exact_match_score(called, expected, eval_params)
+    end
+  end
+
+  defp calculate_ordering_score(called, expected, weighted_length) do
+    cond do
+      Enum.empty?(called) and Enum.empty?(expected) -> 1.0
+      Enum.empty?(expected) -> 0.0
+      true -> weighted_length / length(expected)
+    end
+  end
+
+  # --- Exact match score (matches Python _calculate_exact_match_score) ---
+
+  defp calculate_exact_match_score([], [], _eval_params), do: 1.0
+
+  defp calculate_exact_match_score(called, expected, _eval_params)
+       when length(called) != length(expected) do
+    0.0
+  end
+
+  defp calculate_exact_match_score(called, expected, eval_params) do
+    mismatch? =
+      Enum.zip(called, expected)
+      |> Enum.any?(fn {c, e} ->
+        c.name != e.name or
+          (:input_parameters in eval_params and
+             c.input_parameters != e.input_parameters) or
+          (:output in eval_params and c.output != e.output)
+      end)
+
+    if mismatch?, do: 0.0, else: 1.0
+  end
+
+  # --- Non-exact match score (matches Python _calculate_non_exact_match_score) ---
+
+  defp calculate_non_exact_match_score(called, expected, eval_params) do
+    {total_score, _matched} =
+      Enum.reduce(expected, {0.0, MapSet.new()}, fn exp, {score_acc, matched} ->
+        {best_score, best_idx} = find_best_match(called, exp, eval_params, matched)
+
+        if best_score > 0 do
+          {score_acc + best_score, MapSet.put(matched, best_idx)}
+        else
+          {score_acc, matched}
+        end
+      end)
+
+    cond do
+      Enum.empty?(expected) and Enum.empty?(called) -> 1.0
+      Enum.empty?(expected) -> 0.0
+      true -> total_score / length(expected)
+    end
+  end
+
+  defp find_best_match(called, expected_tool, eval_params, matched) do
+    called
+    |> Enum.with_index()
+    |> Enum.reject(fn {_c, i} -> MapSet.member?(matched, i) end)
+    |> Enum.filter(fn {c, _i} -> expected_tool.name == c.name end)
+    |> Enum.reduce({0.0, nil}, fn {c, i}, {best, best_i} ->
+      match_score = compute_match_score(c, expected_tool, eval_params)
+      if match_score > best, do: {match_score, i}, else: {best, best_i}
+    end)
+  end
+
+  # --- Weighted LCS (matches Python _compute_weighted_lcs) ---
+  # Returns {lcs_tools, weighted_length} matching Python's return signature
+
+  defp compute_weighted_lcs(called, expected, eval_params) do
+    m = length(expected)
+    n = length(called)
+
+    expected_vec = :array.from_list(expected)
+    called_vec = :array.from_list(called)
+
+    # Build DP table
+    dp =
+      for i <- 1..max(m, 1),
+          j <- 1..max(n, 1),
+          i <= m,
+          j <= n,
+          reduce: %{} do
+        acc ->
+          exp = :array.get(i - 1, expected_vec)
+          cal = :array.get(j - 1, called_vec)
+
+          if exp.name != cal.name do
+            val = max(Map.get(acc, {i - 1, j}, 0.0), Map.get(acc, {i, j - 1}, 0.0))
+            Map.put(acc, {i, j}, val)
+          else
+            score = compute_match_score(cal, exp, eval_params)
+
+            diag = if score > 0, do: Map.get(acc, {i - 1, j - 1}, 0.0) + score, else: 0.0
+            up = Map.get(acc, {i - 1, j}, 0.0)
+            left = Map.get(acc, {i, j - 1}, 0.0)
+
+            Map.put(acc, {i, j}, max(diag, max(up, left)))
+          end
+      end
+
+    # Backtrack to recover LCS and total score
+    {lcs, total_score} = backtrack_lcs(dp, expected_vec, m, n)
+
+    {lcs, total_score}
+  end
+
+  defp backtrack_lcs(dp, expected_vec, m, n) do
+    backtrack_lcs(dp, expected_vec, m, n, [], 0.0)
+  end
+
+  defp backtrack_lcs(_dp, _expected_vec, 0, _j, lcs, total_score), do: {lcs, total_score}
+  defp backtrack_lcs(_dp, _expected_vec, _i, 0, lcs, total_score), do: {lcs, total_score}
+
+  defp backtrack_lcs(dp, expected_vec, i, j, lcs, total_score) do
+    current = Map.get(dp, {i, j}, 0.0)
+    up = Map.get(dp, {i - 1, j}, 0.0)
+    left = Map.get(dp, {i, j - 1}, 0.0)
+    diag = Map.get(dp, {i - 1, j - 1}, 0.0)
+
+    cond do
+      current == up ->
+        backtrack_lcs(dp, expected_vec, i - 1, j, lcs, total_score)
+
+      current == left ->
+        backtrack_lcs(dp, expected_vec, i, j - 1, lcs, total_score)
+
+      true ->
+        tool = :array.get(i - 1, expected_vec)
+        step_score = current - diag
+        backtrack_lcs(dp, expected_vec, i - 1, j - 1, [tool | lcs], total_score + step_score)
+    end
+  end
+
+  # --- Match score computation (shared by non-exact and LCS) ---
+
+  defp compute_match_score(called, expected, eval_params) do
+    score = 1.0
+
+    score =
+      if :input_parameters in eval_params do
+        score * compare_dicts(expected.input_parameters, called.input_parameters, false)
+      else
+        score
+      end
+
+    if :output in eval_params and expected.output != called.output do
+      0.0
+    else
+      score
+    end
+  end
+
+  # --- Dictionary comparison (matches Python _compare_dicts) ---
+
+  defp compare_dicts(dict1, dict2, _exact_match?) when dict1 == dict2, do: 1.0
+
+  defp compare_dicts(dict1, dict2, _exact_match?)
+       when is_map(dict1) and is_map(dict2) do
+    keys1 = MapSet.new(Map.keys(dict1))
+    keys2 = MapSet.new(Map.keys(dict2))
+    matched_keys = MapSet.intersection(keys1, keys2)
+    total = MapSet.size(MapSet.union(keys1, keys2))
+
+    if total == 0 do
+      1.0
+    else
+      Enum.reduce(matched_keys, 0.0, fn key, acc ->
+        acc + compare_key_values(Map.get(dict1, key), Map.get(dict2, key), total)
+      end)
+    end
+  end
+
+  defp compare_dicts(_dict1, _dict2, _exact_match?), do: 0.0
+
+  defp compare_key_values(v, v, total), do: 1 / total
+
+  defp compare_key_values(v1, v2, total) when is_map(v1) and is_map(v2) do
+    compare_dicts(v1, v2, false) / total
+  end
+
+  defp compare_key_values(_v1, _v2, _total), do: 0.0
+
+  # --- Tool Selection Score (LLM-based, matches Python _get_tool_selection_score) ---
+
+  defp get_tool_selection_score(user_input, tools_called, available_tools, opts) do
+    tools_called_formatted = Template.format_tools(tools_called)
+    available_tools_formatted = Template.format_tools(available_tools)
+
+    prompt =
+      Template.get_tool_selection_score(
+        user_input: user_input,
+        tools_called: tools_called_formatted,
+        available_tools: available_tools_formatted
+      )
+
+    case Adapter.generate_with_schema(prompt, Schema.tool_selection_score_schema(), opts) do
+      {:ok, response} ->
+        case Schema.parse_tool_selection_score(response) do
+          {:ok, %{score: score, reason: reason}} -> {score, reason}
+          {:error, _} -> {1.0, "Failed to parse tool selection score"}
+        end
+
+      {:error, _} ->
+        {1.0, "Failed to generate tool selection score"}
+    end
+  end
+
+  # --- Reason generation (matches Python _generate_reason) ---
+
+  defp generate_reason(called, expected, exact_match?, consider_ordering?, eval_params) do
+    called_names = Enum.map(called, & &1.name)
+    expected_names = Enum.map(expected, & &1.name)
+
+    cond do
+      exact_match? ->
+        generate_exact_match_reason(called, expected, eval_params, called_names, expected_names)
+
+      consider_ordering? ->
+        generate_ordering_reason(called, expected, eval_params, called_names, expected_names)
+
+      true ->
+        generate_default_reason(called, expected, eval_params, called_names, expected_names)
+    end
+  end
+
+  defp generate_exact_match_reason(called, expected, eval_params, called_names, expected_names) do
+    score = calculate_exact_match_score(called, expected, eval_params)
+    label = if score == 1.0, do: "Exact match", else: "Not an exact match"
+
+    "#{label}: expected #{inspect(expected_names)}, called #{inspect(called_names)}. See details above."
+  end
+
+  defp generate_ordering_reason(called, expected, eval_params, called_names, expected_names) do
+    {lcs, weighted_length} = compute_weighted_lcs(called, expected, eval_params)
+    score = calculate_ordering_score(called, expected, weighted_length)
+
+    if score == 1.0 do
+      "Correct ordering: all expected tools #{inspect(expected_names)} were called in the correct order."
+    else
+      format_ordering_issues(lcs, called_names, expected_names)
+    end
+  end
+
+  defp format_ordering_issues(lcs, called_names, expected_names) do
+    expected_set = MapSet.new(expected_names)
+    called_set = MapSet.new(called_names)
+    lcs_names = MapSet.new(lcs, & &1.name)
+
+    missing = MapSet.difference(expected_set, called_set)
+    out_of_order = MapSet.difference(expected_set, lcs_names)
+
+    issues =
+      []
+      |> maybe_add_issue(missing, "missing tools")
+      |> maybe_add_issue(out_of_order, "out-of-order tools")
+
+    "Incorrect tool usage: #{Enum.join(issues, " and ")}; expected #{inspect(expected_names)}, called #{inspect(called_names)}. See more details above."
+  end
+
+  defp maybe_add_issue(issues, set, label) do
+    if MapSet.size(set) > 0,
+      do: issues ++ ["#{label} #{inspect(MapSet.to_list(set))}"],
+      else: issues
+  end
+
+  defp generate_default_reason(called, expected, eval_params, called_names, expected_names) do
+    score = calculate_non_exact_match_score(called, expected, eval_params)
+
+    if score == 1.0 do
+      "All expected tools #{inspect(expected_names)} were called (order not considered)."
+    else
+      missing_list =
+        expected
+        |> MapSet.new()
+        |> MapSet.difference(MapSet.new(called))
+        |> Enum.map(& &1.name)
+
+      "Incomplete tool usage: missing tools #{inspect(missing_list)}; expected #{inspect(expected_names)}, called #{inspect(called_names)}. See more details above."
+    end
+  end
+
+  # --- Final reason construction (matches Python _construct_final_reason) ---
+
+  defp construct_final_reason(tool_calling_reason, tool_selection_reason) do
+    "[\n\t Tool Calling Reason: #{tool_calling_reason}\n\t Tool Selection Reason: #{tool_selection_reason}\n]\n"
+  end
+end
diff --git a/lib/deep_eval_ex/prompts/tool_correctness.ex b/lib/deep_eval_ex/prompts/tool_correctness.ex
new file mode 100644
index 0000000..366ffaa
--- /dev/null
+++ b/lib/deep_eval_ex/prompts/tool_correctness.ex
@@ -0,0 +1,132 @@
+# Copyright 2025 Steven Holdsworth (@holsee)
+# SPDX-License-Identifier: Apache-2.0
+#
+# Ported from deepeval/metrics/tool_correctness/template.py
+# Original: https://github.com/confident-ai/deepeval
+
+defmodule DeepEvalEx.Prompts.ToolCorrectness do
+  @moduledoc """
+  Prompt templates for the ToolCorrectness metric.
+
+  Contains the tool selection evaluation prompt used when
+  `available_tools` are provided.
+  """
+
+  @doc """
+  Generates a prompt for the LLM to evaluate tool selection quality.
+
+  ## Options
+
+  - `:user_input` - The user's input/task (required)
+  - `:tools_called` - Formatted string of tools called (required)
+  - `:available_tools` - Formatted string of available tools (required)
+  """
+  def get_tool_selection_score(opts) do
+    user_input = Keyword.fetch!(opts, :user_input)
+    tools_called = Keyword.fetch!(opts, :tools_called)
+    available_tools = Keyword.fetch!(opts, :available_tools)
+
+    """
+    You are an expert evaluator assessing the **Tool Selection** quality of an AI agent.
+
+    You are given:
+    - The **user input** that defines the user's goal / task.
+    - A list of **available tools**, each with a name and description.
+    - A list of **tool calls made** by the agent during execution, including tool name and parameters.
+
+    Your job is to assign a **Tool Selection score** from 0.0 to 1.0 based on how appropriate and well-matched the agent's chosen tools were to the task's requirements.
+
+    ---
+
+    DEFINITION:
+
+    Tool Selection evaluates how suitable the agent's tool choices were in addressing the task and sub-tasks.
+
+    This metric does **not** consider:
+    - How well the tools were used (execution quality)
+    - Whether the agent adhered to a plan
+    - Whether the output was correct or efficient
+
+    It only assesses whether the **right tools** were selected, based on their stated descriptions and the demands of the task.
+
+    ---
+
+    INSTRUCTIONS:
+
+    Step 1: Read the **user task** to understand what needed to be accomplished.
+
+    Step 2: Examine the **available tools** and their descriptions to understand the intended purpose of each.
+
+    Step 3: Review the **tool calls made by the agent**:
+    - Were the selected tools well-aligned with the task?
+    - Were any obviously better-suited tools ignored?
+    - Were any tools misapplied or used unnecessarily?
+
+    Step 4: Identify selection issues:
+    - **Correct Selection**: Tool(s) chosen directly and appropriately matched the subtask.
+    - **Over-selection**: More tools were selected than necessary, despite availability of a simpler or more direct option.
+    - **Under-selection**: Key tools that were well-suited were omitted.
+    - **Mis-selection**: Tools were chosen that were poorly matched to their purpose or the subtask.
+
+    ---
+
+    SCORING GUIDE:
+
+    - **1.0** → All selected tools were appropriate and necessary. No better-suited tools were omitted.
+    - **0.75** → Tool choices were mostly appropriate, with minor omissions or unnecessary use.
+    - **0.5** → Mixed tool selection. Some useful tools ignored or some inappropriate ones used.
+    - **0.25** → Poor tool selection. Better alternatives were available and ignored.
+    - **0.0** → Tool selection was clearly misaligned with task requirements.
+
+    ---
+
+    OUTPUT FORMAT:
+
+    Return a valid JSON object with this exact structure:
+    {
+        "score": float between 0.0 and 1.0,
+        "reason": "1-3 concise, factual sentences explaining the score. Reference specific tool names and descriptions when relevant."
+    }
+
+    Do not include any additional commentary or output outside the JSON object.
+
+    ---
+
+    USER INPUT:
+    #{user_input}
+
+    ALL AVAILABLE TOOLS:
+    #{available_tools}
+
+    TOOL CALLS MADE BY AGENT:
+    #{tools_called}
+
+    JSON:
+    """
+  end
+
+  @doc """
+  Formats a list of ToolCall structs as a JSON-like string for prompt inclusion.
+  """
+  def format_tools(tools) when is_list(tools) do
+    formatted =
+      Enum.map_join(tools, ",\n", fn tool ->
+        tool
+        |> Map.from_struct()
+        |> Enum.reject(fn {_k, v} -> is_nil(v) end)
+        |> Map.new()
+        |> Jason.encode!(pretty: true)
+        |> indent(2)
+      end)
+
+    "[\n#{formatted}\n]"
+  end
+
+  defp indent(string, spaces) do
+    pad = String.duplicate(" ", spaces)
+
+    string
+    |> String.split("\n")
+    |> Enum.map_join("\n", fn line -> "#{pad}#{line}" end)
+  end
+end
diff --git a/lib/deep_eval_ex/schemas/metric_outputs/tool_correctness.ex b/lib/deep_eval_ex/schemas/metric_outputs/tool_correctness.ex
new file mode 100644
index 0000000..835e48f
--- /dev/null
+++ b/lib/deep_eval_ex/schemas/metric_outputs/tool_correctness.ex
@@ -0,0 +1,44 @@
+# Copyright 2025 Steven Holdsworth (@holsee)
+# SPDX-License-Identifier: Apache-2.0
+#
+# Ported from deepeval/metrics/tool_correctness/schema.py
+# Original: https://github.com/confident-ai/deepeval
+
+defmodule DeepEvalEx.Schemas.MetricOutputs.ToolCorrectness do
+  @moduledoc """
+  JSON schemas for ToolCorrectness metric LLM responses.
+  """
+
+  @doc """
+  JSON schema for tool selection score evaluation.
+  """
+  def tool_selection_score_schema do
+    %{
+      "type" => "object",
+      "properties" => %{
+        "score" => %{
+          "type" => "number",
+          "description" => "Tool selection score from 0.0 to 1.0"
+        },
+        "reason" => %{
+          "type" => "string",
+          "description" => "1-3 concise sentences explaining the score"
+        }
+      },
+      "required" => ["score", "reason"],
+      "additionalProperties" => false
+    }
+  end
+
+  @doc """
+  Parses tool selection score response from LLM.
+  """
+  def parse_tool_selection_score(%{"score" => score, "reason" => reason})
+      when is_number(score) and is_binary(reason) do
+    {:ok, %{score: score / 1, reason: reason}}
+  end
+
+  def parse_tool_selection_score(other) do
+    {:error, {:invalid_tool_selection_response, other}}
+  end
+end
diff --git a/test/deep_eval_ex/metrics/tool_correctness_test.exs b/test/deep_eval_ex/metrics/tool_correctness_test.exs
new file mode 100644
index 0000000..bb23189
--- /dev/null
+++ b/test/deep_eval_ex/metrics/tool_correctness_test.exs
@@ -0,0 +1,569 @@
+defmodule DeepEvalEx.Metrics.ToolCorrectnessTest do
+  use ExUnit.Case, async: true
+
+  alias DeepEvalEx.LLM.Adapters.Mock
+  alias DeepEvalEx.Metrics.ToolCorrectness
+  alias DeepEvalEx.Schemas.ToolCall
+  alias DeepEvalEx.TestCase
+
+  defp tool(name, params \\ nil, output \\ nil) do
+    %ToolCall{name: name, input_parameters: params, output: output}
+  end
+
+  defp case_with(called, expected) do
+    %TestCase{
+      input: "test query",
+      actual_output: "",
+      tools_called: Enum.map(called, &tool/1),
+      expected_tools: Enum.map(expected, &tool/1)
+    }
+  end
+
+  describe "non-exact match (default mode)" do
+    test "returns score 1.0 for perfect match" do
+      tc = case_with(["tool_a", "tool_b"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 1.0
+      assert result.success == true
+      assert result.metric == "Tool Correctness"
+    end
+
+    test "returns score 0.0 for no match" do
+      tc = case_with(["tool_c"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 0.0
+      assert result.success == false
+    end
+
+    test "returns proportional score for partial match" do
+      tc = case_with(["tool_a", "tool_c"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 0.5
+    end
+
+    test "extra tools do not reduce score" do
+      tc = case_with(["tool_a", "tool_b", "tool_extra"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 1.0
+    end
+
+    test "order does not matter in default mode" do
+      tc = case_with(["tool_b", "tool_a"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 1.0
+    end
+
+    test "greedy matching with duplicates" do
+      # Two expected tool_a, only one called → 0.5
+      tc = case_with(["tool_a"], ["tool_a", "tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 0.5
+    end
+
+    test "both empty lists returns 1.0" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [],
+        expected_tools: []
+      }
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 1.0
+    end
+
+    test "empty expected with called tools returns 0.0" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("tool_a")],
+        expected_tools: []
+      }
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 0.0
+    end
+
+    test "empty called with expected tools returns 0.0" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [],
+        expected_tools: [tool("tool_a")]
+      }
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 0.0
+    end
+  end
+
+  describe "exact match mode" do
+    test "score 1.0 when lists are identical (positional)" do
+      tc = case_with(["tool_a", "tool_b"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_exact_match: true)
+      assert result.score == 1.0
+    end
+
+    test "score 0.0 when lengths differ (extra tools)" do
+      tc = case_with(["tool_a", "tool_b", "tool_c"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_exact_match: true)
+      assert result.score == 0.0
+    end
+
+    test "score 0.0 when lengths differ (missing tools)" do
+      tc = case_with(["tool_a"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_exact_match: true)
+      assert result.score == 0.0
+    end
+
+    test "score 0.0 when positions mismatch" do
+      tc = case_with(["tool_b", "tool_a"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_exact_match: true)
+      assert result.score == 0.0
+    end
+
+    test "both empty lists returns 1.0" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [],
+        expected_tools: []
+      }
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_exact_match: true)
+      assert result.score == 1.0
+    end
+
+    test "exact match with input_parameters" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", %{"q" => "paris"})],
+        expected_tools: [tool("search", %{"q" => "paris"})]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc,
+                 should_exact_match: true,
+                 evaluation_params: [:input_parameters]
+               )
+
+      assert result.score == 1.0
+    end
+
+    test "exact match fails with different input_parameters" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", %{"q" => "london"})],
+        expected_tools: [tool("search", %{"q" => "paris"})]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc,
+                 should_exact_match: true,
+                 evaluation_params: [:input_parameters]
+               )
+
+      assert result.score == 0.0
+    end
+  end
+
+  describe "ordering mode (weighted LCS)" do
+    test "score 1.0 when order matches" do
+      tc = case_with(["tool_a", "tool_b", "tool_c"], ["tool_a", "tool_b", "tool_c"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_consider_ordering: true)
+      assert result.score == 1.0
+    end
+
+    test "score 1.0 when expected subsequence is present in order" do
+      tc = case_with(["tool_x", "tool_a", "tool_y", "tool_b"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_consider_ordering: true)
+      assert result.score == 1.0
+    end
+
+    test "partial score when order is disrupted" do
+      # Expected: a, b, c. Called: c, a, b.
+      # LCS of expected in called: a, b (2 of 3)
+      tc = case_with(["tool_c", "tool_a", "tool_b"], ["tool_a", "tool_b", "tool_c"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_consider_ordering: true)
+      assert_in_delta result.score, 2 / 3, 0.001
+    end
+
+    test "score 0.0 when no expected tools are present" do
+      tc = case_with(["tool_x", "tool_y"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_consider_ordering: true)
+      assert result.score == 0.0
+    end
+
+    test "both empty lists with ordering returns 1.0" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [],
+        expected_tools: []
+      }
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_consider_ordering: true)
+      assert result.score == 1.0
+    end
+
+    test "empty expected with ordering returns 0.0" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("tool_a")],
+        expected_tools: []
+      }
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, should_consider_ordering: true)
+      assert result.score == 0.0
+    end
+  end
+
+  describe "exact match + ordering combined" do
+    # In Python, should_exact_match takes precedence over should_consider_ordering
+    test "fails for different order" do
+      tc = case_with(["tool_b", "tool_a"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc,
+                 should_exact_match: true,
+                 should_consider_ordering: true
+               )
+
+      assert result.score == 0.0
+    end
+
+    test "passes for identical sequence" do
+      tc = case_with(["tool_a", "tool_b"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc,
+                 should_exact_match: true,
+                 should_consider_ordering: true
+               )
+
+      assert result.score == 1.0
+    end
+
+    test "fails when lengths differ" do
+      tc = case_with(["tool_a", "tool_b", "tool_c"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc,
+                 should_exact_match: true,
+                 should_consider_ordering: true
+               )
+
+      assert result.score == 0.0
+    end
+  end
+
+  describe "evaluation_params with :input_parameters" do
+    test "matches when both name and params match" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", %{"q" => "paris"})],
+        expected_tools: [tool("search", %{"q" => "paris"})]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc, evaluation_params: [:input_parameters])
+
+      assert result.score == 1.0
+    end
+
+    test "fractional score for partial parameter match" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", %{"q" => "paris", "limit" => 10})],
+        expected_tools: [tool("search", %{"q" => "paris", "limit" => 5})]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc, evaluation_params: [:input_parameters])
+
+      # q matches (0.5), limit differs (0.0) → similarity = 0.5
+      assert result.score == 0.5
+    end
+
+    test "score 0.0 when all params differ" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", %{"q" => "london"})],
+        expected_tools: [tool("search", %{"q" => "paris"})]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc, evaluation_params: [:input_parameters])
+
+      assert result.score == 0.0
+    end
+
+    test "recursive map comparison" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [
+          tool("search", %{"filters" => %{"country" => "JP", "type" => "school"}})
+        ],
+        expected_tools: [
+          tool("search", %{"filters" => %{"country" => "JP", "type" => "school"}})
+        ]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc, evaluation_params: [:input_parameters])
+
+      assert result.score == 1.0
+    end
+
+    test "missing keys reduce score proportionally" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", %{"q" => "paris", "extra" => "val"})],
+        expected_tools: [tool("search", %{"q" => "paris"})]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc, evaluation_params: [:input_parameters])
+
+      # Union has 2 keys, intersection has 1 matching key → 1/2 = 0.5
+      assert result.score == 0.5
+    end
+  end
+
+  describe "evaluation_params with :output" do
+    test "matches when output is identical" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", nil, "result")],
+        expected_tools: [tool("search", nil, "result")]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc, evaluation_params: [:output])
+
+      assert result.score == 1.0
+    end
+
+    test "score 0.0 when output differs" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [tool("search", nil, "wrong")],
+        expected_tools: [tool("search", nil, "right")]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc, evaluation_params: [:output])
+
+      assert result.score == 0.0
+    end
+  end
+
+  describe "strict_mode" do
+    test "zeroes score below threshold" do
+      tc = case_with(["tool_a", "tool_c"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, strict_mode: true)
+      # Without strict: score = 0.5, threshold = 0.5 → pass
+      # With strict: threshold forced to 1.0, 0.5 < 1.0 → score zeroed
+      assert result.score == 0.0
+      assert result.threshold == 1.0
+      assert result.success == false
+    end
+
+    test "passes with perfect score in strict mode" do
+      tc = case_with(["tool_a", "tool_b"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, strict_mode: true)
+      assert result.score == 1.0
+      assert result.threshold == 1.0
+      assert result.success == true
+    end
+
+    test "strict mode metadata flag is set" do
+      tc = case_with(["tool_a"], ["tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, strict_mode: true)
+      assert result.metadata.strict_mode == true
+    end
+  end
+
+  describe "include_reason" do
+    test "includes reason by default" do
+      tc = case_with(["tool_a"], ["tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert is_binary(result.reason)
+      assert result.reason =~ "Tool Calling Reason"
+      assert result.reason =~ "Tool Selection Reason"
+    end
+
+    test "omits reason when include_reason is false" do
+      tc = case_with(["tool_a"], ["tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc, include_reason: false)
+      assert result.reason == nil
+    end
+  end
+
+  describe "tool selection score (LLM-based)" do
+    setup do
+      Mock.clear_responses()
+      :ok
+    end
+
+    test "calls LLM when available_tools provided" do
+      Mock.set_schema_response(
+        ~r/Tool Selection/,
+        %{"score" => 0.75, "reason" => "Good tool selection with minor issues."}
+      )
+
+      available = [tool("search"), tool("lookup"), tool("delete")]
+
+      tc = %TestCase{
+        input: "Find user info",
+        actual_output: "",
+        tools_called: [tool("search")],
+        expected_tools: [tool("search")]
+      }
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc,
+                 available_tools: available,
+                 adapter: :mock
+               )
+
+      # tool_calling_score = 1.0, tool_selection_score = 0.75
+      # final = min(1.0, 0.75) = 0.75
+      assert result.score == 0.75
+      assert result.metadata.tool_calling_score == 1.0
+      assert result.metadata.tool_selection_score == 0.75
+    end
+
+    test "defaults to 1.0 when no available_tools" do
+      tc = case_with(["tool_a"], ["tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.metadata.tool_selection_score == 1.0
+      assert result.metadata.tool_selection_reason =~ "No available tools"
+    end
+
+    test "final score is min of calling and selection scores" do
+      Mock.set_schema_response(
+        ~r/Tool Selection/,
+        %{"score" => 0.5, "reason" => "Mixed selection."}
+      )
+
+      tc = case_with(["tool_a", "tool_b"], ["tool_a", "tool_b"])
+
+      assert {:ok, result} =
+               ToolCorrectness.measure(tc,
+                 available_tools: [tool("tool_a"), tool("tool_b"), tool("tool_c")],
+                 adapter: :mock
+               )
+
+      # tool_calling = 1.0, tool_selection = 0.5
+      assert result.score == 0.5
+    end
+  end
+
+  describe "validation" do
+    test "returns error when tools_called is nil" do
+      # Force nil by bypassing Ecto defaults
+      tc = %{
+        %TestCase{input: "test", actual_output: ""}
+        | tools_called: nil,
+          expected_tools: nil
+      }
+
+      assert {:error, {:missing_params, params}} = ToolCorrectness.measure(tc)
+      assert :tools_called in params
+      assert :expected_tools in params
+    end
+
+    test "returns error when expected_tools is nil" do
+      tc = %{
+        %TestCase{input: "test", actual_output: "", tools_called: [tool("tool_a")]}
+        | expected_tools: nil
+      }
+
+      assert {:error, {:missing_params, [:expected_tools]}} = ToolCorrectness.measure(tc)
+    end
+
+    test "empty lists are valid (not errors)" do
+      tc = %TestCase{
+        input: "test",
+        actual_output: "",
+        tools_called: [],
+        expected_tools: []
+      }
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.score == 1.0
+    end
+  end
+
+  describe "metadata" do
+    test "metric_name/0 returns correct name" do
+      assert ToolCorrectness.metric_name() == "Tool Correctness"
+    end
+
+    test "required_params/0 returns required parameters" do
+      assert ToolCorrectness.required_params() == [:tools_called, :expected_tools]
+    end
+
+    test "default_threshold/0 returns 0.5" do
+      assert ToolCorrectness.default_threshold() == 0.5
+    end
+  end
+
+  describe "result" do
+    test "includes latency_ms" do
+      tc = case_with(["tool_a"], ["tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert is_integer(result.latency_ms)
+      assert result.latency_ms >= 0
+    end
+
+    test "includes tool calling and selection metadata" do
+      tc = case_with(["tool_a"], ["tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert is_float(result.metadata.tool_calling_score)
+      assert is_float(result.metadata.tool_selection_score)
+      assert is_binary(result.metadata.tool_selection_reason)
+    end
+
+    test "reason includes both calling and selection reasons" do
+      tc = case_with(["tool_a"], ["tool_a"])
+
+      assert {:ok, result} = ToolCorrectness.measure(tc)
+      assert result.reason =~ "Tool Calling Reason:"
+      assert result.reason =~ "Tool Selection Reason:"
+    end
+  end
+end