From 1e892c75046f8231bdd55f210614b8ed63197928 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 19 Jun 2026 12:42:44 +0000
Subject: [PATCH] fix: strip believed_free models from the pricing block on
 every update run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Some providers (Cloudflare being the clearest case) have a non-zero
per-token price in the LiteLLM cost map because they publish a paid-tier
rate, even though their free plan is quota-limited rather than
zero-priced. _merge_pricing() was writing those LiteLLM-sourced prices
into the pricing block on every run, causing compute_cost() to attribute
a cost to every free-tier request — which eventually lands the model in
cost_observed_free_tier and silently breaks free routing.

Fix: after merging the LiteLLM baseline with per-source overrides,
strip any key that appears in any provider's believed_free list before
writing to sidecar["pricing"]. The filter runs on every update so newly
believed-free models are cleaned up automatically and models removed from
believed_free re-appear in pricing on the next run.

Also remove the three Cloudflare entries already in providers.json that
were in both the pricing block and believed_free (llama-2-7b-chat-fp16,
llama-2-7b-chat-int8, mistral-7b-instruct-v0.1). The fourth entry
(@hf/thebloke/codellama-7b-instruct-awq) is not believed_free and is
left intact.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01XLHnQxLYrMzpm5Ar83ihNC
---
 llmproxy/providers.json       | 12 ------------
 scripts/update_free_models.py | 12 ++++++++++++
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/llmproxy/providers.json b/llmproxy/providers.json
index 743982f..27739fd 100644
--- a/llmproxy/providers.json
+++ b/llmproxy/providers.json
@@ -1668,18 +1668,6 @@
       "input_cost_per_token": 2.25e-06,
       "output_cost_per_token": 2.75e-06
     },
-    "cloudflare-workers/@cf/meta/llama-2-7b-chat-fp16": {
-      "input_cost_per_token": 1.923e-06,
-      "output_cost_per_token": 1.923e-06
-    },
-    "cloudflare-workers/@cf/meta/llama-2-7b-chat-int8": {
-      "input_cost_per_token": 1.923e-06,
-      "output_cost_per_token": 1.923e-06
-    },
-    "cloudflare-workers/@cf/mistral/mistral-7b-instruct-v0.1": {
-      "input_cost_per_token": 1.923e-06,
-      "output_cost_per_token": 1.923e-06
-    },
     "cloudflare-workers/@hf/thebloke/codellama-7b-instruct-awq": {
       "input_cost_per_token": 1.923e-06,
       "output_cost_per_token": 1.923e-06
diff --git a/scripts/update_free_models.py b/scripts/update_free_models.py
index ab79ef1..35a231e 100644
--- a/scripts/update_free_models.py
+++ b/scripts/update_free_models.py
@@ -399,6 +399,12 @@ def _merge_pricing(sidecar: dict, updates: dict, litellm_ran: bool) -> bool:
     could not be fetched this run, the existing block is kept as the baseline so a
     transient outage never wipes pricing. Returns True if the block changed.
     Network/parse failures are non-fatal — stale pricing beats aborting the run.
+
+    Believed-free models are always stripped from the final block. Some providers
+    (e.g. Cloudflare) have a paid-tier per-token price in the LiteLLM map even
+    though their free plan is quota-limited rather than zero-priced. Keeping those
+    entries causes compute_cost() to attribute a cost to every free-tier request,
+    which eventually lands the model in cost_observed_free_tier and breaks routing.
     """
     existing = sidecar.get("pricing")
     if not isinstance(existing, dict):
@@ -412,6 +418,12 @@ def _merge_pricing(sidecar: dict, updates: dict, litellm_ran: bool) -> bool:
             baseline = existing
     overrides = _collect_source_pricing(updates)
     merged = {**baseline, **overrides}
+    # Strip any model that is currently believed_free — their cost accounting
+    # belongs in the free-tier path, not the paid pricing block.
+    free_models: set[str] = set()
+    for prov_data in sidecar.get("providers", {}).values():
+        free_models.update(m.lower() for m in prov_data.get("believed_free", []))
+    merged = {k: v for k, v in merged.items() if k not in free_models}
     final = dict(sorted(merged.items()))
     if final == existing:
         return False