From 1e892c75046f8231bdd55f210614b8ed63197928 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 12:42:44 +0000 Subject: [PATCH] fix: strip believed_free models from the pricing block on every update run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some providers (Cloudflare being the clearest case) have a non-zero per-token price in the LiteLLM cost map because they publish a paid-tier rate, even though their free plan is quota-limited rather than zero-priced. _merge_pricing() was writing those LiteLLM-sourced prices into the pricing block on every run, causing compute_cost() to attribute a cost to every free-tier request — which eventually lands the model in cost_observed_free_tier and silently breaks free routing. Fix: after merging the LiteLLM baseline with per-source overrides, strip any key that appears in any provider's believed_free list before writing to sidecar["pricing"]. The filter runs on every update so newly believed-free models are cleaned up automatically and models removed from believed_free re-appear in pricing on the next run. Also remove the three Cloudflare entries already in providers.json that were in both the pricing block and believed_free (llama-2-7b-chat-fp16, llama-2-7b-chat-int8, mistral-7b-instruct-v0.1). The fourth entry (@hf/thebloke/codellama-7b-instruct-awq) is not believed_free and is left intact. Co-Authored-By: Claude Sonnet 4.6 Claude-Session: https://claude.ai/code/session_01XLHnQxLYrMzpm5Ar83ihNC --- llmproxy/providers.json | 12 ------------ scripts/update_free_models.py | 12 ++++++++++++ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/llmproxy/providers.json b/llmproxy/providers.json index 743982f..27739fd 100644 --- a/llmproxy/providers.json +++ b/llmproxy/providers.json @@ -1668,18 +1668,6 @@ "input_cost_per_token": 2.25e-06, "output_cost_per_token": 2.75e-06 }, - "cloudflare-workers/@cf/meta/llama-2-7b-chat-fp16": { - "input_cost_per_token": 1.923e-06, - "output_cost_per_token": 1.923e-06 - }, - "cloudflare-workers/@cf/meta/llama-2-7b-chat-int8": { - "input_cost_per_token": 1.923e-06, - "output_cost_per_token": 1.923e-06 - }, - "cloudflare-workers/@cf/mistral/mistral-7b-instruct-v0.1": { - "input_cost_per_token": 1.923e-06, - "output_cost_per_token": 1.923e-06 - }, "cloudflare-workers/@hf/thebloke/codellama-7b-instruct-awq": { "input_cost_per_token": 1.923e-06, "output_cost_per_token": 1.923e-06 diff --git a/scripts/update_free_models.py b/scripts/update_free_models.py index ab79ef1..35a231e 100644 --- a/scripts/update_free_models.py +++ b/scripts/update_free_models.py @@ -399,6 +399,12 @@ def _merge_pricing(sidecar: dict, updates: dict, litellm_ran: bool) -> bool: could not be fetched this run, the existing block is kept as the baseline so a transient outage never wipes pricing. Returns True if the block changed. Network/parse failures are non-fatal — stale pricing beats aborting the run. + + Believed-free models are always stripped from the final block. Some providers + (e.g. Cloudflare) have a paid-tier per-token price in the LiteLLM map even + though their free plan is quota-limited rather than zero-priced. Keeping those + entries causes compute_cost() to attribute a cost to every free-tier request, + which eventually lands the model in cost_observed_free_tier and breaks routing. """ existing = sidecar.get("pricing") if not isinstance(existing, dict): @@ -412,6 +418,12 @@ def _merge_pricing(sidecar: dict, updates: dict, litellm_ran: bool) -> bool: baseline = existing overrides = _collect_source_pricing(updates) merged = {**baseline, **overrides} + # Strip any model that is currently believed_free — their cost accounting + # belongs in the free-tier path, not the paid pricing block. + free_models: set[str] = set() + for prov_data in sidecar.get("providers", {}).values(): + free_models.update(m.lower() for m in prov_data.get("believed_free", [])) + merged = {k: v for k, v in merged.items() if k not in free_models} final = dict(sorted(merged.items())) if final == existing: return False