diff --git a/README.md b/README.md index cbffccc..b1773b2 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,9 @@ When a request targets a (non-fusion) virtual model, llmproxy: reorderings may then run on top without ever dropping a candidate: the [request-fit triage](#request-fit-triage-every-free-and-local-virtual) for the `*/free` and `*/local` virtuals, and [capability ordering](#capability-aware-routing--failover) - when the request forces a capability. + when the request forces a capability. Finally, any models listed in + [`favorite_free_models`](#favorite_free_models) that are present in the pool + are promoted to the front in ranked order before cycling begins. 3. **Tries each candidate in order**, returning the first **usable** response. A candidate is considered to have **failed** — so llmproxy moves on to the next @@ -678,6 +680,10 @@ Config is stored at `~/.config/llmproxy/config.json` (or the path in "tokens_per_day": 500000 } }, + "favorite_free_models": [ + "google/gemini-2.5-flash", + "groq/llama-3.1-8b-instant" + ], "free_tier": { "sync_on_startup": true, @@ -864,6 +870,45 @@ process — so it is "as far as we can tell in the moment". Any field set to `nu is ignored; a provider with no `free_allowance` simply never gains free-in-the- moment status. + +### `favorite_free_models` — ranked priority list for free-tier routing + +`favorite_free_models` is an **optional** top-level array of model IDs listed in +preference order. When a `*/free` virtual endpoint (e.g. `llmproxy/free`, +`llmproxy/deep__free`) or the free tier of `llmproxy/loadbalanced` selects a +backend, models in this list are promoted to the front of the candidate pool +**in the order listed**, before the normal capacity/request-fit/capability +algorithm handles the rest. + +```json +"favorite_free_models": [ + "google/gemini-2.5-flash", + "anthropic/claude-3-5-haiku-20251001", + "gpt-4o-mini" +] +``` + +Each entry is matched case-insensitively against the upstream model ID (bare, +e.g. `gpt-4o-mini`) or the fully-qualified proxy ID (e.g. +`openai/gpt-4o-mini`). A favorite is only promoted if it is **currently +believed-free** (present in `believed_free` and not flagged as cost-observed); +if it is absent from the virtual model's candidate pool it is silently skipped +and the remaining favorites and the normal algorithm continue unchanged. + +**Cost-observation persistence:** if a favorite is later removed from +`believed_free` because a cost was observed at runtime, it remains in +`favorite_free_models`. When a future sync restores it to the free pool (e.g. +the provider makes it free again), it is automatically re-promoted without any +manual config change. + +`favorite_free_models` has no effect on non-free virtual endpoints +(`llmproxy/deep`, `llmproxy/tools`, etc.) or on fusion virtuals. + +The admin UI's **Models & Categorizations** tab includes a **Favorite free +models** panel where you can add models from a grouped-by-provider picker, +reorder them with up/down buttons, and remove entries — changes are saved +immediately. + ### Token + cost accounting — `GET /v1/usage` diff --git a/llmproxy/admin.py b/llmproxy/admin.py index fcdd5c1..f647d81 100644 --- a/llmproxy/admin.py +++ b/llmproxy/admin.py @@ -463,6 +463,7 @@ def api_get_config(): return jsonify({ "providers": providers, "believed_free": config.get("believed_free", []), + "favorite_free_models": config.get("favorite_free_models", []), "model_reasoning": config.get("model_reasoning", {}), "model_capabilities": config.get("model_capabilities", {}), "free_limits": config.get("free_limits", {}), @@ -794,6 +795,18 @@ def _put_section(key: str, validate): return jsonify({key: payload}) +@bp.route("/admin/api/favorite-free-models", methods=["GET", "PUT"]) +def api_favorite_free_models(): + if request.method == "GET": + return jsonify({"favorite_free_models": _load().get("favorite_free_models", [])}) + + def validate(p): + if not (isinstance(p, list) and all(isinstance(x, str) for x in p)): + return "favorite_free_models must be a list of strings." + return None + return _put_section("favorite_free_models", validate) + + @bp.route("/admin/api/believed-free", methods=["GET", "PUT"]) def api_believed_free(): if request.method == "GET": diff --git a/llmproxy/server.py b/llmproxy/server.py index aec3ea0..ba8b965 100755 --- a/llmproxy/server.py +++ b/llmproxy/server.py @@ -2943,6 +2943,37 @@ def _provider_exposes_to_virtual_models(provider_cfg: dict) -> bool: return provider_cfg.get("expose_to_virtual_models", True) is not False +def _apply_favorite_free_ordering( + candidates: list[tuple[str, dict, str]], + config: dict, +) -> list[tuple[str, dict, str]]: + """Promote favorite_free_models to the front in ranked order. + + Only candidates already present in the pool are promoted — favorites not in + the pool (e.g. cost-observed, not believed_free) are silently skipped. + Non-matching candidates retain their existing order after the favorites. + + Matching is case-insensitive and ignores :variant suffixes (e.g. :free, + :nitro) so that "x/y" matches both "x/y" and "x/y:free". + """ + favorites = config.get("favorite_free_models", []) + if not favorites: + return candidates + remaining = list(candidates) + front: list[tuple[str, dict, str]] = [] + for fav in favorites: + fav_lower = fav.lower() + for i, (pname, _pcfg, umodel) in enumerate(remaining): + umodel_lower = umodel.lower() + umodel_base = umodel_lower.split(":")[0] # strip :variant suffix + qualified = f"{pname}/{umodel}".lower() + qualified_base = f"{pname}/{umodel_base}" + if fav_lower in (umodel_lower, umodel_base, qualified, qualified_base): + front.append(remaining.pop(i)) + break + return front + remaining + + def _param_count(model_id: str) -> float: """Best-effort parameter count (in billions) parsed from a model id. @@ -3436,6 +3467,7 @@ def _price(c: tuple[str, dict, str]) -> float: continue if tier == _TIER_FREE: bucket = _quality_ordered_candidates(bucket, free_limits, reasoning_map) + bucket = _apply_favorite_free_ordering(bucket, config) elif tier == _TIER_LOCAL: # $0 like free — prefer the strongest local model (e.g. the larger # Ollama model) rather than rotating randomly. @@ -4253,6 +4285,8 @@ def on_success(pn: str, um: str, body=None) -> None: logger.info(" [%s] request-fit first-pick tier=%s", model_full, _target_reasoning_tier(payload)) if needed: ordered = _order_by_capability(ordered, needed, _model_capabilities(config)) + if is_free_virtual: + ordered = _apply_favorite_free_ordering(ordered, config) logger.info(" [%s] cycling through %d candidate(s)", model_full, len(ordered)) if is_streaming: timeout = server_cfg.get("stream_timeout", 300) diff --git a/llmproxy/static/admin/index.html b/llmproxy/static/admin/index.html index 9102e6a..8533fac 100644 --- a/llmproxy/static/admin/index.html +++ b/llmproxy/static/admin/index.html @@ -87,6 +87,8 @@ .vm code.id { font-size: 14px; font-weight: 700; } a { color: var(--accent); } .hidden { display: none !important; } + .fav-row { display: grid; grid-template-columns: auto 1fr auto auto auto; gap: 8px; align-items: center; padding: 7px 10px; border: 1px solid var(--border); border-radius: 8px; margin-bottom: 6px; } + .fav-rank { font-size: 11px; font-weight: 700; color: var(--muted); min-width: 18px; text-align: right; } @@ -164,6 +166,15 @@

Models & Categorizations

+
+

Favorite free models

+

These models are tried first (in ranked order) when routing through any */free virtual endpoint or the free tier of llmproxy/loadbalanced. A model is only selected if it is currently believed-free; it is skipped silently otherwise and the normal selection algorithm continues.

+
+
+ + +
+
@@ -424,7 +435,7 @@

${esc(name)} ${keyPill}

// ---- Models & categorizations ---- $("#discover").addEventListener("click", async () => { toast("Discovering…"); - try { const r = await api("/models"); DISCOVERED = r.models || []; renderModels(); toast(`Discovered ${DISCOVERED.length} models`); } + try { const r = await api("/models"); DISCOVERED = r.models || []; renderModels(); renderFavPicker(); toast(`Discovered ${DISCOVERED.length} models`); } catch (e) { toast(e.message, true); } }); $("#model-filter").addEventListener("input", renderModels); @@ -492,6 +503,73 @@

${esc(name)} ${keyPill}

} $("#refresh-vm").addEventListener("click", loadVirtual); +// ---- Favorite free models ---- +let FAVS = []; + +function renderFavPicker() { + const sel = $("#fav-pick"); + // Group discovered/known models by provider + const all = modelUniverse(); + const groups = {}; + for (const m of all) { + const slash = m.indexOf("/"); + const grp = slash >= 0 ? m.slice(0, slash) : "(bare)"; + (groups[grp] = groups[grp] || []).push(m); + } + sel.innerHTML = `` + + Object.keys(groups).sort().map(g => + `${groups[g].map(m => ``).join("")}` + ).join(""); +} + +function renderFavsList() { + const box = $("#fav-list"); + if (!FAVS.length) { box.innerHTML = `

No favorites yet.

`; return; } + box.innerHTML = FAVS.map((m, i) => ` +
+ ${i + 1} + ${esc(m)} + + + +
`).join(""); + box.querySelectorAll("[data-up]").forEach(b => b.addEventListener("click", () => moveFav(+b.dataset.up, -1))); + box.querySelectorAll("[data-dn]").forEach(b => b.addEventListener("click", () => moveFav(+b.dataset.dn, 1))); + box.querySelectorAll("[data-rm]").forEach(b => b.addEventListener("click", () => removeFav(+b.dataset.rm))); +} + +async function saveFavs() { + try { + await api("/favorite-free-models", { method: "PUT", body: JSON.stringify(FAVS) }); + toast("Favorite free models saved"); + } catch (e) { toast(e.message, true); } +} + +function moveFav(i, dir) { + const j = i + dir; + if (j < 0 || j >= FAVS.length) return; + [FAVS[i], FAVS[j]] = [FAVS[j], FAVS[i]]; + renderFavsList(); saveFavs(); +} + +function removeFav(i) { + FAVS.splice(i, 1); renderFavsList(); saveFavs(); +} + +$("#fav-add").addEventListener("click", () => { + const v = $("#fav-pick").value; if (!v) return; + if (FAVS.includes(v)) { toast("Already in favorites", true); return; } + FAVS.push(v); renderFavsList(); renderFavPicker(); saveFavs(); +}); + +async function loadFavs() { + try { + const r = await api("/favorite-free-models"); + FAVS = r.favorite_free_models || []; + renderFavsList(); renderFavPicker(); + } catch (e) { toast(e.message, true); } +} + // ---- Bootstrap ---- async function bootstrap() { try { @@ -515,6 +593,7 @@

${esc(name)} ${keyPill}

renderTemplateSelect(); await loadProviders(); renderModels(); + await loadFavs(); } catch (e) { setAuthState(false, "auth required"); toast(e.message, true); diff --git a/tests/test_admin_api.py b/tests/test_admin_api.py index 124dfa8..72e36b9 100644 --- a/tests/test_admin_api.py +++ b/tests/test_admin_api.py @@ -207,6 +207,47 @@ def test_put_believed_free_rejects_non_list(client): assert resp.status_code == 400 +# --------------------------------------------------------------------------- # +# Favorite free models + +def test_get_favorite_free_models_default_empty(client): + resp = client.get("/admin/api/favorite-free-models") + assert resp.status_code == 200 + assert resp.get_json()["favorite_free_models"] == [] + + +def test_put_favorite_free_models_valid(client, cfg_path): + favs = ["google/gemini-2.5-flash", "openai/gpt-4o-mini"] + resp = client.put("/admin/api/favorite-free-models", json=favs) + assert resp.status_code == 200 + assert resp.get_json()["favorite_free_models"] == favs + assert _read_config(cfg_path)["favorite_free_models"] == favs + + +def test_put_favorite_free_models_rejects_non_list(client): + resp = client.put("/admin/api/favorite-free-models", json={"model": "x"}) + assert resp.status_code == 400 + + +def test_put_favorite_free_models_rejects_non_string_entries(client): + resp = client.put("/admin/api/favorite-free-models", json=["ok", 42]) + assert resp.status_code == 400 + + +def test_favorite_free_models_in_config_get(client, cfg_path): + favs = ["google/gemini-flash"] + client.put("/admin/api/favorite-free-models", json=favs) + resp = client.get("/admin/api/config") + assert resp.status_code == 200 + assert resp.get_json()["favorite_free_models"] == favs + + +def test_get_favorite_free_models_round_trips_empty_list(client, cfg_path): + client.put("/admin/api/favorite-free-models", json=[]) + resp = client.get("/admin/api/favorite-free-models") + assert resp.get_json()["favorite_free_models"] == [] + + def test_put_model_reasoning_validates_level(client): resp = client.put("/admin/api/model-reasoning", json={"m": "ultra"}) assert resp.status_code == 400 diff --git a/tests/test_favorite_ordering.py b/tests/test_favorite_ordering.py new file mode 100644 index 0000000..47dbfea --- /dev/null +++ b/tests/test_favorite_ordering.py @@ -0,0 +1,138 @@ +"""Unit tests for _apply_favorite_free_ordering() in server.py.""" + +from __future__ import annotations + +import importlib +import json + +import pytest + + +@pytest.fixture(autouse=True) +def _reload_server(monkeypatch, tmp_path): + cfg = tmp_path / "config.json" + cfg.write_text(json.dumps({"providers": {}, "believed_free": []})) + monkeypatch.setenv("LLMPROXY_CONFIG", str(cfg)) + from llmproxy import config as config_mod + importlib.reload(config_mod) + from llmproxy import server as server_mod + importlib.reload(server_mod) + + +def _fn(): + from llmproxy import server as server_mod + return server_mod._apply_favorite_free_ordering + + +def _c(provider, model): + """Build a minimal (provider_name, provider_cfg, upstream_model) tuple.""" + return (provider, {}, model) + + +def test_empty_favorites_returns_unchanged(): + candidates = [_c("openai", "gpt-4o"), _c("google", "gemini-flash")] + result = _fn()(candidates, {"favorite_free_models": []}) + assert result == candidates + + +def test_no_favorites_key_returns_unchanged(): + candidates = [_c("openai", "gpt-4o"), _c("google", "gemini-flash")] + result = _fn()(candidates, {}) + assert result == candidates + + +def test_favorite_promoted_to_front(): + a = _c("openai", "gpt-4o") + b = _c("google", "gemini-flash") + candidates = [a, b] + result = _fn()(candidates, {"favorite_free_models": ["google/gemini-flash"]}) + assert result[0] == b + assert result[1] == a + + +def test_multiple_favorites_ranked_order(): + a = _c("openai", "gpt-4o-mini") + b = _c("google", "gemini-flash") + c = _c("anthropic", "haiku") + candidates = [a, b, c] + result = _fn()(candidates, {"favorite_free_models": ["anthropic/haiku", "google/gemini-flash"]}) + assert result[0] == c + assert result[1] == b + assert result[2] == a + + +def test_favorite_not_in_pool_silently_skipped(): + a = _c("openai", "gpt-4o") + candidates = [a] + # "paid-model" is not in the pool + result = _fn()(candidates, {"favorite_free_models": ["openai/paid-model", "openai/gpt-4o"]}) + assert result == [a] + + +def test_bare_id_match(): + a = _c("openai", "gpt-4o-mini") + b = _c("google", "gemini-flash") + candidates = [a, b] + # bare id (no provider prefix) matches by upstream_model + result = _fn()(candidates, {"favorite_free_models": ["gemini-flash"]}) + assert result[0] == b + assert result[1] == a + + +def test_qualified_id_match(): + a = _c("openai", "gpt-4o-mini") + b = _c("google", "gemini-flash") + candidates = [a, b] + result = _fn()(candidates, {"favorite_free_models": ["openai/gpt-4o-mini"]}) + assert result[0] == a + assert result[1] == b + + +def test_case_insensitive_match(): + a = _c("Google", "Gemini-Flash") + candidates = [a] + result = _fn()(candidates, {"favorite_free_models": ["google/gemini-flash"]}) + assert result == [a] + + +def test_non_favorite_order_preserved(): + a = _c("a", "model-a") + b = _c("b", "model-b") + c = _c("c", "model-c") + d = _c("d", "model-d") + candidates = [a, b, c, d] + # Only promote c; a, b, d should remain in their original relative order + result = _fn()(candidates, {"favorite_free_models": ["c/model-c"]}) + assert result == [c, a, b, d] + + +def test_empty_candidates_returns_empty(): + result = _fn()([], {"favorite_free_models": ["openai/gpt-4o"]}) + assert result == [] + + +def test_variant_suffix_stripped_bare(): + # "gemini-flash" matches upstream_model "gemini-flash:free" + a = _c("google", "gemini-flash:free") + b = _c("openai", "gpt-4o") + candidates = [b, a] + result = _fn()(candidates, {"favorite_free_models": ["gemini-flash"]}) + assert result[0] == a + + +def test_variant_suffix_stripped_qualified(): + # "google/gemini-flash" matches (provider="google", model="gemini-flash:free") + a = _c("google", "gemini-flash:free") + b = _c("openai", "gpt-4o") + candidates = [b, a] + result = _fn()(candidates, {"favorite_free_models": ["google/gemini-flash"]}) + assert result[0] == a + + +def test_exact_with_suffix_still_matches(): + # Specifying "google/gemini-flash:free" also works (exact match) + a = _c("google", "gemini-flash:free") + b = _c("openai", "gpt-4o") + candidates = [b, a] + result = _fn()(candidates, {"favorite_free_models": ["google/gemini-flash:free"]}) + assert result[0] == a