Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,9 @@ When a request targets a (non-fusion) virtual model, llmproxy:
reorderings may then run on top without ever dropping a candidate: the
[request-fit triage](#request-fit-triage-every-free-and-local-virtual) for the
`*/free` and `*/local` virtuals, and [capability ordering](#capability-aware-routing--failover)
when the request forces a capability.
when the request forces a capability. Finally, any models listed in
[`favorite_free_models`](#favorite_free_models) that are present in the pool
are promoted to the front in ranked order before cycling begins.
3. **Tries each candidate in order**, returning the first **usable** response.

A candidate is considered to have **failed** — so llmproxy moves on to the next
Expand Down Expand Up @@ -678,6 +680,10 @@ Config is stored at `~/.config/llmproxy/config.json` (or the path in
"tokens_per_day": 500000
}
},
"favorite_free_models": [
"google/gemini-2.5-flash",
"groq/llama-3.1-8b-instant"
],

"free_tier": {
"sync_on_startup": true,
Expand Down Expand Up @@ -864,6 +870,45 @@ process — so it is "as far as we can tell in the moment". Any field set to `nu
is ignored; a provider with no `free_allowance` simply never gains free-in-the-
moment status.

<a name="favorite_free_models"></a>
### `favorite_free_models` — ranked priority list for free-tier routing

`favorite_free_models` is an **optional** top-level array of model IDs listed in
preference order. When a `*/free` virtual endpoint (e.g. `llmproxy/free`,
`llmproxy/deep__free`) or the free tier of `llmproxy/loadbalanced` selects a
backend, models in this list are promoted to the front of the candidate pool
**in the order listed**, before the normal capacity/request-fit/capability
algorithm handles the rest.

```json
"favorite_free_models": [
"google/gemini-2.5-flash",
"anthropic/claude-3-5-haiku-20251001",
"gpt-4o-mini"
]
```

Each entry is matched case-insensitively against the upstream model ID (bare,
e.g. `gpt-4o-mini`) or the fully-qualified proxy ID (e.g.
`openai/gpt-4o-mini`). A favorite is only promoted if it is **currently
believed-free** (present in `believed_free` and not flagged as cost-observed);
if it is absent from the virtual model's candidate pool it is silently skipped
and the remaining favorites and the normal algorithm continue unchanged.

**Cost-observation persistence:** if a favorite is later removed from
`believed_free` because a cost was observed at runtime, it remains in
`favorite_free_models`. When a future sync restores it to the free pool (e.g.
the provider makes it free again), it is automatically re-promoted without any
manual config change.

`favorite_free_models` has no effect on non-free virtual endpoints
(`llmproxy/deep`, `llmproxy/tools`, etc.) or on fusion virtuals.

The admin UI's **Models & Categorizations** tab includes a **Favorite free
models** panel where you can add models from a grouped-by-provider picker,
reorder them with up/down buttons, and remove entries — changes are saved
immediately.

<a name="usage-accounting"></a>
### Token + cost accounting — `GET /v1/usage`

Expand Down
13 changes: 13 additions & 0 deletions llmproxy/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,6 +463,7 @@ def api_get_config():
return jsonify({
"providers": providers,
"believed_free": config.get("believed_free", []),
"favorite_free_models": config.get("favorite_free_models", []),
"model_reasoning": config.get("model_reasoning", {}),
"model_capabilities": config.get("model_capabilities", {}),
"free_limits": config.get("free_limits", {}),
Expand Down Expand Up @@ -794,6 +795,18 @@ def _put_section(key: str, validate):
return jsonify({key: payload})


@bp.route("/admin/api/favorite-free-models", methods=["GET", "PUT"])
def api_favorite_free_models():
if request.method == "GET":
return jsonify({"favorite_free_models": _load().get("favorite_free_models", [])})

def validate(p):
if not (isinstance(p, list) and all(isinstance(x, str) for x in p)):
return "favorite_free_models must be a list of strings."
return None
return _put_section("favorite_free_models", validate)


@bp.route("/admin/api/believed-free", methods=["GET", "PUT"])
def api_believed_free():
if request.method == "GET":
Expand Down
34 changes: 34 additions & 0 deletions llmproxy/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2943,6 +2943,37 @@ def _provider_exposes_to_virtual_models(provider_cfg: dict) -> bool:
return provider_cfg.get("expose_to_virtual_models", True) is not False


def _apply_favorite_free_ordering(
candidates: list[tuple[str, dict, str]],
config: dict,
) -> list[tuple[str, dict, str]]:
"""Promote favorite_free_models to the front in ranked order.

Only candidates already present in the pool are promoted — favorites not in
the pool (e.g. cost-observed, not believed_free) are silently skipped.
Non-matching candidates retain their existing order after the favorites.

Matching is case-insensitive and ignores :variant suffixes (e.g. :free,
:nitro) so that "x/y" matches both "x/y" and "x/y:free".
"""
favorites = config.get("favorite_free_models", [])
if not favorites:
return candidates
remaining = list(candidates)
front: list[tuple[str, dict, str]] = []
for fav in favorites:
fav_lower = fav.lower()
for i, (pname, _pcfg, umodel) in enumerate(remaining):
umodel_lower = umodel.lower()
umodel_base = umodel_lower.split(":")[0] # strip :variant suffix
qualified = f"{pname}/{umodel}".lower()
qualified_base = f"{pname}/{umodel_base}"
if fav_lower in (umodel_lower, umodel_base, qualified, qualified_base):
front.append(remaining.pop(i))
break
return front + remaining


def _param_count(model_id: str) -> float:
"""Best-effort parameter count (in billions) parsed from a model id.

Expand Down Expand Up @@ -3436,6 +3467,7 @@ def _price(c: tuple[str, dict, str]) -> float:
continue
if tier == _TIER_FREE:
bucket = _quality_ordered_candidates(bucket, free_limits, reasoning_map)
bucket = _apply_favorite_free_ordering(bucket, config)
elif tier == _TIER_LOCAL:
# $0 like free — prefer the strongest local model (e.g. the larger
# Ollama model) rather than rotating randomly.
Expand Down Expand Up @@ -4253,6 +4285,8 @@ def on_success(pn: str, um: str, body=None) -> None:
logger.info(" [%s] request-fit first-pick tier=%s", model_full, _target_reasoning_tier(payload))
if needed:
ordered = _order_by_capability(ordered, needed, _model_capabilities(config))
if is_free_virtual:
ordered = _apply_favorite_free_ordering(ordered, config)
logger.info(" [%s] cycling through %d candidate(s)", model_full, len(ordered))
if is_streaming:
timeout = server_cfg.get("stream_timeout", 300)
Expand Down
81 changes: 80 additions & 1 deletion llmproxy/static/admin/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@
.vm code.id { font-size: 14px; font-weight: 700; }
a { color: var(--accent); }
.hidden { display: none !important; }
.fav-row { display: grid; grid-template-columns: auto 1fr auto auto auto; gap: 8px; align-items: center; padding: 7px 10px; border: 1px solid var(--border); border-radius: 8px; margin-bottom: 6px; }
.fav-rank { font-size: 11px; font-weight: 700; color: var(--muted); min-width: 18px; text-align: right; }
</style>
</head>
<body>
Expand Down Expand Up @@ -164,6 +166,15 @@ <h2>Models &amp; Categorizations</h2>
<div id="model-grid" class="grid-models" style="margin-top:12px"></div>
<div class="actions"><button class="btn primary" id="save-cats">Save categorizations</button></div>
</div>
<div class="card">
<h3>Favorite free models</h3>
<p class="muted">These models are tried first (in ranked order) when routing through any <code>*/free</code> virtual endpoint or the free tier of <code>llmproxy/loadbalanced</code>. A model is only selected if it is currently believed-free; it is skipped silently otherwise and the normal selection algorithm continues.</p>
<div id="fav-list" style="margin-bottom:12px"></div>
<div class="inline" style="gap:8px;flex-wrap:wrap">
<select id="fav-pick" style="flex:1;min-width:180px"><option value="">— pick a model to add —</option></select>
<button class="btn" id="fav-add">Add</button>
</div>
</div>
</section>

<!-- VIRTUAL -->
Expand Down Expand Up @@ -424,7 +435,7 @@ <h3>${esc(name)} ${keyPill}</h3>
// ---- Models & categorizations ----
$("#discover").addEventListener("click", async () => {
toast("Discovering…");
try { const r = await api("/models"); DISCOVERED = r.models || []; renderModels(); toast(`Discovered ${DISCOVERED.length} models`); }
try { const r = await api("/models"); DISCOVERED = r.models || []; renderModels(); renderFavPicker(); toast(`Discovered ${DISCOVERED.length} models`); }
catch (e) { toast(e.message, true); }
});
$("#model-filter").addEventListener("input", renderModels);
Expand Down Expand Up @@ -492,6 +503,73 @@ <h3>${esc(name)} ${keyPill}</h3>
}
$("#refresh-vm").addEventListener("click", loadVirtual);

// ---- Favorite free models ----
let FAVS = [];

function renderFavPicker() {
const sel = $("#fav-pick");
// Group discovered/known models by provider
const all = modelUniverse();
const groups = {};
for (const m of all) {
const slash = m.indexOf("/");
const grp = slash >= 0 ? m.slice(0, slash) : "(bare)";
(groups[grp] = groups[grp] || []).push(m);
}
sel.innerHTML = `<option value="">— pick a model to add —</option>` +
Object.keys(groups).sort().map(g =>
`<optgroup label="${esc(g)}">${groups[g].map(m => `<option value="${esc(m)}">${esc(m)}</option>`).join("")}</optgroup>`
).join("");
}

function renderFavsList() {
const box = $("#fav-list");
if (!FAVS.length) { box.innerHTML = `<p class="muted">No favorites yet.</p>`; return; }
box.innerHTML = FAVS.map((m, i) => `
<div class="fav-row">
<span class="fav-rank">${i + 1}</span>
<code style="font-size:13px;word-break:break-all">${esc(m)}</code>
<button class="btn" data-up="${i}" ${i === 0 ? "disabled" : ""} title="Move up">↑</button>
<button class="btn" data-dn="${i}" ${i === FAVS.length - 1 ? "disabled" : ""} title="Move down">↓</button>
<button class="btn danger" data-rm="${i}" title="Remove">✕</button>
</div>`).join("");
box.querySelectorAll("[data-up]").forEach(b => b.addEventListener("click", () => moveFav(+b.dataset.up, -1)));
box.querySelectorAll("[data-dn]").forEach(b => b.addEventListener("click", () => moveFav(+b.dataset.dn, 1)));
box.querySelectorAll("[data-rm]").forEach(b => b.addEventListener("click", () => removeFav(+b.dataset.rm)));
}

async function saveFavs() {
try {
await api("/favorite-free-models", { method: "PUT", body: JSON.stringify(FAVS) });
toast("Favorite free models saved");
} catch (e) { toast(e.message, true); }
}

function moveFav(i, dir) {
const j = i + dir;
if (j < 0 || j >= FAVS.length) return;
[FAVS[i], FAVS[j]] = [FAVS[j], FAVS[i]];
renderFavsList(); saveFavs();
}

function removeFav(i) {
FAVS.splice(i, 1); renderFavsList(); saveFavs();
}

$("#fav-add").addEventListener("click", () => {
const v = $("#fav-pick").value; if (!v) return;
if (FAVS.includes(v)) { toast("Already in favorites", true); return; }
FAVS.push(v); renderFavsList(); renderFavPicker(); saveFavs();
});

async function loadFavs() {
try {
const r = await api("/favorite-free-models");
FAVS = r.favorite_free_models || [];
renderFavsList(); renderFavPicker();
} catch (e) { toast(e.message, true); }
}

// ---- Bootstrap ----
async function bootstrap() {
try {
Expand All @@ -515,6 +593,7 @@ <h3>${esc(name)} ${keyPill}</h3>
renderTemplateSelect();
await loadProviders();
renderModels();
await loadFavs();
} catch (e) {
setAuthState(false, "auth required");
toast(e.message, true);
Expand Down
41 changes: 41 additions & 0 deletions tests/test_admin_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,47 @@ def test_put_believed_free_rejects_non_list(client):
assert resp.status_code == 400


# --------------------------------------------------------------------------- #
# Favorite free models

def test_get_favorite_free_models_default_empty(client):
resp = client.get("/admin/api/favorite-free-models")
assert resp.status_code == 200
assert resp.get_json()["favorite_free_models"] == []


def test_put_favorite_free_models_valid(client, cfg_path):
favs = ["google/gemini-2.5-flash", "openai/gpt-4o-mini"]
resp = client.put("/admin/api/favorite-free-models", json=favs)
assert resp.status_code == 200
assert resp.get_json()["favorite_free_models"] == favs
assert _read_config(cfg_path)["favorite_free_models"] == favs


def test_put_favorite_free_models_rejects_non_list(client):
resp = client.put("/admin/api/favorite-free-models", json={"model": "x"})
assert resp.status_code == 400


def test_put_favorite_free_models_rejects_non_string_entries(client):
resp = client.put("/admin/api/favorite-free-models", json=["ok", 42])
assert resp.status_code == 400


def test_favorite_free_models_in_config_get(client, cfg_path):
favs = ["google/gemini-flash"]
client.put("/admin/api/favorite-free-models", json=favs)
resp = client.get("/admin/api/config")
assert resp.status_code == 200
assert resp.get_json()["favorite_free_models"] == favs


def test_get_favorite_free_models_round_trips_empty_list(client, cfg_path):
client.put("/admin/api/favorite-free-models", json=[])
resp = client.get("/admin/api/favorite-free-models")
assert resp.get_json()["favorite_free_models"] == []


def test_put_model_reasoning_validates_level(client):
resp = client.put("/admin/api/model-reasoning", json={"m": "ultra"})
assert resp.status_code == 400
Expand Down
Loading
Loading