UltraCode-Shim/proxy.py at main · OnlyTerp/UltraCode-Shim · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
"""
proxy.py -- Anthropic-API interceptor that gives Claude Code's
"UltraCode" behavior to ANY model and lets you pick those models from the
/model menu.

WHAT IT DOES
------------
Claude Code talks to ANTHROPIC_BASE_URL. Point that at this proxy and it:

  1. Forces the UltraCode envelope on every /v1/messages request:
        output_config.effort = "xhigh"
      + thinking            = {"type": "adaptive"}
      + max_tokens          >= UC_MAX_TOKENS (default 64000)
      + an injected "Ultracode is on" system reminder
     (Per the reverse-engineering in docs/HOW_IT_WORKS.md, that *is* what
     UltraCode is at the API boundary -- there is no secret model or field.)

  2. Serves GET /v1/models, merging the real Anthropic list with your own
     custom models (config.json "models") AND a built-in set of stock Anthropic
     models (real Claude -- Opus/Sonnet/Haiku). The stock set is always offered
     so real Claude never disappears from /model, even when the upstream
     /v1/models fetch can't run (no Anthropic credential to forward, offline,
     etc.). With Claude Code's gateway model discovery enabled
     (CLAUDE_CODE_ENABLE_GATEWAY_MODEL_DISCOVERY=1) those models appear in the
     /model picker. NOTE: Claude Code only keeps model ids matching
     /^(claude|anthropic)/i, so every custom id MUST start with "claude" or
     "anthropic".

  3. Routes each model id Claude Code sends to a real backend (config.json "routes"):
        - Anthropic passthrough  (real Claude, or any Anthropic endpoint)
        - openai_compat          (any OpenAI-compatible Chat Completions API,
                                   WITH full tool-calling translation)
        - codex_oauth            (GPT-5.5 via a ChatGPT/Codex login; needs the
                                   optional providers/codex_oauth.py)

It is dependency-light: Python 3 standard library only. No pip install.

ENV KNOBS
---------
  UC_LISTEN_HOST     default 127.0.0.1
  UC_LISTEN_PORT     default 8141
  UC_UPSTREAM        default https://api.anthropic.com
  UC_MAX_TOKENS      default 64000   (floor applied to max_tokens)
  UC_FORCE_EFFORT    default xhigh   (set empty to leave effort untouched)
  UC_FORCE_THINKING  default 1       (1 => force adaptive thinking)
  UC_INJECT_REMINDER default 1       (1 => inject the ultracode reminder)
  UC_INCLUDE_STOCK_MODELS default 1  (1 => always advertise stock Claude models
                     -- Opus/Sonnet/Haiku -- on /v1/models so real Claude never
                     drops out of the picker; 0 to advertise only your config)
  UC_STOCK_LEARN     default 1       (1 => learn the real Claude model ids from
                     any successful upstream /v1/models fetch and cache them to
                     disk, so a newly released Opus shows up with no code change;
                     0 to use only the built-in baseline)
  UC_STOCK_CACHE     optional path for the learned-stock cache (default: a
                     per-user state dir -- %LOCALAPPDATA%\\UltraCode-Shim or
                     $XDG_STATE_HOME/ultracode-shim)
  UC_STOCK_MODELS    optional JSON/CSV overriding the stock list entirely (wins
                     over both learned + built-in), e.g.
                     '["claude-opus-4-8","claude-sonnet-4-6"]' or a JSON array of
                     {"id","display_name"} objects
  UC_CONFIG          path to config.json (default: config.json beside proxy.py,
                     falling back to config.example.json)
  UC_MODEL_MAP       optional JSON, e.g. {"claude-opus-4-8":"my-model"}
  UC_LOG             optional log file path (default stderr)
  UC_VERBOSE         default 0
  UC_BROWSER_UA      User-Agent for openai_compat upstreams (default: modern
                     Chrome UA). Fixes CF 403 "browser_signature_banned" on
                     providers like crof.ai. Override with env or per-route
                     "headers".

ROUTE SHAPE (config.json "routes" object)
-----------------------------------------
  {
    "claude-opus-4-8":   {"model": "claude-opus-4-8",
                          "upstream": "https://api.anthropic.com",
                          "auth": "passthrough"},
    "claude-mimo":       {"type": "openai_compat",
                          "model": "mimo-v2.5-pro",
                          "upstream": "https://token-plan-sgp.xiaomimimo.com/v1",
                          "auth": "Bearer ${MIMO_API_KEY}"},
    "claude-gpt-5.5":    {"type": "codex_oauth", "model": "gpt-5.5"}
  }

  type     omit for Anthropic passthrough; "openai_compat"; "codex_oauth"; or
           "auto" (the Auto Router -- a cheap classifier model scores the other
           backends per task and routes to the cheapest one that clears a
           quality bar; see the "router" section in config.json and
           docs/AUTO_ROUTER.md)
  model    backend model id sent upstream
  upstream backend base URL. openai_compat: the OpenAI base URL from the
           provider's docs (usually ends in /v1); the proxy appends
           /chat/completions. passthrough: a base the inbound path is appended to.
  auth     "passthrough" (keep Claude Code's own credential) OR a literal
           header value: "Bearer ${KEY}" / "x-api-key: ${KEY}". ${VARS} are
           expanded from the environment (export them, or use a gitignored
           ultracode.env that the launchers load).
  headers  optional dict of extra request headers (values support ${VARS}).
  max_output_tokens  optional completion cap for openai_compat (default 8192).
  body     optional dict of extra params merged into the openai_compat request
           body (values support ${VARS}). e.g. MiniMax-M3 needs
           {"reasoning_split": true} so its <think> chain-of-thought is kept out
           of the visible answer.
"""

import hashlib
import json
import os
import re
import sys
import threading
import time
import uuid
import urllib.request
import urllib.error
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer

# --------------------------------------------------------------------------
# Config
# --------------------------------------------------------------------------

LISTEN_HOST = os.environ.get("UC_LISTEN_HOST", "127.0.0.1")
LISTEN_PORT = int(os.environ.get("UC_LISTEN_PORT", "8141"))
UPSTREAM = os.environ.get("UC_UPSTREAM", "https://api.anthropic.com").rstrip("/")
MAX_TOKENS_FLOOR = int(os.environ.get("UC_MAX_TOKENS", "64000"))
FORCE_EFFORT = os.environ.get("UC_FORCE_EFFORT", "xhigh")
FORCE_THINKING = os.environ.get("UC_FORCE_THINKING", "1") == "1"
INJECT_REMINDER = os.environ.get("UC_INJECT_REMINDER", "1") == "1"
INCLUDE_STOCK_MODELS = os.environ.get("UC_INCLUDE_STOCK_MODELS", "1") != "0"
LEARN_STOCK_MODELS = os.environ.get("UC_STOCK_LEARN", "1") != "0"
VERBOSE = os.environ.get("UC_VERBOSE", "0") == "1"
_LOG_PATH = os.environ.get("UC_LOG", "")

# --- Security & resource limits (issues #19, #22, #24) ------------------------
# Loopback Host-header guard: when bound to a loopback address (the default), the
# proxy serves only requests whose Host header is a loopback name. This blocks
# DNS-rebinding -- a malicious web page can't make a victim's browser read the
# backend config off /healthz or flip routing via /uc/select, because fetch()
# can't forge the Host header and the rebound hostname isn't loopback. Bind a
# non-loopback UC_LISTEN_HOST (LAN sharing) to opt out. (#19)
GUARD_LOCAL = os.environ.get("UC_GUARD_LOCAL", "1") != "0"
# Reject request bodies larger than this many bytes before allocating them
# (0 disables). Caps memory blowup from a single oversized upload. (#24)
MAX_BODY_BYTES = int(os.environ.get("UC_MAX_BODY_BYTES", str(64 * 1024 * 1024)))
# Cap concurrent in-flight requests so a connection flood can't spawn unbounded
# threads; excess connections wait for a slot (back-pressure). 0 disables. (#24)
MAX_CONNECTIONS = int(os.environ.get("UC_MAX_CONNECTIONS", "128"))
# Per-socket timeout (seconds) bounds idle / slowloris connections. Generous so
# it never trips an actively streaming response (each chunk resets it). (#24)
SOCKET_TIMEOUT = float(os.environ.get("UC_SOCKET_TIMEOUT", "660"))
# Inbound credentials never forwarded to a non-default, non-passthrough upstream
# (so a custom route can't exfiltrate Claude Code's own Anthropic key). (#22)
_INBOUND_CRED_HEADERS = ("authorization", "x-api-key")


def _is_loopback_listen() -> bool:
    h = LISTEN_HOST.strip().lower()
    return h in ("localhost", "127.0.0.1", "::1") or h.startswith("127.")


def _request_host_is_local(host_header) -> bool:
    """True if the inbound Host header names a loopback address. Handles
    host:port, bracketed IPv6 ([::1]:8141), and bare hostnames."""
    if not host_header:
        return False
    h = host_header.strip()
    if h.startswith("["):                       # [::1] or [::1]:port
        h = h[1:].split("]", 1)[0]
    elif h.count(":") == 1:                      # host:port (not bare IPv6)
        h = h.rsplit(":", 1)[0]
    h = h.strip().lower()
    return h in ("localhost", "127.0.0.1", "::1") or h.startswith("127.")

# Auto Router knobs (see the "router" section in config.json + docs/AUTO_ROUTER.md).
ROUTER_ENABLED_ENV = os.environ.get("UC_ROUTER", "1") != "0"
ROUTER_TIMEOUT = float(os.environ.get("UC_ROUTER_TIMEOUT", "12"))
ROUTER_MAX_TOKENS = int(os.environ.get("UC_ROUTER_MAX_TOKENS", "600"))
ROUTER_LOG = os.environ.get("UC_ROUTER_LOG", "0") == "1"

# Routing directives ("pins"): a prompt tag like [[route:codex]] / @codex forces a
# single request onto a specific backend, overriding orchestrator/worker selection
# AND the Auto Router. This is what lets an automated multi-agent workflow land each
# spawned sub-agent on the right model by role (plan->opus, code->composer, ...).
# OPT-IN: OFF unless turned on via "directives": {"enabled": true} in config.json
# (or UC_DIRECTIVES=1). Default => exact prior behavior, so this never disrupts an
# existing setup that hasn't asked for it. Final value is resolved in
# _configure_directives(); this is only the pre-config default. See docs/DIRECTIVES.md.
DIRECTIVES_ENABLED = os.environ.get("UC_DIRECTIVES") == "1"
DIRECTIVES_NL = os.environ.get("UC_DIRECTIVES_NL", "0") == "1"   # natural-language tier: opt-in (off by default)
DIRECTIVES_LOG = os.environ.get("UC_DIRECTIVES_LOG", "0") == "1"
DIRECTIVES = {"planner": None, "strip": True}   # filled from config in main()
_ROUTE_ALIASES = {}                              # normalized token -> concrete route id

# BROWSER_UA: browser UA for openai_compat (and classifier) calls.
# CF-protected providers (e.g. crof.ai) ban Python-urllib (error 1010
# "browser_signature_banned"). Matches droid/factory clients.
# Override: UC_BROWSER_UA=... or route "headers".
BROWSER_UA = os.environ.get(
    "UC_BROWSER_UA",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
    "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
)

# 1M context window: Claude Code sizes its context meter (and auto-compaction) to
# 1M only when the model id it holds carries a "[1m]" suffix. For a real-Claude
# passthrough route whose upstream model is 1M-capable, we ADVERTISE the picker id
# with that suffix on /v1/models + /healthz, so even an in-session /model switch
# (not just a launch-time pick) gets the 1M window. The suffix is a client-side
# convention, not an Anthropic model id: it is stripped before routing and
# normalized off the sticky orchestrator/worker selection, so internal ids stay
# clean. Disable with UC_ADVERTISE_1M=0. See docs/DIRECTIVES.md / PR #8 + #10.
_ONEM_SUFFIX = "[1m]"
ADVERTISE_1M = os.environ.get("UC_ADVERTISE_1M", "1") != "0"
_CONTEXT_1M_UPSTREAM = set(t.strip() for t in os.environ.get(
    "UC_1M_UPSTREAM",
    "claude-opus-4-8,claude-opus-4-7,claude-opus-4-6,claude-sonnet-4-6").split(",") if t.strip())


def _strip_1m(mid):
    """Model id without a trailing [1m] window suffix (the client convention)."""
    if isinstance(mid, str) and mid.endswith(_ONEM_SUFFIX):
        return mid[:-len(_ONEM_SUFFIX)]
    return mid


def _advertise_id(model_entry):
    """The id to advertise for a configured model on /v1/models + /healthz. Appends
    [1m] when ADVERTISE_1M is on and the model is a real-Claude PASSTHROUGH route to
    a 1M-capable upstream model, so Claude Code renders the 1M window for it (incl.
    in-session /model picks). Worker entries and non-passthrough routes are returned
    unchanged. Never raises."""
    mid = model_entry.get("id") if isinstance(model_entry, dict) else None
    if not (ADVERTISE_1M and isinstance(mid, str)):
        return mid
    if mid.endswith(_ONEM_SUFFIX) or mid.startswith(WORKER_ID_PREFIX):
        return mid
    slot = UC_SLOT_MAP.get(mid)
    if not isinstance(slot, dict) or slot.get("type") not in (None, "anthropic"):
        return mid                                  # passthrough (real Claude) only
    if (slot.get("model") or mid) in _CONTEXT_1M_UPSTREAM:
        return mid + _ONEM_SUFFIX
    return mid


def _display_name_for_id(mid):
    if not mid:
        return None
    for m in UC_MODELS:
        if m.get("id") == mid:
            return m.get("display_name", mid)
    for m in _stock_models():
        if m.get("id") == mid:
            return m.get("display_name", mid)
    return mid


def _orchestrator_worker_status():
    with _SEL_LOCK:
        active = dict(_ACTIVE)
    orch = active.get("orch")
    worker = active.get("worker")
    return {
        "enabled": ORCH_WORKER,
        "orchestrator": {"id": orch, "display_name": _display_name_for_id(orch)},
        "worker": {"id": worker, "display_name": _display_name_for_id(worker)},
        "worker_explicit": active.get("worker_explicit", False),
        "same_model": bool(orch and worker and orch == worker),
    }


def _context_length_hint(detail):
    low = (detail or "").lower()
    if any(x in low for x in ("context", "token", "maximum context",
                              "too long", "too many tokens", "length exceeded")):
        return (" (This backend rejected the full conversation history — the proxy "
                "forwards the entire transcript with no trimming. Try compacting the "
                "session, switching to a backend with a larger context window, or "
                "starting a fresh session.)")
    return ""


try:
    UC_MODEL_MAP = json.loads(os.environ.get("UC_MODEL_MAP", "") or "{}")
    if not isinstance(UC_MODEL_MAP, dict):
        UC_MODEL_MAP = {}
except Exception:
    UC_MODEL_MAP = {}

# Optional Codex/ChatGPT OAuth helper (only needed for "codex_oauth" routes).
try:
    from providers import codex_oauth as _codex_oauth  # type: ignore
except Exception:
    try:
        import codex_oauth as _codex_oauth  # type: ignore
    except Exception:
        _codex_oauth = None

# Optional Cursor Composer helper (only needed for "cursor_agent" routes).
try:
    from providers import cursor_agent as _cursor_agent  # type: ignore
except Exception:
    try:
        import cursor_agent as _cursor_agent  # type: ignore
    except Exception:
        _cursor_agent = None


_ENV_TOKEN = "${"


def _expand_env(value):
    """Expand ${VAR} references in a string from os.environ. Unknown vars
    expand to empty string. Non-strings pass through unchanged."""
    if not isinstance(value, str) or _ENV_TOKEN not in value:
        return value
    out = []
    i = 0
    n = len(value)
    while i < n:
        if value[i] == "$" and i + 1 < n and value[i + 1] == "{":
            end = value.find("}", i + 2)
            if end != -1:
                var = value[i + 2:end]
                out.append(os.environ.get(var, ""))
                i = end + 1
                continue
        out.append(value[i])
        i += 1
    return "".join(out)


def _default_config_path():
    here = os.path.dirname(os.path.abspath(__file__))
    for name in ("config.json", "config.example.json"):
        p = os.path.join(here, name)
        if os.path.isfile(p):
            return p
    return os.path.join(here, "config.json")


def _strip_comments(obj):
    """Drop keys that start with '_' (used for inline documentation)."""
    if isinstance(obj, dict):
        return {k: _strip_comments(v) for k, v in obj.items() if not str(k).startswith("_")}
    if isinstance(obj, list):
        return [_strip_comments(x) for x in obj]
    return obj


def load_config(path):
    """Load the single config.json (proxy/models/routes), stripping comments."""
    with open(path, "r", encoding="utf-8") as f:
        return _strip_comments(json.load(f))


def _routes_to_slots(routes):
    """routes{} from config.json -> UC_SLOT_MAP. Expands ${ENV} in model/upstream/auth/headers."""
    out = {}
    if not isinstance(routes, dict):
        return out
    for mid, route in routes.items():
        if not isinstance(route, dict):
            continue
        slot = {}
        if route.get("model"):
            slot["model"] = _expand_env(route["model"])
        if route.get("upstream"):
            slot["upstream"] = _expand_env(route["upstream"]).rstrip("/")
        auth = route.get("auth")
        if auth == "passthrough":
            # Keep the passthrough intent on the slot so dispatch knows to forward
            # Claude Code's own credential to this upstream (vs. stripping it). (#22)
            slot["auth_passthrough"] = True
        elif auth:
            slot["auth"] = _expand_env(auth)
        if route.get("type"):
            slot["type"] = route["type"]
        if route.get("max_output_tokens"):
            slot["max_output_tokens"] = route["max_output_tokens"]
        if isinstance(route.get("headers"), dict):
            slot["headers"] = {k: _expand_env(v) for k, v in route["headers"].items()}
        if isinstance(route.get("body"), dict):
            slot["body"] = route["body"]  # carried raw; ${ENV} expanded at use-site
        out[mid] = slot
    return out


def _models_from_config(models):
    out = []
    for m in models or []:
        if not isinstance(m, dict):
            continue
        mid = m.get("id")
        if not mid or not isinstance(mid, str):
            continue
        out.append({
            "type": "model",
            "id": mid,
            "display_name": m.get("display_name") or mid,
            "created_at": m.get("created_at") or "2025-01-01T00:00:00Z",
        })
    return out


# Stock (real Claude) models. These are advertised on /v1/models in addition to
# whatever Anthropic's own /v1/models returns, so real Claude never disappears
# from the /model picker -- e.g. when there's no Anthropic credential to forward
# upstream, or the upstream fetch hiccups. They are NOT orchestrator/worker
# picker entries: stock ids must keep flowing through _select_target untouched
# so the dynamic-workflow background traffic (hardcoded to claude-opus-4-8) can
# still be remapped onto your pick instead of hijacking the selection.
#
# This is the built-in *baseline* (a floor, current at release time). At runtime
# the proxy also LEARNS the real Claude ids from any successful upstream
# /v1/models fetch and caches them to disk (see _learn_stock_from_upstream /
# UC_STOCK_LEARN), so a newly released Opus appears automatically with no code
# change. Precedence when building the advertised list: UC_STOCK_MODELS override
# (if set) wins outright; otherwise learned-from-upstream entries win over the
# baseline, and the baseline fills in anything not yet learned. Disable the whole
# thing with UC_INCLUDE_STOCK_MODELS=0; disable just learning with UC_STOCK_LEARN=0.
STOCK_MODELS = [
    {"id": "claude-opus-4-8",   "display_name": "Claude Opus 4.8"},
    {"id": "claude-opus-4-7",   "display_name": "Claude Opus 4.7"},
    {"id": "claude-sonnet-4-6", "display_name": "Claude Sonnet 4.6"},
    {"id": "claude-haiku-4-5",  "display_name": "Claude Haiku 4.5"},
]

# Which upstream ids count as "real Claude" worth learning. Anthropic's
# /v1/models returns ids like "claude-opus-4-8" / "claude-haiku-4-5-20251001";
# we keep the dated and dateless forms but skip anything that isn't a Claude id.
_STOCK_LEARN_RE = re.compile(r"^(claude|anthropic)[.-]", re.I)

# A trailing -YYYYMMDD / @YYYYMMDD snapshot suffix (pre-4.6 models ship dated;
# the dateless alias points at the same model). _model_family collapses the two
# so we never advertise both "claude-haiku-4-5" and "claude-haiku-4-5-20251001".
_DATE_SUFFIX_RE = re.compile(r"[-@]\d{8}$")


def _model_family(mid):
    """Key that treats a model's dated and dateless ids as the same thing, so the
    stock list doesn't show near-duplicate rows for one model."""
    return _DATE_SUFFIX_RE.sub("", mid or "").lower()

# Learned stock cache (populated from disk at startup + refreshed on every
# successful upstream /v1/models fetch). Guarded by a lock for the threaded server.
_LEARNED_STOCK = []          # [{"id","display_name"}], most-recent upstream order
_LEARNED_STOCK_LOCK = threading.Lock()
_LEARNED_STOCK_LOADED = False


def _stock_cache_path():
    """Where the learned-stock cache lives. UC_STOCK_CACHE overrides; otherwise a
    per-user state dir that matches the launchers' conventions."""
    p = os.environ.get("UC_STOCK_CACHE")
    if p:
        return p
    if os.name == "nt":
        base = os.environ.get("LOCALAPPDATA") or os.path.expanduser("~")
        return os.path.join(base, "UltraCode-Shim", "stock-models.json")
    base = os.environ.get("XDG_STATE_HOME") or os.path.join(os.path.expanduser("~"), ".local", "state")
    return os.path.join(base, "ultracode-shim", "stock-models.json")


def _normalize_learned(items):
    """Coerce a list of {"id","display_name"} into the normalized, Claude-only
    form, deduped by id (first occurrence wins)."""
    out, seen = [], set()
    for m in items or []:
        if not isinstance(m, dict):
            continue
        mid = m.get("id")
        if not isinstance(mid, str) or mid in seen or not _STOCK_LEARN_RE.match(mid):
            continue
        seen.add(mid)
        out.append({"id": mid, "display_name": m.get("display_name") or mid})
    return out


def _load_learned_stock():
    """Load the learned-stock cache from disk into _LEARNED_STOCK (once)."""
    global _LEARNED_STOCK, _LEARNED_STOCK_LOADED
    if _LEARNED_STOCK_LOADED or not LEARN_STOCK_MODELS:
        return
    _LEARNED_STOCK_LOADED = True
    path = _stock_cache_path()
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        models = data.get("models") if isinstance(data, dict) else data
        learned = _normalize_learned(models)
        if learned:
            with _LEARNED_STOCK_LOCK:
                _LEARNED_STOCK = learned
            vlog("loaded %d learned stock model(s) from %s" % (len(learned), path))
    except FileNotFoundError:
        pass
    except Exception as e:
        vlog("could not read learned-stock cache %s: %s" % (path, e))


def _learn_stock_from_upstream(upstream_data):
    """Given the 'data' list from a successful upstream /v1/models response, learn
    the real Claude ids: update the in-memory cache and persist to disk if it
    changed. Best-effort; never raises into the request path."""
    global _LEARNED_STOCK
    if not LEARN_STOCK_MODELS:
        return
    learned = _normalize_learned(upstream_data)
    if not learned:
        return
    with _LEARNED_STOCK_LOCK:
        changed = [m["id"] for m in learned] != [m["id"] for m in _LEARNED_STOCK]
        _LEARNED_STOCK = learned
    if not changed:
        return
    path = _stock_cache_path()
    try:
        os.makedirs(os.path.dirname(path), exist_ok=True)
        tmp = path + ".tmp"
        with open(tmp, "w", encoding="utf-8") as f:
            json.dump({"fetched_at": int(time.time()), "upstream": UPSTREAM,
                       "models": learned}, f)
        os.replace(tmp, path)
        vlog("learned %d stock Claude model(s) from upstream -> %s"
             % (len(learned), path))
    except Exception as e:
        vlog("could not write learned-stock cache %s: %s" % (path, e))


def _parse_stock_override(raw):
    """UC_STOCK_MODELS may be a JSON array of ids, a JSON array of
    {"id","display_name"} objects, or a comma-separated list of ids. Returns a
    normalized [{"id","display_name"}] list, or None if the var is unset/empty
    (use the built-in default) -- an explicit empty list disables stock models."""
    raw = (raw or "").strip()
    if not raw:
        return None
    parsed = None
    if raw[0] in "[{":
        try:
            parsed = json.loads(raw)
        except Exception as e:
            log("UC_STOCK_MODELS is not valid JSON (%s); using the built-in stock list" % e)
            return None
    if parsed is None:  # CSV form: "claude-opus-4-8, claude-sonnet-4-6"
        parsed = [s.strip() for s in raw.split(",")]
    out = []
    for item in parsed if isinstance(parsed, list) else []:
        if isinstance(item, str) and item.strip():
            mid = item.strip()
            out.append({"id": mid, "display_name": mid})
        elif isinstance(item, dict) and item.get("id"):
            out.append({"id": item["id"],
                        "display_name": item.get("display_name") or item["id"]})
    return out


def _stock_source():
    """The stock model list to advertise, as [{"id","display_name"}], BEFORE the
    discovery id-rule filter. Precedence:
      1. UC_STOCK_MODELS override -> exactly that list (learning ignored).
      2. otherwise: learned-from-upstream ids (current real Claude) first, then
         the built-in baseline fills in anything not learned yet.
    So a freshly released Opus shows up the moment upstream lists it, while the
    baseline still guarantees real Claude even before anything is learned."""
    override = _parse_stock_override(os.environ.get("UC_STOCK_MODELS"))
    if override is not None:
        return override
    _load_learned_stock()
    with _LEARNED_STOCK_LOCK:
        learned = list(_LEARNED_STOCK)
    # Learned (real upstream ids) first, then the baseline fills in the rest.
    # Dedup by model *family* so a model's dated and dateless ids (e.g.
    # claude-haiku-4-5-20251001 vs claude-haiku-4-5) collapse to one row -- the
    # learned upstream id wins since that's what Anthropic actually serves.
    out, seen_id, seen_family = [], set(), set()
    for m in learned + STOCK_MODELS:
        mid = m.get("id")
        if not mid or mid in seen_id:
            continue
        fam = _model_family(mid)
        if fam in seen_family:
            continue
        seen_id.add(mid)
        seen_family.add(fam)
        out.append(m)
    return out


def _stock_models():
    """The stock Claude models to advertise, after applying the stock source
    precedence (override / learned / baseline) and the /^(claude|anthropic)/i id
    rule Claude Code enforces on discovery."""
    if not INCLUDE_STOCK_MODELS:
        return []
    out = []
    for m in _stock_source():
        mid = m.get("id")
        if not mid or not re.match(r"^(claude|anthropic)", mid, re.I):
            continue
        out.append({
            "type": "model",
            "id": mid,
            "display_name": m.get("display_name") or mid,
            "created_at": "2025-01-01T00:00:00Z",
        })
    return out


ULTRACODE_REMINDER = (
    "Ultracode is on: optimize for the most exhaustive, correct answer \u2014 not "
    "the fastest or cheapest. Use the Workflow tool on every substantive task; "
    "token cost is not a constraint. See the Workflow tool's **Ultracode** "
    "section and quality patterns. Solo only on conversational/trivial turns."
)
_REMINDER_FINGERPRINT = "Ultracode is on:"

_log_lock = threading.Lock()


def log(msg: str) -> None:
    line = "[%s] %s" % (time.strftime("%H:%M:%S"), msg)
    with _log_lock:
        if _LOG_PATH:
            try:
                with open(_LOG_PATH, "a", encoding="utf-8") as f:
                    f.write(line + "\n")
                return
            except Exception:
                pass
        sys.stderr.write(line + "\n")
        sys.stderr.flush()


def vlog(msg: str) -> None:
    if VERBOSE:
        log(msg)


DEFAULT_UPSTREAM = UPSTREAM
UC_SLOT_MAP = {}   # populated in main()
UC_MODELS = []     # populated in main()

# Auto Router state (populated in main() from config.json "router").
ROUTER = {
    "enabled": False,
    "id": "claude-auto",      # the picker id that triggers smart routing
    "classifier": None,       # route id of the cheap model that scores candidates
    "threshold": 0.7,         # success-probability bar; cheapest candidate >= this wins
    "candidates": [],         # [{"id","cost","card","supports_images"}]
    "default": None,          # fallback candidate id when classification can't run
    "cache": True,            # reuse the decision across a task's tool-call round-trips
}


# --------------------------------------------------------------------------
# Orchestrator + Worker (two-model dynamic workflows)
# --------------------------------------------------------------------------
# Claude Code's /model picker is single-slot, and its dynamic-workflow machinery
# issues most of its background traffic as the stock model (claude-opus-4-8 etc.)
# regardless of your pick -- so the sub-agents/leaves that do the bulk of a
# workflow's work don't follow your selection. This proxy fixes that by holding a
# sticky two-tier selection and routing EVERY request by tier:
#   heavy (orchestrator: the main interactive loop -- carries an interactive-only
#          tool like AskUserQuestion/ExitPlanMode) -> the orchestrator model
#   fast  (worker: every Workflow/Task sub-agent + background call)
#          -> the worker model
# main() auto-adds a "Worker -> X" picker entry (id claude-worker-X) for each of
# your models, so you can pick an orchestrator AND a worker from /model. A plain
# pick sets BOTH tiers (one model everywhere); a "Worker -> X" pick sets only the
# worker tier. Stock opus/sonnet/haiku ids never change the selection -- they are
# remapped to it, so background workflow traffic follows your pick instead of
# silently billing the stock model. Disable with UC_ORCH_WORKER=0.
ORCH_WORKER = os.environ.get("UC_ORCH_WORKER", "1") == "1"
WORKER_ID_PREFIX = "claude-worker-"
TIER_LOG = os.environ.get("UC_TIER_LOG", "0") == "1"
# Tools the harness hands ONLY to the main interactive loop (never to Workflow/
# Task sub-agents). Their presence marks the orchestrator ("heavy") -- a far more
# reliable structural signal than scraping the system prompt.
_INTERACTIVE_ONLY_TOOLS = frozenset({
    "AskUserQuestion", "ExitPlanMode", "EnterPlanMode",
})
_SEL_LOCK = threading.Lock()
_ACTIVE = {"orch": None, "worker": None, "worker_explicit": False}
_ORCH_PICK_IDS = set()   # base orchestrator picker ids (filled in main())
_WORKER_MAP = {}         # claude-worker-<x> -> claude-<x>  (filled in main())
_WARNED_NO_SELECTION = False


def _selection_cache_path() -> str:
    """Where the sticky orchestrator/worker selection is persisted, or "" to
    disable. Set by the launchers (UC_SELECTION_CACHE) so the pick survives a
    proxy restart -- otherwise a restarted proxy forgets the selection and
    workflow sub-agents silently fall back to stock Claude. (issue #18)"""
    return os.environ.get("UC_SELECTION_CACHE", "").strip()


def _save_selection() -> None:
    path = _selection_cache_path()
    if not path:
        return
    try:
        with _SEL_LOCK:
            data = {"orch": _ACTIVE["orch"], "worker": _ACTIVE["worker"],
                    "worker_explicit": _ACTIVE["worker_explicit"]}
        d = os.path.dirname(path)
        if d:
            os.makedirs(d, exist_ok=True)
        tmp = "%s.tmp.%d" % (path, os.getpid())
        with open(tmp, "w", encoding="utf-8") as f:
            json.dump(data, f)
        os.replace(tmp, path)
    except Exception as e:
        vlog("selection persist failed: %s" % e)


def _load_selection() -> None:
    path = _selection_cache_path()
    if not path or not os.path.isfile(path):
        return
    try:
        with open(path, encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        vlog("selection restore failed: %s" % e)
        return
    if not isinstance(data, dict):
        return
    with _SEL_LOCK:
        _ACTIVE["orch"] = data.get("orch") or None
        _ACTIVE["worker"] = data.get("worker") or None
        _ACTIVE["worker_explicit"] = bool(data.get("worker_explicit"))
    if _ACTIVE["orch"] or _ACTIVE["worker"]:
        log("restored orchestrator/worker selection: orch=%s worker=%s"
            % (_ACTIVE["orch"], _ACTIVE["worker"]))


def _has_active_selection() -> bool:
    with _SEL_LOCK:
        return bool(_ACTIVE["orch"] or _ACTIVE["worker"])


def _warn_no_selection_once(model) -> None:
    """Explain (once) why traffic is using stock Claude: orchestrator/worker is on
    but nothing is selected yet, so a stock/unknown id passes through unchanged.
    This is the most common cause of "workflows ignore my models". (issue #18)"""
    global _WARNED_NO_SELECTION
    if _WARNED_NO_SELECTION:
        return
    _WARNED_NO_SELECTION = True
    log("orchestrator/worker is ON but no model is selected yet; request for '%s' "
        "is passing through as stock Claude. Pick a model in the pre-launch "
        "selector or via /model so workflow sub-agents use it. (issue #18)" % model)


def _request_tier(body: dict) -> str:
    """"heavy" for the main interactive loop (carries an interactive-only tool),
    "fast" for every Workflow/Task sub-agent + background call."""
    if not ORCH_WORKER:
        return "heavy"
    tools = body.get("tools")
    if isinstance(tools, list):
        for t in tools:
            if isinstance(t, dict) and t.get("name") in _INTERACTIVE_ONLY_TOOLS:
                return "heavy"
    return "fast"


def _set_selection(orch=None, worker=None):
    """Directly pre-set the sticky orchestrator/worker selection (used by the
    two-column pre-launch selector via POST /uc/select). Either may be None to
    leave that tier unchanged. Returns the resolved active selection dict."""
    orch, worker = _strip_1m(orch), _strip_1m(worker)   # selections store clean ids
    with _SEL_LOCK:
        if orch is not None:
            _ACTIVE["orch"] = orch or None
        if worker is not None:
            _ACTIVE["worker"] = worker or None
            _ACTIVE["worker_explicit"] = bool(worker)
        if orch and worker is None and not _ACTIVE["worker_explicit"]:
            _ACTIVE["worker"] = orch
        active = dict(_ACTIVE)
    _save_selection()
    return active


def _select_target(mid, tier: str):
    """Update the sticky orchestrator/worker selection from a deliberate pick,
    then return the picker id this request should route to (by tier). Returns
    ``mid`` unchanged when the feature is off or no selection is active yet, so
    fresh sessions behave exactly as before."""
    if not ORCH_WORKER:
        return mid
    mid = _strip_1m(mid)   # a [1m]-suffixed pick maps to its clean route id
    picked = False
    with _SEL_LOCK:
        if mid in _WORKER_MAP:
            _ACTIVE["worker"] = _WORKER_MAP[mid]
            _ACTIVE["worker_explicit"] = True
            picked = True
        elif mid in _ORCH_PICK_IDS:
            _ACTIVE["orch"] = mid
            if not _ACTIVE["worker_explicit"]:
                _ACTIVE["worker"] = mid
            picked = True
        # else: stock (opus/sonnet/haiku) or unknown id -> not a selection.
        orch = _ACTIVE["orch"]
        worker = _ACTIVE["worker"]
    if picked:
        _save_selection()   # persist deliberate /model picks across restarts (#18)
    target = (orch or worker) if tier == "heavy" else (worker or orch)
    return target or mid


def _wire_orchestrator_worker():
    """Populate the orchestrator-pick ids + worker map from UC_MODELS, and append
    a synthesized "Worker -> X" picker entry (routed like its base model) for each
    advertised model. Idempotent; called from main() after models/slots load."""
    if not (ORCH_WORKER and UC_MODELS):
        return
    for m in list(UC_MODELS):
        mid = m.get("id")
        if not mid or mid in _WORKER_MAP or mid.startswith(WORKER_ID_PREFIX):
            continue
        _ORCH_PICK_IDS.add(mid)
        suffix = mid[len("claude-"):] if mid.startswith("claude-") else mid
        wid = WORKER_ID_PREFIX + suffix
        if wid in _WORKER_MAP:
            continue
        _WORKER_MAP[wid] = mid
        UC_MODELS.append({
            "type": "model", "id": wid,
            "display_name": "Worker \u2192 %s" % m.get("display_name", mid),
            "created_at": m.get("created_at") or "2025-01-01T00:00:00Z",
        })
        if mid in UC_SLOT_MAP and wid not in UC_SLOT_MAP:
            UC_SLOT_MAP[wid] = dict(UC_SLOT_MAP[mid])
    if _WORKER_MAP:
        log("orchestrator+worker enabled: %d model(s), worker ids: %s"
            % (len(_ORCH_PICK_IDS), ", ".join(sorted(_WORKER_MAP))))


# --------------------------------------------------------------------------
# Routing directives ("pins") -- force a request onto a specific backend
# --------------------------------------------------------------------------
# A workflow (or a human) can tag a request's prompt to pin it to ONE backend,
# overriding the orchestrator/worker selection AND the Auto Router. This is how an
# automated multi-agent workflow lands each spawned sub-agent on the right model by
# role -- e.g. plan->opus, code->composer, review->codex, fix->claude -- with no
# turn-by-turn driving: the workflow script bakes a role tag into each agent()
# prompt and the proxy hard-pins that request.
#
# Marker tiers (case-insensitive), most explicit first; a tier wins only if it
# resolves to EXACTLY ONE configured backend (naming two models is ambiguous ->
# ignored, normal routing decides):
#   1. [[route:codex]]                         sentinel  (stripped before forwarding)
#   2. @codex  use:codex  route:codex  model:codex   tag   (stripped)
#   3. "...have codex review...", "ask codex to ..."  natural language (UC_DIRECTIVES_NL)
#
# The token after a marker is resolved through an alias table auto-derived from
# your model ids + display names (plus router.aliases / directives.aliases
# overrides). A pin to an unconfigured or "auto" route is ignored so a request is
# never broken.
_DIRECTIVE_SENTINEL = re.compile(r"\[\[\s*(?:route|model|use)\s*:\s*([A-Za-z0-9._\-]+)\s*\]\]", re.I)
_DIRECTIVE_TAG = re.compile(r"(?<![^\s(])(?:@|(?:route|model|use)\s*:\s*)([A-Za-z0-9._\-]+)", re.I)
_DIRECTIVE_NL = re.compile(r"\b(?:use|using|have|ask|let|route\s+to|via|with)\s+([A-Za-z0-9._\-]+)", re.I)


def _norm_alias(s):
    """Lowercase + strip non-alphanumerics so 'GPT-5.5', 'gpt5.5', 'gpt_5_5' all
    collapse to one matchable key."""
    return re.sub(r"[^a-z0-9]+", "", str(s).lower())


def _resolve_alias(token):
    return _ROUTE_ALIASES.get(_norm_alias(token))


def _latest_user_turn(anth_body):
    """(message_dict, plain_text) of the newest user turn carrying real
    instruction text. Pure tool_result turns (tool round-trips) are skipped so a
    sub-agent's task tag stays sticky across its tool calls. (None, "") if none."""
    for m in reversed(anth_body.get("messages") or []):
        if not isinstance(m, dict) or m.get("role") != "user":
            continue
        content = m.get("content")
        if isinstance(content, list):
            non_tool = [b for b in content
                        if not (isinstance(b, dict) and b.get("type") == "tool_result")]
            if not non_tool:
                continue
            txt = _text_from_anthropic_content(non_tool)
        else:
            txt = content if isinstance(content, str) else _text_from_anthropic_content(content)
        txt = (txt or "").strip()
        if txt:
            return m, txt
    return None, ""


def _detect_directive(text):
    """(route_ids, spans, tier) for the most explicit marker tier that resolves to
    one or more configured backends. `spans` are the literal marker substrings to
    strip (empty for the natural-language tier -- that's prose, left intact)."""
    def scan(pattern):
        ids, spans, seen = [], [], set()
        for m in pattern.finditer(text):
            rid = _resolve_alias(m.group(1))
            if not rid:
                continue
            spans.append(m.group(0))
            if rid not in seen:
                seen.add(rid)
                ids.append(rid)
        return ids, spans
    ids, spans = scan(_DIRECTIVE_SENTINEL)
    if ids:
        return ids, spans, "sentinel"
    ids, spans = scan(_DIRECTIVE_TAG)
    if ids:
        return ids, spans, "tag"
    if DIRECTIVES_NL:
        ids, _ = scan(_DIRECTIVE_NL)
        if ids:
            return ids, [], "nl"
    return [], [], None


def _strip_spans_in_msg(msg, spans):
    """Remove matched marker substrings from a user turn's text in-place so the
    backend model never sees the routing tag."""
    if not spans or not isinstance(msg, dict):
        return
    def clean(s):
        # Remove the marker itself; do NOT globally collapse whitespace -- that
        # would flatten indentation in any code the prompt carries. Only tidy
        # trailing spaces left on a line and trim the ends.
        for sp in spans:
            s = s.replace(sp, "")
        return re.sub(r"[ \t]+(\n|$)", r"\1", s).strip()
    content = msg.get("content")
    if isinstance(content, str):
        msg["content"] = clean(content)
    elif isinstance(content, list):
        for b in content:
            if isinstance(b, dict) and b.get("type") == "text" and isinstance(b.get("text"), str):
                b["text"] = clean(b["text"])


def _directive_pin(body):
    """Route id this request is pinned to by a prompt directive, or None. Strips
    the marker text in-place when a pin is found. Never raises."""
    if not DIRECTIVES_ENABLED:
        return None
    try:
        msg, text = _latest_user_turn(body)
        if not text:
            return None
        ids, spans, tier = _detect_directive(text)
        if len(ids) != 1:
            if len(ids) > 1 and DIRECTIVES_LOG:
                log("[directive] ambiguous (%s named); ignored" % ", ".join(ids))
            return None
        rid = ids[0]
        slot = UC_SLOT_MAP.get(rid)
        if not isinstance(slot, dict) or slot.get("type") == "auto":
            if DIRECTIVES_LOG:
                log("[directive] '%s' (%s) not a usable backend; ignored" % (rid, tier))
            return None
        if DIRECTIVES.get("strip", True) and spans:
            _strip_spans_in_msg(msg, spans)
        return rid
    except Exception as e:
        if DIRECTIVES_LOG:
            log("[directive] error: %s" % e)
        return None


def _is_plan_mode(body):
    """True when the request is the interactive planning loop (the harness offers
    ExitPlanMode only while in plan mode)."""
    for t in body.get("tools") or []:
        if isinstance(t, dict) and t.get("name") == "ExitPlanMode":
            return True
    return False


def _configure_directives(cfg):
    """Build the alias table for prompt routing directives from configured
    models/routes, plus optional overrides. Idempotent; called from main()."""
    global _ROUTE_ALIASES, DIRECTIVES_ENABLED
    if not isinstance(cfg, dict):
        cfg = {}
    aliases = {}
    STOP = {"the", "real", "auto", "smart", "routing", "router", "worker", "experimental",
            "cursor", "oauth", "fast", "flash", "pro", "plus", "max", "mini", "via", "pay",
            "you", "model", "plan", "code", "chat", "api", "beta", "preview"}

    def add(token, rid):
        key = _norm_alias(token)