diff --git a/.agents/rules/ssh-keys-register-once-across-scopes.md b/.agents/rules/ssh-keys-register-once-across-scopes.md new file mode 100644 index 0000000..600c35c --- /dev/null +++ b/.agents/rules/ssh-keys-register-once-across-scopes.md @@ -0,0 +1,63 @@ +# Env-scoped SSH keys must register once across all scopes + +`program.Run` builds one `registry.BuildRegistry` — and therefore one +`hetzner.HetznerCompute` — per realization scope (the region-less global slice plus one per +region), all sharing the same Pulumi `ctx`. Hetzner SSH keys are **account-global** and named with no +region slug (`wardnet--key-user` / `wardnet--key-deploy`, via `naming.GlobalResource`). If +each scope's compute provider registers them independently, Pulumi fails the whole preview/up with +`Duplicate resource URN '…hcloud:index/sshKey:SshKey::wardnet--key-user'`. + +This is the sibling of `registry-provider-names-are-region-scoped.md`. There the fix was to make each +provider URN unique per scope (every scope legitimately needs its own provider). Here it is the +opposite: the SSH keys are a single env-scoped resource pair that must be created **exactly once +total**, not once per scope. + +So the SSH key cache must be **shared across every `HetznerCompute` of one program run**: +`program.Run` creates one `registry.NewSSHKeyCache()` and threads it through both `BuildRegistry` +calls → `registry.Compute` → `hetzner.NewCompute`. `ensureSshKeys` dedups against that shared +`*hetzner.SSHKeyCache`, so the first scope to provision a server registers the keys and every other +scope reuses the same key objects. + +### Owning provider must be scope-order-independent + +Which scope wins the dedup race depends on realization order (global slice first when it provisions a +server, else the sorted-first region) — so the keys must **not** be registered under that winning +scope's region-scoped provider (`hcloud-`). If they were, adding a region that sorts earlier, +removing the first region, or adding a compute-bearing global slice later would move the keys' provider +reference between runs, and Pulumi can replace an account-global resource every server depends on when +its provider changes. Instead `ensureSshKeys` registers them under a **dedicated, fixed-name** provider +(`hcloud-ssh-keys`, `SSHKeyCache.provider`), created once on the cache. Because the cache already +dedups key creation to a single caller, this provider is registered exactly once and never collides on +URN the way a per-scope `hcloud-` provider would — it is the deliberate exception to +`registry-provider-names-are-region-scoped.md`, valid **only** because it is account-global and +single-registration. + +This assumes **one Hetzner account per env**: the keys' env-scoped names carry no account dimension, +and the dedicated provider is created with a single token (the winning scope's). `regions.yaml` can +technically set a different `hetzner.apiToken` per region, but that is unsupported — the key would be +created in only one account and other-account servers would fail to reference it by name. + +## Applies to + +- `providers/hetzner/compute.go` — `SSHKeyCache` (incl. its dedicated `provider`), `NewSSHKeyCache`, + `HetznerCompute.sshKeys`, `ensureSshKeys`, `sshKeyProvider`. The keys carry no region-slug or + container label (`tags.HetznerLabels(..., "", "", ...)`) since they are env-scoped, so the single + shared key is labelled identically regardless of which scope creates it. Per-instance maps + (`firewalls`, `instanceCounters`) stay per-instance: firewalls are region-scoped (their names embed + the slug) so they never collide across scopes. +- `internal/registry/registry.go` — `BuildRegistry`'s `sshKeys` parameter and the `NewSSHKeyCache` + wrapper, threaded into `Compute` → `hetzner.NewCompute`. +- `program/program.go` — the single `registry.NewSSHKeyCache()` passed to every `BuildRegistry`. + +When adding any new account-global (region-less, env-scoped) Hetzner resource, give it the same +treatment: a shared cache so it registers once, a fixed (non-region-scoped) name, and a +scope-order-independent owning provider. Do **not** give it a region-scoped name like a regional +resource. + +## Why + +A single-scope config (one region, or global-only) never exercised the collision — one +`HetznerCompute`, one registration. The first config to provision a server in **both** the global +slice and a region — or in two regions — registered the env-scoped SSH keys twice under one URN and +failed at preview before any resource was touched. This was hit by the first real cross-scope +deployment (`wardnet-infrastructure` PR #31). diff --git a/internal/registry/registry.go b/internal/registry/registry.go index 011716f..7e5b241 100644 --- a/internal/registry/registry.go +++ b/internal/registry/registry.go @@ -45,6 +45,9 @@ type registry struct { dns *regions.DnsAuthority ssh types.SSHConfig regionTable regions.Table + // sshKeys is the cross-scope SSH key cache shared by every registry of one + // program run so env-scoped Hetzner SSH keys register under a single URN. + sshKeys *hetzner.SSHKeyCache hetznerProviderOnce sync.Once hetznerProvider *hcloud.Provider @@ -76,8 +79,10 @@ type registry struct { // to label cloud resources; eph carries the ADR-0028 ephemeral-env labels (zero // value for a static env). ctx is stored and used lazily when provider objects // are first constructed — it must be the context passed to the Pulumi program's -// run function. -func BuildRegistry(ctx *pulumi.Context, config map[string]map[string]any, dns *regions.DnsAuthority, ssh types.SSHConfig, regionTable regions.Table, project, env, region string, eph tags.Ephemeral) ProviderRegistry { +// run function. sshKeys is the cross-scope SSH key cache shared by every +// registry of one program run (see NewSSHKeyCache); pass nil for a standalone +// registry (e.g. tests) and hetzner.NewCompute allocates a private cache. +func BuildRegistry(ctx *pulumi.Context, config map[string]map[string]any, dns *regions.DnsAuthority, ssh types.SSHConfig, regionTable regions.Table, project, env, region string, eph tags.Ephemeral, sshKeys *hetzner.SSHKeyCache) ProviderRegistry { slug, _ := regionTable.Slug(region) // already validated by loader return ®istry{ ctx: ctx, @@ -90,9 +95,18 @@ func BuildRegistry(ctx *pulumi.Context, config map[string]map[string]any, dns *r dns: dns, ssh: ssh, regionTable: regionTable, + sshKeys: sshKeys, } } +// NewSSHKeyCache returns the shared SSH key cache program.Run threads into every +// BuildRegistry call so env-scoped Hetzner SSH keys register exactly once across +// all realization scopes. It wraps hetzner.NewSSHKeyCache so callers need not +// import the provider package directly. +func NewSSHKeyCache() *hetzner.SSHKeyCache { + return hetzner.NewSSHKeyCache() +} + // hetznerProv lazily creates the shared hcloud.Provider for the Hetzner // provider implementations. Subsequent calls return the cached instance. func (r *registry) hetznerProv() *hcloud.Provider { @@ -141,6 +155,7 @@ func (r *registry) Compute(name string) (types.ComputeProvider, error) { r.slug, r.eph, realizations, + r.sshKeys, ) }) return r.hetznerComp, nil diff --git a/internal/registry/registry_test.go b/internal/registry/registry_test.go index 99175f9..e27fbdf 100644 --- a/internal/registry/registry_test.go +++ b/internal/registry/registry_test.go @@ -66,7 +66,7 @@ func TestProviderResourceNamesAreScopePerRegistry(t *testing.T) { // Mirror program.Run: a global-slice registry and a regional registry over // the same context. Distinct region strings → distinct scopes. for _, region := range []string{"global", "us-east-1"} { - reg := BuildRegistry(ctx, config, ®ions.DnsAuthority{Zone: "z"}, types.SSHConfig{}, regions.Table{}, "proj", "prd", region, tags.Ephemeral{}) + reg := BuildRegistry(ctx, config, ®ions.DnsAuthority{Zone: "z"}, types.SSHConfig{}, regions.Table{}, "proj", "prd", region, tags.Ephemeral{}, nil) // Compute("hetzner") forces hcloud.NewProvider; DNS("cloudflare") forces // cf.NewProvider. Both are the singleton provider resources that collide. if _, cerr := reg.Compute("hetzner"); cerr != nil { @@ -90,6 +90,133 @@ func TestProviderResourceNamesAreScopePerRegistry(t *testing.T) { } } +// sshKeyMocks records, for every SSH key resource registered, its name and the +// provider reference it was registered under (args.Provider, whose URN embeds the +// provider resource name). NewResource also returns inline the IDs/outputs the +// compute Create path needs (numeric firewall ID, server IPv4), but those are not +// retained. +type sshKeyMocks struct { + mu sync.Mutex + sshNames []string + sshProviders []string +} + +func (m *sshKeyMocks) Call(pulumi.MockCallArgs) (resource.PropertyMap, error) { + return resource.PropertyMap{}, nil +} + +func (m *sshKeyMocks) NewResource(args pulumi.MockResourceArgs) (string, resource.PropertyMap, error) { + switch args.TypeToken { + case "hcloud:index/sshKey:SshKey": + m.mu.Lock() + m.sshNames = append(m.sshNames, args.Name) + m.sshProviders = append(m.sshProviders, args.Provider) + m.mu.Unlock() + return args.Name + "-id", resource.PropertyMap{ + "name": resource.NewStringProperty(args.Name), + "publicKey": resource.NewStringProperty("ssh-ed25519 AAAA test"), + }, nil + case "hcloud:index/firewall:Firewall": + // Firewall ID is parsed as an int in compute.Create. + return "42", resource.PropertyMap{"name": resource.NewStringProperty(args.Name)}, nil + case "hcloud:index/server:Server": + return args.Name + "-id", resource.PropertyMap{ + "name": resource.NewStringProperty(args.Name), + "ipv4Address": resource.NewStringProperty("1.2.3.4"), + }, nil + } + return args.Name + "-id", resource.PropertyMap{"name": resource.NewStringProperty(args.Name)}, nil +} + +// TestSshKeysRegisterOncePerEnvAcrossScopes guards the sibling of the +// provider-URN regression (TestProviderResourceNamesAreScopePerRegistry): Hetzner +// SSH keys are account-global and named with no region slug +// (`wardnet--key-{user,deploy}`, via naming.GlobalResource). program.Run +// builds one registry — and one HetznerCompute — per scope (the region-less global +// slice plus each region) over the same Pulumi context. If each scope's compute +// provider registered the keys independently, the second registration would reuse +// the URN and the real engine would fail with +// `Duplicate resource URN '…SshKey::wardnet--key-user'` — exactly the failure +// the first real cross-scope deployment hit. +// +// program.Run threads ONE shared SSH key cache into every BuildRegistry call. This +// drives two registries over one context through the real Compute().Create() path +// and asserts each env-scoped key is registered exactly once. (The mock monitor +// does not reject duplicate URNs, so we count registrations rather than rely on +// RunErr returning an error — same approach as the provider-name test above.) +// +// Reverting to a per-instance cache makes each scope register the keys again, so +// each name's count becomes 2 and this test fails. +func TestSshKeysRegisterOncePerEnvAcrossScopes(t *testing.T) { + config := map[string]map[string]any{ + "hetzner": { + "apiToken": "t", + "location": "ash", + "network_zone": "us-east", + "serverTypes": map[string]any{"SMALL": "cx23"}, + "images": map[string]any{"ubuntu-24.04": "ubuntu-24.04"}, + }, + } + // "global" is absent from the table → empty slug (region-less global slice), + // exactly as program.Run resolves it; "us-east-1" → "use1". + regionTable := regions.Table{"us-east-1": {Slug: "use1"}} + mocks := &sshKeyMocks{} + + net := types.NetworkOutputs{ + NetworkID: pulumi.String("99").ToStringOutput(), + SubnetID: pulumi.String("12345").ToStringOutput(), + } + spec := types.ComputeSpec{ + Name: "bridge", + Kind: "vm", + Container: "vpc", + Provider: "hetzner", + Network: "vpc", + Size: "SMALL", + Image: "ubuntu-24.04", + InstanceCount: 1, + } + + err := pulumi.RunErr(func(ctx *pulumi.Context) error { + // One shared cache, threaded into every registry exactly as program.Run does. + cache := NewSSHKeyCache() + for _, region := range []string{"global", "us-east-1"} { + reg := BuildRegistry(ctx, config, nil, types.SSHConfig{}, regionTable, "proj", "prd", region, tags.Ephemeral{}, cache) + comp, cerr := reg.Compute("hetzner") + if cerr != nil { + return cerr + } + // abstractRegion == the scope key, so ExtractRegionConfigs/ResolveRegion + // resolve the realization under that key. Distinct slugs keep the server + // and firewall names unique; only the env-scoped SSH keys collide. + if _, err := comp.Create(ctx, spec, net, "prd", region, "bridge.example.com", "", types.FirewallPorts{}); err != nil { + return err + } + } + return nil + }, pulumi.WithMocks("proj", "stack", mocks)) + require.NoError(t, err) + + counts := map[string]int{} + for _, n := range mocks.sshNames { + counts[n]++ + } + assert.Equal(t, 1, counts["wardnet-prd-key-user"], + "user SSH key must register once across scopes, got %v (duplicate-URN regression)", mocks.sshNames) + assert.Equal(t, 1, counts["wardnet-prd-key-deploy"], + "deploy SSH key must register once across scopes, got %v (duplicate-URN regression)", mocks.sshNames) + + // Each key must be registered under the dedicated, scope-independent provider + // (hcloud-ssh-keys) — NOT the winning scope's region-scoped provider + // (hcloud-global / hcloud-us-east-1). Pinning to a fixed provider name keeps + // the account-global key's owning provider stable when the scope set or + // realization order changes, avoiding a provider-change replacement. + for _, p := range mocks.sshProviders { + assert.Contains(t, p, "hcloud-ssh-keys", + "SSH keys must register under the dedicated hcloud-ssh-keys provider, got %q", p) + } +} + func dedupe(in []string) []string { seen := map[string]struct{}{} var out []string @@ -105,7 +232,7 @@ func dedupe(in []string) []string { func TestRegistryUnknownProvider(t *testing.T) { // nil ctx + nil regionTable: providers are built lazily, so construction is // not triggered during this test. Only the "unknown provider" paths are hit. - r := BuildRegistry(nil, map[string]map[string]any{"hetzner": {"apiToken": "x"}}, nil, types.SSHConfig{}, nil, "test-project", "test", "us-east-1", tags.Ephemeral{}) + r := BuildRegistry(nil, map[string]map[string]any{"hetzner": {"apiToken": "x"}}, nil, types.SSHConfig{}, nil, "test-project", "test", "us-east-1", tags.Ephemeral{}, nil) // "hetzner" is now a known network provider — must succeed. np, err := r.Network("hetzner") diff --git a/program/gate_test.go b/program/gate_test.go index 5f5797d..97a0b35 100644 --- a/program/gate_test.go +++ b/program/gate_test.go @@ -168,7 +168,7 @@ func TestTLSAndServiceShareOneGate(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { reg := registry.BuildRegistry(ctx, map[string]map[string]any{"hetzner": {"apiToken": "x"}}, - nil, types.SSHConfig{DeployPrivateKey: "priv"}, nil, "proj", "prd", "us-east-1", tags.Ephemeral{}) + nil, types.SSHConfig{DeployPrivateKey: "priv"}, nil, "proj", "prd", "us-east-1", tags.Ephemeral{}, nil) res := types.Resources{ Compute: []types.ComputeSpec{ {Name: "bridge", Provider: "hetzner", InstanceCount: 1, DeployUser: &types.DeployUserSpec{Name: "deploy"}}, diff --git a/program/global_test.go b/program/global_test.go index a69ffbb..fa265a8 100644 --- a/program/global_test.go +++ b/program/global_test.go @@ -86,7 +86,7 @@ func TestCreateInfraGlobalScope(t *testing.T) { }, "neon": {"apiKey": "k", "region": "aws-us-east-2"}, } - reg := registry.BuildRegistry(ctx, config, nil, types.SSHConfig{}, regions.Table{}, "proj", "prd", globalScope, tags.Ephemeral{}) + reg := registry.BuildRegistry(ctx, config, nil, types.SSHConfig{}, regions.Table{}, "proj", "prd", globalScope, tags.Ephemeral{}, nil) res := types.Resources{ Network: []types.NetworkSpec{{ @@ -174,7 +174,7 @@ func TestGlobalServiceRealizesRegionLess(t *testing.T) { reg := registry.BuildRegistry(ctx, map[string]map[string]any{"hetzner": {"apiToken": "x"}}, ®ions.DnsAuthority{Provider: "cloudflare", Zone: "z"}, - types.SSHConfig{DeployPrivateKey: "priv"}, nil, "proj", "prd", globalScope, tags.Ephemeral{}) + types.SSHConfig{DeployPrivateKey: "priv"}, nil, "proj", "prd", globalScope, tags.Ephemeral{}, nil) res := types.Resources{ Compute: []types.ComputeSpec{ {Name: "tenants", Provider: "hetzner", InstanceCount: 1, DeployUser: &types.DeployUserSpec{Name: "deploy"}}, diff --git a/program/program.go b/program/program.go index a54da39..1764c87 100644 --- a/program/program.go +++ b/program/program.go @@ -155,9 +155,15 @@ func Run(ctx *pulumi.Context) error { } ctx.Export("appDeployDescriptor", pulumi.Any(appDesc)) + // Env-scoped Hetzner SSH keys (wardnet--key-{user,deploy}) carry no + // region slug, so every scope's compute provider would register the same URN. + // Share one cache across all registries this run builds so the keys register + // exactly once (see .agents/rules/ssh-keys-register-once-across-scopes.md). + sshKeyCache := registry.NewSSHKeyCache() + registries := make(map[string]registry.ProviderRegistry, len(regionNames)) for _, region := range regionNames { - registries[region] = registry.BuildRegistry(ctx, regionTable[region].Providers, regionTable[region].Dns, vars.SSH, regionTable, ctx.Project(), env, region, eph) + registries[region] = registry.BuildRegistry(ctx, regionTable[region].Providers, regionTable[region].Dns, vars.SSH, regionTable, ctx.Project(), env, region, eph, sshKeyCache) } // networkOutputs: region → specName+"/"+subnetName → NetworkOutputs. The @@ -202,7 +208,7 @@ func Run(ctx *pulumi.Context) error { if authority == nil && len(derivedRecords(globalRes, env, "", vars.BaseDomain, ephemeralSlug)) > 0 { return fmt.Errorf("global slice realizes DNS records but its placementRegion %q has no dns: authority — declare a dns: block on that region in regions.yaml", globalBlock.PlacementRegion) } - globalReg := registry.BuildRegistry(ctx, globalBlock.Providers, authority, vars.SSH, regionTable, ctx.Project(), env, globalScope, eph) + globalReg := registry.BuildRegistry(ctx, globalBlock.Providers, authority, vars.SSH, regionTable, ctx.Project(), env, globalScope, eph, sshKeyCache) scopes = append(scopes, scope{key: globalScope, slug: "", reg: globalReg, authority: authority, res: globalRes}) } for _, region := range regionNames { diff --git a/providers/hetzner/compute.go b/providers/hetzner/compute.go index 76c55d1..cc39500 100644 --- a/providers/hetzner/compute.go +++ b/providers/hetzner/compute.go @@ -17,9 +17,41 @@ import ( "github.com/wardnet/inforge/internal/types" ) +// SSHKeyCache deduplicates env-scoped SSH key creation across every +// HetznerCompute instance a single program run builds. program.Run builds one +// registry — and therefore one HetznerCompute — per realization scope (the +// region-less global slice plus one per region). SSH keys are Hetzner-account- +// global resources named only `wardnet--key-{user,deploy}` (no region +// slug), so each scope would otherwise register the same URN and fail the +// preview with "Duplicate resource URN". Sharing one cache across all instances +// makes ensureSshKeys create each key exactly once, under a single dedicated +// provider. See .agents/rules/ssh-keys-register-once-across-scopes.md. +type SSHKeyCache struct { + mu sync.Mutex + // keys is keyed by env since SSH keys are env-scoped (not region-scoped). + keys map[string][]*hcloud.SshKey + // provider is the dedicated hcloud provider the account-global SSH keys are + // registered under, created once on first use with a fixed (scope-independent) + // resource name. Pinning the keys to a stable provider name — rather than the + // first realizing scope's region-scoped provider — keeps their owning provider + // stable across runs even if the scope set or realization order changes; + // otherwise Pulumi could see the provider reference move between runs and + // replace an account-global resource every server depends on. It assumes one + // Hetzner account per env (the keys' env-scoped names carry no account + // dimension). nil in tests, where the instance has no provider. + provider *hcloud.Provider +} + +// NewSSHKeyCache returns an empty SSH key cache to thread through every +// BuildRegistry/NewCompute call of one program run. +func NewSSHKeyCache() *SSHKeyCache { + return &SSHKeyCache{keys: map[string][]*hcloud.SshKey{}} +} + // HetznerCompute implements types.ComputeProvider for Hetzner Cloud. One -// instance is shared per registry (i.e. per region). Firewalls and SSH keys -// are deduplicated across multiple Create calls via mutex-protected maps. +// instance is shared per registry (i.e. per region/scope). Firewalls are +// deduplicated per instance via a mutex-protected map; env-scoped SSH keys are +// deduplicated across instances via a shared SSHKeyCache. type HetznerCompute struct { sshAuthorizedKeys string deployPublicKey string @@ -30,8 +62,9 @@ type HetznerCompute struct { eph tags.Ephemeral mu sync.Mutex firewalls map[string]*hcloud.Firewall - // sshKeys is keyed by env since SSH keys are env-scoped (not region-scoped). - sshKeys map[string][]*hcloud.SshKey + // sshKeys is shared across every scope's HetznerCompute so env-scoped SSH + // keys register once across the whole program run (see SSHKeyCache). + sshKeys *SSHKeyCache // instanceCounters tracks how many servers have been created for each // spec name so that Create assigns each call its unique specKey suffix. instanceCounters map[string]int @@ -43,11 +76,16 @@ type HetznerCompute struct { // naming. eph carries the ADR-0028 ephemeral-env labels (zero value for a static // env). regionOverrides is the output of ExtractRegionConfigs (the per-region // realizations) and may be nil — Create then fails closed for any region that -// has no realization. -func NewCompute(sshAuthorizedKeys, deployPublicKey, apiToken string, provider *hcloud.Provider, project, slug string, eph tags.Ephemeral, regionOverrides map[string]RegionConfig) *HetznerCompute { +// has no realization. sshKeys is the cross-scope SSH key cache shared by every +// HetznerCompute of one program run; pass nil for a standalone instance (e.g. +// tests) and a private cache is allocated. +func NewCompute(sshAuthorizedKeys, deployPublicKey, apiToken string, provider *hcloud.Provider, project, slug string, eph tags.Ephemeral, regionOverrides map[string]RegionConfig, sshKeys *SSHKeyCache) *HetznerCompute { if regionOverrides == nil { regionOverrides = map[string]RegionConfig{} } + if sshKeys == nil { + sshKeys = NewSSHKeyCache() + } return &HetznerCompute{ sshAuthorizedKeys: sshAuthorizedKeys, deployPublicKey: deployPublicKey, @@ -57,7 +95,7 @@ func NewCompute(sshAuthorizedKeys, deployPublicKey, apiToken string, provider *h slug: slug, eph: eph, firewalls: map[string]*hcloud.Firewall{}, - sshKeys: map[string][]*hcloud.SshKey{}, + sshKeys: sshKeys, instanceCounters: map[string]int{}, regions: regionOverrides, } @@ -292,43 +330,81 @@ func (h *HetznerCompute) ensureFirewall(ctx *pulumi.Context, spec types.ComputeS } // ensureSshKeys returns the [user, deploy] SSH key pair for env, creating them -// if they do not yet exist. SSH keys are env-scoped (not region-scoped). It is -// safe to call concurrently. Index 0 is the user authorized-keys key; index 1 -// is the deploy public key. +// if they do not yet exist, under the cache's dedicated provider so their owning +// provider is independent of which scope realizes first. SSH keys are env-scoped +// (not region-scoped). It is safe to call concurrently. Index 0 is the user +// authorized-keys key; index 1 is the deploy public key. func (h *HetznerCompute) ensureSshKeys(ctx *pulumi.Context, env string) ([]*hcloud.SshKey, error) { - h.mu.Lock() - defer h.mu.Unlock() + h.sshKeys.mu.Lock() + defer h.sshKeys.mu.Unlock() - if keys, ok := h.sshKeys[env]; ok { + if keys, ok := h.sshKeys.keys[env]; ok { return keys, nil } - // SSH keys are env-scoped, not container-scoped: omit container label. - keyLabels := toStringMap(tags.HetznerLabels(h.project, env, h.slug, "", h.eph)) + prov, err := h.sshKeyProvider(ctx) + if err != nil { + return nil, fmt.Errorf("create ssh key provider: %w", err) + } + + // SSH keys are env-scoped, not region- or container-scoped: omit both the + // region-slug and container labels so the single shared key carries no + // scope-specific label regardless of which scope creates it. + keyLabels := toStringMap(tags.HetznerLabels(h.project, env, "", "", h.eph)) userKeyName := naming.GlobalResource(env, "key", "user") - userKey, err := h.newOrImportSshKey(ctx, userKeyName, h.sshAuthorizedKeys, keyLabels) + userKey, err := h.newOrImportSshKey(ctx, prov, userKeyName, h.sshAuthorizedKeys, keyLabels) if err != nil { return nil, fmt.Errorf("create user ssh key %s: %w", userKeyName, err) } deployKeyName := naming.GlobalResource(env, "key", "deploy") - deployKey, err := h.newOrImportSshKey(ctx, deployKeyName, h.deployPublicKey, keyLabels) + deployKey, err := h.newOrImportSshKey(ctx, prov, deployKeyName, h.deployPublicKey, keyLabels) if err != nil { return nil, fmt.Errorf("create deploy ssh key %s: %w", deployKeyName, err) } keys := []*hcloud.SshKey{userKey, deployKey} - h.sshKeys[env] = keys + h.sshKeys.keys[env] = keys return keys, nil } -// newOrImportSshKey creates an SSH key in Hetzner, importing the existing one -// if a key with the same name is already present (adopt-or-create, idempotent). -// It uses a direct hcloud API call rather than Pulumi's LookupSshKey invoke, -// which logs engine-level errors on 404 and fails the stack even when caught. -func (h *HetznerCompute) newOrImportSshKey(ctx *pulumi.Context, name, publicKey string, labels pulumi.StringMap) (*hcloud.SshKey, error) { - opts := h.providerOpts() +// sshKeyProvider returns the dedicated, stably-named hcloud provider the +// account-global SSH keys register under, creating it once (memoised on the +// shared cache). Its resource name is fixed (not region-scoped) so the keys' +// owning provider does not move when the scope set or realization order changes. +// Because the cache dedups key creation to a single caller, this provider is +// registered exactly once — it never collides on URN the way a per-scope provider +// would (see .agents/rules/registry-provider-names-are-region-scoped.md). Returns +// nil when the instance has no provider (tests), so keys register provider-less +// as before. Callers must hold h.sshKeys.mu. +func (h *HetznerCompute) sshKeyProvider(ctx *pulumi.Context) (*hcloud.Provider, error) { + if h.provider == nil { + return nil, nil + } + if h.sshKeys.provider != nil { + return h.sshKeys.provider, nil + } + p, err := hcloud.NewProvider(ctx, "hcloud-ssh-keys", &hcloud.ProviderArgs{ + Token: pulumi.String(h.apiToken), + }) + if err != nil { + return nil, err + } + h.sshKeys.provider = p + return p, nil +} + +// newOrImportSshKey creates an SSH key in Hetzner under prov, importing the +// existing one if a key with the same name is already present (adopt-or-create, +// idempotent). It uses a direct hcloud API call rather than Pulumi's +// LookupSshKey invoke, which logs engine-level errors on 404 and fails the stack +// even when caught. +func (h *HetznerCompute) newOrImportSshKey(ctx *pulumi.Context, prov *hcloud.Provider, name, publicKey string, labels pulumi.StringMap) (*hcloud.SshKey, error) { + var opts []pulumi.ResourceOption + if prov != nil { + opts = append(opts, pulumi.Provider(prov)) + } if id, err := h.lookupSshKeyID(name); err == nil && id != 0 { opts = append(opts, pulumi.Import(pulumi.ID(strconv.Itoa(id)))) } diff --git a/providers/hetzner/compute_test.go b/providers/hetzner/compute_test.go index 4ec26d1..2a4a76a 100644 --- a/providers/hetzner/compute_test.go +++ b/providers/hetzner/compute_test.go @@ -56,7 +56,7 @@ func (m *computeMocks) NewResource(args pulumi.MockResourceArgs) (string, resour func TestEnsureFirewallIdempotency(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil, nil) bridgeSpec := types.ComputeSpec{Name: "bridge", Container: "vpc", Provider: "hetzner"} fw1, err := h.ensureFirewall(ctx, bridgeSpec, "prod", types.FirewallPorts{}) @@ -110,7 +110,7 @@ func (m *fwCaptureMocks) NewResource(args pulumi.MockResourceArgs) (string, reso func TestEnsureFirewallSourceScoping(t *testing.T) { mocks := &fwCaptureMocks{} err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil, nil) spec := types.ComputeSpec{Name: "back", Container: "vpc", Provider: "hetzner"} _, err := h.ensureFirewall(ctx, spec, "prod", types.FirewallPorts{ Public: []int{443}, @@ -148,7 +148,7 @@ func TestEnsureFirewallSourceScoping(t *testing.T) { func TestEnsureFirewallExposedPorts(t *testing.T) { mocks := &fwCaptureMocks{} err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil, nil) spec := types.ComputeSpec{Name: "edge", Container: "vpc", Provider: "hetzner"} _, err := h.ensureFirewall(ctx, spec, "prod", types.FirewallPorts{ PrivateExposed: []types.ExposedPort{ @@ -187,7 +187,7 @@ func TestEnsureFirewallExposedPorts(t *testing.T) { func TestEnsureFirewallCustomRules(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil, nil) spec := types.ComputeSpec{ Name: "bridge", @@ -215,7 +215,7 @@ func TestEnsureFirewallCustomRules(t *testing.T) { func TestComputeCreateWithCustomFirewall(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1()) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1(), nil) net := types.NetworkOutputs{ NetworkID: pulumi.String("99").ToStringOutput(), @@ -253,7 +253,7 @@ func TestComputeCreateWithCustomFirewall(t *testing.T) { func TestEnsureSshKeysIdempotency(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil, nil) // SSH keys are env-scoped: same env must return the same keys. keys1, err := h.ensureSshKeys(ctx, "prod") @@ -282,11 +282,42 @@ func TestEnsureSshKeysIdempotency(t *testing.T) { require.NoError(t, err) } +// TestEnsureSshKeysSharedAcrossInstances reproduces the multi-scope bug: a +// program run builds one HetznerCompute per scope (the global slice plus each +// region), all over the same Pulumi context. Env-scoped SSH keys carry no region +// slug, so two instances registering them independently collide on the same URN. +// Sharing one SSHKeyCache must make the keys register exactly once — both +// instances return the identical key objects (a second NewSshKey under the same +// URN would otherwise fail the run). +func TestEnsureSshKeysSharedAcrossInstances(t *testing.T) { + err := pulumi.RunErr(func(ctx *pulumi.Context) error { + cache := NewSSHKeyCache() + // Distinct slugs model the global slice ("") and a region ("use1"). + global := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "", tags.Ephemeral{}, nil, cache) + regional := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, nil, cache) + + keysGlobal, err := global.ensureSshKeys(ctx, "prod") + if err != nil { + return err + } + keysRegional, err := regional.ensureSshKeys(ctx, "prod") + if err != nil { + return err + } + if keysGlobal[0] != keysRegional[0] || keysGlobal[1] != keysRegional[1] { + t.Error("shared SSHKeyCache must return the same key objects across instances (keys registered twice)") + } + return nil + }, pulumi.WithMocks("inforge", "test", &computeMocks{})) + + require.NoError(t, err) +} + // ---- Create smoke test ------------------------------------------------------- func TestComputeCreateReturnsPublicIP(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1()) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1(), nil) // Synthesise a NetworkOutputs with a known subnet ID. subnetID := pulumi.String("12345").ToStringOutput() @@ -319,7 +350,7 @@ func TestComputeCreateReturnsPublicIP(t *testing.T) { func TestComputeCreateInstanceCounterIncrement(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1()) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1(), nil) net := types.NetworkOutputs{ NetworkID: pulumi.String("99").ToStringOutput(), @@ -390,7 +421,7 @@ func (m *capturingMocks) NewResource(args pulumi.MockResourceArgs) (string, reso func TestComputeCreateUsesRealization(t *testing.T) { mocks := &capturingMocks{} err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1()) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1(), nil) net := types.NetworkOutputs{ NetworkID: pulumi.String("99").ToStringOutput(), SubnetID: pulumi.String("12345").ToStringOutput(), @@ -409,7 +440,7 @@ func TestComputeCreateUsesRealization(t *testing.T) { func TestComputeCreateUnknownRegionReturnsError(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1()) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1(), nil) net := types.NetworkOutputs{ NetworkID: pulumi.String("99").ToStringOutput(), SubnetID: pulumi.String("12345").ToStringOutput(), @@ -432,7 +463,7 @@ func TestComputeCreateUnknownRegionReturnsError(t *testing.T) { func TestComputeCreateUnknownSizeReturnsError(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1()) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1(), nil) net := types.NetworkOutputs{ NetworkID: pulumi.String("99").ToStringOutput(), SubnetID: pulumi.String("12345").ToStringOutput(), @@ -455,7 +486,7 @@ func TestComputeCreateUnknownSizeReturnsError(t *testing.T) { func TestComputeCreateUnknownImageReturnsError(t *testing.T) { err := pulumi.RunErr(func(ctx *pulumi.Context) error { - h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1()) + h := NewCompute("ssh-ed25519 user", "ssh-ed25519 deploy", "", nil, "test-project", "use1", tags.Ephemeral{}, useEast1(), nil) net := types.NetworkOutputs{ NetworkID: pulumi.String("99").ToStringOutput(), SubnetID: pulumi.String("12345").ToStringOutput(),