diff --git a/cmd/stepsecurity-dev-machine-guard/main.go b/cmd/stepsecurity-dev-machine-guard/main.go index 3816133..d999db3 100644 --- a/cmd/stepsecurity-dev-machine-guard/main.go +++ b/cmd/stepsecurity-dev-machine-guard/main.go @@ -92,6 +92,11 @@ func main() { if cfg.EnableNPMScan == nil && config.EnableNPMScan != nil { cfg.EnableNPMScan = config.EnableNPMScan } + // --legacy-node-scan / --disk-node-scan override the config-file value + // (which config.Load already applied to config.UseLegacyNodeScan). + if cfg.UseLegacyNodeScan != nil { + config.UseLegacyNodeScan = *cfg.UseLegacyNodeScan + } if cfg.EnableBrewScan == nil && config.EnableBrewScan != nil { cfg.EnableBrewScan = config.EnableBrewScan } diff --git a/internal/cli/cli.go b/internal/cli/cli.go index ac06880..8e01324 100644 --- a/internal/cli/cli.go +++ b/internal/cli/cli.go @@ -27,6 +27,7 @@ type Config struct { InstallDir string // --install-dir=DIR base install directory; all non-bootstrap files (logs, hook errors, binary placement) live under this dir. "" w/ InstallDirSet=true means "explicitly disabled" (no file logging). InstallDirSet bool // true if --install-dir was passed (empty value = disable file logging for this run) EnableNPMScan *bool // nil=auto, true/false=explicit + UseLegacyNodeScan *bool // nil=auto (disk scan); true=command path, false=disk path EnableBrewScan *bool // nil=auto, true/false=explicit EnablePythonScan *bool // nil=auto, true/false=explicit IncludeBundledPlugins bool // --include-bundled-plugins: include bundled/platform plugins in output @@ -170,6 +171,12 @@ func Parse(args []string) (*Config, error) { case arg == "--disable-npm-scan": v := false cfg.EnableNPMScan = &v + case arg == "--legacy-node-scan": + v := true + cfg.UseLegacyNodeScan = &v + case arg == "--disk-node-scan": + v := false + cfg.UseLegacyNodeScan = &v case arg == "--enable-brew-scan": v := true cfg.EnableBrewScan = &v diff --git a/internal/config/config.go b/internal/config/config.go index 6f3aacf..9256bde 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -34,6 +34,17 @@ var ( // STEPSEC_ENABLE_SCAN_STATE=1) to opt back in. STEPSEC_DISABLE_SCAN_STATE=1 // always forces legacy. UseLegacyPackageScan = true + + // UseLegacyNodeScan, when true, reverts Node.js package discovery to the + // command-based path (`npm ls` / `yarn list` / `pnpm ls` / `bun pm ls`, + // shipped as raw output for the backend to parse). Defaults to false: Node + // packages are read from on-disk lockfiles (and node_modules as a fallback) + // with no package-manager subprocess. Set use_legacy_node_scan=true in + // config.json (or --legacy-node-scan) to opt back into the command path. + // + // Independent of UseLegacyPackageScan above (which gates the delta-upload + // optimization, not the disk-vs-command source). + UseLegacyNodeScan = false ) // MaxExecutionDuration is the whole-process execution-watchdog limit @@ -64,6 +75,7 @@ type ConfigFile struct { InstallDir string `json:"install_dir,omitempty"` MaxExecutionDuration string `json:"max_execution_duration,omitempty"` UseLegacyPackageScan *bool `json:"use_legacy_package_scan,omitempty"` + UseLegacyNodeScan *bool `json:"use_legacy_node_scan,omitempty"` } // userConfigDir returns ~/.stepsecurity — the per-user config location. @@ -199,6 +211,9 @@ func Load() { if cfg.UseLegacyPackageScan != nil { UseLegacyPackageScan = *cfg.UseLegacyPackageScan } + if cfg.UseLegacyNodeScan != nil { + UseLegacyNodeScan = *cfg.UseLegacyNodeScan + } } // IsEnterpriseMode returns true if valid enterprise credentials are configured. @@ -513,6 +528,7 @@ func ShowConfigure() { fmt.Printf(" %-24s %s\n", "Max Execution Duration:", displayMaxExecution(cfg.MaxExecutionDuration)) fmt.Printf(" %-24s %s\n", "Search Directories:", displayDirs(cfg.SearchDirs)) fmt.Printf(" %-24s %s\n", "Enable NPM Scan:", displayBoolScan(cfg.EnableNPMScan)) + fmt.Printf(" %-24s %s\n", "Legacy Node Scan:", displayBoolScan(cfg.UseLegacyNodeScan)) fmt.Printf(" %-24s %s\n", "Enable Brew Scan:", displayBoolScan(cfg.EnableBrewScan)) fmt.Printf(" %-24s %s\n", "Enable Python Scan:", displayBoolScan(cfg.EnablePythonScan)) fmt.Printf(" %-24s %s\n", "Scan TCC-Protected Dirs:", displayTCC(cfg.IncludeTCCProtected)) diff --git a/internal/detector/nodedist.go b/internal/detector/nodedist.go new file mode 100644 index 0000000..82f60d3 --- /dev/null +++ b/internal/detector/nodedist.go @@ -0,0 +1,204 @@ +// Disk-based Node.js package discovery. +// +// NodeDistDetector inventories installed Node packages by parsing on-disk +// lockfiles (package-lock.json / npm-shrinkwrap.json, pnpm-lock.yaml, +// yarn.lock, bun.lock) and, as a fallback, node_modules/**/package.json — +// instead of running `npm ls` / `yarn list` / `pnpm ls` / `bun pm ls`. +// +// Why parse instead of exec: +// - Robust: no dependency on a working PM binary, correct PATH under +// launchd/systemd, network access, or a non-broken interpreter. A +// package manager that errors or hangs can't drop a project to zero. +// - Complete: lockfiles carry the FULL resolved graph; the command path +// truncated transitive deps at --depth=3. +// - Read-only: never executes project code (no postinstall, no PnP loader). +// +// Output is intentionally minimal — {name, version, is_direct} — matching +// exactly what the backend persists (DeviceNPMPackageUsageInfo). The lockfile +// is the source of truth for the resolved set and for direct-vs-transitive; +// callers gate on an actual install (node_modules / PnP present) so a project +// that was never installed isn't reported. +// +// Security context: all reads go through the Executor (so the user-aware +// executor and test mocks both apply) and are size-bounded via maxLockfileSize +// before the bytes are pulled into memory. The node_modules walk uses +// filepath.WalkDir directly (matching nodeproject.go) and never follows +// directory symlinks, so a symlinked dependency can't redirect the walk out of +// the project tree. +package detector + +import ( + "path/filepath" + "sort" + + "github.com/step-security/dev-machine-guard/internal/executor" + "github.com/step-security/dev-machine-guard/internal/model" + "github.com/step-security/dev-machine-guard/internal/progress" + "github.com/step-security/dev-machine-guard/internal/tcc" +) + +// maxLockfileSize bounds a single lockfile / package.json read. Real lockfiles +// for large monorepos run into the low tens of MiB; this ceiling only guards +// against a pathological or hostile file exhausting memory. +const maxLockfileSize = 64 << 20 // 64 MiB + +// NodeDistDetector parses installed Node packages from disk, with no +// package-manager subprocess. +type NodeDistDetector struct { + exec executor.Executor + log *progress.Logger + skipper *tcc.Skipper + maxFileSize int64 +} + +func NewNodeDistDetector(exec executor.Executor) *NodeDistDetector { + return &NodeDistDetector{exec: exec, log: progress.NewNoop(), maxFileSize: maxLockfileSize} +} + +// WithSkipper attaches a TCC skipper so the node_modules fallback walk skips +// macOS-protected directories. A nil skipper is a no-op. Returns the detector +// for chaining. +func (d *NodeDistDetector) WithSkipper(s *tcc.Skipper) *NodeDistDetector { + d.skipper = s + return d +} + +// WithLogger attaches a progress logger. A nil logger falls back to the no-op +// default. Returns the detector for chaining. +func (d *NodeDistDetector) WithLogger(log *progress.Logger) *NodeDistDetector { + if log != nil { + d.log = log + } + return d +} + +// ScanProject returns the packages installed for a project, parsed from disk. +// +// pm is the package manager already detected for the project (see +// DetectProjectPM). It selects which lockfile parser to try first; if that +// lockfile is absent or unparseable, ScanProject falls back to walking +// node_modules. The result is de-duplicated by (name, version) and sorted by +// name then version for stable output. +func (d *NodeDistDetector) ScanProject(projectDir, pm string) []model.NodePackage { + var pkgs []model.NodePackage + + switch pm { + case "bun": + pkgs = d.parseFirstPresent(projectDir, d.parseBunLock, "bun.lock") + case "pnpm": + pkgs = d.parseFirstPresent(projectDir, d.parsePnpmLock, "pnpm-lock.yaml") + case "yarn", "yarn-berry": + pkgs = d.parseFirstPresent(projectDir, d.parseYarnLock, "yarn.lock") + default: // "npm" and anything unrecognised + pkgs = d.parseFirstPresent(projectDir, d.parsePackageLock, "package-lock.json", "npm-shrinkwrap.json") + } + + // Fallback: no parseable lockfile (e.g. bun.lockb binary format, a yarn + // PnP project whose yarn.lock we couldn't read, or a tree installed + // without a lockfile). Read whatever is actually on disk in node_modules. + if len(pkgs) == 0 { + pkgs = d.walkNodeModules(projectDir) + } + + return dedupSortPackages(pkgs) +} + +// lockfileParser parses one lockfile's bytes into packages. directNames carries +// the project's declared direct dependencies (from package.json), used by +// parsers whose lockfile format does not itself encode directness. +type lockfileParser func(data []byte, directNames map[string]struct{}) []model.NodePackage + +// parseFirstPresent reads direct-dependency names once, then tries each +// candidate lockfile name in priority order, returning the first that parses +// to a non-empty package set. +func (d *NodeDistDetector) parseFirstPresent(projectDir string, parse lockfileParser, candidates ...string) []model.NodePackage { + directNames := d.directDepNames(projectDir) + for _, name := range candidates { + data, ok := d.readBounded(filepath.Join(projectDir, name)) + if !ok { + continue + } + if pkgs := parse(data, directNames); len(pkgs) > 0 { + return pkgs + } + } + return nil +} + +// readBounded reads path through the executor, rejecting files larger than the +// size cap. The size is checked via Stat before the read so a pathological +// file is never pulled into memory; the post-read length check is a race-safety +// fallback (the file can grow between Stat and ReadFile). Returns ok=false for +// a missing, oversized, or unreadable file. +func (d *NodeDistDetector) readBounded(path string) (data []byte, ok bool) { + if d.maxFileSize > 0 { + if info, err := d.exec.Stat(path); err == nil && info.Size() > d.maxFileSize { + d.log.Debug("node disk scan: %s exceeds %d bytes — skipping", path, d.maxFileSize) + return nil, false + } + } + b, err := d.exec.ReadFile(path) + if err != nil { + return nil, false + } + if d.maxFileSize > 0 && int64(len(b)) > d.maxFileSize { + d.log.Debug("node disk scan: %s exceeds %d bytes — skipping", path, d.maxFileSize) + return nil, false + } + return b, true +} + +// directDepNames returns the set of dependency names declared directly in the +// project's package.json (dependencies + devDependencies + optional + peer). +// Used to mark is_direct for lockfile formats (yarn, bun) and the node_modules +// walk that don't otherwise distinguish direct from transitive. Returns an +// empty (non-nil) set when package.json is missing or unparseable, so callers +// can treat "not found" as "not direct" without nil checks. +func (d *NodeDistDetector) directDepNames(projectDir string) map[string]struct{} { + out := make(map[string]struct{}) + data, ok := d.readBounded(filepath.Join(projectDir, "package.json")) + if !ok { + return out + } + for _, m := range parsePackageJSONDepMaps(data) { + for name := range m { + out[name] = struct{}{} + } + } + return out +} + +// dedupSortPackages collapses duplicate (name, version) pairs — the same +// package can be reachable via multiple paths in a lockfile — and sorts by name +// then version for deterministic output. When duplicates disagree on +// directness, direct wins (a package that is a direct dependency anywhere is +// reported as direct). +func dedupSortPackages(pkgs []model.NodePackage) []model.NodePackage { + if len(pkgs) == 0 { + return nil + } + type key struct{ name, version string } + idx := make(map[key]int, len(pkgs)) + out := make([]model.NodePackage, 0, len(pkgs)) + for _, p := range pkgs { + if p.Name == "" || p.Version == "" { + continue + } + k := key{p.Name, p.Version} + if i, ok := idx[k]; ok { + if p.IsDirect { + out[i].IsDirect = true + } + continue + } + idx[k] = len(out) + out = append(out, p) + } + sort.Slice(out, func(i, j int) bool { + if out[i].Name == out[j].Name { + return out[i].Version < out[j].Version + } + return out[i].Name < out[j].Name + }) + return out +} diff --git a/internal/detector/nodedist_bun.go b/internal/detector/nodedist_bun.go new file mode 100644 index 0000000..f7b31de --- /dev/null +++ b/internal/detector/nodedist_bun.go @@ -0,0 +1,133 @@ +package detector + +import ( + "encoding/json" + "strings" + + "github.com/step-security/dev-machine-guard/internal/model" +) + +// parseBunLock extracts installed packages from a text bun.lock. +// +// bun.lock is JSONC (JSON with // and /* */ comments and trailing commas), so +// it is sanitised to strict JSON before unmarshalling. The `packages` object +// maps a dependency name to an entry whose first array element is the resolved +// "name@version" string (older entries may instead be an object with a +// `version` field). The binary bun.lockb is NOT handled here — the caller +// falls back to the node_modules walk for those. Directness comes from the +// project's declared deps (directNames). +func (d *NodeDistDetector) parseBunLock(data []byte, directNames map[string]struct{}) []model.NodePackage { + var lf struct { + Packages map[string]json.RawMessage `json:"packages"` + } + if err := json.Unmarshal(stripJSONC(data), &lf); err != nil { + d.log.Debug("node disk scan: bun.lock parse failed: %v", err) + return nil + } + out := make([]model.NodePackage, 0, len(lf.Packages)) + for key, raw := range lf.Packages { + name, version := decodeBunEntry(key, raw) + if name == "" || version == "" { + continue + } + _, direct := directNames[name] + out = append(out, model.NodePackage{Name: name, Version: version, IsDirect: direct}) + } + return out +} + +// decodeBunEntry resolves one packages-map entry to name and version. The +// canonical shape is an array whose first element is "name@version"; a legacy +// object shape carries {"version": "..."} and falls back to the map key for +// the name. +func decodeBunEntry(key string, raw json.RawMessage) (name, version string) { + var arr []json.RawMessage + if err := json.Unmarshal(raw, &arr); err == nil && len(arr) > 0 { + var spec string + if json.Unmarshal(arr[0], &spec) == nil { + if n, v := splitAtVersion(spec); v != "" { + return n, v + } + } + } + var obj struct { + Version string `json:"version"` + } + if json.Unmarshal(raw, &obj) == nil && obj.Version != "" { + return key, obj.Version + } + return "", "" +} + +// splitAtVersion splits a "name@version" spec, treating the first '@' after any +// @scope as the boundary ("@scope/pkg@1.2.3" -> "@scope/pkg", "1.2.3"). +func splitAtVersion(s string) (name, version string) { + searchFrom := 0 + if strings.HasPrefix(s, "@") { + searchFrom = 1 + } + if at := strings.IndexByte(s[searchFrom:], '@'); at >= 0 { + pos := searchFrom + at + return s[:pos], s[pos+1:] + } + return s, "" +} + +// stripJSONC removes // line comments, /* */ block comments, and trailing +// commas from JSONC, yielding strict JSON. Comment markers inside string +// literals are preserved (string state is tracked, honouring backslash +// escapes). Trailing commas are dropped by erasing any comma that is followed +// only by whitespace before a closing } or ]. This is a minimal sanitiser +// sufficient for bun.lock, not a general JSONC implementation. +func stripJSONC(in []byte) []byte { + out := make([]byte, 0, len(in)) + inString := false + escaped := false + for i := 0; i < len(in); i++ { + c := in[i] + if inString { + out = append(out, c) + switch { + case escaped: + escaped = false + case c == '\\': + escaped = true + case c == '"': + inString = false + } + continue + } + switch { + case c == '"': + inString = true + out = append(out, c) + case c == '/' && i+1 < len(in) && in[i+1] == '/': + for i < len(in) && in[i] != '\n' { + i++ + } + if i < len(in) { + out = append(out, '\n') + } + case c == '/' && i+1 < len(in) && in[i+1] == '*': + i += 2 + for i+1 < len(in) && !(in[i] == '*' && in[i+1] == '/') { + i++ + } + i++ // land on '/'; loop's i++ steps past it + case c == ',': + // Drop a trailing comma: skip ahead past whitespace; if the next + // non-space byte closes an object/array, omit the comma. + j := i + 1 + for j < len(in) && (in[j] == ' ' || in[j] == '\t' || in[j] == '\n' || in[j] == '\r') { + j++ + } + if j < len(in) && (in[j] == '}' || in[j] == ']') { + continue // skip the comma + } + out = append(out, c) + default: + out = append(out, c) + } + } + return out +} diff --git a/internal/detector/nodedist_global.go b/internal/detector/nodedist_global.go new file mode 100644 index 0000000..bf8853d --- /dev/null +++ b/internal/detector/nodedist_global.go @@ -0,0 +1,114 @@ +package detector + +import ( + "path/filepath" + + "github.com/step-security/dev-machine-guard/internal/executor" + "github.com/step-security/dev-machine-guard/internal/model" +) + +// nodeGlobalRoot is a global node_modules directory paired with the package +// manager that owns it. +type nodeGlobalRoot struct { + pm string + dir string +} + +// NodeGlobalRoots enumerates global node_modules directories on the host, +// grouped by package manager, with no PM invocation (replaces `npm config get +// prefix` / `yarn global dir` / `pnpm root -g`). Only directories that exist +// are returned. +// +// Globals are scattered across version managers and install prefixes, so this +// is a best-effort sweep of the well-known locations rather than an exhaustive +// resolution of the user's active prefix — the same trade-off as the Python +// global-roots scan. Where a manager (nvm/fnm/volta) keeps per-version trees, +// every installed version's global dir is included. +func NodeGlobalRoots(exec executor.Executor) []nodeGlobalRoot { + var roots []nodeGlobalRoot + add := func(pm, dir string) { + if dir != "" && exec.DirExists(dir) { + roots = append(roots, nodeGlobalRoot{pm: pm, dir: dir}) + } + } + addGlob := func(pm, pattern string) { + if matches, err := exec.Glob(pattern); err == nil { + for _, m := range matches { + add(pm, m) + } + } + } + home := nodeHomeDir(exec) + + // --- npm: /lib/node_modules (POSIX) or /node_modules (Windows). --- + switch exec.GOOS() { + case model.PlatformDarwin, model.PlatformLinux: + add("npm", "/usr/local/lib/node_modules") + add("npm", "/usr/lib/node_modules") + add("npm", "/opt/homebrew/lib/node_modules") + if home != "" { + add("npm", filepath.Join(home, ".npm-global", "lib", "node_modules")) + addGlob("npm", filepath.Join(home, ".nvm", "versions", "node", "*", "lib", "node_modules")) + addGlob("npm", filepath.Join(home, ".volta", "tools", "image", "node", "*", "lib", "node_modules")) + addGlob("npm", filepath.Join(home, ".local", "share", "fnm", "node-versions", "*", "installation", "lib", "node_modules")) + } + case model.PlatformWindows: + if appData := exec.Getenv("APPDATA"); appData != "" { + add("npm", filepath.Join(appData, "npm", "node_modules")) + } + } + // Honor an explicit prefix override regardless of OS. + for _, env := range []string{"npm_config_prefix", "PREFIX"} { + if p := exec.Getenv(env); p != "" { + add("npm", filepath.Join(p, "lib", "node_modules")) // POSIX layout + add("npm", filepath.Join(p, "node_modules")) // Windows layout + } + } + + // --- pnpm: /global//node_modules (the is a store-format id). --- + for _, pnpmHome := range pnpmGlobalHomes(exec, home) { + addGlob("pnpm", filepath.Join(pnpmHome, "global", "*", "node_modules")) + } + + // --- yarn classic: ~/.config/yarn/global/node_modules. --- + if home != "" { + add("yarn", filepath.Join(home, ".config", "yarn", "global", "node_modules")) + } + + return roots +} + +// pnpmGlobalHomes returns candidate pnpm home directories (PNPM_HOME plus the +// OS default), under which global installs live at global//node_modules. +func pnpmGlobalHomes(exec executor.Executor, home string) []string { + var homes []string + if h := exec.Getenv("PNPM_HOME"); h != "" { + homes = append(homes, h) + } + switch exec.GOOS() { + case model.PlatformDarwin: + if home != "" { + homes = append(homes, filepath.Join(home, "Library", "pnpm")) + } + case model.PlatformLinux: + if home != "" { + homes = append(homes, filepath.Join(home, ".local", "share", "pnpm")) + } + case model.PlatformWindows: + if localAppData := exec.Getenv("LOCALAPPDATA"); localAppData != "" { + homes = append(homes, filepath.Join(localAppData, "pnpm")) + } + } + return homes +} + +// nodeHomeDir returns the user's home directory via the platform-appropriate +// environment variable. Uses the env rather than user.Current so that, under a +// root daemon delegating to a logged-in user, callers that pre-set HOME resolve +// the user's tree. +func nodeHomeDir(exec executor.Executor) string { + if exec.GOOS() == model.PlatformWindows { + return exec.Getenv("USERPROFILE") + } + return exec.Getenv("HOME") +} diff --git a/internal/detector/nodedist_global_test.go b/internal/detector/nodedist_global_test.go new file mode 100644 index 0000000..b69793f --- /dev/null +++ b/internal/detector/nodedist_global_test.go @@ -0,0 +1,81 @@ +package detector + +import ( + "context" + "path/filepath" + "testing" + + "github.com/step-security/dev-machine-guard/internal/executor" + "github.com/step-security/dev-machine-guard/internal/progress" +) + +// ScanGlobalModules marks immediate children of the global root as direct +// (globally-installed) and anything below a nested node_modules as transitive. +func TestScanGlobalModules(t *testing.T) { + // Real global roots end in "node_modules"; the package-path rule keys on + // that segment, so the fixture must too. + root := filepath.Join(t.TempDir(), "lib", "node_modules") + mustWrite(t, filepath.Join(root, "typescript", "package.json"), `{"name":"typescript","version":"5.4.0"}`) + mustWrite(t, filepath.Join(root, "@scope", "cli", "package.json"), `{"name":"@scope/cli","version":"1.0.0"}`) + mustWrite(t, filepath.Join(root, "typescript", "node_modules", "dep", "package.json"), `{"name":"dep","version":"2.0.0"}`) + + got := newDistDetector().ScanGlobalModules(root) + assertPkgs(t, got, "typescript@5.4.0+direct", "@scope/cli@1.0.0+direct", "dep@2.0.0") +} + +// An explicit npm prefix override is resolved to /lib/node_modules. +func TestNodeGlobalRoots_PrefixOverride(t *testing.T) { + prefix := t.TempDir() + nm := filepath.Join(prefix, "lib", "node_modules") + mustWrite(t, filepath.Join(nm, "typescript", "package.json"), `{"name":"typescript","version":"5.4.0"}`) + t.Setenv("npm_config_prefix", prefix) + + found := false + for _, r := range NodeGlobalRoots(executor.NewReal()) { + if r.pm == "npm" && filepath.Clean(r.dir) == filepath.Clean(nm) { + found = true + } + } + if !found { + t.Fatalf("expected npm global root %q from prefix override", nm) + } +} + +// Enterprise disk mode: ScanProjects emits structured packages with no raw +// output and no package-manager invocation. +func TestNodeScanner_DiskMode_Project(t *testing.T) { + root := t.TempDir() + proj := filepath.Join(root, "app") + mustWrite(t, filepath.Join(proj, "package.json"), `{"name":"app","dependencies":{"lodash":"^4"}}`) + mustWrite(t, filepath.Join(proj, "package-lock.json"), `{ + "lockfileVersion": 3, + "packages": { + "": {"name":"app"}, + "node_modules/lodash": {"version":"4.17.21"}, + "node_modules/dep": {"version":"1.0.0"} + } + }`) + // Isolate the scan cache to a temp file and bypass it for a deterministic run. + t.Setenv("STEPSEC_NODE_SCAN_CACHE", filepath.Join(t.TempDir(), "cache.json")) + t.Setenv("STEPSEC_NODE_SCAN_CACHE_BYPASS", "1") + + exec := executor.NewReal() + scanner := NewNodeScanner(exec, progress.NewNoop(), ""). + WithDiskScan(NewNodeDistDetector(exec)) + + results, discovered := scanner.ScanProjects(context.Background(), []string{root}, nil) + if len(results) != 1 { + t.Fatalf("got %d results, want 1: %+v", len(results), results) + } + r := results[0] + if r.RawStdoutBase64 != "" { + t.Errorf("disk mode must not emit raw stdout, got %d bytes", len(r.RawStdoutBase64)) + } + if r.PackagesCount != 2 || len(r.Packages) != 2 { + t.Fatalf("want 2 packages, got count=%d slice=%v", r.PackagesCount, r.Packages) + } + assertPkgs(t, r.Packages, "lodash@4.17.21+direct", "dep@1.0.0") + if len(discovered) != 1 { + t.Errorf("want 1 discovered project, got %d", len(discovered)) + } +} diff --git a/internal/detector/nodedist_modules.go b/internal/detector/nodedist_modules.go new file mode 100644 index 0000000..dc826f8 --- /dev/null +++ b/internal/detector/nodedist_modules.go @@ -0,0 +1,143 @@ +package detector + +import ( + "encoding/json" + "os" + "path/filepath" + "strings" + + "github.com/step-security/dev-machine-guard/internal/model" +) + +// walkNodeModules is the fallback when no parseable lockfile exists: it reads +// every installed package's own package.json under /node_modules +// and reports the actual installed name@version. A package is marked direct +// when its name appears in the project's declared dependencies. +// +// This covers npm / yarn-classic (real nested node_modules) and pnpm (whose +// real package dirs live under node_modules/.pnpm//node_modules/); +// the "last node_modules segment" rule in isNodeModulesPackagePath matches all +// of them. yarn-berry PnP installs have no node_modules and rely on the +// yarn.lock parser instead. +func (d *NodeDistDetector) walkNodeModules(projectDir string) []model.NodePackage { + directNames := d.directDepNames(projectDir) + return d.scanModulesTree(filepath.Join(projectDir, "node_modules"), + func(name, _ string) bool { _, ok := directNames[name]; return ok }) +} + +// scanModulesTree reads every installed package's package.json under a +// node_modules root and reports name@version. isDirect decides directness per +// package from its name and full path (project scans key on declared deps; +// global scans key on top-level position — see ScanGlobalModules). The walk +// never follows directory symlinks (filepath.WalkDir does not), so pnpm's +// symlink farm is read via the real .pnpm store dirs, not by chasing links out +// of the tree. +func (d *NodeDistDetector) scanModulesTree(root string, isDirect func(name, path string) bool) []model.NodePackage { + if !d.exec.DirExists(root) { + return nil + } + var pkgs []model.NodePackage + _ = filepath.WalkDir(root, func(path string, entry os.DirEntry, err error) error { + if err != nil { + return nil + } + if entry.IsDir() { + if d.skipper.ShouldSkip(path, root) { + return filepath.SkipDir + } + // .bin holds symlinked CLI shims; .cache is npm's build cache. + // Neither contains installed-package metadata. + if name := entry.Name(); name == ".bin" || name == ".cache" { + return filepath.SkipDir + } + return nil + } + if entry.Name() != "package.json" || !isNodeModulesPackagePath(path) { + return nil + } + data, ok := d.readBounded(path) + if !ok { + return nil + } + name, version := packageJSONNameVersion(data) + if name == "" || version == "" { + return nil + } + pkgs = append(pkgs, model.NodePackage{Name: name, Version: version, IsDirect: isDirect(name, path)}) + return nil + }) + return pkgs +} + +// ScanGlobalModules reads a global node_modules directory (e.g. an npm prefix's +// lib/node_modules, or pnpm's global store), reporting installed packages. A +// package is direct when it sits immediately under the global root (a package +// the user installed with `-g`); anything below a further node_modules is a +// transitive dependency of one of those. +func (d *NodeDistDetector) ScanGlobalModules(nmRoot string) []model.NodePackage { + clean := filepath.Clean(nmRoot) + return d.scanModulesTree(clean, func(_, path string) bool { + rel := strings.TrimPrefix(filepath.ToSlash(path), filepath.ToSlash(clean)) + return !strings.Contains(rel, "node_modules/") + }) +} + +// isNodeModulesPackagePath reports whether a package.json path is a package +// root inside node_modules — i.e. the path tail after the LAST "node_modules" +// segment is exactly /package.json or @scope//package.json. This +// rejects package.json files nested deeper inside a package's own source/test +// fixtures (which are not installed dependencies), while accepting both +// hoisted and pnpm-store layouts. Using the last segment handles nesting like +// node_modules/a/node_modules/b and node_modules/.pnpm/x@1/node_modules/x. +func isNodeModulesPackagePath(path string) bool { + parts := strings.Split(filepath.ToSlash(path), "/") + nmIdx := -1 + for i := len(parts) - 1; i >= 0; i-- { + if parts[i] == "node_modules" { + nmIdx = i + break + } + } + if nmIdx == -1 { + return false + } + tail := parts[nmIdx+1:] // segments after the last node_modules, incl. "package.json" + switch len(tail) { + case 2: // /package.json — must NOT be a scope dir + return tail[1] == "package.json" && !strings.HasPrefix(tail[0], "@") + case 3: // @scope//package.json + return tail[2] == "package.json" && strings.HasPrefix(tail[0], "@") + default: + return false + } +} + +// packageJSONNameVersion extracts the name and version from a package.json. +// Both are required for a usable inventory record; a private/workspace +// package.json missing either is treated as not-a-package by the caller. +func packageJSONNameVersion(data []byte) (name, version string) { + var pj struct { + Name string `json:"name"` + Version string `json:"version"` + } + if err := json.Unmarshal(data, &pj); err != nil { + return "", "" + } + return strings.TrimSpace(pj.Name), strings.TrimSpace(pj.Version) +} + +// parsePackageJSONDepMaps returns the four dependency maps a package.json can +// declare. Order is irrelevant — callers only use the union of key names to +// decide directness. +func parsePackageJSONDepMaps(data []byte) []map[string]string { + var pj struct { + Dependencies map[string]string `json:"dependencies"` + DevDependencies map[string]string `json:"devDependencies"` + OptionalDependencies map[string]string `json:"optionalDependencies"` + PeerDependencies map[string]string `json:"peerDependencies"` + } + if err := json.Unmarshal(data, &pj); err != nil { + return nil + } + return []map[string]string{pj.Dependencies, pj.DevDependencies, pj.OptionalDependencies, pj.PeerDependencies} +} diff --git a/internal/detector/nodedist_npm.go b/internal/detector/nodedist_npm.go new file mode 100644 index 0000000..16e60b3 --- /dev/null +++ b/internal/detector/nodedist_npm.go @@ -0,0 +1,119 @@ +package detector + +import ( + "encoding/json" + "strings" + + "github.com/step-security/dev-machine-guard/internal/model" +) + +// npmLock is the union of the two package-lock.json / npm-shrinkwrap.json +// shapes. lockfileVersion 1 uses the nested `dependencies` tree; versions 2 +// and 3 use the flat `packages` map keyed by install path. A v2 lockfile +// carries both for backwards compatibility, so `packages` is preferred when +// present (it is authoritative and avoids double-counting the tree). +type npmLock struct { + Packages map[string]npmLockPkg `json:"packages"` // v2/v3 + Dependencies map[string]npmLockDepV1 `json:"dependencies"` // v1 +} + +type npmLockPkg struct { + Version string `json:"version"` + Name string `json:"name"` // set for aliased installs; key is authoritative otherwise + Link bool `json:"link"` // true for workspace symlinks — not an installed version +} + +type npmLockDepV1 struct { + Version string `json:"version"` + Dependencies map[string]npmLockDepV1 `json:"dependencies"` +} + +// parsePackageLock parses an npm lockfile into installed packages. +// +// Directness is taken from directNames (the project's declared deps), NOT from +// lockfile structure: npm hoists transitive packages to the top of +// node_modules, so install-path depth (and the v1 tree's top level) marks +// hoisted transitives as direct. Matching declared deps mirrors the tree +// top-level that `npm ls` — the command path we are replacing — reports as +// direct. +func (d *NodeDistDetector) parsePackageLock(data []byte, directNames map[string]struct{}) []model.NodePackage { + var lf npmLock + if err := json.Unmarshal(data, &lf); err != nil { + d.log.Debug("node disk scan: package-lock parse failed: %v", err) + return nil + } + if len(lf.Packages) > 0 { + return npmPackagesFromV2(lf.Packages, directNames) + } + if len(lf.Dependencies) > 0 { + var out []model.NodePackage + collectNpmV1(lf.Dependencies, directNames, &out) + return out + } + return nil +} + +// npmPackagesFromV2 flattens the v2/v3 `packages` map. Keys are install paths: +// - "" → the project root (skipped) +// - "node_modules/foo" → a hoisted/top-level install +// - "node_modules/a/node_modules/b" → a nested install +// - "packages/ui" → a workspace member (no node_modules +// segment; skipped — it's first-party, not an installed dependency) +func npmPackagesFromV2(packages map[string]npmLockPkg, directNames map[string]struct{}) []model.NodePackage { + out := make([]model.NodePackage, 0, len(packages)) + for key, p := range packages { + if key == "" || p.Link || !strings.Contains(key, "node_modules/") { + continue + } + name := nameFromPackagesKey(key, p.Name) + if name == "" || p.Version == "" { + continue + } + _, direct := directNames[name] + out = append(out, model.NodePackage{Name: name, Version: p.Version, IsDirect: direct}) + } + return out +} + +// nameFromPackagesKey extracts the package name from a v2/v3 install-path key, +// preferring the explicit `name` field (set for aliased installs). For the key +// it takes the segment(s) after the LAST node_modules, preserving an @scope. +func nameFromPackagesKey(key, explicit string) string { + if explicit != "" { + return explicit + } + idx := strings.LastIndex(key, "node_modules/") + if idx == -1 { + return "" + } + tail := key[idx+len("node_modules/"):] + if strings.HasPrefix(tail, "@") { + // Scoped: keep "@scope/name", drop any deeper path. + segs := strings.SplitN(tail, "/", 3) + if len(segs) < 2 { + return "" + } + return segs[0] + "/" + segs[1] + } + if i := strings.IndexByte(tail, '/'); i >= 0 { + return tail[:i] + } + return tail +} + +// collectNpmV1 walks the lockfileVersion-1 nested dependency tree, emitting one +// record per node. Directness is by declared-dep membership (see +// parsePackageLock), not tree position — the v1 tree mirrors the hoisted +// node_modules layout, so a top-level node can still be a transitive package. +// An entry without a concrete version is skipped but still recursed into. +func collectNpmV1(deps map[string]npmLockDepV1, directNames map[string]struct{}, out *[]model.NodePackage) { + for name, dep := range deps { + if dep.Version != "" { + _, direct := directNames[name] + *out = append(*out, model.NodePackage{Name: name, Version: dep.Version, IsDirect: direct}) + } + if len(dep.Dependencies) > 0 { + collectNpmV1(dep.Dependencies, directNames, out) + } + } +} diff --git a/internal/detector/nodedist_pnpm.go b/internal/detector/nodedist_pnpm.go new file mode 100644 index 0000000..0480512 --- /dev/null +++ b/internal/detector/nodedist_pnpm.go @@ -0,0 +1,83 @@ +package detector + +import ( + "strings" + + "github.com/step-security/dev-machine-guard/internal/model" +) + +// parsePnpmLock extracts installed packages from a pnpm-lock.yaml. +// +// It is a deliberate line scanner, not a YAML unmarshal: pnpm lockfiles are +// large and we only need the top-level `packages:` block's keys. Each key +// encodes name@version across pnpm's lockfile generations: +// +// v9: foo@1.2.3: @scope/foo@1.2.3: +// v6: /foo@1.2.3: /foo@1.2.3(react@18.0.0): /@scope/foo@1.2.3: +// v5: /foo/1.2.3: /@scope/foo/1.2.3: +// +// Only keys at exactly two-space indent inside `packages:` are entries; nested +// fields (resolution, engines, …) sit at four+ spaces and are ignored, and +// sibling top-level blocks (importers:, snapshots:, settings:) end the scan of +// the packages block. Directness is taken from the project's declared deps +// (directNames), since the resolved `packages:` block does not mark it. +func (d *NodeDistDetector) parsePnpmLock(data []byte, directNames map[string]struct{}) []model.NodePackage { + var out []model.NodePackage + inPackages := false + for _, raw := range strings.Split(string(data), "\n") { + line := strings.TrimRight(raw, "\r") + if strings.TrimSpace(line) == "" { + continue + } + indent := len(line) - len(strings.TrimLeft(line, " ")) + body := line[indent:] + + if indent == 0 { + // A new top-level block: we're in `packages:` only while this is it. + inPackages = strings.HasPrefix(body, "packages:") + continue + } + if !inPackages || indent != 2 || !strings.HasSuffix(body, ":") { + continue + } + key := strings.Trim(strings.TrimSuffix(body, ":"), `'"`) + name, version := parsePnpmPackageKey(key) + if name == "" || version == "" { + continue + } + _, direct := directNames[name] + out = append(out, model.NodePackage{Name: name, Version: version, IsDirect: direct}) + } + return out +} + +// parsePnpmPackageKey splits a pnpm packages-block key into name and version, +// handling the leading slash (v5/v6), the @scope prefix, peer-dependency +// suffixes ("(react@18.0.0)" in v6/v9, "_react@18.0.0" in v5), and the legacy +// slash separator ("/foo/1.2.3"). The first '@' after any scope is the +// name/version boundary; absent one, a trailing "/version" is assumed. +func parsePnpmPackageKey(key string) (name, version string) { + key = strings.TrimPrefix(key, "/") + if i := strings.IndexByte(key, '('); i >= 0 { // strip "(peer@x)" suffix + key = key[:i] + } + if key == "" { + return "", "" + } + searchFrom := 0 + if key[0] == '@' { // skip the scope '@' so the version '@' is found + searchFrom = 1 + } + if at := strings.IndexByte(key[searchFrom:], '@'); at >= 0 { + pos := searchFrom + at + name, version = key[:pos], key[pos+1:] + } else if sl := strings.LastIndexByte(key, '/'); sl >= 0 { + name, version = key[:sl], key[sl+1:] // legacy v5 "name/version" + } else { + return "", "" + } + if i := strings.IndexByte(version, '_'); i >= 0 { // strip "_peer@x" (v5) + version = version[:i] + } + return name, version +} diff --git a/internal/detector/nodedist_test.go b/internal/detector/nodedist_test.go new file mode 100644 index 0000000..0b52612 --- /dev/null +++ b/internal/detector/nodedist_test.go @@ -0,0 +1,262 @@ +package detector + +import ( + "encoding/json" + "os" + "path/filepath" + "testing" + + "github.com/step-security/dev-machine-guard/internal/executor" + "github.com/step-security/dev-machine-guard/internal/model" +) + +// newDistDetector builds a detector backed by the real executor; the parser +// tests feed bytes directly, and the integration tests use real temp dirs. +func newDistDetector() *NodeDistDetector { return NewNodeDistDetector(executor.NewReal()) } + +// pkgSet collapses a result to name@version[+direct] strings for order-free +// assertions. +func pkgSet(pkgs []model.NodePackage) map[string]bool { + out := make(map[string]bool, len(pkgs)) + for _, p := range pkgs { + s := p.Name + "@" + p.Version + if p.IsDirect { + s += "+direct" + } + out[s] = true + } + return out +} + +func assertPkgs(t *testing.T, got []model.NodePackage, want ...string) { + t.Helper() + set := pkgSet(got) + if len(set) != len(want) { + t.Fatalf("got %d packages %v, want %d %v", len(set), got, len(want), want) + } + for _, w := range want { + if !set[w] { + t.Errorf("missing %q in %v", w, got) + } + } +} + +func TestParsePackageLock_V3(t *testing.T) { + data := []byte(`{ + "lockfileVersion": 3, + "packages": { + "": {"name": "root", "version": "1.0.0"}, + "node_modules/lodash": {"version": "4.17.21"}, + "node_modules/@scope/util": {"version": "2.0.0"}, + "node_modules/lodash/node_modules/nested-dep": {"version": "1.5.0"}, + "node_modules/linkpkg": {"link": true}, + "packages/ui": {"version": "0.0.0"} + } + }`) + direct := map[string]struct{}{"lodash": {}, "@scope/util": {}} + got := newDistDetector().parsePackageLock(data, direct) + // root/link/workspace skipped; nested-dep is hoisted to top-level but is + // not a declared dep, so it is transitive. + assertPkgs(t, got, "lodash@4.17.21+direct", "@scope/util@2.0.0+direct", "nested-dep@1.5.0") +} + +func TestParsePackageLock_V1(t *testing.T) { + data := []byte(`{ + "lockfileVersion": 1, + "dependencies": { + "lodash": {"version": "4.17.21"}, + "chalk": {"version": "5.0.0", "dependencies": {"ansi": {"version": "6.0.0"}}} + } + }`) + direct := map[string]struct{}{"lodash": {}, "chalk": {}} + got := newDistDetector().parsePackageLock(data, direct) + assertPkgs(t, got, "lodash@4.17.21+direct", "chalk@5.0.0+direct", "ansi@6.0.0") +} + +func TestParsePnpmLock_V9(t *testing.T) { + data := []byte(`lockfileVersion: '9.0' +importers: + .: + dependencies: + lodash: + specifier: ^4.17.0 + version: 4.17.21 +packages: + lodash@4.17.21: + resolution: {integrity: sha512-x} + '@scope/util@2.0.0': + resolution: {integrity: sha512-y} + is-odd@1.0.0(react@18.0.0): + resolution: {integrity: sha512-z} +snapshots: + lodash@4.17.21: {} +`) + got := newDistDetector().parsePnpmLock(data, map[string]struct{}{"lodash": {}}) + // peer suffix stripped on is-odd; snapshots block ignored; only lodash direct. + assertPkgs(t, got, "lodash@4.17.21+direct", "@scope/util@2.0.0", "is-odd@1.0.0") +} + +func TestParsePnpmLock_V6AndV5Keys(t *testing.T) { + v6 := []byte("packages:\n /foo@1.2.3:\n resolution: {}\n /@scope/bar@2.0.0(peer@1.0.0):\n resolution: {}\n") + assertPkgs(t, newDistDetector().parsePnpmLock(v6, nil), "foo@1.2.3", "@scope/bar@2.0.0") + + v5 := []byte("packages:\n /foo/1.2.3:\n resolution: {}\n /@scope/bar/2.0.0:\n resolution: {}\n") + assertPkgs(t, newDistDetector().parsePnpmLock(v5, nil), "foo@1.2.3", "@scope/bar@2.0.0") +} + +func TestParseYarnLock_ClassicAndBerry(t *testing.T) { + classic := []byte(`# yarn lockfile v1 +lodash@^4.17.0: + version "4.17.21" + resolved "https://registry.example/lodash" + +"@scope/util@^2.0.0", "@scope/util@^2.1.0": + version "2.1.0" +`) + assertPkgs(t, newDistDetector().parseYarnLock(classic, map[string]struct{}{"lodash": {}}), + "lodash@4.17.21+direct", "@scope/util@2.1.0") + + berry := []byte(`__metadata: + version: 6 +"lodash@npm:^4.17.0": + version: 4.17.21 +"root@workspace:.": + version: 0.0.0-use.local +`) + // __metadata and the local workspace marker are skipped. + assertPkgs(t, newDistDetector().parseYarnLock(berry, nil), "lodash@4.17.21") +} + +func TestParseBunLock(t *testing.T) { + data := []byte(`{ + "lockfileVersion": 1, + // installed packages + "packages": { + "lodash": ["lodash@4.17.21", {}, "sha512-x"], + "@scope/util": ["@scope/util@2.0.0", {}], + "legacy": {"version": "1.0.0"}, + }, + }`) + got := newDistDetector().parseBunLock(data, map[string]struct{}{"lodash": {}}) + assertPkgs(t, got, "lodash@4.17.21+direct", "@scope/util@2.0.0", "legacy@1.0.0") +} + +func TestStripJSONC(t *testing.T) { + in := []byte("{\n // line\n \"a\": \"http://x\", /* keep // inside string */\n \"b\": [1, 2,],\n}") + var v struct { + A string `json:"a"` + B []int `json:"b"` + } + if err := json.Unmarshal(stripJSONC(in), &v); err != nil { + t.Fatalf("stripped JSONC did not parse: %v", err) + } + if v.A != "http://x" || len(v.B) != 2 { + t.Fatalf("got %+v, want a=http://x b=[1 2]", v) + } +} + +func TestParsePnpmPackageKey(t *testing.T) { + cases := map[string][2]string{ + "foo@1.2.3": {"foo", "1.2.3"}, + "@scope/foo@1.2.3": {"@scope/foo", "1.2.3"}, + "/foo@1.2.3": {"foo", "1.2.3"}, + "/@scope/foo@1.2.3(react@18)": {"@scope/foo", "1.2.3"}, + "/foo/1.2.3": {"foo", "1.2.3"}, + "/@scope/foo/1.2.3": {"@scope/foo", "1.2.3"}, + "foo@1.2.3_react@18.0.0": {"foo", "1.2.3"}, + } + for in, want := range cases { + n, v := parsePnpmPackageKey(in) + if n != want[0] || v != want[1] { + t.Errorf("parsePnpmPackageKey(%q) = (%q,%q), want (%q,%q)", in, n, v, want[0], want[1]) + } + } +} + +func TestIsNodeModulesPackagePath(t *testing.T) { + yes := []string{ + "/p/node_modules/foo/package.json", + "/p/node_modules/@scope/foo/package.json", + "/p/node_modules/a/node_modules/b/package.json", + "/p/node_modules/.pnpm/foo@1/node_modules/foo/package.json", + } + no := []string{ + "/p/package.json", // not under node_modules + "/p/node_modules/foo/src/package.json", // nested fixture, not a pkg root + "/p/node_modules/@scope/package.json", // scope dir, no pkg + } + for _, p := range yes { + if !isNodeModulesPackagePath(p) { + t.Errorf("expected %q to be a package root", p) + } + } + for _, p := range no { + if isNodeModulesPackagePath(p) { + t.Errorf("expected %q NOT to be a package root", p) + } + } +} + +// ScanProject dispatches to the lockfile parser and de-duplicates. +func TestScanProject_LockfileDispatch(t *testing.T) { + dir := t.TempDir() + mustWrite(t, filepath.Join(dir, "package.json"), `{"name":"root","dependencies":{"lodash":"^4"}}`) + mustWrite(t, filepath.Join(dir, "package-lock.json"), `{ + "lockfileVersion": 3, + "packages": { + "": {"name":"root"}, + "node_modules/lodash": {"version":"4.17.21"}, + "node_modules/dep": {"version":"1.0.0"} + } + }`) + got := newDistDetector().ScanProject(dir, "npm") + assertPkgs(t, got, "lodash@4.17.21+direct", "dep@1.0.0") +} + +// With no parseable lockfile, ScanProject falls back to node_modules. +func TestScanProject_NodeModulesFallback(t *testing.T) { + dir := t.TempDir() + mustWrite(t, filepath.Join(dir, "package.json"), `{"name":"root","dependencies":{"lodash":"^4"}}`) + mustWrite(t, filepath.Join(dir, "node_modules", "lodash", "package.json"), `{"name":"lodash","version":"4.17.21"}`) + mustWrite(t, filepath.Join(dir, "node_modules", "@scope", "util", "package.json"), `{"name":"@scope/util","version":"2.0.0"}`) + mustWrite(t, filepath.Join(dir, "node_modules", "lodash", "node_modules", "tdep", "package.json"), `{"name":"tdep","version":"1.0.0"}`) + // A nested fixture package.json that is NOT an installed package root: + mustWrite(t, filepath.Join(dir, "node_modules", "lodash", "src", "package.json"), `{"name":"evil","version":"9.9.9"}`) + + got := newDistDetector().ScanProject(dir, "npm") + assertPkgs(t, got, "lodash@4.17.21+direct", "@scope/util@2.0.0", "tdep@1.0.0") +} + +func TestDedupSortPackages(t *testing.T) { + in := []model.NodePackage{ + {Name: "b", Version: "1.0.0"}, + {Name: "a", Version: "2.0.0", IsDirect: true}, + {Name: "a", Version: "2.0.0"}, // dup of the direct one + {Name: "a", Version: "1.0.0"}, // distinct version + {Name: "", Version: "1.0.0"}, // dropped (no name) + } + got := dedupSortPackages(in) + if len(got) != 3 { + t.Fatalf("got %d, want 3: %v", len(got), got) + } + // sorted: a@1.0.0, a@2.0.0, b@1.0.0 + if got[0].Name != "a" || got[0].Version != "1.0.0" { + t.Errorf("got[0]=%+v", got[0]) + } + if got[1].Version != "2.0.0" || !got[1].IsDirect { // direct wins on merge + t.Errorf("got[1]=%+v, want a@2.0.0 direct", got[1]) + } + if got[2].Name != "b" { + t.Errorf("got[2]=%+v", got[2]) + } +} + +func mustWrite(t *testing.T, path, content string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatal(err) + } + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatal(err) + } +} diff --git a/internal/detector/nodedist_yarn.go b/internal/detector/nodedist_yarn.go new file mode 100644 index 0000000..198d6a7 --- /dev/null +++ b/internal/detector/nodedist_yarn.go @@ -0,0 +1,93 @@ +package detector + +import ( + "strings" + + "github.com/step-security/dev-machine-guard/internal/model" +) + +// parseYarnLock extracts installed packages from a yarn.lock (Yarn Classic v1 +// and Yarn Berry v2+). The format is a custom indented block list: +// +// "lodash@^4.17.0": # header: one or more "name@range" descriptors +// version "4.17.21" # classic: version is a quoted field +// "@scope/pkg@npm:^1.0.0": # berry: protocol-qualified descriptor +// version: 1.2.3 # berry: version is a bare YAML scalar +// +// A header sits at column 0 and ends with ':'; its body is indented. We take +// the package name from the (first) descriptor and pair it with the entry's +// version. Directness comes from the project's declared deps (directNames), +// since yarn.lock itself does not record it. +func (d *NodeDistDetector) parseYarnLock(data []byte, directNames map[string]struct{}) []model.NodePackage { + var out []model.NodePackage + curName := "" + for _, raw := range strings.Split(string(data), "\n") { + line := strings.TrimRight(raw, "\r") + if line == "" || strings.HasPrefix(strings.TrimSpace(line), "#") { + continue + } + if line[0] != ' ' && line[0] != '\t' { // header (column 0) + curName = "" + if strings.HasSuffix(line, ":") { + curName = yarnNameFromHeader(strings.TrimSuffix(line, ":")) + } + continue + } + // Body line. __metadata is a berry bookkeeping block, not a package. + if curName == "" || curName == "__metadata" { + continue + } + version, ok := yarnVersionField(strings.TrimSpace(line)) + if !ok { + continue + } + // Skip the local workspace marker berry emits for first-party packages. + if version == "0.0.0-use.local" { + curName = "" + continue + } + _, direct := directNames[curName] + out = append(out, model.NodePackage{Name: curName, Version: version, IsDirect: direct}) + curName = "" // one version per entry; ignore later fields in the block + } + return out +} + +// yarnNameFromHeader extracts the package name from a yarn.lock entry header. +// Headers may list several comma-separated descriptors and may be quoted; the +// name is everything before the version specifier, where the boundary is the +// first '@' after any @scope. Examples: +// +// "lodash@^4.17.0, lodash@^4.17.21" -> lodash +// "@scope/pkg@npm:^1.0.0" -> @scope/pkg +func yarnNameFromHeader(header string) string { + h := strings.Trim(strings.TrimSpace(header), `"`) + if i := strings.Index(h, ", "); i >= 0 { // first of multiple descriptors + h = h[:i] + } + h = strings.Trim(strings.TrimSpace(h), `"`) + searchFrom := 0 + if strings.HasPrefix(h, "@") { + searchFrom = 1 + } + if at := strings.IndexByte(h[searchFrom:], '@'); at >= 0 { + return h[:searchFrom+at] + } + return h +} + +// yarnVersionField parses a body line's version field, accepting both the +// classic quoted form (`version "4.17.21"`) and the berry bare form +// (`version: 4.17.21`). Returns ok=false for any other body line. +func yarnVersionField(t string) (string, bool) { + if !strings.HasPrefix(t, "version") { + return "", false + } + rest := strings.TrimSpace(t[len("version"):]) + rest = strings.TrimSpace(strings.TrimPrefix(rest, ":")) + rest = strings.Trim(rest, `"`) + if rest == "" { + return "", false + } + return rest, true +} diff --git a/internal/detector/nodeproject.go b/internal/detector/nodeproject.go index 1e1936a..176f4ec 100644 --- a/internal/detector/nodeproject.go +++ b/internal/detector/nodeproject.go @@ -18,6 +18,10 @@ const maxNodeProjects = 1000 type NodeProjectDetector struct { exec executor.Executor skipper *tcc.Skipper + // dist, when non-nil, makes per-project package listing read resolved + // versions from on-disk lockfiles instead of the declared ranges in + // package.json. Attached by the caller based on config.UseLegacyNodeScan. + dist *NodeDistDetector } func NewNodeProjectDetector(exec executor.Executor) *NodeProjectDetector { @@ -31,6 +35,15 @@ func (d *NodeProjectDetector) WithSkipper(s *tcc.Skipper) *NodeProjectDetector { return d } +// WithDiskScan switches per-project package listing to disk parsing (resolved +// name@version from the lockfile / node_modules) via the supplied detector. A +// nil detector leaves the legacy package.json-range path in place. Returns the +// detector for chaining. +func (d *NodeProjectDetector) WithDiskScan(dist *NodeDistDetector) *NodeProjectDetector { + d.dist = dist + return d +} + // CountProjects counts the number of Node.js projects found under the given directories. func (d *NodeProjectDetector) CountProjects(_ context.Context, searchDirs []string) int { return len(d.ListProjects(searchDirs)) @@ -76,7 +89,7 @@ func (d *NodeProjectDetector) listInDir(dir string) []model.ProjectInfo { if !hasNodeModules && !isYarnBerryPnP { return nil } - pkgs := d.readPackageJSONDeps(path) + pkgs := d.projectPackages(projectDir, pm, path) projects = append(projects, model.ProjectInfo{ Path: projectDir, PackageManager: pm, @@ -91,6 +104,31 @@ func (d *NodeProjectDetector) listInDir(dir string) []model.ProjectInfo { return projects } +// projectPackages returns a project's packages, preferring disk parsing +// (resolved name@version, full transitive set) when a dist detector is +// attached, and falling back to the package.json declared ranges otherwise. +// is_direct is not carried in the community ProjectInfo shape, so it is dropped +// here; it survives in the enterprise path. +func (d *NodeProjectDetector) projectPackages(projectDir, pm, packageJSONPath string) []model.PackageDetail { + if d.dist != nil { + return nodePackagesToDetails(d.dist.ScanProject(projectDir, pm)) + } + return d.readPackageJSONDeps(packageJSONPath) +} + +// nodePackagesToDetails projects the disk-parse result down to the community +// {name, version} shape. +func nodePackagesToDetails(pkgs []model.NodePackage) []model.PackageDetail { + if len(pkgs) == 0 { + return nil + } + out := make([]model.PackageDetail, len(pkgs)) + for i, p := range pkgs { + out[i] = model.PackageDetail{Name: p.Name, Version: p.Version} + } + return out +} + // readPackageJSONDeps reads dependencies + devDependencies from a package.json file. func (d *NodeProjectDetector) readPackageJSONDeps(packageJSONPath string) []model.PackageDetail { data, err := d.exec.ReadFile(packageJSONPath) diff --git a/internal/detector/nodescan.go b/internal/detector/nodescan.go index 397f678..bc0d7e6 100644 --- a/internal/detector/nodescan.go +++ b/internal/detector/nodescan.go @@ -54,6 +54,12 @@ type NodeScanner struct { // map from multiple goroutines. pmAvailability map[string]error pmAvailabilityMu sync.Mutex + // dist, when non-nil, makes both the per-project and global scans read + // packages from disk (lockfiles / node_modules) instead of invoking the + // package manager. Attached by the caller based on config.UseLegacyNodeScan. + // The cache, ordering, size cap, and concurrency around the scan are + // unchanged — only the per-project package source differs. + dist *NodeDistDetector } func NewNodeScanner(exec executor.Executor, log *progress.Logger, loggedInUser string) *NodeScanner { @@ -65,6 +71,14 @@ func NewNodeScanner(exec executor.Executor, log *progress.Logger, loggedInUser s } } +// WithDiskScan switches package discovery to on-disk parsing via the supplied +// detector (no `npm ls` / `yarn` / `pnpm` / `bun` subprocess). A nil detector +// leaves the legacy command path in place. Returns the scanner for chaining. +func (s *NodeScanner) WithDiskScan(dist *NodeDistDetector) *NodeScanner { + s.dist = dist + return s +} + // binaryAvailable returns the cached checkPath result for a package-manager // binary, populating the cache on first call. Wraps checkPath so callers in // the per-project loop don't pay a LookPath per project on devices that @@ -176,8 +190,14 @@ func (s *NodeScanner) checkPath(ctx context.Context, name string) error { return err } -// ScanGlobalPackages runs npm/yarn/pnpm list -g and returns raw base64-encoded results. +// ScanGlobalPackages returns globally-installed packages, one NodeScanResult +// per package manager. In disk mode it parses each PM's global node_modules; +// otherwise it runs npm/yarn/pnpm list -g and returns raw base64 output. func (s *NodeScanner) ScanGlobalPackages(ctx context.Context) []model.NodeScanResult { + if s.dist != nil { + return s.scanGlobalPackagesFromDisk() + } + var results []model.NodeScanResult s.emitProgress("global: npm") @@ -635,6 +655,10 @@ func orderScanProjects(projects []projectEntry, knownLastVerified map[string]tim // DirExists checks per project and to keep the detected value consistent // with what the caller logged. func (s *NodeScanner) scanProject(ctx context.Context, projectDir, pm string) (model.NodeScanResult, bool) { + if s.dist != nil { + return s.scanProjectFromDisk(projectDir, pm) + } + var cmd string var args []string switch pm { @@ -719,6 +743,58 @@ func (s *NodeScanner) scanProject(ctx context.Context, projectDir, pm string) (m }, true } +// scanProjectFromDisk produces a project's NodeScanResult by parsing on-disk +// lockfiles / node_modules instead of running the package manager. Unlike the +// command path it does not require the PM binary to be installed, so a project +// whose toolchain is absent is still inventoried. RawStdout/Stderr stay empty +// (the backend reads Packages directly), and PMVersion is omitted — resolving +// it would mean running the binary we are deliberately not invoking. +func (s *NodeScanner) scanProjectFromDisk(projectDir, pm string) (model.NodeScanResult, bool) { + pkgs := s.dist.ScanProject(projectDir, pm) + return model.NodeScanResult{ + ProjectPath: projectDir, + PackageManager: pm, + WorkingDirectory: projectDir, + Packages: pkgs, + PackagesCount: len(pkgs), + ExitCode: 0, + }, true +} + +// scanGlobalPackagesFromDisk inventories globally-installed packages from each +// package manager's global node_modules on disk, returning one NodeScanResult +// per PM (the delta layer reconciles globals keyed by package manager). Roots +// for the same PM are merged and de-duplicated. Returns nil when no global +// roots exist on the host. +func (s *NodeScanner) scanGlobalPackagesFromDisk() []model.NodeScanResult { + roots := NodeGlobalRoots(s.exec) + if len(roots) == 0 { + s.log.Debug("node global disk scan: no global node_modules roots found") + return nil + } + byPM := make(map[string][]model.NodePackage) + var order []string + for _, r := range roots { + s.emitProgress("global: " + r.pm) + if _, seen := byPM[r.pm]; !seen { + order = append(order, r.pm) + } + byPM[r.pm] = append(byPM[r.pm], s.dist.ScanGlobalModules(r.dir)...) + } + results := make([]model.NodeScanResult, 0, len(order)) + for _, pm := range order { + pkgs := dedupSortPackages(byPM[pm]) + s.log.Debug("node global disk scan: %s -> %d packages", pm, len(pkgs)) + results = append(results, model.NodeScanResult{ + PackageManager: pm, + Packages: pkgs, + PackagesCount: len(pkgs), + ExitCode: 0, + }) + } + return results +} + func (s *NodeScanner) getVersion(ctx context.Context, binary, flag string) string { stdout, _, _, err := s.runCmd(ctx, 10*time.Second, binary, flag) if err != nil { diff --git a/internal/model/model.go b/internal/model/model.go index ec6bd57..82d61f7 100644 --- a/internal/model/model.go +++ b/internal/model/model.go @@ -147,18 +147,49 @@ type UnchangedGlobalRef struct { LastUploadedExecutionID string `json:"last_uploaded_execution_id,omitempty"` } -// NodeScanResult holds raw scan output for enterprise telemetry. -// Used for both global packages and per-project scans. +// NodeScanResult holds one project's (or one global root's) scan output for +// enterprise telemetry. Used for both global packages and per-project scans. +// +// Two mutually-exclusive shapes flow through this struct depending on how the +// scan was produced: +// - Legacy (command) path: RawStdoutBase64 carries the raw `npm ls`/`yarn`/ +// `pnpm`/`bun` output; the backend parses it into packages on ingest. +// - Disk-parse path: Packages is populated directly and RawStdoutBase64 is +// left empty. The backend's ParseNodeProjects passes a project through +// untouched when RawStdoutBase64 == "", so pre-parsed packages reach +// storage with no backend change. Never set both: a non-empty +// RawStdoutBase64 makes the backend re-parse and overwrite Packages. +// +// JSON tags match agent-api's ddbmodels.NodeProject so the payload +// deserializes server-side without a schema change. type NodeScanResult struct { - ProjectPath string `json:"project_path"` - PackageManager string `json:"package_manager"` - PMVersion string `json:"package_manager_version"` - WorkingDirectory string `json:"working_directory"` - RawStdoutBase64 string `json:"raw_stdout_base64"` - RawStderrBase64 string `json:"raw_stderr_base64"` - Error string `json:"error"` - ExitCode int `json:"exit_code"` - ScanDurationMs int64 `json:"scan_duration_ms"` + ProjectPath string `json:"project_path"` + PackageManager string `json:"package_manager"` + PMVersion string `json:"package_manager_version"` + WorkingDirectory string `json:"working_directory"` + RawStdoutBase64 string `json:"raw_stdout_base64,omitempty"` + RawStderrBase64 string `json:"raw_stderr_base64,omitempty"` + Packages []NodePackage `json:"packages,omitempty"` + PackagesCount int `json:"packages_count"` + Error string `json:"error"` + ExitCode int `json:"exit_code"` + ScanDurationMs int64 `json:"scan_duration_ms"` +} + +// NodePackage is one installed Node package discovered by disk parsing. +// +// Fields are intentionally limited to what the backend persists today +// (name, version, direct-vs-transitive) — see DeviceNPMPackageUsageInfo. The +// agent-api NodePackage additionally declares InstallPath and Dependencies, +// but both are parsed-then-discarded server-side and are omitted here on +// purpose. JSON tags match ddbmodels.NodePackage. +type NodePackage struct { + Name string `json:"name"` + Version string `json:"version"` + // IsDirect marks a top-level dependency (declared in the project's + // package.json) versus a transitive one pulled in by another package. + // Derived from lockfile structure, not from running the package manager. + IsDirect bool `json:"is_direct,omitempty"` } // PackageDetail represents a single package name and version. diff --git a/internal/scan/scanner.go b/internal/scan/scanner.go index 48553e0..881b5e1 100644 --- a/internal/scan/scanner.go +++ b/internal/scan/scanner.go @@ -7,6 +7,7 @@ import ( "github.com/step-security/dev-machine-guard/internal/buildinfo" "github.com/step-security/dev-machine-guard/internal/cli" + "github.com/step-security/dev-machine-guard/internal/config" "github.com/step-security/dev-machine-guard/internal/detector" "github.com/step-security/dev-machine-guard/internal/detector/configaudit" "github.com/step-security/dev-machine-guard/internal/device" @@ -122,6 +123,10 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) error { log.StepStart("Scanning Node.js projects") start = time.Now() projectDetector := detector.NewNodeProjectDetector(exec).WithSkipper(tccSkipper) + if !config.UseLegacyNodeScan { + projectDetector = projectDetector.WithDiskScan( + detector.NewNodeDistDetector(exec).WithSkipper(tccSkipper).WithLogger(log)) + } nodeProjects = projectDetector.ListProjects(searchDirs) log.StepDone(time.Since(start)) } else { diff --git a/internal/telemetry/delta.go b/internal/telemetry/delta.go index af81d03..abd003d 100644 --- a/internal/telemetry/delta.go +++ b/internal/telemetry/delta.go @@ -81,6 +81,16 @@ func npmRecordsFromResults(results []model.NodeScanResult) []state.ScanRecord { if r.ProjectPath == "" { continue } + // Disk-parse results carry structured Packages and an empty raw body; + // hash the parsed packages so the delta change-detector reflects the + // actual inventory (hashing an empty raw body would collapse every + // project to the same hash). The command path keeps hashing raw stdout. + if r.RawStdoutBase64 == "" { + out = append(out, state.ScanRecordFromValue( + r.ProjectPath, r.PackageManager, r.PMVersion, r.Packages, r.ExitCode, + )) + continue + } out = append(out, state.ScanRecordFromBase64( r.ProjectPath, r.PackageManager, r.PMVersion, r.RawStdoutBase64, r.ExitCode, )) @@ -111,7 +121,15 @@ func globalRecordsFromNode(results []model.NodeScanResult) []state.GlobalRecord if r.PackageManager == "" { continue } - hash, _ := state.CanonicalHashJSON(decodeBase64OrRaw(r.RawStdoutBase64)) + var hash string + if r.RawStdoutBase64 == "" { + // Disk-parse globals: hash the parsed packages (see + // npmRecordsFromResults). ScanRecordFromValue gives the same + // canonical hash used everywhere else for structured values. + hash = state.ScanRecordFromValue("", r.PackageManager, "", r.Packages, r.ExitCode).Hash + } else { + hash, _ = state.CanonicalHashJSON(decodeBase64OrRaw(r.RawStdoutBase64)) + } out = append(out, state.GlobalRecord{PM: r.PackageManager, Hash: hash, ExitCode: r.ExitCode}) } return out diff --git a/internal/telemetry/telemetry.go b/internal/telemetry/telemetry.go index 4291612..4280910 100644 --- a/internal/telemetry/telemetry.go +++ b/internal/telemetry/telemetry.go @@ -873,6 +873,10 @@ func Run(exec executor.Executor, log *progress.Logger, cfg *cli.Config) (err err log.Progress("Scanning globally installed packages...") nodeScanner := detector.NewNodeScanner(exec, log, loggedInUsername).WithSkipper(tccSkipper) + if !config.UseLegacyNodeScan { + nodeScanner = nodeScanner.WithDiskScan( + detector.NewNodeDistDetector(exec).WithSkipper(tccSkipper).WithLogger(log)) + } // Stream sub-progress so heartbeats show "project 12 of 47" / // "global: yarn" during the long-running node phase. Both // ScanGlobalPackages and ScanProjects share this hook.