From 7d54df71de1c20780c0edc41db4657230acf461f Mon Sep 17 00:00:00 2001 From: matthewpeterkort Date: Tue, 5 May 2026 13:57:03 -0700 Subject: [PATCH 1/7] fix add error passing --- cmd/remote/add/gen3.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/remote/add/gen3.go b/cmd/remote/add/gen3.go index 9f9ceed7..33151ebc 100644 --- a/cmd/remote/add/gen3.go +++ b/cmd/remote/add/gen3.go @@ -99,13 +99,13 @@ func gen3Init(remoteName, credFile, fenceToken, project, organization, bucket st default: existing, err := configure.Load(remoteName) - if err == nil { + if err != nil { + return fmt.Errorf("failed to load %s config: %w", remoteName, err) + } else { accessToken = existing.AccessToken apiKey = existing.APIKey keyID = existing.KeyID apiEndpoint = existing.APIEndpoint - } else { - return fmt.Errorf("must provide either --cred or --token (or have existing profile %s)", remoteName) } } From 254a25f0b04c0b31074a1b0315b77e5490fc1426 Mon Sep 17 00:00:00 2001 From: Brian Date: Wed, 6 May 2026 15:28:50 -0700 Subject: [PATCH 2/7] dont delete repo;force push instead (#227) Co-authored-by: Matthew Peterkort <33436238+matthewpeterkort@users.noreply.github.com> --- tests/monorepos/e2e-monorepo-remote.sh | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/monorepos/e2e-monorepo-remote.sh b/tests/monorepos/e2e-monorepo-remote.sh index 7c40376b..a53dc5a2 100755 --- a/tests/monorepos/e2e-monorepo-remote.sh +++ b/tests/monorepos/e2e-monorepo-remote.sh @@ -504,7 +504,7 @@ validate_config() { configure_remote_auth() { MONO_REMOTE_URL_AUTH="$MONO_REMOTE_URL" if [[ -n "$TEST_GITHUB_TOKEN" && "$MONO_REMOTE_URL" =~ ^https://github.com/ ]]; then - MONO_REMOTE_URL_AUTH="${MONO_REMOTE_URL/https:\/\/github.com\//https:\/\/x-access-token:${TEST_GITHUB_TOKEN}@github.com/}" + MONO_REMOTE_URL_AUTH="${MONO_REMOTE_URL/https:\/\/github.com\//https://x-access-token:${TEST_GITHUB_TOKEN}@github.com/}" fi } @@ -571,11 +571,12 @@ delete_github_repo_if_requested() { require_cmd gh if GH_TOKEN="$TEST_GITHUB_TOKEN" gh api "/repos/${GITHUB_OWNER_REPO}" >/dev/null 2>&1; then - log "Deleting existing GitHub repo ${GITHUB_OWNER_REPO} for clean test run" - GH_TOKEN="$TEST_GITHUB_TOKEN" gh api -X DELETE "/repos/${GITHUB_OWNER_REPO}" >/dev/null - DELETED_REMOTE_REPO_AT_START=true - # Small wait to avoid eventual-consistency race with immediate recreation. - sleep 2 + # log "Deleting existing GitHub repo ${GITHUB_OWNER_REPO} for clean test run" + # GH_TOKEN="$TEST_GITHUB_TOKEN" gh api -X DELETE "/repos/${GITHUB_OWNER_REPO}" >/dev/null + # DELETED_REMOTE_REPO_AT_START=true + # # Small wait to avoid eventual-consistency race with immediate recreation. + # sleep 2 + log "Skipping deletion of existing GitHub repo ${GITHUB_OWNER_REPO}; using push -f instead" fi } @@ -683,7 +684,7 @@ push_dataset() { git add .gitattributes git commit -m "Initialize LFS tracking" || true # Ensure origin/main is established as upstream for subsequent git-drs pushes. - git push --set-upstream "$MONO_REMOTE_NAME" "$MONO_GIT_BRANCH" + git push -f --set-upstream "$MONO_REMOTE_NAME" "$MONO_GIT_BRANCH" if [[ "$MONO_RUN_MULTIPART_SMOKE" == "true" ]]; then mkdir -p fixtures/multipart-smoke From 1fda5621ef9cbda55a353deb50f785729ec11dc2 Mon Sep 17 00:00:00 2001 From: Brian Date: Thu, 7 May 2026 09:14:39 -0700 Subject: [PATCH 3/7] update docs for latest (#226) * update docs for latest * document new,non-lfs paths * extract issues * refactor lfs out of docs * update docs --------- Co-authored-by: matthewpeterkort Co-authored-by: Matthew Peterkort <33436238+matthewpeterkort@users.noreply.github.com> --- README.md | 174 ++--- ...sue-add-include-pattern-to-git-drs-pull.md | 51 ++ docs/adding-s3-files.md | 26 +- ...-drs-endpoints-and-transfer-concurrency.md | 216 ++++++ docs/commands.md | 639 +++++------------- docs/developer-guide.md | 21 +- docs/drs-registerfile-upsert.md | 4 +- docs/e2e-modes-and-local-setup.md | 6 +- docs/getting-started.md | 421 ++++++------ docs/installation.md | 48 +- docs/precommit-cache-addurl-prepush.md | 10 +- docs/precommit.md | 41 +- docs/troubleshooting.md | 581 ++++++---------- 13 files changed, 971 insertions(+), 1267 deletions(-) create mode 100644 attic/issue-add-include-pattern-to-git-drs-pull.md create mode 100644 docs/architecture-drs-endpoints-and-transfer-concurrency.md diff --git a/README.md b/README.md index 9fa44367..81ea2f6c 100644 --- a/README.md +++ b/README.md @@ -3,143 +3,113 @@ --- # NOTICE -git-drs is not yet fully compliant with DRS. It currently works against Gen3 DRS server. Full GA4GH DRS support is expected once v1.6 of the specification has been published. +`git-drs` is not a pure GA4GH DRS client. It targets Syfon/Gen3-style DRS workflows and uses extensions where repo-scale behavior requires them. --- [![Tests](https://github.com/calypr/git-drs/actions/workflows/test.yaml/badge.svg)](https://github.com/calypr/git-drs/actions/workflows/test.yaml) -**Git/DRS orchestration with optional Git LFS compatibility** +**Git/DRS orchestration with Git-compatible pointer workflows** -Git DRS manages Git-facing DRS workflows: local metadata, Git hooks, filter behavior, lookup/register/push/pull orchestration, and optional Git LFS compatibility. Provider-specific transfer, signed URL behavior, and direct cloud inspection live in client code outside this repo. +`git-drs` manages: + +- remote Gen3/Syfon configuration +- local DRS metadata +- pointer-aware push/pull orchestration +- bucket-scoped object reference workflows ## Key Features -- **Unified Workflow**: Manage both code and large data files using standard Git commands -- **DRS Integration**: Built-in support for Gen3 DRS servers -- **Multi-Remote Support**: Work with development, staging, and production servers in one repository -- **Automatic Processing**: Files are processed automatically during commits and pushes -- **Flexible Tracking**: Track individual files, patterns, or entire directories +- unified Git/data workflow around DRS-backed pointers +- Gen3/Syfon integration +- multiple remotes in one repository +- explicit file tracking and hydration +- metadata-only reference support for existing bucket objects ## How It Works -Git DRS works alongside Git LFS when you want LFS-compatible pointers and storage, while still supporting DRS-centric workflows: +At a high level: -1. **Initialization**: Set up repository and DRS server configuration -2. **Automatic Commits**: Create DRS objects during pre-commit hooks -3. **Automatic Pushes**: Register files with DRS servers and upload to configured storage -4. **On-Demand Downloads**: Pull specific files or patterns as needed +1. initialize the repository with `git drs init` +2. configure a remote for one `organization/project` +3. track file patterns with `git drs track` +4. add/commit/push normally +5. hydrate pointer files later with `git drs pull` ## Quick Start -### Installation - ```bash -# Install Git LFS first -brew install git-lfs # macOS -git lfs install --skip-smudge - -# Install Git DRS -/bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/main/install.sh)" -- $GIT_DRS_VERSION - -# Install global Git filter configuration for git-drs git drs install -``` - -### Basic Usage - -```bash -# Initialize repository (one-time Git repo setup) git drs init - -# Add DRS remote -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --organization my-program \ - --project my-project \ - --bucket my-bucket - -# Required prerequisite (usually steward/admin setup): -# create bucket credentials, then map org/project to full storage roots before users run push/pull -git drs bucket add production \ - --bucket my-bucket \ - --region us-east-1 \ - --access-key "$AWS_ACCESS_KEY_ID" \ - --secret-key "$AWS_SECRET_ACCESS_KEY" \ - --s3-endpoint https://s3.amazonaws.com -git drs bucket add-organization production \ - --organization my-program \ - --path s3://my-bucket/my-program -git drs bucket add-project production \ - --organization my-program \ - --project my-project \ - --path s3://my-bucket/my-program/my-project - -# Track files -git lfs track "*.bam" +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +git drs track "*.bam" git add .gitattributes - -# Add and commit files -git add my-file.bam -git commit -m "Add data file" +git add sample.bam +git commit -m "Add sample" git push - -# Download files -git lfs pull -I "*.bam" +git drs ls-files +git drs pull -I "*.bam" ``` -## Documentation +## Current CLI Shape -For detailed setup and usage information: +The cleaned CLI intentionally removed legacy commands: -- **[Getting Started](docs/getting-started.md)** - Repository setup and basic workflows -- **[Commands Reference](docs/commands.md)** - Complete command documentation -- **[Installation Guide](docs/installation.md)** - Platform-specific installation -- **[Troubleshooting](docs/troubleshooting.md)** - Common issues and solutions -- **[E2E Modes + Local Setup](docs/e2e-modes-and-local-setup.md)** - Local vs remote mode, server config, and end-to-end runbooks -- **[Cloud/Object Integration](docs/adding-s3-files.md)** - Adding files from provider URLs or configured bucket object keys -- **[Developer Guide](docs/developer-guide.md)** - Internals and development +- removed: + - `git drs fetch` + - `git drs list` + - `git drs upload` + - `git drs download` +- `git drs pull` is hydration-only +- `git drs ls-files` is the local file inventory command +- `git drs remote add gen3` takes scope as `organization/project` -## Supported Servers +Example: -- **Gen3 Data Commons** (e.g., CALYPR) +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +``` -## Supported Environments +## Bucket Mapping Model -- **Local Development** environments -- **HPC Systems** (e.g., ARC) +End users should not need to know the bucket name. -## Commands Overview +Push and pull depend on server-side bucket mapping for the requested scope. That mapping is normally provisioned once by a steward/admin using the bucket commands. -| Command | Description | -| ---------------------- | ------------------------------------- | -| `git drs install` | Install global git-drs filter config | -| `git drs init` | Initialize repository | -| `git drs remote add` | Add a DRS remote server | -| `git drs remote list` | List configured remotes | -| `git drs remote set` | Set default remote | -| `git drs add-url` | Add files via provider URLs or configured bucket object keys | -| `git lfs track` | Track file patterns with LFS | -| `git lfs ls-files` | List tracked files | -| `git lfs pull` | Download tracked files | -| `git drs fetch` | Fetch metadata from DRS server | -| `git drs push` | Push objects to DRS server | +## Common Commands -Use `--help` with any command for details. See [Commands Reference](docs/commands.md) for complete documentation. +| Command | Description | +| --- | --- | +| `git drs install` | Install global `git-drs` filter config | +| `git drs init` | Initialize repository-local `git-drs` state | +| `git drs remote add gen3 [remote] ` | Add or refresh a Gen3/Syfon remote | +| `git drs remote list` | List configured remotes | +| `git drs remote set ` | Set the default remote | +| `git drs track ` | Track files or globs | +| `git drs untrack ` | Stop tracking files or globs | +| `git drs ls-files` | List tracked files and localization state | +| `git drs pull` | Hydrate pointer files in the current checkout | +| `git drs push` | Register/upload objects and push metadata workflow | +| `git drs add-url` | Add an existing provider object by URL or scoped key | +| `git drs add-ref` | Add a local reference to an existing DRS object | +| `git drs query` | Query a DRS object by ID | +| `git drs copy-records` | Copy Syfon records between remotes for one scope | -## Requirements +## Documentation -- Git LFS installed and configured -- Access credentials for your DRS server -- Go 1.24+ (for building from source) +- [Getting Started](docs/getting-started.md) +- [Commands Reference](docs/commands.md) +- [Troubleshooting](docs/troubleshooting.md) +- [Developer Guide](docs/developer-guide.md) +- [GA4GH DRS Scalability Gaps](docs/ga4gh-drs-scalability-gaps.md) -## Support +## Requirements -- **Issues**: [GitHub Issues](https://github.com/calypr/git-drs/issues) -- **Releases**: [GitHub Releases](https://github.com/calypr/git-drs/releases) -- **Documentation**: See `docs/` folder for detailed guides +- Git +- access credentials for the target Gen3/Syfon deployment +- Go 1.26.2+ for local builds -## License +## Support -This project is part of the CALYPR data commons ecosystem. +- [GitHub Issues](https://github.com/calypr/git-drs/issues) +- [GitHub Releases](https://github.com/calypr/git-drs/releases) diff --git a/attic/issue-add-include-pattern-to-git-drs-pull.md b/attic/issue-add-include-pattern-to-git-drs-pull.md new file mode 100644 index 00000000..4217ab3b --- /dev/null +++ b/attic/issue-add-include-pattern-to-git-drs-pull.md @@ -0,0 +1,51 @@ +# Add `-I "pattern"` include filter support to `git drs pull` + +## Summary +Add include-pattern filtering to `git drs pull`, similar to legacy `git lfs pull -I "pattern"` workflows. + +## Motivation +Current `git drs pull` behavior pulls based on repository resolution without a user-facing path pattern filter. Users migrating from `git lfs pull -I` expect selective hydration of files by glob/path. + +## Proposed UX +Support: + +```bash +git drs pull -I "results/*.txt" +git drs pull -I "*.bam" -I "data/**" +git drs pull --include "path/to/file" +``` + +Optional: +- `--exclude` parity (if desired in same change or follow-up) + +## Proposed behavior +1. Parse one or more include patterns (`-I`, `--include`). +2. Resolve candidate pointers as usual. +3. Filter by repo-relative path match before download. +4. Download only matched objects; skip others with clear logging. +5. If no pattern supplied, preserve current default behavior. + +## Scope +- `cmd/pull/main.go` CLI flags and pull selection pipeline +- pointer/path inventory layer (where path<->OID candidates are produced) +- docs: `docs/commands.md`, `docs/getting-started.md`, `docs/troubleshooting.md` +- tests for include filtering semantics + +## Acceptance criteria +- [ ] `git drs pull -I ""` works for a single pattern. +- [ ] Repeated `-I` flags are supported. +- [ ] Include matching is against repo-relative paths. +- [ ] Default `git drs pull` behavior unchanged when no `-I` is passed. +- [ ] Help text documents pattern syntax and examples. +- [ ] Unit/integration tests cover positive and negative matches. + +## Testing matrix +- Single file exact path include. +- Wildcard include (`*.bam`, `data/**`). +- Multiple `-I` values. +- No matches (should no-op cleanly and return success unless policy says otherwise). +- Mixed matched/unmatched objects in same pull run. + +## Notes +This closes a usability gap for users transitioning from `git lfs` CLI habits to `git drs` commands while keeping pull behavior explicit and predictable. + diff --git a/docs/adding-s3-files.md b/docs/adding-s3-files.md index cb233826..f8a83a6b 100644 --- a/docs/adding-s3-files.md +++ b/docs/adding-s3-files.md @@ -1,6 +1,6 @@ # Adding Provider Objects with `git drs add-url` -`git drs add-url` prepares a Git LFS pointer plus local DRS metadata for an object that already exists in provider storage. +`git drs add-url` prepares a Git pointer plus local DRS metadata for an object that already exists in provider storage. Important behavior: @@ -26,7 +26,7 @@ The inspector also accepts other go-cloud styles (`gs://`, `azblob://`, `file:// If your remote org/project already has a bucket mapping, pass an object key relative to that configured bucket scope and set `--scheme`. ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes git drs add-url path/to/object.bin data/from-bucket.bin \ @@ -54,7 +54,7 @@ git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin \ If you know the authoritative SHA256, pass `--sha256`. ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes git drs add-url path/to/object.bin data/from-bucket.bin \ @@ -66,25 +66,25 @@ git commit -m "add known-sha object" git drs push ``` -## Unknown SHA256 (experimental sentinel mode) +## Unknown SHA256 If SHA256 is unknown, omit `--sha256`. Behavior: 1. `add-url` performs object metadata lookup (HEAD/attributes). -2. Synthetic OID is derived from ETag (`sha256(etag)`). -3. A local sentinel object is written into `.git/lfs/objects/...`. -4. `git drs push` performs metadata-only registration. +2. A deterministic placeholder OID is derived from remote object metadata. +3. A pointer file and local DRS metadata are written. +4. `git drs push` performs metadata registration. ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 git add data/from-bucket.bin -git commit -m "add unknown-sha object (sentinel mode)" +git commit -m "add unknown-sha object" git drs push ``` @@ -103,7 +103,7 @@ For e2e/dev harnesses, `TEST_BUCKET_*` variables are also supported by command-l ## Prerequisites -- File path must be LFS-tracked (via `.gitattributes`). +- File path must be tracked (via `.gitattributes`). - Remote configuration must point to the intended org/project scope. - The bucket credential and org/project storage scope must exist on drs-server, for example via `git drs bucket add`, then `git drs bucket add-organization` or `git drs bucket add-project --path s3://bucket/prefix`. @@ -118,13 +118,13 @@ Usually region/endpoint mismatch for S3-compatible storage. ### `no local payload available; skipping upload and keeping metadata-only registration` -Expected for add-url pointer/sentinel flows where local payload bytes are intentionally absent. +Expected for add-url pointer/metadata-only flows where local payload bytes are intentionally absent. -### `file is not tracked by LFS` +### `file is not tracked` Track the path pattern and re-add: ```bash -git lfs track "data/*.bin" +git drs track "data/*.bin" git add .gitattributes ``` diff --git a/docs/architecture-drs-endpoints-and-transfer-concurrency.md b/docs/architecture-drs-endpoints-and-transfer-concurrency.md new file mode 100644 index 00000000..7f5d258a --- /dev/null +++ b/docs/architecture-drs-endpoints-and-transfer-concurrency.md @@ -0,0 +1,216 @@ +# Architecture: DRS Endpoint Flows, Transfer Concurrency, and `add-url`/`add-ref` + +This document explains three implementation areas in `git-drs`: + +1. How user-issued Git/Git-DRS commands map to GA4GH DRS endpoint calls. +2. How transfer concurrency works for upload and download. +3. How `add-url` and `add-ref` work, including when and where SHA existence is checked on the DRS server. + +--- + +## 1) Command to Endpoint Trace (User command -> Code path -> DRS API) + +## 1.1 High-level command routing + +- User-facing commands are registered in `cmd/root.go`. +- Relevant command entrypoints: + - `git drs push` -> `cmd/push/main.go` + - `git drs pull` -> `cmd/pull/main.go` + - `git drs ls-files` -> `cmd/lsfiles/main.go` + - `git drs query` -> `cmd/query/main.go` + - `git drs add-ref` -> `cmd/addref/add-ref.go` + - `git drs add-url` -> `cmd/addurl/service.go` + +`git-drs` obtains a remote-specific API client via `config.LoadConfig()` + `cfg.GetRemoteClient(...)` (see `internal/config/remote.go`). + +## 1.2 Endpoint mapping matrix + +The table below maps command behavior to DRS client calls and the corresponding DRS API intent. + +| User command | Main call path | DRS client method(s) | DRS endpoint intent | +| --- | --- | --- | --- | +| `git drs query ` | `cmd/query/main.go` | `DRS().GetObject(drs_id)` | Get object by DRS ID (`/ga4gh/drs/v1/objects/{id}` style) | +| `git drs query --checksum ` | `cmd/query/main.go` -> `drsremote.ObjectsByHashForScope` | `DRS().BatchGetObjectsByHash([]checksum)` | Lookup objects by checksum (`/ga4gh/drs/v1/objects/checksum/{checksum}` style; asserted in tests) | +| `git drs ls-files --drs` | `cmd/lsfiles/main.go` | `DRS().BatchGetObjectsByHash([]checksum)` | Check DRS registration status for local tracked files | +| `git drs pull` | `cmd/pull/main.go` -> `drsremote.DownloadToCachePath` | `DRS().BatchGetObjectsByHash`, `DRS().GetAccessURL`; optional bulk via `DRSAPI().GetBulkAccessURLWithResponse` | Resolve missing OIDs to DRS records and access URLs, then hydrate content into the current checkout | +| `git drs push [remote]` | `cmd/push/main.go` -> `pushsync.BatchSyncForPush` | `DRS().BatchGetObjectsByHash`, `DRS().RegisterObjects`, `DRS().GetAccessURL` | Check checksum presence, register missing records, probe/downloadability before upload | +| `git drs add-ref ` | `cmd/addref/add-ref.go` | `DRS().GetObject(drs_uri)` | Resolve existing DRS object and write pointer | + +Notes: + +- `internal/drsremote/remote_test.go` explicitly verifies some concrete paths: + - checksum lookup path `/ga4gh/drs/v1/objects/checksum/{sha}` + - bulk access path `/ga4gh/drs/v1/objects/access` + - access URL path `/ga4gh/drs/v1/objects/{id}/access/{type}` +- `git drs pre-push-prepare` also calls a non-GA4GH metadata staging endpoint: + - `POST {remote}/info/drs/objects/metadata` (`cmd/prepush/main.go`) + - This is optional capability and not part of GA4GH DRS. + +## 1.3 Trace from standard Git commands + +`git-drs` participates in both explicit `git drs ...` commands and standard Git workflows after `git drs init`: + +- `git drs init` installs hooks (`cmd/initialize/main.go`): + - pre-commit: `git drs precommit` +- During a normal `git push`, pre-push metadata can be staged via `/info/drs/objects/metadata` before transfer. +- The explicit `git drs push` command runs the register/upload workflow, then runs `git push --no-verify` by default (`cmd/push/main.go`). + +--- + +## 2) Transfer Concurrency Model (Upload and Download) + +### Concurrency mechanism: in-process goroutines only + +All transfer concurrency in `git-drs` is **in-process**, implemented with **Go goroutines and channels**. There is no use of OS-level multi-processing (no `fork`/`exec` of worker processes) for data movement. + +- Upload object fan-out uses `golang.org/x/sync/errgroup` — goroutines with a shared context and bounded by `errgroup.SetLimit(n)`. +- Download chunk parallelism uses the `sydownload` library, which internally uses goroutines to issue concurrent HTTP range requests. +- Sub-process calls (`exec.Command("git", ...)`) appear only for Git metadata operations (for example `git checkout`, `git ls-files`, `git check-attr`), never for data-transfer concurrency. + +## 2.1 Upload concurrency (`git drs push`) + +Upload tuning originates from Git config and is carried in `config.GitContext`: + +- `lfs.concurrenttransfers` -> `UploadConcurrency` (Git config key) +- `drs.multipart-threshold` (MB) -> `MultiPartThreshold` + +See `internal/config/remote.go` (`newGitContext`) and `cmd/initialize/main.go` (`initGitConfig`). + +### Upload execution strategy + +In `internal/pushsync/batch_sync.go`: + +1. Build upload candidates. +2. Split candidates into: + - small files: `size < MultiPartThreshold` + - large files: `size >= MultiPartThreshold` +3. Small files upload in parallel using `errgroup.WithContext` + `eg.SetLimit(UploadConcurrency)` + `eg.Go(goroutine)` — **in-process goroutine fan-out**. +4. Large files upload sequentially (single goroutine, no additional concurrency). + +Key implementation points: + +- `executeUploadPlan(...)` controls fan-out and limits. +- Actual upload call is `syupload.UploadObjectFile(...)` in `internal/pushsync/register.go`. +- `forceMultipart` is computed per file (`fileSize >= threshold`) and passed to upload. + +Operationally, this gives bounded goroutine parallelism for many small objects while reducing resource contention for very large uploads. + +## 2.2 Download concurrency (`git drs pull`) + +Download concurrency is set via `sydownload.DownloadOptions`: + +- `MultipartThreshold: 5 MiB` +- `Concurrency: 2` +- `ChunkSize: 64 MiB` + +These values are currently hardcoded in `internal/drsremote/remote.go` (`downloadResolved`) and apply to the pull/hydration workflow. + +### Intra-object chunk concurrency + +The `sydownload` library implements **goroutine-based HTTP range-request concurrency** within a single object download: + +- `resolvedSource.GetRangeReader(ctx, guid, offset, length)` issues an HTTP range (`Range: bytes=offset-end`) request. +- `sydownload.DownloadToPathWithOptions` coordinates up to `Concurrency` (2) goroutines issuing simultaneous range requests per object. +- This is purely in-process; no subprocess is spawned. + +### Object-level iteration in pull + +- In `cmd/pull/main.go`, missing OIDs are processed in a **sequential** `for` loop — one object at a time. +- Each object download can still be internally chunk-concurrent (up to `Concurrency=2` goroutines) via `sydownload`. +- So pull concurrency is **intra-object** (goroutine-based chunk/range concurrency), not broad object fan-out. +- Bulk metadata prefetch (DRS objects + bulk access URLs) is performed **before** the sequential download loop to amortize API round-trips. + +## 2.3 Git metadata subprocesses + +Some flows still call Git commands directly for repository state inspection. + +- These are **subprocess** calls (`exec.Command("git", ...)`), not goroutine fan-out. +- Examples include tracked-file discovery and attribute inspection used by `ls-files` and `pull`. +- This is distinct from the goroutine-based `git drs push` upload fan-out and `sydownload` chunk concurrency. + +--- + +## 3) `add-url` and `add-ref`: Implementation and SHA existence checks + +## 3.1 `add-url` implementation + +Main logic lives in `cmd/addurl/service.go`. + +Workflow: + +1. Parse CLI input (`cmd/addurl/params.go`). +2. Resolve remote scope (org/project/bucket/prefix) (`cmd/addurl/scope.go`). +3. Resolve source object URL (full URL mode or key+`--scheme` mode). +4. Inspect object using cloud client (`sycloud.InspectObject`). +5. Ensure object identity: + - If `--sha256` provided: trust it as OID. + - Otherwise: derive a deterministic placeholder OID from remote object metadata. +6. Write pointer file to worktree. +7. Best-effort update of pre-commit cache (`updatePrecommitCache`). +8. Ensure file is tracked if needed. +9. Write/update local DRS metadata object under `.git/drs/lfs/objects` (`writeAddURLDrsObject`). + +### Does `add-url` query DRS server for SHA existence? + +Not immediately. `add-url` is local-preparation oriented: + +- It inspects provider object metadata. +- It writes local pointer + local DRS metadata. +- Server checksum existence is checked later during push (see section 3.3). + +## 3.2 `add-ref` implementation + +Main logic is in `cmd/addref/add-ref.go`. + +Workflow: + +1. Resolve remote client. +2. Call `DRS().GetObject(drs_uri)`. +3. Create parent directory if needed. +4. Write pointer from returned DRS object checksums (`lfs.CreateLfsPointer`). + +### Does `add-ref` query DRS server for SHA existence? + +It does not perform a checksum lookup endpoint call. It verifies existence by object ID (`GetObject`) and consumes checksum from that object payload.## 3.3 Where SHA existence check against DRS actually happens + +Checksum existence checks are performed during `git drs push` in `internal/pushsync/batch_sync.go`: + +1. `lookupMetadata()` iterates OIDs and calls: + - `drsremote.ObjectsByHash(...)` -> `DRS().BatchGetObjectsByHash(...)` +2. If no records exist for an OID, object candidate is included for bulk registration: + - `DRS().RegisterObjects(...)` +3. Upload decision is then based on registration status + downloadability probe. + +So for both `add-url` and `add-ref`, the checksum-existence gate is primarily deferred to push-time synchronization logic. + +--- + +## 4) End-to-end sequence summaries + +## 4.1 `git drs add-url ...` then `git drs push` + +1. `add-url`: local pointer + local DRS object prepared. +2. `push`: checksum lookup (`BatchGetObjectsByHash`). +3. Missing checksum -> `RegisterObjects`. +4. If payload required and available -> upload via syfon transfer. +5. Git refs pushed. + +## 4.2 `git drs add-ref ` then `git drs pull` + +1. `add-ref`: `GetObject(drs_id)` and write pointer. +2. `pull`: detect unresolved pointers. +3. For each OID, resolve scoped object by checksum and access URL. +4. Download to local object cache and hydrate the tracked file in the worktree. + +--- + +## 5) Practical implications for operators and developers + +- If you need immediate server-side checksum validation during `add-url`, that behavior does not exist today; validation happens at push time. +- All transfer concurrency is in-process (goroutines); no subprocess workers are used for data movement. +- Upload concurrency is configurable through Git config (`lfs.concurrenttransfers` key) and is implemented as a goroutine pool bounded by `errgroup.SetLimit`. +- Download concurrency is fixed (not configurable at runtime): `Concurrency=2` goroutines per object for HTTP range requests, currently hardcoded in `internal/drsremote/remote.go`. +- Object-level download iteration in `git drs pull` is sequential; only intra-object chunk downloads are concurrent. +- Git metadata discovery still uses subprocess calls, but those are repository inspection details, not data-transfer concurrency. + +--- diff --git a/docs/commands.md b/docs/commands.md index e1a7329f..43ac6d5a 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -1,617 +1,288 @@ # Commands Reference -Complete reference for Git DRS and related Git LFS commands. +Complete reference for the `git-drs` CLI as used on the `fix/cli` line. -Git DRS owns Git/DRS orchestration and local metadata. Direct provider access, signed URL behavior, and cloud inspection are client-side responsibilities reached through `syfon/client`. +Git DRS owns Git/DRS orchestration and local metadata. Provider access, signed URL behavior, and cloud inspection are handled through Syfon and client code behind these commands. -> **Navigation:** [Getting Started](getting-started.md) → **Commands Reference** → [Troubleshooting](troubleshooting.md) +> **Navigation:** [Getting Started](getting-started.md) -> **Commands Reference** -> [Troubleshooting](troubleshooting.md) -## Git DRS Commands +## Command Model -### `git drs install` - -Install global Git filter configuration for git-drs. This is equivalent in purpose to running `git-lfs install` for the git-drs filter. - -**Usage:** +`git-drs` is intentionally smaller now. -```bash -git drs install -``` +- Removed legacy commands: + - `git drs fetch` + - `git drs list` + - `git drs upload` + - `git drs download` +- `git drs pull` now mirrors `git lfs pull` semantics: + - it hydrates tracked pointer files in the current checkout + - it does not run `git pull` +- `git drs ls-files` is the `git lfs ls-files` analog: + - local-first inventory + - optional DRS registration checks +- `git drs remote add gen3` now takes scope as a positional `organization/project` -**What it does:** +## Core Setup -- Sets global Git config for `filter.drs.clean` -- Sets global Git config for `filter.drs.smudge` -- Sets global Git config for `filter.drs.process` -- Sets global Git config for `filter.drs.required` +### `git drs install` -**Resulting `~/.gitconfig` entries:** +Install global Git filter configuration for `git-drs`. -```ini -[filter "drs"] - clean = git-drs clean -- %f - smudge = git-drs smudge -- %f - process = git-drs filter - required = true +```bash +git drs install ``` -**When to run:** - -- **Once per machine/user** after installing `git-drs` -- Re-run any time you want to reset these global filter values +This sets the global `filter.drs.*` entries used by Git clean/smudge/filter operations. ### `git drs init` -Initialize Git DRS in a repository. Sets up Git DRS hooks and creates a `.git/drs/` directory that Git ignores automatically. - -**Usage:** +Initialize `git-drs` in the current repository. ```bash git drs init [flags] ``` -**Options:** - -- `--transfers `: Number of concurrent transfers (default: 4) - -**Example:** - -```bash -git drs init -``` - -**What it does:** - -- Creates `.git/drs/` directory structure -- Configures Git/LFS settings for git-drs managed push/pull -- Installs Git hooks for DRS workflows - -**When to run:** - -- **Once** after cloning a Git repository -- **Once** after creating a new Git repository -- **Never** needed for subsequent work sessions - -**You do NOT need to run `git drs init` again:** - -- When starting a new work session -- After refreshing credentials -- After pulling new changes +Common flags: -**Note:** Run this before adding remotes. +- `--transfers `: concurrent transfers +- `--upsert`: enable upsert behavior for push/register flows +- `--multipart-threshold `: multipart threshold in MB +- `--enable-data-client-logs`: enable lower-level client logging -### `git drs remote` +Run this once per repository. -Manage DRS remote server configurations. Git DRS supports multiple remotes for working with development, staging, and production servers. +## Remote Configuration -#### `git drs remote add gen3 ` +### `git drs remote add gen3 [remote-name] ` -Add a Gen3 DRS server configuration. - -**Usage:** +Add or refresh a Gen3-backed Syfon remote. ```bash -git drs remote add gen3 \ - --url \ - --cred \ - --organization \ - --project \ - [--bucket ] +git drs remote add gen3 [remote-name] [--cred | --token ] ``` -**Options:** - -- `--url `: Gen3 server endpoint (required) -- `--cred `: Path to credentials JSON file (required) -- `--token `: Token for temporary access (alternative to --cred) -- `--organization `: Program/organization scope used for bucket mapping -- `--project `: Project ID (required) -- `--bucket `: Bucket name fallback when no org/project mapping is configured - -**Examples:** +Examples: ```bash -# Add production remote -git drs remote add gen3 production \ - --url https://calypr-public.ohsu.edu \ - --cred /path/to/credentials.json \ - --organization my-program \ - --project my-project - -# Add staging remote -git drs remote add gen3 staging \ - --url https://staging.calypr.ohsu.edu \ - --cred /path/to/staging-credentials.json \ - --organization staging-program \ - --project staging-project +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +git drs remote add gen3 staging HTAN_INT/BForePC --token "$GEN3_TOKEN" ``` -**Note:** The first remote you add automatically becomes the default remote. -**Important:** A bucket mapping for the target `organization/project` must already exist, typically created once by a steward/admin with `git drs bucket add`, then `git drs bucket add-organization` or `git drs bucket add-project --path :///`. Without that mapping, push/pull operations will fail. +Notes: -#### `git drs remote list` +- `remote-name` is optional; if omitted, the default remote name is used. +- scope is always one positional argument: `organization/project` +- `--cred` imports a Gen3 credential file +- `--token` uses a temporary bearer token +- bucket resolution is scope-driven; users do not need to provide `--bucket` +- endpoint resolution comes from the credential/token path; users do not need to provide `--url` -List all configured DRS remotes. +Prerequisite: -**Usage:** - -```bash -git drs remote list -``` - -**Example Output:** - -``` -* production gen3 https://calypr-public.ohsu.edu - staging gen3 https://staging.calypr.ohsu.edu - development gen3 https://dev.calypr.ohsu.edu -``` +- the target `organization/project` must already be mapped to a bucket on the server +- if no local repo mapping exists, `git-drs` can resolve the visible bucket from the server -The `*` indicates the default remote used by all commands unless specified otherwise. +### `git drs remote list` -#### `git drs remote set ` - -Set the default DRS remote for all operations. - -**Usage:** - -```bash -git drs remote set -``` - -**Examples:** +List configured DRS remotes. ```bash -# Switch to staging for testing -git drs remote set staging - -# Switch back to production -git drs remote set production - -# Verify change git drs remote list ``` -### `git drs fetch [remote-name]` - -Fetch DRS object metadata from remote server. Downloads metadata only, not actual files. +### `git drs remote set ` -**Usage:** +Set the default DRS remote. ```bash -# Fetch from default remote -git drs fetch - -# Fetch from specific remote -git drs fetch staging -git drs fetch production -``` - -**Note:** `fetch` and `push` are commonly used together for cross-remote workflows. See `git drs push` below. - -**What it does:** - -- Identifies remote and project from configuration -- Transfers all DRS records for a given project from the server to the local `.git/drs/lfs/objects/` directory - -### `git drs add-url [path]` - -Prepare a pointer plus local DRS metadata for an object that already exists in provider storage. - -**Usage:** - -```bash -# Preferred: object key resolved against configured bucket scope -git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 - -# Compatibility: explicit provider URL -git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin -``` - -**Options:** - -- `--scheme `: Required for object-key mode because local bucket mappings persist bucket/prefix, not provider scheme -- `--sha256 `: Expected SHA256 checksum when known - -**What it does:** - -- Resolves the effective org/project bucket scope for the current remote -- Inspects the provider object through client-owned cloud code -- Writes a Git LFS pointer into the worktree -- Stores local DRS metadata for later registration during `git drs push` - -### `git drs push [remote-name]` - -Push local DRS objects to server. Uploads new files and registers metadata. - -**Usage:** - -```bash -# Push to default remote -git drs push - -# Push to specific remote -git drs push staging -git drs push production -``` - -**What it does:** - -- Checks local `.git/drs/lfs/objects/` for DRS metadata -- For each object, uploads file to bucket if file exists locally -- If file doesn't exist locally (metadata only), registers metadata without upload -- This enables cross-remote promotion workflows - -**Cross-Remote Promotion:** - -Transfer DRS records from one remote to another (eg staging to production) without re-uploading files: - -```bash -# Fetch metadata from staging -git drs fetch staging - -# Push metadata to production (no file upload since files don't exist locally) -git drs push production -``` - -This is useful when files are already in the production bucket with matching SHA256 hashes. It can also be used to reupload files given that the files are pulled to the repo first. - -**Note:** `fetch` and `push` are commonly used together. `fetch` pulls metadata from one remote, `push` registers it to another. - -### `git drs query` - -Query a DRS object by its DRS ID or SHA256 checksum. - -**Usage:** - -```bash -# Query by DRS ID (default behavior) -git drs query - -# Query by SHA256 checksum -git drs query --checksum +git drs remote set production ``` -**Options:** +## Bucket Mapping -- `--checksum`, `-c`: Treat the argument as a SHA256 checksum instead of a DRS ID. -- `--pretty`, `-p`: Output indented JSON for easier reading. -- `--remote`, `-r`: Target a specific remote (default: default_remote). +These commands are typically steward/admin setup, not day-to-day end-user commands. -**Examples:** +### `git drs bucket add` -```bash -# Query by checksum and pretty-print the result -git drs query --checksum 9f2c2db77f0a3e2b47e4b44b8ce8d4c8c3c4c0b5f4c5a2d2f9b1d0bfb0a1c2d3 --pretty +Declare bucket credentials for a remote. -# Query by DRS ID against a specific remote -git drs query did:example:12345 --remote staging -``` - -### `git drs add-url` - -Prepare a file reference via cloud object URL for DRS registration. +### `git drs bucket add-organization` -**Usage:** +Map an organization to a bucket path. ```bash -# Stage local pointer + DRS metadata -git drs add-url [path] [--sha256 ] -# Register/push prepared records -git drs push +git drs bucket add-organization production \ + --organization HTAN_INT \ + --path s3://cbds/htan-int ``` -**Examples:** +### `git drs bucket add-project` -```bash -# Known SHA path -git drs add-url s3://bucket/path/file.bin data/file.bin --sha256 - -# Unknown SHA path (experimental sentinel mode) -git drs add-url s3://bucket/path/file.bin data/file.bin -``` - -**Options:** - -- `--sha256 `: Optional SHA256 hash of the source object. - If omitted, add-url uses experimental ETag-derived sentinel mode and registers a synthetic OID. - -**Notes:** - -- `add-url` no longer accepts per-command AWS credential flags. -- S3 connection hints are resolved from environment/runtime config when needed (for example `AWS_REGION`, `AWS_ENDPOINT_URL`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`). -- Registration happens on `git drs push`, not at `add-url` time. - -### `git drs version` - -Display Git DRS version information. +Map an organization/project to a bucket path. ```bash -git drs version +git drs bucket add-project production \ + --organization HTAN_INT \ + --project BForePC \ + --path s3://cbds/htan-int/bforepc ``` -### `git drs track [pattern ...]` - -Manage Git LFS tracking patterns from Git DRS. - -**View tracked patterns:** +## File Tracking and Hydration -```bash -git drs track -``` +### `git drs track` -**Track one or more patterns:** +Track files or patterns with Git-compatible pointer behavior. ```bash git drs track "*.bam" -git drs track "*.bam" "data/**" +git drs track "data/**" ``` -**Options:** - -- `--verbose`: Show detailed Git LFS output -- `--dry-run`: Show what would change without writing `.gitattributes` +Stage `.gitattributes` after changing tracked patterns. -### `git drs untrack [pattern ...]` +### `git drs untrack` -Remove one or more Git LFS tracking patterns. +Stop tracking patterns. ```bash git drs untrack "*.bam" -git drs untrack "*.bam" "data/**" -``` - -**Options:** - -- `--verbose`: Show detailed Git LFS output -- `--dry-run`: Show what would change without writing `.gitattributes` - -### Internal Commands - -These commands are called automatically by Git hooks: - -- `git drs precommit`: Process staged files during commit -- `git drs pre-push-prepare`: Stage DRS metadata before push -- `git lfs pre-push`: Optional Git LFS compatibility push flow (invoked by the pre-push hook when enabled) - -## Git LFS Commands - -### `git lfs track` - -Manage file tracking patterns. - -**View Tracked Patterns:** - -```bash -git lfs track -``` - -**Track New Pattern:** - -```bash -git lfs track "*.bam" -git lfs track "data/**" -git lfs track "specific-file.txt" -``` - -**Untrack Pattern:** - -```bash -git lfs untrack "*.bam" ``` -### `git lfs ls-files` - -List LFS-tracked files in the repository. +### `git drs ls-files [pathspec...]` -**All Files:** +List tracked LFS-style files in the current checkout. ```bash -git lfs ls-files +git drs ls-files +git drs ls-files data/** +git drs ls-files -I "*.bam" +git drs ls-files --drs +git drs ls-files -l --drs +git drs ls-files -n results/** ``` -**Specific Pattern:** - -```bash -git lfs ls-files -I "*.bam" -git lfs ls-files -I "data/**" -``` +Important behavior: -**Output Format:** +- default mode is local-first and cheap +- `*` means localized/hydrated in the worktree +- `-` means the worktree still contains a pointer +- `--drs` adds DRS registration checks -- `*` prefix: File is localized (downloaded) -- `-` prefix: File is not localized -- No prefix: File status unknown +Common flags: -### `git lfs pull` +- `-I, --include `: include filter; may be repeated +- `-l, --long`: long output +- `-n, --name-only`: path-only output +- `--json`: structured output +- `--drs`: check DRS registration status -Download LFS-tracked files. +### `git drs pull` -**All Files:** +Hydrate tracked pointer files in the current checkout. ```bash -git lfs pull +git drs pull +git drs pull -I "*.bam" +git drs pull -I "data/**" -I "results/*.txt" +git drs pull --dry-run -I "results/**" ``` -**Specific Files:** +Important behavior: -```bash -git lfs pull -I "*.bam" -git lfs pull -I "data/important.txt" -git lfs pull -I "results/**" -``` - -**Multiple Patterns:** +- `git drs pull` does not run `git pull` +- it only hydrates tracked pointer files already present in the current checkout +- include matching is against repo-relative paths -```bash -git lfs pull -I "*.bam" -I "*.vcf" -``` +Common flags: -### `git lfs install` +- `-I, --include `: include filter; may be repeated +- `--dry-run`: show what would be hydrated without downloading -Configure Git LFS for the system or repository. +## Object Registration and Push -**System-wide:** - -```bash -git lfs install --skip-smudge -``` +### `git drs push [remote-name]` -**Repository-only:** +Register and upload tracked objects, then rely on normal Git push for refs. ```bash -git lfs install --local --skip-smudge +git drs push +git drs push production ``` -The `--skip-smudge` option prevents automatic downloading of all LFS files during clone/checkout. - -## Standard Git Commands - -Git DRS integrates with standard Git commands: - -### `git add` - -Stage files for commit. LFS-tracked files are automatically processed. +What it does: -```bash -git add myfile.bam -git add data/ -git add . -``` +- resolves local pointer/object metadata +- uploads local bytes when needed +- registers object metadata with the target Syfon instance -### `git commit` +### `git drs add-url [path]` -Commit changes. Git DRS pre-commit hook runs automatically. +Create a pointer and local metadata for an object that already exists in provider storage. ```bash -git commit -m "Add new data files" +git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 +git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin +git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin --sha256 ``` -### `git push` +Notes: -Push commits to remote. Git DRS automatically uploads new files to DRS server. +- object-key mode resolves against the configured bucket scope +- explicit provider URL mode remains supported +- `--scheme` is required for object-key mode -```bash -git push -git push origin main -``` - -### `git clone` +### `git drs add-ref ` -Clone repository. Use with Git DRS initialization: +Add a local pointer file for an existing DRS object. ```bash -git clone -cd -git drs init -git drs remote add gen3 production --cred /path/to/credentials.json --url ... --organization ... --project ... +git drs add-ref drs://example/object-id data/object.bin ``` -## Workflow Examples +### `git drs query ` -### Complete File Addition Workflow +Query a DRS object by ID. ```bash -# 1. Ensure file type is tracked -git lfs track "*.bam" -git add .gitattributes - -# 2. Add your file -git add mydata.bam - -# 3. Verify tracking -git lfs ls-files -I "mydata.bam" - -# 4. Commit (creates DRS record) -git commit -m "Add analysis results" - -# 5. Push (uploads to default DRS server) -git push +git drs query drs://example/object-id ``` -### Selective File Download - -```bash -# Check what's available -git lfs ls-files +## Metadata Copy -# Download specific files -git lfs pull -I "results/*.txt" -git lfs pull -I "important-dataset.bam" - -# Verify download -git lfs ls-files -I "results/*.txt" -``` +### `git drs copy-records [source-remote] ` -### Repository Setup from Scratch +Copy Syfon metadata records from one remote to another for a single project scope. ```bash -# 1. Create and clone repo -git clone -cd - -# 2. Initialize Git DRS -git drs init - -# 3. Add DRS remote -git drs remote add gen3 production \ - --url https://calypr-public.ohsu.edu \ - --cred /path/to/credentials.json \ - --organization my-program \ - --project my-project - -# 4. Set up file tracking -git lfs track "*.bam" -git lfs track "*.vcf.gz" -git lfs track "data/**" -git add .gitattributes -git commit -m "Configure LFS tracking" -git push - -# 5. Add data files -git add data/sample1.bam -git commit -m "Add sample data" -git push +git drs copy-records prod HTAN_INT/BForePC +git drs copy-records dev prod HTAN_INT/BForePC ``` -### Cross-Remote Promotion Workflow +Behavior: -```bash -# 1. Add multiple remotes -git drs remote add gen3 staging \ - --url https://staging.calypr.ohsu.edu \ - --cred /path/to/staging-credentials.json \ - --organization staging-program \ - --project staging-project - -git drs remote add gen3 production \ - --url https://calypr-public.ohsu.edu \ - --cred /path/to/prod-credentials.json \ - --organization prod-program \ - --project prod-project - -# 2. Fetch metadata from staging -git drs fetch staging - -# 3. Push metadata to production (no re-upload) -git drs push production -``` +- with one remote arg: + - source defaults to the configured default remote + - arg is treated as the target remote +- with two remote args: + - first is source + - second is target +- copies metadata only, not object bytes -## Environment Variables +Merge behavior for existing target records: -Git DRS respects these environment variables: +- match by DID +- union `controlled_access` +- union `access_methods` +- preserve existing target metadata otherwise -- `AWS_ACCESS_KEY_ID`: AWS access key (for S3 operations) -- `AWS_SECRET_ACCESS_KEY`: AWS secret key (for S3 operations) +## Removed Legacy Commands -## Help and Documentation +These commands are gone from the cleaned CLI: -Use `--help` with any command for detailed usage: +- `git drs fetch` +- `git drs list` +- `git drs upload` +- `git drs download` -```bash -git-drs --help -git-drs init --help -git-drs add-url --help -git lfs --help -git lfs track --help -``` +If older docs or notes mention them, treat those references as stale. diff --git a/docs/developer-guide.md b/docs/developer-guide.md index e9751130..df82388c 100644 --- a/docs/developer-guide.md +++ b/docs/developer-guide.md @@ -10,7 +10,7 @@ Git DRS integrates with Git through several mechanisms: **Pre-commit Hook**: `git drs precommit` - Triggered automatically before each commit -- Processes all staged LFS files +- Processes all staged files - Creates DRS records for new files - Only processes files that don't already exist on the DRS server - Prepares metadata for later upload during push @@ -34,7 +34,7 @@ Git DRS integrates with Git through several mechanisms: - Stores in .git/drs/ directory 4. Developer: git push 5. Git Hook: git drs pre-push-prepare - - Stages pending metadata for LFS verify + - Stages pending metadata for DRS verify 6. Git DRS: - `git drs push` runs register/upload directly - `git drs pull` runs download directly @@ -44,8 +44,8 @@ Git DRS integrates with Git through several mechanisms: Git DRS no longer uses a custom transfer agent. -- Upload path (primary): `git drs push` discovers local LFS pointers, bulk-registers missing objects, checks validity, and uploads missing bits. -- Download path (primary): `git drs pull` resolves object records and downloads into local LFS object storage. +- Upload path (primary): `git drs push` discovers local pointers, bulk-registers missing objects, checks validity, and uploads missing bits. +- Download path (primary): `git drs pull` resolves object records and downloads into local object storage. ## Repository Structure @@ -73,12 +73,12 @@ drs/ # DRS object utilities ├── object.go # DRS object structures └── util.go # Utility functions -lfs/ # Git LFS integration -└── lfs.go # LFS pointer/discovery helpers +lfs/ # Pointer utilities +└── lfs.go # Pointer/discovery helpers utils/ # Shared utilities ├── common.go # Common functions -├── lfs-track.go # LFS tracking utilities +├── lfs-track.go # Tracking utilities └── util.go # General utilities ``` @@ -97,14 +97,13 @@ servers: ### DRS Object Management -Objects are stored in `.git/drs/lfs/objects/` during pre-commit and referenced during push/pull workflows. +Objects are stored in `.git/drs/objects/` during pre-commit and referenced during push/pull workflows. ## Development Setup ### Prerequisites -- Go 1.24+ -- Git LFS installed +- Go 1.26.2+ - Access to a DRS server for testing ### Building from Source @@ -152,7 +151,7 @@ export PATH=$PATH:$(pwd) ```bash # Test specific functionality -go test ./utils -run TestLFSTrack +go test ./utils -run TestTrack ``` ### Integration Tests diff --git a/docs/drs-registerfile-upsert.md b/docs/drs-registerfile-upsert.md index 72f162b5..03d1cff4 100644 --- a/docs/drs-registerfile-upsert.md +++ b/docs/drs-registerfile-upsert.md @@ -1,4 +1,4 @@ -# ADR 0001: Configure RegisterFile upsert/bucket checks via git LFS config +# ADR 0001: Configure RegisterFile upsert/bucket checks via git config ## Status Accepted @@ -8,7 +8,7 @@ The DRS `RegisterFile` flow needs toggles for: - whether to upsert DRS records (create when no matching project record exists, or replace by deleting and re-registering when a ma - whether to check bucket existence before uploading (Unimplemented, currently always checks and skips upload if already present) -These toggles must be controlled per-repository using git LFS configuration (`git config` entries under `drs.*`). This keeps behavior in repo-local configuration and avoids coupling to remote YAML configuration. +These toggles must be controlled per-repository using git config (`git config` entries under `drs.*`). This keeps behavior in repo-local configuration and avoids coupling to remote YAML configuration. ## Decision Read `drs.upsert` from git config during DRS client initialization. Missing values default to `false`. Invalid values fail initialization with a clear error. diff --git a/docs/e2e-modes-and-local-setup.md b/docs/e2e-modes-and-local-setup.md index 20820cf2..eb0de218 100644 --- a/docs/e2e-modes-and-local-setup.md +++ b/docs/e2e-modes-and-local-setup.md @@ -81,7 +81,7 @@ TEST_STRICT_CLEANUP=true - HTTP basic auth via: - `TEST_LOCAL_USERNAME` + `TEST_LOCAL_PASSWORD`, or - `TEST_ADMIN_AUTH_HEADER="Authorization: Basic "` -- `git drs remote add local ... --username ... --password ...` stores local basic auth in repo config for helper/LFS flows. +- `git drs remote add local ... --username ... --password ...` stores local basic auth in repo config for credential-helper flows. ## How wrapper scripts map to the main suites @@ -138,7 +138,7 @@ What it covers: - `git drs push` metadata register + upload - multipart/resume behavior -- `git drs pull` and `git lfs pull` compatibility checks +- `git drs pull` download and compatibility checks - cleanup by DID resolution ## Local add-url E2E: runbook @@ -152,7 +152,7 @@ bash tests/e2e-local-addurl.sh What it covers: - known-sha add-url path (`--sha256 `) -- unknown-sha add-url path (sentinel pointer OID) +- unknown-sha add-url path (placeholder pointer OID) - push/register + pull hydration checks ## Monorepo E2E (remote and local) diff --git a/docs/getting-started.md b/docs/getting-started.md index 92a2b636..cfef54f3 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -1,25 +1,32 @@ # Getting Started -This guide walks you through setting up Git DRS and performing common workflows. +This guide walks through the current `git-drs` workflow on the cleaned CLI path. -> **Navigation:** [Installation](installation.md) → **Getting Started** → [Commands Reference](commands.md) → [Troubleshooting](troubleshooting.md) +> **Navigation:** [Installation](installation.md) -> **Getting Started** -> [Commands Reference](commands.md) -> [Troubleshooting](troubleshooting.md) -## Repository Initialization +## What `git-drs` Does -Every Git repository using Git DRS requires configuration, whether you're creating a new repo or cloning an existing one. +`git-drs` manages: -### Cloning Existing Repository (Gen3) +- Git-compatible pointer files +- local DRS metadata +- remote Syfon/Gen3 configuration +- pointer hydration and object registration workflows -1. **Clone the Repository** +It no longer tries to be a mixed bag of Git, Git LFS, and DRS transport wrappers. + +## Cloning an Existing Repository + +1. Clone the repository: ```bash git clone .git cd ``` -2. **Configure SSH** (if using SSH URLs) +2. If you use SSH remotes, make sure your SSH setup is already working for that host. - If using SSH URLs like `git@github.com:user/repo.git`, add to `~/.ssh/config`: + A typical keepalive configuration looks like: ``` Host github.com @@ -27,347 +34,361 @@ Every Git repository using Git DRS requires configuration, whether you're creati ServerAliveInterval 30 ``` -3. **Get Credentials** - - - Log in to your data commons (e.g., https://calypr-public.ohsu.edu/) - - Profile → Create API Key → Download JSON - - **Note**: Credentials expire after 30 days - -4. **Initialize Repository** +3. Initialize `git-drs` in the repo: ```bash git drs init ``` -5. **Verify Configuration** +4. Hydrate tracked files if needed: ```bash - git drs remote list + git drs pull ``` - Output: - ``` - * production gen3 https://calypr-public.ohsu.edu/ - ``` +This is the normal onboarding flow for an existing repo. `git drs pull` hydrates pointer files already present in the checkout. It does not replace `git pull`. - The `*` indicates this is the default remote. +## One-Time Machine Setup -### New Repository Setup (Gen3) +Install `git-drs` and the global Git filter configuration: -1. **Create and Clone Repository** +```bash +git drs install +``` - ```bash - git clone .git - cd - ``` +## One-Time Repository Setup + +After cloning or creating a repository: + +```bash +git drs init +``` + +That sets up repository-local `git-drs` state and hooks. + +## Add a Gen3 Remote + +The current shape is: + +```bash +git drs remote add gen3 [remote-name] [--cred | --token ] +``` + +Example: + +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +``` -2. **Configure SSH** (if needed - same as above) +Notes: -3. **Get Credentials** (same as above) +- scope is one positional argument: `organization/project` +- users do not provide `--bucket` +- users do not provide `--url` +- bucket resolution is scope-based and server-backed -4. **Get Project Details** +Verify: - Contact your data coordinator for: - - DRS server URL - - Organization name - - Project ID - - Bucket name - - Confirmation that bucket mapping exists for your organization/project +```bash +git drs remote list +``` -5. **Initialize Git DRS** +## New Repository Setup + +For a new repository or a repository that has not yet been configured with `git-drs`: + +1. Initialize the repository: ```bash git drs init ``` -6. **Add Remote Configuration** +2. Add the target remote: ```bash - git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket + git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json ``` - **Note:** Since this is your first remote, it automatically becomes the default. No need to run `git drs remote set`. - -7. **Verify Configuration** +3. Verify the configuration: ```bash git drs remote list ``` - Output: - ``` - * production gen3 https://calypr-public.ohsu.edu - ``` +## Steward/Admin Prerequisite - **Important:** `git drs remote add` alone is not enough. Push/pull requires an existing bucket mapping for your `organization/project` (usually provisioned once by a steward/admin). +Push and pull depend on server-side bucket mapping for the target scope. -**Managing Additional Remotes** - -You can add more remotes later for multi-environment workflows (development, staging, production): +That usually means a steward/admin has already done something like: ```bash -# Add staging remote -git drs remote add gen3 staging \ - --cred /path/to/staging-credentials.json \ - --url https://staging.calypr.ohsu.edu \ - --project staging-project \ - --bucket staging-bucket - -# View all remotes -git drs remote list +git drs bucket add production \ + --bucket cbds \ + --region us-east-1 \ + --access-key "$AWS_ACCESS_KEY_ID" \ + --secret-key "$AWS_SECRET_ACCESS_KEY" + +git drs bucket add-organization production \ + --organization HTAN_INT \ + --path s3://cbds/htan-int + +git drs bucket add-project production \ + --organization HTAN_INT \ + --project BForePC \ + --path s3://cbds/htan-int/bforepc +``` -# Switch default remote -git drs remote set staging +End users generally should not need to know the bucket name. -# Or use specific remote for one command -git drs push production -git drs fetch staging -``` +## Credentials -## File Tracking +For Gen3-backed deployments: -Git DRS can use Git LFS-compatible pointers and local object storage. You must explicitly track file patterns before adding LFS-managed files. +- obtain a credential JSON or token from the target data commons +- the common path is: log in -> profile -> create API key -> download JSON +- refresh it when it expires +- re-run `git drs remote add gen3 ... --cred ...` when you need to refresh the stored profile -### View Current Tracking +Example: ```bash -git lfs track +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json ``` -### Track Files +## Managing Additional Remotes -**Single File** +You can add multiple remotes for multi-environment workflows. ```bash -git lfs track path/to/specific-file.txt -git add .gitattributes +git drs remote add gen3 staging HTAN_INT/BForePC --cred /path/to/staging-credentials.json +git drs remote list +git drs remote set staging ``` -**File Pattern** +Or target a non-default remote for a single command: ```bash -git lfs track "*.bam" -git add .gitattributes +git drs push production +git drs copy-records staging production HTAN_INT/BForePC ``` -**Directory** +## Track Files + +Track file types or paths you want managed by `git-drs`: ```bash -git lfs track "data/**" +git drs track "*.bam" git add .gitattributes ``` -### Untrack Files +You can also track explicit paths or path globs: ```bash -# View tracked patterns -git lfs track - -# Remove pattern -git lfs untrack "*.bam" - -# Stage changes +git drs track "data/**" git add .gitattributes ``` -## Basic Workflows +View current tracking: + +```bash +git drs track +``` -### Adding and Pushing Files +Stop tracking patterns: ```bash -# Track file type (if not already tracked) -git lfs track "*.bam" +git drs untrack "*.bam" git add .gitattributes +``` -# Add your file -git add myfile.bam - -# Verify LFS is tracking it -git lfs ls-files +## Add, Commit, and Push -# Commit and push -git commit -m "Add new data file" +```bash +git add sample.bam +git commit -m "Add sample" git push ``` -> **Note**: Git DRS automatically creates DRS records during commit and uploads files to the default remote during push. +`git-drs` handles pointer/object registration behavior around the Git workflow. -### Downloading Files +## Inspect Tracked Files -**Single File** +Use `ls-files` as the local inventory command: ```bash -git lfs pull -I path/to/file.bam +git drs ls-files +git drs ls-files -l +git drs ls-files --drs +git drs ls-files -I "*.bam" ``` -**Pattern** +Interpretation: -```bash -git lfs pull -I "*.bam" -``` - -**All Files** +- `*` means localized/hydrated in the worktree +- `-` means the worktree still contains a pointer -```bash -git lfs pull -``` +## Hydrate Files -**Directory** +Use `git drs pull` only for hydration. ```bash -git lfs pull -I "data/**" +git drs pull +git drs pull -I "*.bam" +git drs pull -I "results/**" -I "*.txt" ``` -### Checking File Status - -```bash -# List all LFS-tracked files -git lfs ls-files +Important: -# Check specific pattern -git lfs ls-files -I "*.bam" +- `git drs pull` does not run `git pull` +- run plain `git pull` yourself when you want new commits/trees +- then run `git drs pull` if you need to hydrate pointer files in the checkout -# View localization status -# (-) = not localized, (*) = localized -git lfs ls-files -``` +## Add Existing Bucket Objects -## Working with Cloud Object URLs - -You can add references to existing bucket objects without copying them: +If the object already exists in provider storage, use `add-url`: ```bash # Track the file pattern first -git lfs track "myfile.txt" +git drs track "myfile.txt" git add .gitattributes # Add object reference (known sha256 path) -git drs add-url s3://bucket/path/to/file \ +git drs add-url s3://bucket/path/to/file myfile.txt \ --sha256 -# Or use unknown-sha (experimental sentinel mode) -git drs add-url s3://bucket/path/to/file +# Or use unknown-sha +git drs add-url s3://bucket/path/to/file myfile.txt # Commit and push +git add myfile.txt git commit -m "Add S3 file reference" git push ``` -See [Cloud URL Integration Guide](adding-s3-files.md) for detailed examples. - -## Configuration Management - -### View Configuration +Scoped bucket-key mode also works: ```bash -git drs remote list +git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 +git commit -m "Add bucket-backed object reference" +git push ``` -### Update Configuration +Explicit provider URL mode also works: ```bash -# Refresh credentials - re-add remote with new credentials -git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket - -# Switch default remote -git drs remote set staging +git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin ``` -### View Logs +## Session Workflow -- Logs location: `.git/drs/` directory +> **Note:** You do not need to run `git drs init` again. Initialization is a one-time setup per local repository clone. -## Command Summary +For a normal work session: -| Action | Commands | -| ------------------ | ------------------------------------------- | -| **Initialize** | `git drs init` | -| **Add remote** | `git drs remote add gen3 --cred...` | -| **View remotes** | `git drs remote list` | -| **Set default** | `git drs remote set ` | -| **Track files** | `git lfs track "pattern"` | -| **Check tracked** | `git lfs ls-files` | -| **Add files** | `git add file.ext` | -| **Commit** | `git commit -m "message"` | -| **Push** | `git push` | -| **Download** | `git lfs pull -I "pattern"` | +1. Refresh credentials if needed -## Session Workflow + ```bash + git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json + ``` + +2. Update Git history if needed + + ```bash + git pull + ``` -> **Note**: You do NOT need to run `git drs init` again. Initialization is a one-time setup per Git repository clone. +3. Hydrate tracked files if needed -For each work session: + ```bash + git drs pull + ``` -1. **Refresh credentials** (if expired - credentials expire after 30 days) +4. Work with files normally ```bash - git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket + git add ... + git commit -m "..." + git push ``` -2. **Work with files** (track, add, commit, push) +## Configuration Management + +View current remote configuration: + +```bash +git drs remote list +``` + +Refresh or update credentials by re-adding the remote: + +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json +``` ## Local DRS Server Setup -Use this flow when developing against a local `drs-server` instead of hosted Gen3. +Use this flow when developing against a local Syfon/DRS server instead of a hosted Gen3 deployment. -1. **Initialize repo** +1. Initialize the repo: ```bash git drs init ``` -2. **Add local remote** +2. Add the local remote: ```bash - git drs remote add local origin http://localhost:8080 \ - --organization calypr \ - --project end_to_end_test \ - --bucket cbds \ - --username drs-user \ - --password drs-pass + git drs remote add local origin http://localhost:8080 ``` - If your local server has no basic auth, omit `--username/--password`. + If your local server requires basic auth, include the local auth flags supported by that command. -3. **Track and push** +3. Track and push: ```bash - git lfs track "*.bin" + git drs track "*.bin" git add .gitattributes data/example.bin git commit -m "Add local DRS test file" git drs push ``` -4. **Verify pull** +4. Verify hydration: ```bash git drs pull - # or the Git LFS compatibility path - git lfs pull ``` -For complete local/remote mode behavior and e2e runbooks, see [E2E Modes + Local Setup](e2e-modes-and-local-setup.md). +For full local/remote runbooks, see [E2E Modes + Local Setup](e2e-modes-and-local-setup.md). -3. **Download files as needed** +## Copy Metadata Between Remotes - ```bash - git lfs pull -I "required-files*" - ``` +Use `copy-records` to copy Syfon metadata records between remotes for a single scope: -## Next Steps +```bash +git drs copy-records dev prod HTAN_INT/BForePC +``` + +Or let the default remote be the source: + +```bash +git drs copy-records prod HTAN_INT/BForePC +``` + +This copies metadata only. It does not copy object bytes between buckets. + +## Common Flow Summary + +```bash +git drs install +git drs init +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json +git drs track "*.bam" +git add .gitattributes +git add sample.bam +git commit -m "Add sample" +git push +git drs ls-files +git drs pull -I "*.bam" +``` -- [Commands Reference](commands.md) - Complete command documentation -- [Troubleshooting](troubleshooting.md) - Common issues and solutions -- [Developer Guide](developer-guide.md) - Advanced usage and internals +For command details, see [commands.md](commands.md). diff --git a/docs/installation.md b/docs/installation.md index 723303d7..0f111459 100644 --- a/docs/installation.md +++ b/docs/installation.md @@ -4,21 +4,7 @@ This guide covers installation of Git DRS across different environments and targ ## Prerequisites -All installations require [Git LFS](https://git-lfs.com/) to be installed first: - -```bash -# macOS -brew install git-lfs - -# Linux (download binary) -wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.0/git-lfs-linux-amd64-v3.7.0.tar.gz -tar -xvf git-lfs-linux-amd64-v3.7.0.tar.gz -export PREFIX=$HOME -./git-lfs-v3.7.0/install.sh - -# Configure LFS -git lfs install --skip-smudge -``` +Git DRS requires Git to be installed. Install Git DRS using the steps below, then run `git drs install` to configure Git filters. ## Local Installation (Gen3 Server) @@ -33,9 +19,9 @@ git lfs install --skip-smudge 2. **Update PATH** ```bash - # Add to ~/.bash_profile or ~/.zshrc + # Add to your shell startup file (for example ~/.zshrc, ~/.bashrc, or ~/.profile) export PATH="$PATH:$HOME/.local/bin" - source ~/.bash_profile # or source ~/.zshrc + source ~/.zshrc # or source your shell startup file ``` 3. **Verify Installation** @@ -61,27 +47,7 @@ git lfs install --skip-smudge ### Steps -1. **Install Git LFS on HPC** - ```bash - # Download and install Git LFS - wget https://github.com/git-lfs/git-lfs/releases/download/v3.7.1/git-lfs-linux-amd64-v3.7.1.tar.gz - tar -xvf git-lfs-linux-amd64-v3.7.1.tar.gz - export PREFIX=$HOME - ./git-lfs-3.7.1/install.sh - - # Make permanent - echo 'export PATH="$HOME/bin:$PATH"' >> ~/.bash_profile - source ~/.bash_profile - - # Configure - git lfs install --skip-smudge - - # Cleanup - rm git-lfs-linux-amd64-v3.7.0.tar.gz - rm -r git-lfs-3.7.0/ - ``` - -2. **Configure Git/SSH (if needed)** +1. **Configure Git/SSH (if needed)** ```bash # Generate SSH key ssh-keygen -t ed25519 -C "your_email@example.com" @@ -94,7 +60,7 @@ git lfs install --skip-smudge cat ~/.ssh/id_ed25519.pub ``` -3. **Install Git DRS** +2. **Install Git DRS** ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/calypr/git-drs/refs/heads/main/install.sh)" @@ -103,7 +69,7 @@ git lfs install --skip-smudge source ~/.bash_profile ``` -4. **Verify Installation** +3. **Verify Installation** ```bash git-drs version git drs install @@ -133,8 +99,6 @@ After installation, verify your setup: # Check Git DRS version git-drs version -# Check Git LFS -git lfs version # View configured remotes (after setup) git drs remote list diff --git a/docs/precommit-cache-addurl-prepush.md b/docs/precommit-cache-addurl-prepush.md index fe77ec08..924050c3 100644 --- a/docs/precommit-cache-addurl-prepush.md +++ b/docs/precommit-cache-addurl-prepush.md @@ -5,10 +5,10 @@ Proposed ## Context `cmd/precommit` now maintains a local cache under `.git/drs/pre-commit/v1` that records: -- path → LFS OID in `paths/.json` +- path → OID in `paths/.json` - OID → paths + URL hint in `oids/.json` -`precommit_cache` provides read helpers for this cache and is intended to let the pre-push hook validate against authoritative sources while using cached hints to avoid re-scanning worktrees. `cmd/addurl` currently writes the LFS pointer and DRS files but does not update the pre-commit cache. `cmd/prepush` currently computes updates without consulting the cache. This means: +`precommit_cache` provides read helpers for this cache and is intended to let the pre-push hook validate against authoritative sources while using cached hints to avoid re-scanning worktrees. `cmd/addurl` currently writes the pointer and DRS files but does not update the pre-commit cache. `cmd/prepush` currently computes updates without consulting the cache. This means: - `add-url`-created objects are invisible to cache-aware workflows unless a pre-commit hook runs later. - `pre-push` cannot leverage cached OID/path/url hints or detect mismatches early. @@ -16,7 +16,7 @@ Proposed Update `cmd/addurl` and `cmd/prepush` to integrate with the pre-commit cache, while preserving the current fallback behavior when the cache is missing or stale. ### Changes required in `cmd/addurl` -1. **Write cache entries after LFS pointer creation** +1. **Write cache entries after pointer creation** - Create/update the path entry (`paths/.json`) using the same encoding as `cmd/precommit` (`base64.RawURLEncoding` of the repo-relative path). - Create/update the OID entry (`oids/.json`) using the same OID hashing (`sha256(oid string)`), ensuring the `paths` list includes the new path. 2. **Persist the external URL hint** @@ -30,13 +30,13 @@ Update `cmd/addurl` and `cmd/prepush` to integrate with the pre-commit cache, wh ### Changes required in `cmd/prepush` 1. **Use `precommit_cache` to seed work** - - Open the cache early and, when available, use it to map pushed paths/branches to their LFS OIDs and cached URL hints. + - Open the cache early and, when available, use it to map pushed paths/branches to their OIDs and cached URL hints. - If the cache is missing or entries are stale, fall back to current discovery/update logic. 2. **Validate cached URL hints** - When `updateDrsObjects` resolves authoritative URLs, compare them to cached hints via `precommit_cache.CheckExternalURLMismatch`. - Warn (or fail, depending on policy) on mismatches to surface potentially stale or incorrect metadata before pushing. 3. **Prefer cache data for DRS updates** - - Use cached OIDs/paths to reduce redundant file scans for LFS pointers. + - Use cached OIDs/paths to reduce redundant file scans for pointers. - Carry cached `external_url` into DRS metadata when authoritative sources are unavailable, while still treating it as non-authoritative. ## Consequences diff --git a/docs/precommit.md b/docs/precommit.md index 91682b6c..89bb318d 100644 --- a/docs/precommit.md +++ b/docs/precommit.md @@ -16,11 +16,11 @@ This repository uses a **local, non-versioned cache** under: .git/drs/pre-commit/ ``` -to support fast, offline-friendly workflows for **Git LFS–tracked files**. +to support fast, offline-friendly workflows for **Git DRS–tracked files**. The cache is: -* **LFS-only** +* **pointer-only** * **non-authoritative** * **local to a working copy** * **never committed to Git** @@ -44,7 +44,7 @@ Its sole purpose is to bridge the gap between: * Updates `.git/drs/pre-commit` cache * Never performs network I/O * Never queries DRS or DRS -* Ignores all non-LFS files +* Ignores all non-tracked files ### `precommit_cache` (helper library) @@ -61,7 +61,7 @@ Its sole purpose is to bridge the gap between: ## Cache Scope (Important) -Only files whose **staged content** is a valid Git LFS pointer are in scope: +Only files whose **staged content** is a valid Git DRS pointer are in scope: ``` version https://git-lfs.github.com/spec/v1 @@ -112,7 +112,7 @@ The cache models **three non-authoritative relationships**: 3. **OID → External URL (hint)** All are **hints only**. -The authoritative source of truth lives on the server (DRS / DRS). +The authoritative source of truth lives on the server (DRS). --- @@ -122,7 +122,7 @@ The authoritative source of truth lives on the server (DRS / DRS). `v1/paths/.json` -Represents the **currently staged** LFS object at a given working-tree path. +Represents the **currently staged** DRS object at a given working-tree path. ```json { @@ -135,7 +135,7 @@ Represents the **currently staged** LFS object at a given working-tree path. Notes: * `path` is repo-relative -* `lfs_oid` comes from the staged LFS pointer +* `lfs_oid` comes from the staged DRS pointer * Updated on: * add @@ -149,7 +149,7 @@ Notes: `v1/oids/.json` -Represents **advisory information** about an LFS object. +Represents **advisory information** about a DRS object. ```json { @@ -187,16 +187,16 @@ Used to record deleted paths for potential GC or debugging. ## Pre-Commit Behavior (What Happens Automatically) -### Add / Modify LFS File +### Add / Modify Tracked File -* Extracts LFS OID from staged pointer +* Extracts OID from staged pointer * Updates: * `paths/.json` * `oids/.json` * Preserves any existing `external_url` hint -### Rename / Move LFS File +### Rename / Move Tracked File * Moves `paths/.json` → `paths/.json` * Updates OID entry paths list @@ -291,7 +291,7 @@ url, ok, err := cache.LookupExternalURLByOID(oid) * Hint only * May be stale or missing -* Must be validated against DRS / DRS +* Must be validated against DRS --- @@ -322,7 +322,7 @@ Used by pre-push to compare local hints with server truth. ## Intended Pre-Push Usage Pattern 1. Determine commit range from pre-push stdin -2. Enumerate **LFS OIDs** referenced by pushed commits +2. Enumerate **OIDs** referenced by pushed commits 3. For each OID: * Optionally read local hints from `precommit_cache` @@ -377,9 +377,7 @@ sequenceDiagram participant PC as pre-commit hook (cmd/precommit) participant Cache as .git/drs/pre-commit (local cache) participant PP as pre-push hook - participant LFS as git-lfs participant IDX as DRS (authoritative) - participant DRS as DRS (authoritative) Dev->>Git: git add Dev->>Git: git commit @@ -387,10 +385,10 @@ sequenceDiagram Git->>PC: invoke pre-commit (no stdin) PC->>Git: git diff --cached --name-status -M PC->>Git: git show : (staged pointer) - alt staged file is LFS pointer + alt staged file is DRS pointer PC->>Cache: write paths/.json (path -> oid) PC->>Cache: upsert oids/.json (oid -> paths[] + external_url hint) - else non-LFS file + else non-tracked file PC-->>Git: ignore (out of scope) end PC-->>Git: exit 0 (commit proceeds) @@ -398,13 +396,10 @@ sequenceDiagram Dev->>Git: git push Git->>PP: invoke pre-push (stdin: ref updates) PP->>PP: compute commit ranges from stdin - PP->>LFS: enumerate LFS OIDs referenced by pushed commits + PP->>IDX: enumerate OIDs referenced by pushed commits loop for each required OID PP->>Cache: lookup external_url hint (optional) PP->>IDX: resolve by sha256 (OID) -> object_id + urls[] - opt DRS resolution - PP->>DRS: resolve by object_id -> access_methods[] - end alt OID not resolvable PP-->>Git: fail push (exit non-zero) else resolvable @@ -418,7 +413,7 @@ sequenceDiagram ## Summary -> `.git/drs/pre-commit` is a **local, LFS-only, non-authoritative cache** that tracks +> `.git/drs/pre-commit` is a **local, pointer-only, non-authoritative cache** that tracks > **path ↔ OID ↔ external URL hints** to support rename, undo, and offline workflows. > > `precommit_cache` provides safe, read-only access to this cache for enforcement at pre-push. @@ -426,5 +421,5 @@ sequenceDiagram If you want, I can also: * add **inline Go doc comments** suitable for `pkg.go.dev` -* generate a **sequence diagram** (commit → cache → push → DRS/DRS) +* generate a **sequence diagram** (commit → cache → push → DRS) * or write a **pre-push reference implementation** that uses these helpers end-to-end diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index 01ff0f3b..c0b77c7b 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -1,247 +1,166 @@ # Troubleshooting -Common issues and solutions when working with Git DRS. +Common issues and solutions for the cleaned `git-drs` CLI. -> **Navigation:** [Getting Started](getting-started.md) → [Commands Reference](commands.md) → **Troubleshooting** +> **Navigation:** [Getting Started](getting-started.md) -> [Commands Reference](commands.md) -> **Troubleshooting** ## Frequently Asked Questions ### Do I need to run `git drs init` each time? -**No.** `git drs init` is set up once per Git repo. +No. -**Run it once when:** +`git drs init` is repository setup. Run it once per local clone unless you are deliberately reinitializing the repo. -- You first clone a repository -- You create a new repository +Run it when: -**Don't run it again:** +- you first clone a repository and need local `git-drs` setup +- you create a new repository and want to enable `git-drs` -- At the start of each work session -- After refreshing credentials -- After pulling updates +Do not run it every session: -**What it does:** +- not at the start of normal daily work +- not after refreshing credentials +- not after `git pull` -- Sets up `.drs/` directory structure -- Configures Git LFS hooks -- Updates `.gitignore` +What it changes: -These changes persist in your local repository. For subsequent sessions, you only need to refresh credentials if they've expired (every 30 days). +- creates `.git/drs/` repository-local state +- sets up `git-drs` repository configuration and hooks +- prepares the repo for managed pointer/register/hydration behavior -### What to do if you run `git drs init` again +### What if I run `git drs init` again? -Running `git drs init` a second time is usually harmless but unnecessary. It may re-create the `\`.git/drs/\`` directory, re-install hooks, or modify `\`.gitattributes\`` and `\`.gitignore\``. If you ran it accidentally, follow these steps: +Usually nothing catastrophic, but it is unnecessary. -1. Inspect what changed - - `git status` - - `git diff` (or `git diff -- ` for a specific file, e.g. `\`.gitignore\``) +If you did it accidentally: -2. If changes are fine - - No action required; commit the intended changes or leave them uncommitted. +1. inspect what changed -3. If you want to discard uncommitted changes - - Restore specific files: `git restore --staged \`.gitignore\`` && `git restore \`.gitignore\`` - - Restore all working-tree changes: `git restore .` - - Or (destructive) reset everything: `git reset --hard` \- use with caution. - -4. If you already committed the unintended changes - - Undo the last commit but keep changes staged: `git reset --soft HEAD~1` - - Or remove the commit and working changes: `git reset --hard HEAD~1` \- use with caution. - - See the "Undo Last Commit" section above for alternatives. - -5. Hooks or credentials issues - - If hooks were replaced or credentials need refresh, run `git drs init` with the correct `--cred`/`--profile` options, or re-add the remote with `git drs remote add`. - -Summary: inspect with `git status`/`git diff`, then either accept, manually edit, or revert the changes using standard `git restore` / `git reset` commands. - - -## When to Use Which Tool - -Understanding when to use Git, Git LFS, or Git DRS commands: - -### Git DRS Commands - -**Use for**: Repository and remote configuration - -- `git drs init` - Initialize Git LFS hooks -- `git drs remote add` - Configure DRS server connections -- `git drs remote list` - View configured remotes -- `git drs add-url` - Add cloud object references - -**When**: - -- Setting up a new repository -- Adding/managing DRS remotes -- Refreshing expired credentials -- Adding external file references - -### Git LFS Commands - -**Use for**: File tracking and management - -- `git lfs track` - Define which files to track -- `git lfs ls-files` - See tracked files and status -- `git lfs pull` - Download specific files -- `git lfs untrack` - Stop tracking file patterns - -**When**: - -- Managing which files are stored externally -- Downloading specific files -- Checking file localization status + ```bash + git status + git diff + ``` -### Standard Git Commands +2. if the changes are harmless, leave them alone or commit what you intended -**Use for**: Version control operations +3. if you want to discard the uncommitted changes, use normal Git restore/reset flow carefully -- `git add` - Stage files for commit -- `git commit` - Create commits -- `git push` - Upload commits and trigger file uploads -- `git pull` - Get latest commits +4. if hooks or repo-local state were repaired intentionally, keep the changes -**When**: +The right default is: inspect first, then decide whether anything actually needs to be reverted. -- Normal development workflow -- Git DRS runs automatically in the background +### What does `git drs init` actually change? -## Common Error Messages +It prepares repository-local `git-drs` state: -## Git LFS-Oriented Troubleshooting Guide (Commit/Push/Clone/Pull) +- `.git/drs/` metadata/state +- hook/config wiring for `git-drs` workflows +- the repo-local setup needed for pointer/register/hydration behavior -The checks below prioritize Git LFS guidance and documentation because Git DRS relies on Git LFS for large-file handling. If you run into issues, start with the Git LFS troubleshooting docs and logs, then move to Git DRS-specific configuration checks. Primary references: the Git LFS troubleshooting guide and the Git LFS documentation for installation, tracking, and environment variables: +Those changes persist in the clone. They are not something you redo per session. -- Git LFS troubleshooting: https://github.com/git-lfs/git-lfs/wiki/Troubleshooting -- Git LFS docs: https://github.com/git-lfs/git-lfs/tree/main/docs +## When to Use Which Tool -### Failed Commit (Git LFS hooks or pointer issues) +### Use `git-drs` for -1. **Confirm Git LFS is installed and hooks are active** - - Run: `git lfs version` and `git lfs env` - - If `git lfs env` reports `git lfs install` is needed, run `git lfs install` to re-install hooks. - - This is the most common cause of commits failing to convert large files into LFS pointers. +- repository-local `git-drs` setup +- remote configuration +- tracking rules +- object hydration +- DRS/Syfon metadata-oriented workflows -2. **Check whether the file was tracked before the commit** - - Run: `git lfs track` and confirm the file pattern is listed. - - If not tracked, add it (`git lfs track "*.bam"`) and stage `.gitattributes`. +Examples: -3. **Verify the file is staged as an LFS pointer** - - Run: `git lfs ls-files` to confirm the file is listed. - - If a large file was added to Git history directly, remove it from the index and re-add it after tracking. +- `git drs init` +- `git drs remote add gen3 ...` +- `git drs track` +- `git drs ls-files` +- `git drs pull` +- `git drs add-url` +- `git drs copy-records` -4. **Review Git LFS logs for hook errors** - - Run: `git lfs logs last` to inspect hook failures. - - Common errors include missing filters or file locking issues. +### Use normal Git for -### Failed Push (LFS uploads, auth, or bandwidth issues) +- branch and commit movement +- staging and committing +- ordinary ref push/pull operations -1. **Check Git LFS authentication and endpoint configuration** - - Run: `git lfs env` and confirm `Endpoint` values are correct. - - If tokens are expired, refresh credentials and re-run the push. +Examples: -2. **Retry with LFS verbose logging** - - Run: `GIT_TRACE=1 GIT_CURL_VERBOSE=1 git lfs push --all` - - Use this output to identify `403/401` auth issues or proxy errors. +- `git add` +- `git commit` +- `git push` +- `git pull` -3. **Confirm the LFS objects exist locally** - - Run: `git lfs ls-files` and ensure your large files are listed. - - Missing objects indicate a tracking or filter issue before the push. +## First Principles -4. **Validate the remote supports Git LFS** - - Run: `git lfs env` to confirm the remote endpoint. - - Some Git servers require explicit LFS enablement or URL configuration. +Before debugging behavior, keep the command split straight: -### Failed Clone (LFS objects missing or blocked) +- `git pull` + - updates commits, branches, and checkout state +- `git drs pull` + - hydrates tracked pointer files already present in the current checkout +- `git drs ls-files` + - shows tracked files and localization state -1. **Confirm LFS objects were fetched** - - After clone, run: `git lfs pull` to fetch large files. - - If the repo only has LFS pointers, you will see pointer files until you pull. +If you blur those together, the failure modes get confusing. -2. **Check LFS smudge/clean filters** - - Run: `git lfs env` and verify `git-lfs` filters are enabled. - - If not, run `git lfs install` and re-run `git lfs pull`. +## Common Error Patterns -3. **Validate access and authentication** - - `git lfs env` will show which endpoint is used; 401/403 errors point to invalid credentials. +### Failed commit or pointer conversion issues -4. **Inspect LFS logs for download errors** - - Run: `git lfs logs last` for the most recent transfer errors. +Check these in order: -### Failed Pull (LFS fetch/checkout issues) +1. confirm the file pattern was tracked before the add/commit flow -1. **Run `git lfs pull` separately** - - This isolates LFS download errors from Git merge errors. + ```bash + git drs track + ``` -2. **Check LFS file locking or concurrent transfers** - - If your Git host uses LFS file locking, verify the file is not locked by another user. +2. confirm `.gitattributes` was staged after changing tracking rules -3. **Review filters and tracking** - - Run: `git lfs track` to ensure required patterns are present. - - If a file type is newly tracked, re-run `git add .gitattributes` and commit. + ```bash + git status + ``` -4. **Check for storage or bandwidth limits** - - Some Git LFS hosts enforce quotas; errors will show in `git lfs logs last`. +3. confirm the file shows up in the tracked inventory -### Authentication Errors + ```bash + git drs ls-files + ``` -**Error**: `Upload error: 403 Forbidden` or `401 Unauthorized` +4. inspect `.git/drs/` logs if the hook path failed -**Cause**: Expired or invalid credentials +### Failed push: upload, register, or auth -**Solution**: +Check: ```bash -# Download new credentials from your data commons -# Then refresh them by re-adding the remote -git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +git drs remote list +git drs ls-files --drs ``` -**Prevention**: +Then retry with higher Git/HTTP verbosity if needed: -- Credentials expire after 30 days -- Set a reminder to refresh them regularly - ---- - -**Error**: `Upload error: 503 Service Unavailable` - -**Cause**: DRS server is temporarily unavailable or credentials expired - -**Solutions**: - -1. Wait and retry the operation -2. Refresh credentials: - ```bash - git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket - ``` -3. If persistent, download new credentials from the data commons - -### Network Errors - -**Error**: `net/http: TLS handshake timeout` - -**Cause**: Network connectivity issues +```bash +GIT_TRACE=1 GIT_CURL_VERBOSE=1 git push +``` -**Solution**: +### Failed clone or fresh checkout still has pointer files -- Simply retry the command -- These are usually temporary network issues +That usually just means hydration has not happened yet. ---- +Run: -**Error**: Git push timeout during large file uploads +```bash +git drs init +git drs pull +``` -**Cause**: Long-running operations timing out +### Network timeout during push or download -**Solution**: Add to `~/.ssh/config`: +If you use SSH remotes, keepalives help: ``` Host github.com @@ -249,297 +168,195 @@ Host github.com ServerAliveInterval 30 ``` -### File Tracking Issues +## Common Problems -**Error**: Files not being tracked by LFS +### `git drs pull` did not update my branch -**Symptoms**: +That is expected. -- Large files committed directly to Git -- `git lfs ls-files` doesn't show your files +`git drs pull` no longer runs `git pull`. -**Solution**: +Use: ```bash -# Check what's currently tracked -git lfs track - -# Track your file type -git lfs track "*.bam" -git add .gitattributes - -# Remove from Git and re-add -git rm --cached large-file.bam -git add large-file.bam -git commit -m "Track large file with LFS" +git pull +git drs pull ``` ---- +### `git drs ls-files` does not show my file -**Error**: `[404] Object does not exist on the server` +Check these in order: -**Symptoms**: - -- After clone, git pull fails - -**Solution**: +1. is the path actually tracked? ```bash -# confirm repo has complete configuration -git drs list-config - -# init your git drs project -git drs init --cred /path/to/cred/file --profile - -# attempt git pull again -git lfs pull -I path/to/file +git drs track ``` ---- - -**Error**: `git lfs ls-files` shows files but they won't download - -**Cause**: Files may not have been properly uploaded or DRS records missing - -**Solution**: +2. did you stage `.gitattributes` after adding the pattern? ```bash -# Check repository status -git drs list-config - -# Try pulling with verbose output -git lfs pull -I "problematic-file*" --verbose - -# Check logs -cat .git/drs/*.log +git add .gitattributes ``` -### Configuration Issues - -**Error**: `git drs remote list` shows empty or incomplete configuration - -**Cause**: Repository not properly initialized or no remotes configured - -**Solution**: +3. is the file part of the current checkout? ```bash -# Initialize repository if needed -git drs init +git ls-files -- path/to/file +``` -# Add Gen3 remote -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +4. inspect the local view: -# Verify configuration -git drs remote list +```bash +git drs ls-files -l ``` ---- +### `git drs pull` does nothing -**Error**: Configuration exists but commands fail +That usually means one of these: -**Cause**: Mismatched configuration between global and local settings, or expired credentials +- the current checkout already has localized bytes +- there are no tracked pointer files matching your include filters +- the file is not tracked by `git-drs` -**Solution**: +Check: ```bash -# Check configuration -git drs remote list - -# Refresh credentials by re-adding the remote -git drs remote add gen3 production \ - --cred /path/to/new-credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +git drs ls-files +git drs ls-files -I "*.bam" +git drs pull --dry-run -I "*.bam" ``` -### Remote Configuration Issues - -**Error**: `no default remote configured` +### `git drs pull` still leaves pointer files -**Cause**: Repository initialized but no remotes added yet - -**Solution**: +Check DRS registration status: ```bash -# Add your first remote (automatically becomes default) -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket +git drs ls-files --drs ``` ---- - -**Error**: `default remote 'X' not found` +If the object is not registered or not resolvable from the configured remote, hydration cannot succeed. -**Cause**: Default remote was deleted or configuration is corrupted - -**Solution**: +Also confirm the remote configuration: ```bash -# List available remotes git drs remote list - -# Set a different remote as default -git drs remote set staging - -# Or add a new remote -git drs remote add gen3 production \ - --cred /path/to/credentials.json \ - --url https://calypr-public.ohsu.edu \ - --project my-project \ - --bucket my-bucket ``` ---- +If needed, inspect the detailed logs: -**Error**: Commands using wrong remote +```bash +ls -la .git/drs/ +``` -**Cause**: Default remote is not the one you want to use +### `git drs remote add gen3` fails on bucket mapping -**Solution**: +Current shape: ```bash -# Check current default -git drs remote list - -# Option 1: Change default remote -git drs remote set production - -# Option 2: Specify remote for single command -git drs push staging -git drs fetch production +git drs remote add gen3 [remote-name] [--cred | --token ] ``` -## Undoing Changes - -### Untrack LFS Files +If this fails, the likely cause is missing bucket mapping for that scope. -If you accidentally tracked the wrong files: +That mapping is usually steward/admin setup, not something the end user invents ad hoc. -```bash -# See current tracking -git lfs track +### My credentials expired -# Remove incorrect pattern -git lfs untrack "wrong-dir/**" +Refresh by re-adding the remote with a new credential file or token: -# Add correct pattern -git lfs track "correct-dir/**" - -# Stage the changes -git add .gitattributes -git commit -m "Fix LFS tracking patterns" +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json ``` -### Undo Git Add +### `git push` fails with upload or register errors -Remove files from staging area: +Check: ```bash -# Check what's staged -git status - -# Unstage specific files -git restore --staged file1.bam file2.bam - -# Unstage all files -git restore --staged . +git drs remote list +git drs ls-files --drs ``` -### Undo Last Commit +Typical root causes: -To retry a commit with different files: +- expired credentials +- wrong remote selected +- missing server-side bucket mapping +- object registration or upload permissions missing for the target scope -```bash -# Undo last commit, keep files in working directory -git reset --soft HEAD~1 +### Files are not being tracked -# Or undo and unstage files -git reset HEAD~1 +Symptoms: -# Or completely undo commit and changes (BE CAREFUL!) -git reset --hard HEAD~1 -``` - -### Remove Files from LFS History +- large files were committed directly to Git +- `git drs ls-files` does not show the file -If you committed large files directly to Git by mistake: +Recovery: ```bash -# Remove from Git history (use carefully!) -git filter-branch --tree-filter 'rm -f large-file.dat' HEAD - -# Then track properly with LFS -git lfs track "*.dat" +git drs track "*.bam" git add .gitattributes -git add large-file.dat -git commit -m "Track large file with LFS" +git rm --cached large-file.bam +git add large-file.bam +git commit -m "Track large file with git-drs" ``` -## Diagnostic Commands - -### Check System Status +### Cloned repo only has pointer files -```bash -# Git DRS version and help -git-drs version -git-drs --help +That is normal. -# Configuration -git drs remote list +After cloning: -# Repository status -git status -git lfs ls-files +```bash +git drs init +git drs pull ``` -### View Logs +Or hydrate only what you need: ```bash -# Git DRS logs (in repository) -ls -la .git/drs/ -cat .git/drs/*.log +git drs pull -I "*.bam" ``` -### Test Connectivity +## Debugging Workflow -```bash -# Test basic Git operations -git lfs pull --dry-run +When behavior is unclear, use this sequence: -# Test DRS configuration +```bash git drs remote list +git drs track +git drs ls-files -l +git drs ls-files --drs +git drs pull --dry-run ``` -## Getting Help +That usually tells you whether the problem is: -### Log Analysis +- tracking +- hydration state +- DRS registration +- remote configuration -When reporting issues, include: +## Log and State Inspection -```bash -# System information -git-drs version -git lfs version -git --version +Useful checks: -# Configuration +```bash git drs remote list - -# Recent logs -tail -50 .git/drs/*.log +git drs track +git drs ls-files -l +git drs ls-files --drs +ls -la .git/drs/ ``` -## Prevention Best Practices +## Removed Commands + +If you see old notes mentioning these, ignore them: + +- `git drs fetch` +- `git drs list` +- `git drs upload` +- `git drs download` -1. **Test in small batches** - Don't commit hundreds of files at once -2. **Verify tracking** - Always check `git lfs ls-files` after adding files -3. **Use .gitignore** - Prevent accidental commits of temporary files -4. **Monitor repository size** - Keep an eye on `.git` directory size +Those were removed from the cleaned CLI surface. From 002ab10c389398dc81454012b33d9f2e0baa02c0 Mon Sep 17 00:00:00 2001 From: Matthew Peterkort <33436238+matthewpeterkort@users.noreply.github.com> Date: Mon, 11 May 2026 15:29:20 -0700 Subject: [PATCH 4/7] Mirror git-drs CLI to be like LFS (#232) * update git-drs cli to be LFS like * remove sentinel, address issues * add design doc * add new command for migrating records, fix go.mod * Fix ls-files inventory for hydrated tracked LFS files * fix cli * fix tests * integrate git drs init into remote add * address a bajillion outstanding issues * fix tests * fix test * bump deps * fix progress bar output styling * add business logic for handling git-drs rm command * fix tests * bump deps * clean up * fix tests * fix go mod * fix tests --- .github/workflows/build.yaml | 1 - .github/workflows/pr-checks.yaml | 1 - .github/workflows/syfon-backend-e2e.yaml | 1 - .github/workflows/test.yaml | 1 - README.md | 14 +- cmd/addurl/main_test.go | 36 +- cmd/addurl/service.go | 34 +- cmd/copyrecords/main.go | 350 ++++++++++++++ cmd/copyrecords/main_test.go | 139 ++++++ cmd/download/main.go | 100 ---- cmd/fetch/fetch_test.go | 37 -- cmd/fetch/main.go | 66 --- cmd/initialize/main.go | 152 ++++-- cmd/initialize/main_test.go | 24 + cmd/list/main.go | 59 --- cmd/lsfiles/main.go | 199 ++++++-- cmd/lsfiles/main_test.go | 217 +++++++-- cmd/ping/main.go | 137 ++++++ cmd/ping/main_test.go | 132 +++++ cmd/precommit/main.go | 115 ++++- cmd/precommit/main_test.go | 79 +++ cmd/prepush/io_helpers.go | 27 ++ cmd/prepush/main.go | 139 +----- cmd/prepush/main_test.go | 13 +- cmd/prepush/pushed_refs.go | 76 +++ cmd/pull/main.go | 142 +++--- cmd/pull/pull_test.go | 85 +++- cmd/push/main.go | 40 +- cmd/push/main_test.go | 58 +++ cmd/push/progress.go | 189 ++++++++ cmd/push/progress_test.go | 73 +++ cmd/remote/add/add_test.go | 91 +++- cmd/remote/add/gen3.go | 185 ++++++-- cmd/remote/add/init.go | 15 +- cmd/remote/add/local.go | 88 +++- cmd/remote/add/local_test.go | 75 ++- cmd/remote/list.go | 22 +- cmd/remote/remote_test.go | 108 +++++ cmd/remote/remove.go | 59 +++ cmd/remote/root.go | 1 + cmd/rm/main.go | 58 +++ cmd/rm/main_test.go | 54 +++ cmd/root.go | 17 +- cmd/upload/main.go | 99 ---- ...-drs-endpoints-and-transfer-concurrency.md | 2 +- docs/TODO/git-drs-rm-semantics.md | 155 ++++++ docs/adding-s3-files.md | 2 +- docs/commands.md | 233 ++++++++- docs/drs-uri-canonical-identity.md | 212 +++++++++ docs/ga4gh-drs-scalability-gaps.md | 195 ++++++++ docs/getting-started.md | 89 ++-- docs/troubleshooting.md | 59 ++- go.mod | 12 +- go.sum | 16 +- internal/config/config.go | 59 +++ internal/config/config_test.go | 30 ++ internal/config/remote.go | 14 +- internal/drsdelete/git_history.go | 98 ++++ internal/drsdelete/live_refs.go | 42 ++ internal/drsdelete/reconcile.go | 115 +++++ internal/drsdelete/reconcile_test.go | 167 +++++++ internal/drsdelete/test_helpers_test.go | 103 ++++ internal/drsmap/drs_map.go | 36 +- internal/drsmap/drs_map_test.go | 58 +-- internal/drsobject/object.go | 26 +- internal/drsremote/remote.go | 72 +++ internal/drsremote/remote_test.go | 112 ++++- internal/drsremote/scope.go | 13 +- internal/gitrepo/bucket_scope.go | 5 +- internal/lfs/inventory.go | 265 +++++++++-- internal/lfs/inventory_test.go | 120 ++++- internal/lfs/sentinel.go | 65 --- internal/lfs/sentinel_test.go | 66 --- internal/pathspec/match.go | 62 +++ internal/pathspec/match_test.go | 22 + internal/precommit_cache/helpers.go | 134 ++++++ internal/pushsync/batch_sync.go | 155 +++++- internal/pushsync/batch_sync_test.go | 449 ++++++++++++++++++ internal/pushsync/progress.go | 34 ++ internal/pushsync/register.go | 87 +++- internal/pushsync/register_test.go | 53 ++- internal/testutils/config.go | 6 + tests/README.md | 2 +- tests/coverage-test.sh | 4 +- tests/e2e-gen3-remote-addurl.sh | 22 +- tests/e2e-gen3-remote-full.sh | 12 +- .../docker_syfon_e2e_assertions_test.go | 46 +- .../docker_syfon_e2e_helpers_test.go | 5 +- .../docker_syfon/docker_syfon_e2e_test.go | 11 + tests/monorepos/e2e-monorepo-remote.sh | 32 +- tests/monorepos/run-test.sh | 4 +- 91 files changed, 5956 insertions(+), 1233 deletions(-) create mode 100644 cmd/copyrecords/main.go create mode 100644 cmd/copyrecords/main_test.go delete mode 100644 cmd/download/main.go delete mode 100644 cmd/fetch/fetch_test.go delete mode 100644 cmd/fetch/main.go delete mode 100644 cmd/list/main.go create mode 100644 cmd/ping/main.go create mode 100644 cmd/ping/main_test.go create mode 100644 cmd/prepush/io_helpers.go create mode 100644 cmd/prepush/pushed_refs.go create mode 100644 cmd/push/main_test.go create mode 100644 cmd/push/progress.go create mode 100644 cmd/push/progress_test.go create mode 100644 cmd/remote/remove.go create mode 100644 cmd/rm/main.go create mode 100644 cmd/rm/main_test.go delete mode 100644 cmd/upload/main.go rename docs/{ => TODO}/architecture-drs-endpoints-and-transfer-concurrency.md (98%) create mode 100644 docs/TODO/git-drs-rm-semantics.md create mode 100644 docs/drs-uri-canonical-identity.md create mode 100644 docs/ga4gh-drs-scalability-gaps.md create mode 100644 internal/drsdelete/git_history.go create mode 100644 internal/drsdelete/live_refs.go create mode 100644 internal/drsdelete/reconcile.go create mode 100644 internal/drsdelete/reconcile_test.go create mode 100644 internal/drsdelete/test_helpers_test.go delete mode 100644 internal/lfs/sentinel.go delete mode 100644 internal/lfs/sentinel_test.go create mode 100644 internal/pathspec/match.go create mode 100644 internal/pathspec/match_test.go create mode 100644 internal/pushsync/batch_sync_test.go create mode 100644 internal/pushsync/progress.go diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index de23db13..76e94efc 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -4,7 +4,6 @@ on: push: branches: [ main, master, develop ] pull_request: - branches: [ main, master, develop ] workflow_dispatch: concurrency: diff --git a/.github/workflows/pr-checks.yaml b/.github/workflows/pr-checks.yaml index 4069d45f..6557835b 100644 --- a/.github/workflows/pr-checks.yaml +++ b/.github/workflows/pr-checks.yaml @@ -2,7 +2,6 @@ name: PR Checks on: pull_request: - branches: [ main, master, develop ] concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} diff --git a/.github/workflows/syfon-backend-e2e.yaml b/.github/workflows/syfon-backend-e2e.yaml index 62bb839b..cfc4474d 100644 --- a/.github/workflows/syfon-backend-e2e.yaml +++ b/.github/workflows/syfon-backend-e2e.yaml @@ -2,7 +2,6 @@ name: Syfon Backend E2E on: pull_request: - branches: [ main, master, develop ] workflow_dispatch: concurrency: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 82d0ec2c..38b2fc29 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -4,7 +4,6 @@ on: push: branches: [ main, master, develop ] pull_request: - branches: [ main, master, develop ] workflow_dispatch: concurrency: diff --git a/README.md b/README.md index 81ea2f6c..48dc57d9 100644 --- a/README.md +++ b/README.md @@ -30,23 +30,23 @@ At a high level: -1. initialize the repository with `git drs init` -2. configure a remote for one `organization/project` +1. configure a remote for one `organization/project` +2. let `remote add` bootstrap repo-local `git-drs` state if needed 3. track file patterns with `git drs track` 4. add/commit/push normally +5. remove tracked pointers with `git drs rm` when you want repository deletion to reconcile with remote DRS state 5. hydrate pointer files later with `git drs pull` ## Quick Start ```bash git drs install -git drs init git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json git drs track "*.bam" git add .gitattributes git add sample.bam git commit -m "Add sample" -git push +git drs push git drs ls-files git drs pull -I "*.bam" ``` @@ -81,15 +81,17 @@ Push and pull depend on server-side bucket mapping for the requested scope. That | Command | Description | | --- | --- | | `git drs install` | Install global `git-drs` filter config | -| `git drs init` | Initialize repository-local `git-drs` state | +| `git drs init` | Explicitly initialize or repair repository-local `git-drs` state | | `git drs remote add gen3 [remote] ` | Add or refresh a Gen3/Syfon remote | | `git drs remote list` | List configured remotes | +| `git drs remote remove ` | Remove a configured DRS remote | | `git drs remote set ` | Set the default remote | | `git drs track ` | Track files or globs | | `git drs untrack ` | Stop tracking files or globs | +| `git drs rm ...` | Remove tracked DRS/LFS files from Git | | `git drs ls-files` | List tracked files and localization state | | `git drs pull` | Hydrate pointer files in the current checkout | -| `git drs push` | Register/upload objects and push metadata workflow | +| `git drs push` | Register/upload objects, reconcile committed deletes, and push refs | | `git drs add-url` | Add an existing provider object by URL or scoped key | | `git drs add-ref` | Add a local reference to an existing DRS object | | `git drs query` | Query a DRS object by ID | diff --git a/cmd/addurl/main_test.go b/cmd/addurl/main_test.go index 26a6456a..060dea16 100644 --- a/cmd/addurl/main_test.go +++ b/cmd/addurl/main_test.go @@ -19,7 +19,6 @@ import ( "github.com/calypr/git-drs/internal/config" "github.com/calypr/git-drs/internal/drsobject" "github.com/calypr/git-drs/internal/gitrepo" - "github.com/calypr/git-drs/internal/lfs" "github.com/calypr/git-drs/internal/precommit_cache" sycloud "github.com/calypr/syfon/client/cloud" ) @@ -100,9 +99,9 @@ func TestRunAddURL_WritesPointerAndLFSObject(t *testing.T) { t.Fatalf("service.Run error: %v", err) } - oid, err := lfs.SyntheticOIDFromETag("abcd1234") + oid, err := placeholderOIDForUnknownSHA("abcd1234", "s3://bucket/path/to/file.bin") if err != nil { - t.Fatalf("SyntheticOIDFromETag: %v", err) + t.Fatalf("placeholderOIDForUnknownSHA: %v", err) } pointerPath := filepath.Join(tempDir, "path/to/file.bin") @@ -120,15 +119,8 @@ func TestRunAddURL_WritesPointerAndLFSObject(t *testing.T) { } lfsObject := filepath.Join(lfsRoot, "objects", oid[0:2], oid[2:4], oid) - if _, err := os.Stat(lfsObject); err != nil { - t.Fatalf("expected LFS object at %s: %v", lfsObject, err) - } - sentinel, err := os.ReadFile(lfsObject) - if err != nil { - t.Fatalf("read sentinel: %v", err) - } - if !lfs.IsAddURLSentinelBytes(sentinel) { - t.Fatalf("expected add-url sentinel payload, got: %q", string(sentinel)) + if _, err := os.Stat(lfsObject); !os.IsNotExist(err) { + t.Fatalf("expected no local LFS object payload at %s, got err=%v", lfsObject, err) } drsObject, err := drsobject.ReadObject(common.DRS_OBJS_PATH, oid) @@ -143,6 +135,26 @@ func TestRunAddURL_WritesPointerAndLFSObject(t *testing.T) { } } +func TestPlaceholderOIDForUnknownSHA(t *testing.T) { + oid1, err := placeholderOIDForUnknownSHA("etag-abc", "s3://bucket/key") + if err != nil { + t.Fatalf("placeholderOIDForUnknownSHA: %v", err) + } + oid2, err := placeholderOIDForUnknownSHA(`"etag-abc"`, "s3://bucket/key") + if err != nil { + t.Fatalf("placeholderOIDForUnknownSHA quoted: %v", err) + } + if oid1 != oid2 { + t.Fatalf("expected trimmed etag handling to be stable: %s vs %s", oid1, oid2) + } + if len(oid1) != 64 { + t.Fatalf("expected 64-char oid, got %q", oid1) + } + if _, err := placeholderOIDForUnknownSHA("", "s3://bucket/key"); err == nil { + t.Fatal("expected empty etag error") + } +} + func TestParseAddURLInput_DoesNotRequireAWSFlags(t *testing.T) { cmd := NewCommand() in, err := parseAddURLInput(cmd, []string{"gs://bucket/path/to/file.bin"}) diff --git a/cmd/addurl/service.go b/cmd/addurl/service.go index 79ad6195..e50ca8c1 100644 --- a/cmd/addurl/service.go +++ b/cmd/addurl/service.go @@ -2,9 +2,10 @@ package addurl import ( "context" + "crypto/sha256" "fmt" "log/slog" - "os" + "strings" "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/config" @@ -186,26 +187,29 @@ func writeAddURLDrsObject(builder drsobject.Builder, file addURLDrsFile, objectP return drsObj, nil } -// ensureLFSObject ensures the LFS object identified by objectInfo exists in the -// repository's LFS storage. If SHA256 is provided, it is trusted and returned. -// Otherwise we create a sentinel object and synthetic OID derived from ETag, -// deferring true checksum validation to first real data use. +// ensureLFSObject returns the LFS pointer OID to use for the add-url target. +// If SHA256 is provided, it is trusted and returned. Otherwise we derive a +// deterministic placeholder OID from provider identity without writing any +// local LFS object payload. func (s *AddURLService) ensureLFSObject(ctx context.Context, objectInfo *sycloud.ObjectInfo, input addURLInput, lfsRoot string) (string, error) { _ = ctx + _ = lfsRoot if input.sha256 != "" { return input.sha256, nil } - oid, err := lfs.SyntheticOIDFromETag(objectInfo.ETag) - if err != nil { - return "", err - } - objPath, err := lfs.WriteAddURLSentinelObject(lfsRoot, oid, objectInfo.ETag, input.objectURL) - if err != nil { - return "", err + return placeholderOIDForUnknownSHA(objectInfo.ETag, input.objectURL) +} + +func placeholderOIDForUnknownSHA(etag string, sourceURL string) (string, error) { + e := strings.TrimSpace(strings.Trim(etag, `"`)) + src := strings.TrimSpace(sourceURL) + if e == "" { + return "", fmt.Errorf("etag is required for placeholder oid") } - if _, err := fmt.Fprintf(os.Stderr, "Added add-url sentinel object at %s\n", objPath); err != nil { - return "", fmt.Errorf("stderr write: %w", err) + if src == "" { + return "", fmt.Errorf("source URL is required for placeholder oid") } - return oid, nil + sum := sha256.Sum256([]byte("git-drs-add-url-placeholder:v2\netag=" + e + "\nsource=" + src + "\n")) + return fmt.Sprintf("%x", sum[:]), nil } diff --git a/cmd/copyrecords/main.go b/cmd/copyrecords/main.go new file mode 100644 index 00000000..c494190c --- /dev/null +++ b/cmd/copyrecords/main.go @@ -0,0 +1,350 @@ +package copyrecords + +import ( + "context" + "encoding/json" + "fmt" + "log/slog" + "strings" + + "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drslog" + drsapi "github.com/calypr/syfon/apigen/client/drs" + internalapi "github.com/calypr/syfon/apigen/client/internalapi" + syservices "github.com/calypr/syfon/client/services" + "github.com/spf13/cobra" +) + +var ( + batchSize int +) + +type copyStats struct { + SourceSeen int + Created int + Updated int + Unchanged int + Written int +} + +type indexAPI interface { + List(ctx context.Context, opts syservices.ListRecordsOptions) (internalapi.ListRecordsResponse, error) + BulkDocuments(ctx context.Context, dids []string) ([]internalapi.InternalRecordResponse, error) + CreateBulk(ctx context.Context, req internalapi.BulkCreateRequest) (internalapi.ListRecordsResponse, error) +} + +var Cmd = &cobra.Command{ + Use: "copy-records [source-remote] ", + Short: "Copy Syfon records between remotes for one organization/project scope", + Long: "Read all Syfon records for a source organization/project scope and bulk load them into a target Syfon instance, only merging controlled_access and access_methods for records that already exist on the target.", + Args: cobra.RangeArgs(2, 3), + RunE: func(cmd *cobra.Command, args []string) error { + logger := drslog.GetLogger() + cfg, err := config.LoadConfig() + if err != nil { + return fmt.Errorf("error loading config: %w", err) + } + + sourceRemote := "" + targetRemote := "" + scopeArg := "" + if len(args) == 2 { + targetRemote = args[0] + scopeArg = args[1] + } else { + sourceRemote = args[0] + targetRemote = args[1] + scopeArg = args[2] + } + + srcRemoteName, err := cfg.GetRemoteOrDefault(sourceRemote) + if err != nil { + return fmt.Errorf("error resolving source remote: %w", err) + } + if strings.TrimSpace(targetRemote) == "" { + return fmt.Errorf("target remote is required") + } + dstRemoteName := config.Remote(targetRemote) + if srcRemoteName == dstRemoteName { + return fmt.Errorf("source and target remotes must be different") + } + + srcCfg := cfg.GetRemote(srcRemoteName) + if srcCfg == nil { + return fmt.Errorf("source remote %q not found", srcRemoteName) + } + + org, proj, err := parseScopeArg(scopeArg) + if err != nil { + return err + } + + srcCtx, err := cfg.GetRemoteClient(srcRemoteName, logger) + if err != nil { + return fmt.Errorf("error creating source client: %w", err) + } + dstCtx, err := cfg.GetRemoteClient(dstRemoteName, logger) + if err != nil { + return fmt.Errorf("error creating target client: %w", err) + } + + stats, err := copyProjectRecords(cmd.Context(), logger, srcCtx.Client.Index(), dstCtx.Client.Index(), org, proj, batchSize) + if err != nil { + return err + } + + logger.Info("copy-records complete", + "source_remote", srcRemoteName, + "target_remote", dstRemoteName, + "organization", org, + "project", proj, + "source_seen", stats.SourceSeen, + "created", stats.Created, + "updated", stats.Updated, + "unchanged", stats.Unchanged, + "written", stats.Written, + ) + return nil + }, +} + +func init() { + Cmd.Flags().IntVar(&batchSize, "batch-size", 250, "records per source page and target bulk write") +} + +func parseScopeArg(raw string) (string, string, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", "", fmt.Errorf("scope is required and must be in organization/project form") + } + parts := strings.Split(raw, "/") + if len(parts) != 2 { + return "", "", fmt.Errorf("invalid scope %q: expected organization/project", raw) + } + org := strings.TrimSpace(parts[0]) + project := strings.TrimSpace(parts[1]) + if org == "" || project == "" { + return "", "", fmt.Errorf("invalid scope %q: expected organization/project", raw) + } + return org, project, nil +} + +func copyProjectRecords(ctx context.Context, logger *slog.Logger, src indexAPI, dst indexAPI, org, project string, batchSize int) (copyStats, error) { + if batchSize <= 0 { + batchSize = 250 + } + + stats := copyStats{} + page := 1 + for { + listResp, err := src.List(ctx, syservices.ListRecordsOptions{ + Organization: org, + ProjectID: project, + Limit: batchSize, + Page: page, + }) + if err != nil { + return stats, fmt.Errorf("source list failed for %s/%s page %d: %w", org, project, page, err) + } + records := []internalapi.InternalRecord{} + if listResp.Records != nil { + records = *listResp.Records + } + if len(records) == 0 { + break + } + stats.SourceSeen += len(records) + + toWrite, batchStats, err := buildMergedBatch(ctx, dst, records) + if err != nil { + return stats, err + } + stats.Created += batchStats.Created + stats.Updated += batchStats.Updated + stats.Unchanged += batchStats.Unchanged + + if len(toWrite) > 0 { + resp, err := dst.CreateBulk(ctx, internalapi.BulkCreateRequest{Records: toWrite}) + if err != nil { + return stats, fmt.Errorf("target bulk create failed on page %d: %w", page, err) + } + if resp.Records != nil { + stats.Written += len(*resp.Records) + } else { + stats.Written += len(toWrite) + } + } + + if logger != nil { + logger.Info("copy-records batch complete", + "organization", org, + "project", project, + "page", page, + "source_records", len(records), + "created", batchStats.Created, + "updated", batchStats.Updated, + "unchanged", batchStats.Unchanged, + "written", len(toWrite), + ) + } + + if len(records) < batchSize { + break + } + page++ + } + + return stats, nil +} + +func buildMergedBatch(ctx context.Context, dst indexAPI, source []internalapi.InternalRecord) ([]internalapi.InternalRecord, copyStats, error) { + stats := copyStats{} + if len(source) == 0 { + return nil, stats, nil + } + + dids := make([]string, 0, len(source)) + for _, rec := range source { + did := strings.TrimSpace(rec.Did) + if did == "" { + continue + } + dids = append(dids, did) + } + + existing, err := dst.BulkDocuments(ctx, dids) + if err != nil { + return nil, stats, fmt.Errorf("target bulk documents failed: %w", err) + } + existingByDID := make(map[string]internalapi.InternalRecord, len(existing)) + for _, rec := range existing { + existingByDID[strings.TrimSpace(rec.Did)] = recordResponseToRecord(rec) + } + + out := make([]internalapi.InternalRecord, 0, len(source)) + for _, src := range source { + did := strings.TrimSpace(src.Did) + if did == "" { + continue + } + if dstRec, ok := existingByDID[did]; ok { + merged, changed := mergeExistingRecord(dstRec, src) + if changed { + out = append(out, merged) + stats.Updated++ + } else { + stats.Unchanged++ + } + continue + } + out = append(out, src) + stats.Created++ + } + + return out, stats, nil +} + +func mergeExistingRecord(dst, src internalapi.InternalRecord) (internalapi.InternalRecord, bool) { + merged := dst + changed := false + + controlledAccess := mergeStringLists(dst.ControlledAccess, src.ControlledAccess) + if !equalStringPointers(merged.ControlledAccess, controlledAccess) { + merged.ControlledAccess = controlledAccess + changed = true + } + + accessMethods := mergeAccessMethods(dst.AccessMethods, src.AccessMethods) + if !equalAccessMethodPointers(merged.AccessMethods, accessMethods) { + merged.AccessMethods = accessMethods + changed = true + } + + return merged, changed +} + +func recordResponseToRecord(in internalapi.InternalRecordResponse) internalapi.InternalRecord { + return internalapi.InternalRecord{ + Did: in.Did, + AccessMethods: in.AccessMethods, + ControlledAccess: in.ControlledAccess, + CreatedTime: in.CreatedTime, + Description: in.Description, + FileName: in.FileName, + Hashes: in.Hashes, + Organization: in.Organization, + Project: in.Project, + Size: in.Size, + UpdatedTime: in.UpdatedTime, + Version: in.Version, + } +} + +func mergeStringLists(left, right *[]string) *[]string { + seen := map[string]struct{}{} + out := make([]string, 0) + for _, list := range []*[]string{left, right} { + if list == nil { + continue + } + for _, raw := range *list { + val := strings.TrimSpace(raw) + if val == "" { + continue + } + if _, ok := seen[val]; ok { + continue + } + seen[val] = struct{}{} + out = append(out, val) + } + } + if len(out) == 0 { + return nil + } + return &out +} + +func mergeAccessMethods(left, right *[]drsapi.AccessMethod) *[]drsapi.AccessMethod { + seen := map[string]struct{}{} + out := make([]drsapi.AccessMethod, 0) + for _, list := range []*[]drsapi.AccessMethod{left, right} { + if list == nil { + continue + } + for _, method := range *list { + key := canonicalAccessMethod(method) + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + out = append(out, method) + } + } + if len(out) == 0 { + return nil + } + return &out +} + +func canonicalAccessMethod(method drsapi.AccessMethod) string { + b, err := json.Marshal(method) + if err != nil { + return fmt.Sprintf("%s|%v", method.Type, method.AccessId) + } + return string(b) +} + +func equalStringPointers(a, b *[]string) bool { + return equalJSON(a, b) +} + +func equalAccessMethodPointers(a, b *[]drsapi.AccessMethod) bool { + return equalJSON(a, b) +} + +func equalJSON(a, b any) bool { + ab, _ := json.Marshal(a) + bb, _ := json.Marshal(b) + return string(ab) == string(bb) +} diff --git a/cmd/copyrecords/main_test.go b/cmd/copyrecords/main_test.go new file mode 100644 index 00000000..6e1e4528 --- /dev/null +++ b/cmd/copyrecords/main_test.go @@ -0,0 +1,139 @@ +package copyrecords + +import ( + "context" + "testing" + + drsapi "github.com/calypr/syfon/apigen/client/drs" + internalapi "github.com/calypr/syfon/apigen/client/internalapi" + syservices "github.com/calypr/syfon/client/services" +) + +type fakeIndexAPI struct { + listResp internalapi.ListRecordsResponse + bulkDocsResp []internalapi.InternalRecordResponse + createBulkReq []internalapi.BulkCreateRequest +} + +func (f *fakeIndexAPI) List(ctx context.Context, opts syservices.ListRecordsOptions) (internalapi.ListRecordsResponse, error) { + return f.listResp, nil +} + +func (f *fakeIndexAPI) BulkDocuments(ctx context.Context, dids []string) ([]internalapi.InternalRecordResponse, error) { + return f.bulkDocsResp, nil +} + +func (f *fakeIndexAPI) CreateBulk(ctx context.Context, req internalapi.BulkCreateRequest) (internalapi.ListRecordsResponse, error) { + f.createBulkReq = append(f.createBulkReq, req) + return internalapi.ListRecordsResponse{Records: &req.Records}, nil +} + +func TestMergeExistingRecord_UnionsControlledAccessAndAccessMethodsOnly(t *testing.T) { + dstName := "target.bin" + srcName := "source.bin" + desc := "keep target description" + leftCA := []string{"/organization/A/project/P1"} + rightCA := []string{"/organization/A/project/P1", "/organization/A/project/P2"} + leftMethods := []drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: "s3://bucket/one"}, + }} + rightMethods := []drsapi.AccessMethod{ + leftMethods[0], + { + Type: drsapi.AccessMethodTypeHttps, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: "https://example.org/two"}, + }, + } + + merged, changed := mergeExistingRecord( + internalapi.InternalRecord{ + Did: "did-1", + FileName: &dstName, + Description: &desc, + ControlledAccess: &leftCA, + AccessMethods: &leftMethods, + }, + internalapi.InternalRecord{ + Did: "did-1", + FileName: &srcName, + ControlledAccess: &rightCA, + AccessMethods: &rightMethods, + }, + ) + + if !changed { + t.Fatalf("expected merge to report a change") + } + if merged.FileName == nil || *merged.FileName != dstName { + t.Fatalf("expected target metadata to be preserved, got %+v", merged.FileName) + } + if merged.Description == nil || *merged.Description != desc { + t.Fatalf("expected target description to be preserved") + } + if merged.ControlledAccess == nil || len(*merged.ControlledAccess) != 2 { + t.Fatalf("expected merged controlled access union, got %+v", merged.ControlledAccess) + } + if merged.AccessMethods == nil || len(*merged.AccessMethods) != 2 { + t.Fatalf("expected merged access method union, got %+v", merged.AccessMethods) + } +} + +func TestBuildMergedBatch_CreatesNewAndUpdatesExisting(t *testing.T) { + srcCA := []string{"/organization/A/project/P1"} + newCA := []string{"/organization/A/project/P2"} + srcMethods := []drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: "s3://bucket/a"}, + }} + newMethods := []drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeHttps, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: "https://example.org/b"}, + }} + + target := &fakeIndexAPI{ + bulkDocsResp: []internalapi.InternalRecordResponse{ + { + Did: "did-existing", + ControlledAccess: &srcCA, + AccessMethods: &srcMethods, + }, + }, + } + + source := []internalapi.InternalRecord{ + { + Did: "did-existing", + ControlledAccess: &newCA, + AccessMethods: &newMethods, + }, + { + Did: "did-new", + ControlledAccess: &srcCA, + AccessMethods: &srcMethods, + }, + } + + out, stats, err := buildMergedBatch(context.Background(), target, source) + if err != nil { + t.Fatalf("buildMergedBatch error: %v", err) + } + if len(out) != 2 { + t.Fatalf("expected 2 output records, got %d", len(out)) + } + if stats.Created != 1 || stats.Updated != 1 || stats.Unchanged != 0 { + t.Fatalf("unexpected stats: %+v", stats) + } +} diff --git a/cmd/download/main.go b/cmd/download/main.go deleted file mode 100644 index 599cd2e0..00000000 --- a/cmd/download/main.go +++ /dev/null @@ -1,100 +0,0 @@ -package download - -import ( - "context" - "fmt" - "path/filepath" - "strings" - - "github.com/calypr/git-drs/internal/common" - "github.com/calypr/git-drs/internal/config" - "github.com/calypr/git-drs/internal/drslog" - "github.com/calypr/git-drs/internal/drsremote" - drsapi "github.com/calypr/syfon/apigen/client/drs" - sydownload "github.com/calypr/syfon/client/transfer/download" - "github.com/spf13/cobra" -) - -var remote string -var outdir string - -// Cmd line declaration -var Cmd = &cobra.Command{ - Use: "download ", - Short: "Download a file from a DRS server", - Long: "Download a file from a DRS server, without creating an LFS pointer", - Args: cobra.MinimumNArgs(1), - RunE: func(cmd *cobra.Command, args []string) error { - - logger := drslog.GetLogger() - - config, err := config.LoadConfig() - if err != nil { - return err - } - - remoteName, err := config.GetRemoteOrDefault(remote) - if err != nil { - logger.Error(fmt.Sprintf("Error getting remote: %v", err)) - return err - } - - client, err := config.GetRemoteClient(remoteName, logger) - if err != nil { - return err - } - for _, src := range args { - obj, err := client.Client.DRS().GetObject(context.Background(), src) - if err != nil { - logger.Error(fmt.Sprintf("Error downloading object %s: %v", src, err)) - } else { - common.PrintDRSObject(obj, false) - dstName := src - if obj.Name != nil && *obj.Name != "" { - dstName = filepath.Base(*obj.Name) - } - dstPath := filepath.Join(outdir, dstName) - logger.Info(fmt.Sprintf("Downloading object %s to path %s", src, dstPath)) - accessURL, err := resolveAccessURL(cmd.Context(), client, obj) - if err != nil { - logger.Error(fmt.Sprintf("Error resolving access URL for object %s: %v", src, err)) - continue - } - if err := drsremote.DownloadResolvedToPath(cmd.Context(), client, obj.Id, dstPath, &obj, accessURL, sydownload.DownloadOptions{ - MultipartThreshold: 5 * 1024 * 1024, - Concurrency: 2, - ChunkSize: 64 * 1024 * 1024, - }); err != nil { - logger.Error(fmt.Sprintf("Error downloading object %s to path %s: %v", src, dstPath, err)) - } else { - logger.Info(fmt.Sprintf("Successfully downloaded object %s to path %s", src, dstPath)) - } - } - } - - return nil - }, -} - -func init() { - Cmd.Flags().StringVarP(&remote, "remote", "r", "", "target remote DRS server (default: default_remote)") - Cmd.Flags().StringVarP(&outdir, "outdir", "o", ".", "output directory for downloaded files") -} - -func resolveAccessURL(ctx context.Context, remote *config.GitContext, obj drsapi.DrsObject) (*drsapi.AccessURL, error) { - if remote == nil || remote.Client == nil { - return nil, fmt.Errorf("DRS client unavailable") - } - if obj.AccessMethods == nil || len(*obj.AccessMethods) == 0 { - return nil, fmt.Errorf("no access methods available for DRS object %s", obj.Id) - } - accessType := strings.TrimSpace(string((*obj.AccessMethods)[0].Type)) - if accessType == "" { - return nil, fmt.Errorf("no access type found in access method for DRS object %s", obj.Id) - } - accessURL, err := remote.Client.DRS().GetAccessURL(ctx, obj.Id, accessType) - if err != nil { - return nil, err - } - return &accessURL, nil -} diff --git a/cmd/fetch/fetch_test.go b/cmd/fetch/fetch_test.go deleted file mode 100644 index 37766718..00000000 --- a/cmd/fetch/fetch_test.go +++ /dev/null @@ -1,37 +0,0 @@ -package fetch - -import ( - "testing" - - "github.com/calypr/git-drs/internal/testutils" - "github.com/stretchr/testify/assert" -) - -func TestFetchCmdArgs(t *testing.T) { - // Test with no arguments (valid) - err := Cmd.Args(Cmd, []string{}) - assert.NoError(t, err) - - // Test with 1 argument (valid) - err = Cmd.Args(Cmd, []string{"origin"}) - assert.NoError(t, err) - - // Test with multiple arguments (invalid) - err = Cmd.Args(Cmd, []string{"origin", "extra"}) - assert.Error(t, err) -} - -func TestFetchRun_Error(t *testing.T) { - _ = testutils.SetupTestGitRepo(t) - // No config, should error - err := Cmd.RunE(Cmd, []string{}) - assert.Error(t, err) -} - -func TestFetchRun_InvalidRemote(t *testing.T) { - tmpDir := testutils.SetupTestGitRepo(t) - testutils.CreateDefaultTestConfig(t, tmpDir) - // Fetch from non-existent remote - err := Cmd.RunE(Cmd, []string{"no-remote"}) - assert.Error(t, err) -} diff --git a/cmd/fetch/main.go b/cmd/fetch/main.go deleted file mode 100644 index 0acf089a..00000000 --- a/cmd/fetch/main.go +++ /dev/null @@ -1,66 +0,0 @@ -package fetch - -import ( - "fmt" - "os/exec" - "strings" - - "github.com/calypr/git-drs/internal/config" - "github.com/calypr/git-drs/internal/drslog" - "github.com/spf13/cobra" -) - -var runCommand = func(name string, args ...string) ([]byte, error) { - cmd := exec.Command(name, args...) - return cmd.CombinedOutput() -} - -// Cmd line declaration -var Cmd = &cobra.Command{ - Use: "fetch [remote-name]", - Short: "Fetch LFS objects from remote via standard git-lfs", - Args: func(cmd *cobra.Command, args []string) error { - if len(args) > 1 { - cmd.SilenceUsage = false - return fmt.Errorf("error: accepts at most 1 argument (remote name), received %d\n\nUsage: %s\n\nSee 'git drs fetch --help' for more details", len(args), cmd.UseLine()) - } - return nil - }, - RunE: func(cmd *cobra.Command, args []string) error { - logger := drslog.GetLogger() - - cfg, err := config.LoadConfig() - if err != nil { - return fmt.Errorf("error loading config: %v", err) - } - - var remote config.Remote - if len(args) > 0 { - remote = config.Remote(args[0]) - } else { - remote, err = cfg.GetDefaultRemote() - if err != nil { - logger.Error(fmt.Sprintf("Error getting remote: %v", err)) - return err - } - } - - drsClient, err := cfg.GetRemoteClient(remote, logger) - if err != nil { - logger.Error(fmt.Sprintf("\nerror creating DRS client: %s", err)) - return err - } - _ = drsClient // Remote validation only. - - out, err := runCommand("git", "lfs", "pull", string(remote)) - if err != nil { - msg := strings.TrimSpace(string(out)) - if msg == "" { - msg = err.Error() - } - return fmt.Errorf("git lfs pull failed for remote %q: %s", remote, msg) - } - - return nil - }, -} diff --git a/cmd/initialize/main.go b/cmd/initialize/main.go index 95fcdf65..26a0738b 100644 --- a/cmd/initialize/main.go +++ b/cmd/initialize/main.go @@ -39,57 +39,123 @@ var Cmd = &cobra.Command{ }, RunE: func(cmd *cobra.Command, args []string) error { logg := drslog.GetLogger() - - // check if .git dir exists to ensure you're in a git repository - _, err := gitrepo.GitTopLevel() - if err != nil { - return fmt.Errorf("error: not in a git repository. Please run this command in the root of your git repository") + if err := InitializeRepo(logg); err != nil { + return err } + logg.Debug(fmt.Sprintf("Using %d concurrent transfers", transfers)) + return nil + }, +} - // create config file if it doesn't exist - err = config.CreateEmptyConfig() - if err != nil { - return fmt.Errorf("error: unable to create config file: %v", err) - } +// InitializeRepo applies git-drs repository-local setup to the current git repository. +// It is safe to call repeatedly. +func InitializeRepo(logg *slog.Logger) error { + // check if .git dir exists to ensure you're in a git repository + _, err := gitrepo.GitTopLevel() + if err != nil { + return fmt.Errorf("error: not in a git repository. Please run this command in the root of your git repository") + } - // load the config - _, err = config.LoadConfig() - if err != nil { - logg.Debug(fmt.Sprintf("We should probably fix this: %v", err)) - return fmt.Errorf("error: unable to load config file: %v", err) - } + // create config file if it doesn't exist + err = config.CreateEmptyConfig() + if err != nil { + return fmt.Errorf("error: unable to create config file: %v", err) + } - // create drs directories - drsDir := common.DRS_DIR - drsLfsObjsDir := common.DRS_OBJS_PATH - if err := os.MkdirAll(drsDir, 0755); err != nil { - return fmt.Errorf("error: unable to create drs directory: %v", err) - } - if err := os.MkdirAll(drsLfsObjsDir, 0755); err != nil { - return fmt.Errorf("error: unable to create drs lfs objects directory: %v", err) - } + // load the config + _, err = config.LoadConfig() + if err != nil { + logg.Debug(fmt.Sprintf("We should probably fix this: %v", err)) + return fmt.Errorf("error: unable to load config file: %v", err) + } - err = initGitConfig() - if err != nil { - return fmt.Errorf("error initializing git-drs repository config: %v", err) - } + // create drs directories + drsDir := common.DRS_DIR + drsLfsObjsDir := common.DRS_OBJS_PATH + if err := os.MkdirAll(drsDir, 0755); err != nil { + return fmt.Errorf("error: unable to create drs directory: %v", err) + } + if err := os.MkdirAll(drsLfsObjsDir, 0755); err != nil { + return fmt.Errorf("error: unable to create drs lfs objects directory: %v", err) + } - // install pre-push hook - err = installPrePushHook(logg) - if err != nil { - return fmt.Errorf("error installing pre-push hook: %v", err) - } - // install pre-commit hook - err = installPreCommitHook(logg) - if err != nil { - return fmt.Errorf("error installing pre-commit hook: %v", err) - } + err = initGitConfig() + if err != nil { + return fmt.Errorf("error initializing git-drs repository config: %v", err) + } - // final logs - logg.Debug("Git DRS initialized") - logg.Debug(fmt.Sprintf("Using %d concurrent transfers", transfers)) + // install pre-push hook + err = installPrePushHook(logg) + if err != nil { + return fmt.Errorf("error installing pre-push hook: %v", err) + } + // install pre-commit hook + err = installPreCommitHook(logg) + if err != nil { + return fmt.Errorf("error installing pre-commit hook: %v", err) + } + + logg.Debug("Git DRS initialized") + return nil +} + +// EnsureInitialized applies initialization only when the repository does not +// already appear to have git-drs local setup installed. +func EnsureInitialized(logg *slog.Logger) error { + initialized, err := isInitialized() + if err != nil { + return err + } + if initialized { return nil - }, + } + return InitializeRepo(logg) +} + +func isInitialized() (bool, error) { + if _, err := gitrepo.GitTopLevel(); err != nil { + return false, fmt.Errorf("error: not in a git repository. Please run this command in the root of your git repository") + } + + if _, err := os.Stat(common.DRS_DIR); err != nil { + if os.IsNotExist(err) { + return false, nil + } + return false, fmt.Errorf("error checking git-drs directory: %v", err) + } + + if val, err := gitrepo.GetGitConfigString("filter.drs.process"); err != nil || strings.TrimSpace(val) != "git-drs filter" { + return false, err + } + + preCommitInstalled, err := hookContains("pre-commit", "git drs precommit") + if err != nil { + return false, err + } + if !preCommitInstalled { + return false, nil + } + + prePushInstalled, err := hookContains("pre-push", "git drs pre-push-prepare") + if err != nil { + return false, err + } + return prePushInstalled, nil +} + +func hookContains(name, marker string) (bool, error) { + hooksDir, err := gitrepo.GetGitHooksDir() + if err != nil { + return false, fmt.Errorf("unable to get hooks directory: %w", err) + } + content, err := os.ReadFile(filepath.Join(hooksDir, name)) + if err != nil { + if os.IsNotExist(err) { + return false, nil + } + return false, err + } + return strings.Contains(string(content), marker), nil } func initGitConfig() error { diff --git a/cmd/initialize/main_test.go b/cmd/initialize/main_test.go index 1126a2dd..04b02369 100644 --- a/cmd/initialize/main_test.go +++ b/cmd/initialize/main_test.go @@ -6,6 +6,7 @@ import ( "strings" "testing" + "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/gitrepo" "github.com/calypr/git-drs/internal/testutils" @@ -106,3 +107,26 @@ func TestInitConfigValues(t *testing.T) { check("lfs.concurrenttransfers", "8") check("lfs.allowincompletepush", "false") } + +func TestEnsureInitialized(t *testing.T) { + testutils.SetupTestGitRepo(t) + logger := drslog.NewNoOpLogger() + + if err := EnsureInitialized(logger); err != nil { + t.Fatalf("EnsureInitialized error: %v", err) + } + if err := EnsureInitialized(logger); err != nil { + t.Fatalf("EnsureInitialized second call error: %v", err) + } + + if _, err := os.Stat(common.DRS_DIR); err != nil { + t.Fatalf("expected %s to exist: %v", common.DRS_DIR, err) + } + filterProcess, err := gitrepo.GetGitConfigString("filter.drs.process") + if err != nil { + t.Fatalf("GetGitConfigString(filter.drs.process): %v", err) + } + if filterProcess != "git-drs filter" { + t.Fatalf("unexpected filter.drs.process: %q", filterProcess) + } +} diff --git a/cmd/list/main.go b/cmd/list/main.go deleted file mode 100644 index dfcf7a7c..00000000 --- a/cmd/list/main.go +++ /dev/null @@ -1,59 +0,0 @@ -package list - -import ( - "context" - "fmt" - - "github.com/calypr/git-drs/internal/common" - "github.com/calypr/git-drs/internal/config" - "github.com/calypr/git-drs/internal/drslog" - "github.com/spf13/cobra" -) - -var remote string -var pretty = false - -// Cmd line declaration -var Cmd = &cobra.Command{ - Use: "list", - Short: "List DRS objects in a DRS server", - Long: "List DRS objects in a DRS server", - RunE: func(cmd *cobra.Command, args []string) error { - - logger := drslog.GetLogger() - - config, err := config.LoadConfig() - if err != nil { - return err - } - - remoteName, err := config.GetRemoteOrDefault(remote) - if err != nil { - logger.Error(fmt.Sprintf("Error getting remote: %v", err)) - return err - } - - client, err := config.GetRemoteClient(remoteName, logger) - if err != nil { - return err - } - - objs, err := client.Client.DRS().ListObjects(context.Background(), 1000, 1) - if err != nil { - return err - } - - for _, drsObj := range objs.DrsObjects { - if err := common.PrintDRSObject(drsObj, pretty); err != nil { - return err - } - } - - return nil - }, -} - -func init() { - Cmd.Flags().StringVarP(&remote, "remote", "r", "", "target remote DRS server (default: default_remote)") - Cmd.Flags().BoolVarP(&pretty, "pretty", "p", false, "pretty print JSON output") -} diff --git a/cmd/lsfiles/main.go b/cmd/lsfiles/main.go index 96c3dfd1..5da58516 100644 --- a/cmd/lsfiles/main.go +++ b/cmd/lsfiles/main.go @@ -1,8 +1,10 @@ package lsfiles import ( + "encoding/json" "fmt" "log/slog" + "os" "sort" "strings" @@ -10,11 +12,18 @@ import ( "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/drsremote" "github.com/calypr/git-drs/internal/lfs" + "github.com/calypr/git-drs/internal/pathspec" + drsapi "github.com/calypr/syfon/apigen/client/drs" "github.com/spf13/cobra" ) var gitRemote string var drsRemote string +var includePatterns []string +var showLong bool +var nameOnly bool +var jsonOutput bool +var drsStatus bool var ( loadConfig = config.LoadConfig @@ -22,34 +31,46 @@ var ( newRemoteClient = func(cfg *config.Config, remote config.Remote, logger *slog.Logger) (*config.GitContext, error) { return cfg.GetRemoteClient(remote, logger) } - loadLFSInventory = lfs.GetAllLfsFiles - lookupScopedObjects = drsremote.ObjectsByHashForScope + loadLFSInventory = func(gitRemoteName, gitRemoteLocation string, branches []string, logger *slog.Logger) (map[string]lfs.LfsFileInfo, error) { + if len(branches) == 0 { + return lfs.GetTrackedLfsFiles(logger) + } + return lfs.GetAllLfsFiles(gitRemoteName, gitRemoteLocation, branches, logger) + } + lookupScopedObjectsBatch = drsremote.ObjectsByHashesForScope ) type fileRow struct { - OID string - Status string - Path string - Detail string + OID string `json:"oid"` + ShortOID string `json:"short_oid"` + Status string `json:"status"` + Path string `json:"path"` + Localized bool `json:"localized"` + Registered bool `json:"registered,omitempty"` + DRSIDs []string `json:"drs_ids,omitempty"` + Detail string `json:"detail,omitempty"` } -func collectRows(cmd *cobra.Command, gitRemoteName, drsRemoteName string) ([]fileRow, error) { +func collectRows(cmd *cobra.Command, gitRemoteName, drsRemoteName string, patterns []string, resolveDRS bool) ([]fileRow, error) { logger := drslog.GetLogger() - cfg, err := loadConfig() - if err != nil { - return nil, err - } + var client *config.GitContext + if resolveDRS { + cfg, err := loadConfig() + if err != nil { + return nil, err + } - remoteName, err := resolveRemote(cfg, drsRemoteName) - if err != nil { - logger.Error(fmt.Sprintf("Error getting remote: %v", err)) - return nil, err - } + remoteName, err := resolveRemote(cfg, drsRemoteName) + if err != nil { + logger.Error(fmt.Sprintf("Error getting remote: %v", err)) + return nil, err + } - client, err := newRemoteClient(cfg, remoteName, logger) - if err != nil { - return nil, err + client, err = newRemoteClient(cfg, remoteName, logger) + if err != nil { + return nil, err + } } lfsFiles, err := loadLFSInventory(gitRemoteName, drsRemoteName, []string{}, logger) @@ -64,28 +85,60 @@ func collectRows(cmd *cobra.Command, gitRemoteName, drsRemoteName string) ([]fil sort.Strings(keys) rows := make([]fileRow, 0, len(keys)) + var drsResults map[string][]drsapi.DrsObject + var drsLookupErr error + if resolveDRS { + oids := make([]string, 0, len(keys)) + seenOIDs := make(map[string]struct{}, len(keys)) + for _, path := range keys { + if !pathspec.MatchesAny(path, patterns) { + continue + } + oid := lfsFiles[path].Oid + if oid == "" { + continue + } + if _, exists := seenOIDs[oid]; exists { + continue + } + seenOIDs[oid] = struct{}{} + oids = append(oids, oid) + } + drsResults, drsLookupErr = lookupScopedObjectsBatch(cmd.Context(), client, oids) + } for _, path := range keys { + if !pathspec.MatchesAny(path, patterns) { + continue + } info := lfsFiles[path] row := fileRow{ - OID: info.Oid, - Path: path, + OID: info.Oid, + ShortOID: shortOID(info.Oid), + Path: path, + Localized: isLocalized(path), + } + row.Status = "-" + if row.Localized { + row.Status = "*" } - results, err := lookupScopedObjects(cmd.Context(), client, info.Oid) - switch { - case err != nil: - row.Status = "error" - row.Detail = err.Error() - case len(results) == 0: - row.Status = "missing" - row.Detail = "-" - default: - row.Status = "present" - ids := make([]string, 0, len(results)) - for _, res := range results { - ids = append(ids, "drs://"+res.Id) + if resolveDRS { + switch { + case drsLookupErr != nil: + row.Detail = drsLookupErr.Error() + default: + results := drsResults[info.Oid] + if len(results) == 0 { + row.Registered = false + break + } + row.Registered = true + row.DRSIDs = make([]string, 0, len(results)) + for _, res := range results { + row.DRSIDs = append(row.DRSIDs, "drs://"+res.Id) + } + row.Detail = strings.Join(row.DRSIDs, ",") } - row.Detail = strings.Join(ids, ",") } rows = append(rows, row) @@ -95,23 +148,80 @@ func collectRows(cmd *cobra.Command, gitRemoteName, drsRemoteName string) ([]fil } func printRows(cmd *cobra.Command, rows []fileRow) error { - if _, err := fmt.Fprintf(cmd.OutOrStdout(), "OID\tSTATUS\tPATH\tDETAIL\n"); err != nil { - return err + if jsonOutput { + enc := json.NewEncoder(cmd.OutOrStdout()) + enc.SetIndent("", " ") + return enc.Encode(rows) } for _, row := range rows { - if _, err := fmt.Fprintf(cmd.OutOrStdout(), "%s\t%s\t%s\t%s\n", row.OID, row.Status, row.Path, row.Detail); err != nil { - return err + switch { + case nameOnly: + if _, err := fmt.Fprintln(cmd.OutOrStdout(), row.Path); err != nil { + return err + } + case drsStatus: + oid := row.ShortOID + if showLong { + oid = row.OID + } + detail := row.Detail + if detail == "" { + detail = "-" + } + if _, err := fmt.Fprintf(cmd.OutOrStdout(), "%s %s %s\t%s\n", oid, row.Status, row.Path, detail); err != nil { + return err + } + default: + oid := row.ShortOID + if showLong { + oid = row.OID + } + if _, err := fmt.Fprintf(cmd.OutOrStdout(), "%s %s %s\n", oid, row.Status, row.Path); err != nil { + return err + } } } return nil } +func shortOID(oid string) string { + if len(oid) <= 10 { + return oid + } + return oid[:10] +} + +func isLocalized(path string) bool { + payload, err := os.ReadFile(path) + if err != nil { + return false + } + _, _, ok := lfs.ParseLFSPointer(payload) + return !ok +} + +func validateOutputFlags() error { + if nameOnly && jsonOutput { + return fmt.Errorf("--name-only and --json are mutually exclusive") + } + if showLong && nameOnly { + return fmt.Errorf("--long and --name-only are mutually exclusive") + } + return nil +} + // Cmd line declaration var Cmd = &cobra.Command{ - Use: "ls-files", - Short: "List local LFS-tracked files and their DRS registration status", + Use: "ls-files [pathspec...]", + Short: "List tracked DRS/LFS pointer files in the repository", + Long: "List tracked DRS/Git-LFS pointer files in the repository. By default this behaves like a local file inventory. Use --drs to also resolve DRS registration status.", RunE: func(cmd *cobra.Command, args []string) error { - rows, err := collectRows(cmd, gitRemote, drsRemote) + if err := validateOutputFlags(); err != nil { + return err + } + patterns := append([]string{}, includePatterns...) + patterns = append(patterns, args...) + rows, err := collectRows(cmd, gitRemote, drsRemote, patterns, drsStatus) if err != nil { return err } @@ -122,4 +232,9 @@ var Cmd = &cobra.Command{ func init() { Cmd.Flags().StringVarP(&gitRemote, "git-remote", "r", "", "target remote Git server (default: origin)") Cmd.Flags().StringVarP(&drsRemote, "drs-remote", "d", "", "target remote DRS server (default: origin)") + Cmd.Flags().StringArrayVarP(&includePatterns, "include", "I", nil, "include pathspec/glob pattern(s)") + Cmd.Flags().BoolVarP(&showLong, "long", "l", false, "show full object IDs") + Cmd.Flags().BoolVarP(&nameOnly, "name-only", "n", false, "show only file paths") + Cmd.Flags().BoolVar(&jsonOutput, "json", false, "emit JSON output") + Cmd.Flags().BoolVar(&drsStatus, "drs", false, "include DRS registration lookup details") } diff --git a/cmd/lsfiles/main_test.go b/cmd/lsfiles/main_test.go index 492b0b4f..1daab41b 100644 --- a/cmd/lsfiles/main_test.go +++ b/cmd/lsfiles/main_test.go @@ -5,6 +5,8 @@ import ( "context" "errors" "log/slog" + "os" + "path/filepath" "strings" "testing" @@ -14,18 +16,108 @@ import ( "github.com/spf13/cobra" ) -func TestCollectRowsAndPrintRows(t *testing.T) { +func resetFlagsForTest() { + gitRemote = "" + drsRemote = "" + includePatterns = nil + showLong = false + nameOnly = false + jsonOutput = false + drsStatus = false +} + +func TestCollectRowsLocalDefault(t *testing.T) { + resetFlagsForTest() + + oldLoadLFSInventory := loadLFSInventory + oldLookupScopedObjectsBatch := lookupScopedObjectsBatch + t.Cleanup(func() { + loadLFSInventory = oldLoadLFSInventory + lookupScopedObjectsBatch = oldLookupScopedObjectsBatch + }) + + tmpDir := t.TempDir() + oldWD, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + if err := os.Chdir(tmpDir); err != nil { + t.Fatalf("chdir tempdir: %v", err) + } + t.Cleanup(func() { + _ = os.Chdir(oldWD) + }) + + localizedPath := filepath.Join("a", "localized.bin") + pointerPath := filepath.Join("b", "pointer.bin") + if err := os.MkdirAll(filepath.Dir(localizedPath), 0o755); err != nil { + t.Fatalf("mkdir localized dir: %v", err) + } + if err := os.MkdirAll(filepath.Dir(pointerPath), 0o755); err != nil { + t.Fatalf("mkdir pointer dir: %v", err) + } + if err := os.WriteFile(localizedPath, []byte("hydrated-bytes"), 0o644); err != nil { + t.Fatalf("write localized file: %v", err) + } + pointerContent := "version https://git-lfs.github.com/spec/v1\noid sha256:" + strings.Repeat("b", 64) + "\nsize 12\n" + if err := os.WriteFile(pointerPath, []byte(pointerContent), 0o644); err != nil { + t.Fatalf("write pointer file: %v", err) + } + + loadLFSInventory = func(gitRemoteName, gitRemoteLocation string, branches []string, logger *slog.Logger) (map[string]lfs.LfsFileInfo, error) { + return map[string]lfs.LfsFileInfo{ + localizedPath: {Name: localizedPath, Oid: strings.Repeat("a", 64)}, + pointerPath: {Name: pointerPath, Oid: strings.Repeat("b", 64)}, + }, nil + } + lookupScopedObjectsBatch = func(ctx context.Context, drsCtx *config.GitContext, checksums []string) (map[string][]drsapi.DrsObject, error) { + t.Fatalf("unexpected remote lookup for checksums %v", checksums) + return nil, nil + } + + cmd := &cobra.Command{} + rows, err := collectRows(cmd, "", "", nil, false) + if err != nil { + t.Fatalf("collectRows returned error: %v", err) + } + if len(rows) != 2 { + t.Fatalf("expected 2 rows, got %d", len(rows)) + } + if rows[0].Path != localizedPath || rows[0].Status != "*" || !rows[0].Localized { + t.Fatalf("unexpected localized row: %+v", rows[0]) + } + if rows[1].Path != pointerPath || rows[1].Status != "-" || rows[1].Localized { + t.Fatalf("unexpected pointer row: %+v", rows[1]) + } + + var out bytes.Buffer + cmd.SetOut(&out) + if err := printRows(cmd, rows); err != nil { + t.Fatalf("printRows returned error: %v", err) + } + got := out.String() + if !strings.Contains(got, rows[0].ShortOID+" * "+localizedPath+"\n") { + t.Fatalf("missing localized row: %q", got) + } + if !strings.Contains(got, rows[1].ShortOID+" - "+pointerPath+"\n") { + t.Fatalf("missing pointer row: %q", got) + } +} + +func TestCollectRowsWithDRSLookupAndFilters(t *testing.T) { + resetFlagsForTest() + oldLoadConfig := loadConfig oldResolveRemote := resolveRemote oldNewRemoteClient := newRemoteClient oldLoadLFSInventory := loadLFSInventory - oldLookupScopedObjects := lookupScopedObjects + oldLookupScopedObjectsBatch := lookupScopedObjectsBatch t.Cleanup(func() { loadConfig = oldLoadConfig resolveRemote = oldResolveRemote newRemoteClient = oldNewRemoteClient loadLFSInventory = oldLoadLFSInventory - lookupScopedObjects = oldLookupScopedObjects + lookupScopedObjectsBatch = oldLookupScopedObjectsBatch }) loadConfig = func() (*config.Config, error) { @@ -37,58 +129,119 @@ func TestCollectRowsAndPrintRows(t *testing.T) { newRemoteClient = func(cfg *config.Config, remote config.Remote, logger *slog.Logger) (*config.GitContext, error) { return &config.GitContext{}, nil } + loadLFSInventory = func(gitRemoteName, gitRemoteLocation string, branches []string, logger *slog.Logger) (map[string]lfs.LfsFileInfo, error) { return map[string]lfs.LfsFileInfo{ - "b/file2.bin": {Name: "b/file2.bin", Oid: strings.Repeat("b", 64)}, - "a/file1.bin": {Name: "a/file1.bin", Oid: strings.Repeat("a", 64)}, - "c/file3.bin": {Name: "c/file3.bin", Oid: strings.Repeat("c", 64)}, + "a/file1.bin": {Name: "a/file1.bin", Oid: strings.Repeat("a", 64)}, + "data/file2.bam": {Name: "data/file2.bam", Oid: strings.Repeat("b", 64)}, + "data/file3.txt": {Name: "data/file3.txt", Oid: strings.Repeat("c", 64)}, }, nil } - lookupScopedObjects = func(ctx context.Context, drsCtx *config.GitContext, checksum string) ([]drsapi.DrsObject, error) { - switch checksum { - case strings.Repeat("a", 64): - return []drsapi.DrsObject{{Id: "did-1"}}, nil - case strings.Repeat("b", 64): - return nil, nil - default: - return nil, errors.New("lookup failed") + lookupScopedObjectsBatch = func(ctx context.Context, drsCtx *config.GitContext, checksums []string) (map[string][]drsapi.DrsObject, error) { + got := map[string][]drsapi.DrsObject{} + for _, checksum := range checksums { + switch checksum { + case strings.Repeat("b", 64): + got[checksum] = []drsapi.DrsObject{{Id: "did-1"}} + default: + got[checksum] = nil + } } + return got, nil } cmd := &cobra.Command{} - rows, err := collectRows(cmd, "", "") + rows, err := collectRows(cmd, "", "", []string{"data/**"}, true) if err != nil { t.Fatalf("collectRows returned error: %v", err) } - if len(rows) != 3 { - t.Fatalf("expected 3 rows, got %d", len(rows)) + if len(rows) != 2 { + t.Fatalf("expected 2 rows, got %d", len(rows)) } - if rows[0].Path != "a/file1.bin" || rows[0].Status != "present" || rows[0].Detail != "drs://did-1" { - t.Fatalf("unexpected first row: %+v", rows[0]) + if rows[0].Path != "data/file2.bam" || !rows[0].Registered || rows[0].Detail != "drs://did-1" { + t.Fatalf("unexpected registered row: %+v", rows[0]) } - if rows[1].Path != "b/file2.bin" || rows[1].Status != "missing" || rows[1].Detail != "-" { - t.Fatalf("unexpected second row: %+v", rows[1]) - } - if rows[2].Path != "c/file3.bin" || rows[2].Status != "error" || rows[2].Detail != "lookup failed" { - t.Fatalf("unexpected third row: %+v", rows[2]) + if rows[1].Path != "data/file3.txt" || rows[1].Registered || rows[1].Detail != "" { + t.Fatalf("unexpected unregistered row: %+v", rows[1]) } + drsStatus = true + showLong = true var out bytes.Buffer cmd.SetOut(&out) if err := printRows(cmd, rows); err != nil { t.Fatalf("printRows returned error: %v", err) } got := out.String() - if !strings.Contains(got, "OID\tSTATUS\tPATH\tDETAIL\n") { - t.Fatalf("missing header in output: %q", got) + if !strings.Contains(got, rows[0].OID+" - data/file2.bam\tdrs://did-1\n") { + t.Fatalf("missing registered row: %q", got) } - if !strings.Contains(got, rows[0].OID+"\tpresent\ta/file1.bin\tdrs://did-1\n") { - t.Fatalf("missing present row: %q", got) + if !strings.Contains(got, rows[1].OID+" - data/file3.txt\t-\n") { + t.Fatalf("missing unregistered row: %q", got) } - if !strings.Contains(got, rows[1].OID+"\tmissing\tb/file2.bin\t-\n") { - t.Fatalf("missing missing row: %q", got) +} + +func TestCollectRowsWithDRSLookupBatchError(t *testing.T) { + resetFlagsForTest() + + oldLoadConfig := loadConfig + oldResolveRemote := resolveRemote + oldNewRemoteClient := newRemoteClient + oldLoadLFSInventory := loadLFSInventory + oldLookupScopedObjectsBatch := lookupScopedObjectsBatch + t.Cleanup(func() { + loadConfig = oldLoadConfig + resolveRemote = oldResolveRemote + newRemoteClient = oldNewRemoteClient + loadLFSInventory = oldLoadLFSInventory + lookupScopedObjectsBatch = oldLookupScopedObjectsBatch + }) + + loadConfig = func() (*config.Config, error) { return &config.Config{}, nil } + resolveRemote = func(cfg *config.Config, name string) (config.Remote, error) { + return config.Remote("origin"), nil } - if !strings.Contains(got, rows[2].OID+"\terror\tc/file3.bin\tlookup failed\n") { - t.Fatalf("missing error row: %q", got) + newRemoteClient = func(cfg *config.Config, remote config.Remote, logger *slog.Logger) (*config.GitContext, error) { + return &config.GitContext{}, nil + } + loadLFSInventory = func(gitRemoteName, gitRemoteLocation string, branches []string, logger *slog.Logger) (map[string]lfs.LfsFileInfo, error) { + return map[string]lfs.LfsFileInfo{ + "data/file2.bam": {Name: "data/file2.bam", Oid: strings.Repeat("b", 64)}, + "data/file3.txt": {Name: "data/file3.txt", Oid: strings.Repeat("c", 64)}, + }, nil + } + lookupScopedObjectsBatch = func(ctx context.Context, drsCtx *config.GitContext, checksums []string) (map[string][]drsapi.DrsObject, error) { + return nil, errors.New("lookup failed") + } + + cmd := &cobra.Command{} + rows, err := collectRows(cmd, "", "", []string{"data/**"}, true) + if err != nil { + t.Fatalf("collectRows returned error: %v", err) + } + if len(rows) != 2 { + t.Fatalf("expected 2 rows, got %d", len(rows)) + } + for _, row := range rows { + if row.Detail != "lookup failed" { + t.Fatalf("expected shared batch lookup error, got row=%+v", row) + } + } +} + +func TestValidateOutputFlags(t *testing.T) { + resetFlagsForTest() + + nameOnly = true + jsonOutput = true + if err := validateOutputFlags(); err == nil { + t.Fatal("expected name-only/json conflict") + } + + resetFlagsForTest() + nameOnly = true + showLong = true + if err := validateOutputFlags(); err == nil { + t.Fatal("expected long/name-only conflict") } } diff --git a/cmd/ping/main.go b/cmd/ping/main.go new file mode 100644 index 00000000..e02d070f --- /dev/null +++ b/cmd/ping/main.go @@ -0,0 +1,137 @@ +package ping + +import ( + "context" + "fmt" + "log/slog" + "strings" + + "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drslog" + "github.com/spf13/cobra" +) + +type statusInfo struct { + Remote config.Remote + IsDefault bool + RemoteType string + Endpoint string + Organization string + Project string + Bucket string + StoragePrefix string + AuthMode string +} + +var pingHealth = func(ctx context.Context, gc *config.GitContext) error { + return gc.Client.Health().Ping(ctx) +} + +var Cmd = &cobra.Command{ + Use: "ping [remote-name]", + Short: "Show effective remote setup and verify the remote responds", + Args: func(cmd *cobra.Command, args []string) error { + if len(args) > 1 { + cmd.SilenceUsage = false + return fmt.Errorf("error: accepts at most 1 argument (remote name), received %d\n\nUsage: %s\n\nSee 'git drs ping --help' for more details", len(args), cmd.UseLine()) + } + return nil + }, + RunE: func(cmd *cobra.Command, args []string) error { + logger := drslog.GetLogger() + status, gc, err := resolveStatus(args, logger) + if err != nil { + return err + } + printStatus(status) + + if err := pingHealth(cmd.Context(), gc); err != nil { + return fmt.Errorf("remote health check failed for %q (%s): %w", status.Remote, status.Endpoint, err) + } + fmt.Println("health: ok") + return nil + }, +} + +func resolveStatus(args []string, logger *slog.Logger) (statusInfo, *config.GitContext, error) { + cfg, err := config.LoadConfig() + if err != nil { + return statusInfo{}, nil, err + } + + var remoteArg string + if len(args) == 1 { + remoteArg = args[0] + } + remoteName, err := cfg.GetRemoteOrDefault(remoteArg) + if err != nil { + return statusInfo{}, nil, err + } + + remoteCfg := cfg.GetRemote(remoteName) + if remoteCfg == nil { + return statusInfo{}, nil, fmt.Errorf("no remote configuration found for %q", remoteName) + } + + gc, err := cfg.GetRemoteClient(remoteName, logger) + if err != nil { + return statusInfo{}, nil, err + } + + status := statusInfo{ + Remote: remoteName, + IsDefault: remoteName == cfg.DefaultRemote, + Endpoint: remoteCfg.GetEndpoint(), + Organization: remoteCfg.GetOrganization(), + Project: remoteCfg.GetProjectId(), + Bucket: gc.BucketName, + StoragePrefix: gc.StoragePrefix, + AuthMode: authMode(gc), + } + switch remoteCfg.(type) { + case *config.Gen3Remote: + status.RemoteType = string(config.Gen3ServerType) + case *config.LocalRemote: + status.RemoteType = string(config.LocalServerType) + default: + status.RemoteType = "unknown" + } + + return status, gc, nil +} + +func printStatus(status statusInfo) { + def := "" + if status.IsDefault { + def = " (default)" + } + fmt.Printf("remote: %s%s\n", status.Remote, def) + fmt.Printf("type: %s\n", status.RemoteType) + fmt.Printf("endpoint: %s\n", status.Endpoint) + fmt.Printf("organization: %s\n", blankIfEmpty(status.Organization)) + fmt.Printf("project: %s\n", blankIfEmpty(status.Project)) + fmt.Printf("bucket: %s\n", blankIfEmpty(status.Bucket)) + fmt.Printf("storage_prefix: %s\n", blankIfEmpty(status.StoragePrefix)) + fmt.Printf("auth: %s\n", status.AuthMode) +} + +func authMode(gc *config.GitContext) string { + if gc == nil || gc.Credential == nil { + return "none" + } + if strings.TrimSpace(gc.Credential.AccessToken) != "" { + return "bearer" + } + if strings.TrimSpace(gc.Credential.KeyID) != "" || strings.TrimSpace(gc.Credential.APIKey) != "" { + return "basic" + } + return "none" +} + +func blankIfEmpty(v string) string { + v = strings.TrimSpace(v) + if v == "" { + return "-" + } + return v +} diff --git a/cmd/ping/main_test.go b/cmd/ping/main_test.go new file mode 100644 index 00000000..8efff3c7 --- /dev/null +++ b/cmd/ping/main_test.go @@ -0,0 +1,132 @@ +package ping + +import ( + "bytes" + "context" + "io" + "os" + "strings" + "testing" + + "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drslog" + "github.com/calypr/git-drs/internal/gitrepo" + "github.com/calypr/git-drs/internal/testutils" +) + +func TestPingCmdArgs(t *testing.T) { + if err := Cmd.Args(Cmd, nil); err != nil { + t.Fatalf("unexpected error with no args: %v", err) + } + if err := Cmd.Args(Cmd, []string{"origin"}); err != nil { + t.Fatalf("unexpected error with one arg: %v", err) + } + if err := Cmd.Args(Cmd, []string{"origin", "extra"}); err == nil { + t.Fatal("expected error for extra args") + } +} + +func TestResolveStatusLocalRemote(t *testing.T) { + tmpDir := testutils.SetupTestGitRepo(t) + testutils.CreateTestConfig(t, tmpDir, &config.Config{ + DefaultRemote: config.Remote(config.ORIGIN), + Remotes: map[config.Remote]config.RemoteSelect{ + config.Remote(config.ORIGIN): { + Local: &config.LocalRemote{ + BaseURL: "http://127.0.0.1:8080", + ProjectID: "end_to_end_test", + Bucket: "cbds", + Organization: "calypr", + BasicUsername: "drs-user", + BasicPassword: "drs-pass", + }, + }, + }, + }) + if err := gitrepo.SetBucketMapping("calypr", "end_to_end_test", "cbds", "prefix"); err != nil { + t.Fatalf("SetBucketMapping failed: %v", err) + } + + status, _, err := resolveStatus(nil, drslog.NewNoOpLogger()) + if err != nil { + t.Fatalf("resolveStatus returned error: %v", err) + } + if status.Remote != "origin" || !status.IsDefault { + t.Fatalf("unexpected remote selection: %+v", status) + } + if status.RemoteType != "local" || status.Endpoint != "http://127.0.0.1:8080" { + t.Fatalf("unexpected remote type/endpoint: %+v", status) + } + if status.Organization != "calypr" || status.Project != "end_to_end_test" { + t.Fatalf("unexpected scope: %+v", status) + } + if status.Bucket != "cbds" || status.StoragePrefix != "prefix" { + t.Fatalf("unexpected bucket scope: %+v", status) + } + if status.AuthMode != "none" { + t.Fatalf("expected auth mode none from client credential shape, got %+v", status) + } +} + +func TestPingRunEPrintsStatusAndHealth(t *testing.T) { + tmpDir := testutils.SetupTestGitRepo(t) + testutils.CreateTestConfig(t, tmpDir, &config.Config{ + DefaultRemote: config.Remote(config.ORIGIN), + Remotes: map[config.Remote]config.RemoteSelect{ + config.Remote(config.ORIGIN): { + Local: &config.LocalRemote{ + BaseURL: "http://127.0.0.1:8080", + ProjectID: "end_to_end_test", + Bucket: "cbds", + Organization: "calypr", + }, + }, + }, + }) + if err := gitrepo.SetBucketMapping("calypr", "end_to_end_test", "cbds", "prefix"); err != nil { + t.Fatalf("SetBucketMapping failed: %v", err) + } + + oldHealth := pingHealth + pingHealth = func(ctx context.Context, gc *config.GitContext) error { + if gc == nil || gc.ProjectId != "end_to_end_test" { + t.Fatalf("unexpected git context: %+v", gc) + } + return nil + } + t.Cleanup(func() { pingHealth = oldHealth }) + + oldStdout := os.Stdout + r, w, err := os.Pipe() + if err != nil { + t.Fatalf("pipe: %v", err) + } + os.Stdout = w + t.Cleanup(func() { os.Stdout = oldStdout }) + + runErr := Cmd.RunE(Cmd, nil) + _ = w.Close() + if runErr != nil { + t.Fatalf("Cmd.RunE returned error: %v", runErr) + } + + var buf bytes.Buffer + if _, err := io.Copy(&buf, r); err != nil { + t.Fatalf("read stdout: %v", err) + } + got := buf.String() + for _, want := range []string{ + "remote: origin (default)", + "type: local", + "endpoint: http://127.0.0.1:8080", + "organization: calypr", + "project: end_to_end_test", + "bucket: cbds", + "storage_prefix: prefix", + "health: ok", + } { + if !strings.Contains(got, want) { + t.Fatalf("expected output to contain %q, got %q", want, got) + } + } +} diff --git a/cmd/precommit/main.go b/cmd/precommit/main.go index ac2f3047..53ca5a12 100644 --- a/cmd/precommit/main.go +++ b/cmd/precommit/main.go @@ -26,6 +26,7 @@ import ( "os/exec" "path/filepath" "sort" + "strconv" "strings" "time" @@ -33,8 +34,14 @@ import ( ) const ( - cacheVersionDir = "drs/pre-commit/v1" - lfsSpecLine = "version https://git-lfs.github.com/spec/v1" + cacheVersionDir = "drs/pre-commit/v1" + lfsSpecLine = "version https://git-lfs.github.com/spec/v1" + defaultDirectCommitWarningThreshold = int64(10 * 1024 * 1024) +) + +var ( + directCommitWarningThresholdBytes = defaultDirectCommitWarningThreshold + confirmOversizedDirectGitCommit = promptOversizedDirectGitCommit ) type PathEntry struct { @@ -67,6 +74,11 @@ type Change struct { Status string // raw status, e.g. "A", "M", "D", "R100" } +type OversizedStagedFile struct { + Path string + Size int64 +} + // Cmd line declaration var Cmd = &cobra.Command{ Use: "precommit", @@ -114,6 +126,19 @@ func run(ctx context.Context) error { if len(changes) == 0 { return nil } + oversized, err := collectOversizedPlainGitStagedFiles(ctx, changes, directCommitWarningThresholdBytes) + if err != nil { + return err + } + if len(oversized) > 0 { + allowed, err := confirmOversizedDirectGitCommit(oversized) + if err != nil { + return err + } + if !allowed { + return fmt.Errorf("commit aborted so you can track large files before committing them directly to Git") + } + } now := time.Now().UTC().Format(time.RFC3339) @@ -349,6 +374,92 @@ func stagedLFSOID(ctx context.Context, path string) (string, bool, error) { return "", false, nil } +func stagedBlobSize(ctx context.Context, path string) (int64, error) { + out, err := git(ctx, "cat-file", "-s", ":"+path) + if err != nil { + return 0, err + } + size, err := strconv.ParseInt(strings.TrimSpace(string(out)), 10, 64) + if err != nil { + return 0, fmt.Errorf("parse staged blob size for %s: %w", path, err) + } + return size, nil +} + +func collectOversizedPlainGitStagedFiles(ctx context.Context, changes []Change, thresholdBytes int64) ([]OversizedStagedFile, error) { + if thresholdBytes <= 0 { + return nil, nil + } + var oversized []OversizedStagedFile + seen := make(map[string]struct{}) + for _, ch := range changes { + if ch.Kind != KindAdd && ch.Kind != KindModify && ch.Kind != KindRename { + continue + } + path := ch.NewPath + if path == "" { + continue + } + if _, ok := seen[path]; ok { + continue + } + seen[path] = struct{}{} + + _, isLFS, err := stagedLFSOID(ctx, path) + if err != nil { + continue + } + if isLFS { + continue + } + + size, err := stagedBlobSize(ctx, path) + if err != nil { + return nil, err + } + if size <= thresholdBytes { + continue + } + oversized = append(oversized, OversizedStagedFile{Path: path, Size: size}) + } + sort.Slice(oversized, func(i, j int) bool { return oversized[i].Path < oversized[j].Path }) + return oversized, nil +} + +func promptOversizedDirectGitCommit(files []OversizedStagedFile) (bool, error) { + if len(files) == 0 { + return true, nil + } + + fmt.Fprintf(os.Stderr, "\nWarning: the following staged files are being committed directly to Git and exceed %s:\n\n", humanBytes(directCommitWarningThresholdBytes)) + for _, f := range files { + fmt.Fprintf(os.Stderr, " - %s (%s)\n", f.Path, humanBytes(f.Size)) + } + fmt.Fprintln(os.Stderr, "\nIf these should be managed by git-drs, track them first and re-add them.") + fmt.Fprint(os.Stderr, "Continue committing these files directly to GitHub? [y/N]: ") + + reader := bufio.NewReader(os.Stdin) + line, err := reader.ReadString('\n') + if err != nil && !errors.Is(err, io.EOF) { + return false, err + } + answer := strings.ToLower(strings.TrimSpace(line)) + return answer == "y" || answer == "yes", nil +} + +func humanBytes(n int64) string { + const unit = int64(1024) + if n < unit { + return fmt.Sprintf("%d B", n) + } + div, exp := unit, 0 + for q := n / unit; q >= unit; q /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %ciB", float64(n)/float64(div), "KMGTPE"[exp]) +} + func gitRevParseGitDir(ctx context.Context) (string, error) { out, err := git(ctx, "rev-parse", "--git-dir") if err != nil { diff --git a/cmd/precommit/main_test.go b/cmd/precommit/main_test.go index 8a0fb0c6..5ee5cc9a 100644 --- a/cmd/precommit/main_test.go +++ b/cmd/precommit/main_test.go @@ -114,6 +114,85 @@ func TestHandleUpsertWritesLFSPointerCache(t *testing.T) { } } +func TestCollectOversizedPlainGitStagedFiles(t *testing.T) { + repo := setupGitRepo(t) + oldwd := mustChdir(t, repo) + t.Cleanup(func() { _ = os.Chdir(oldwd) }) + + plainPath := filepath.Join(repo, "data", "large.bin") + if err := os.MkdirAll(filepath.Dir(plainPath), 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + if err := os.WriteFile(plainPath, []byte("plain oversized payload"), 0o644); err != nil { + t.Fatalf("write plain file: %v", err) + } + gitCmd(t, repo, "add", "data/large.bin") + + pointerPath := filepath.Join(repo, "data", "pointer.bin") + lfsPointer := strings.Join([]string{ + "version https://git-lfs.github.com/spec/v1", + "oid sha256:deadbeef", + "size 999", + "", + }, "\n") + if err := os.WriteFile(pointerPath, []byte(lfsPointer), 0o644); err != nil { + t.Fatalf("write pointer file: %v", err) + } + gitCmd(t, repo, "add", "data/pointer.bin") + + changes, err := stagedChanges(context.Background()) + if err != nil { + t.Fatalf("stagedChanges: %v", err) + } + files, err := collectOversizedPlainGitStagedFiles(context.Background(), changes, 1) + if err != nil { + t.Fatalf("collectOversizedPlainGitStagedFiles: %v", err) + } + if len(files) != 1 { + t.Fatalf("expected 1 oversized plain file, got %d: %+v", len(files), files) + } + if files[0].Path != "data/large.bin" { + t.Fatalf("unexpected oversized file path: %+v", files[0]) + } +} + +func TestRunAbortsWhenOversizedPlainGitCommitIsRejected(t *testing.T) { + repo := setupGitRepo(t) + oldwd := mustChdir(t, repo) + t.Cleanup(func() { _ = os.Chdir(oldwd) }) + + path := filepath.Join(repo, "data", "large.bin") + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("mkdir: %v", err) + } + if err := os.WriteFile(path, []byte("plain oversized payload"), 0o644); err != nil { + t.Fatalf("write file: %v", err) + } + gitCmd(t, repo, "add", "data/large.bin") + + oldThreshold := directCommitWarningThresholdBytes + oldPrompt := confirmOversizedDirectGitCommit + t.Cleanup(func() { + directCommitWarningThresholdBytes = oldThreshold + confirmOversizedDirectGitCommit = oldPrompt + }) + directCommitWarningThresholdBytes = 1 + confirmOversizedDirectGitCommit = func(files []OversizedStagedFile) (bool, error) { + if len(files) != 1 || files[0].Path != "data/large.bin" { + t.Fatalf("unexpected prompt files: %+v", files) + } + return false, nil + } + + err := run(context.Background()) + if err == nil { + t.Fatal("expected run to abort when oversized file warning is rejected") + } + if !strings.Contains(err.Error(), "commit aborted") { + t.Fatalf("unexpected error: %v", err) + } +} + func setupGitRepo(t *testing.T) string { t.Helper() dir := t.TempDir() diff --git a/cmd/prepush/io_helpers.go b/cmd/prepush/io_helpers.go new file mode 100644 index 00000000..75dee003 --- /dev/null +++ b/cmd/prepush/io_helpers.go @@ -0,0 +1,27 @@ +package prepush + +import ( + "fmt" + "io" + "os" +) + +func bufferStdin(stdin io.Reader, createTempFile func(dir, pattern string) (*os.File, error)) (*os.File, error) { + tmp, err := createTempFile("", "prepush-stdin-*") + if err != nil { + return nil, fmt.Errorf("error creating temp file for stdin: %w", err) + } + + if _, err := io.Copy(tmp, stdin); err != nil { + _ = tmp.Close() + _ = os.Remove(tmp.Name()) + return nil, fmt.Errorf("error buffering stdin: %w", err) + } + + if _, err := tmp.Seek(0, 0); err != nil { + _ = tmp.Close() + _ = os.Remove(tmp.Name()) + return nil, fmt.Errorf("error seeking temp stdin: %w", err) + } + return tmp, nil +} diff --git a/cmd/prepush/main.go b/cmd/prepush/main.go index 9e2e841e..6ff4a411 100644 --- a/cmd/prepush/main.go +++ b/cmd/prepush/main.go @@ -1,7 +1,6 @@ package prepush import ( - "bufio" "bytes" "context" "encoding/base64" @@ -18,6 +17,7 @@ import ( "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drsdelete" "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/drsmap" "github.com/calypr/git-drs/internal/drsobject" @@ -25,7 +25,6 @@ import ( "github.com/calypr/git-drs/internal/lfs" "github.com/calypr/git-drs/internal/precommit_cache" drsapi "github.com/calypr/syfon/apigen/client/drs" - syfoncommon "github.com/calypr/syfon/common" "github.com/spf13/cobra" ) @@ -86,6 +85,10 @@ func (s *PrePushService) Run(args []string, stdin io.Reader) error { myLogger.Debug("Warning. Skipping DRS preparation. Error getting remote configuration.") return nil } + drsClient, err := cfg.GetRemoteClient(remote, myLogger) + if err != nil { + return err + } scope, err := gitrepo.ResolveBucketScope( remoteConfig.GetOrganization(), @@ -117,6 +120,10 @@ func (s *PrePushService) Run(args []string, stdin io.Reader) error { myLogger.Error(fmt.Sprintf("error reading pushed refs: %v", err)) return err } + if _, err := drsdelete.ReconcileCommittedDeletes(ctx, drsClient, drsDeleteRefs(refs), myLogger); err != nil { + myLogger.Error(fmt.Sprintf("delete reconciliation failed: %v", err)) + return err + } branches := branchesFromRefs(refs) cache, cacheReady := openCache(ctx, myLogger) @@ -244,9 +251,6 @@ func toMetadataCandidate(c drsapi.DrsObjectCandidate) metadataCandidate { URL: accURL, }, } - if authzMap := syfoncommon.AuthzMapFromAccessMethodAuthorizations(am.Authorizations); len(authzMap) > 0 { - m.Authorizations = authzMap - } out.AccessMethods = append(out.AccessMethods, m) } } @@ -288,7 +292,7 @@ func submitPendingLFSMeta(ctx context.Context, remote config.Remote, endpoint st if err != nil { return fmt.Errorf("failed to create pending metadata request: %w", err) } - httpReq.Header.Set("Content-Type", "application/json") + httpReq.Header.Set("Content-Type", "application/vnd.git-lfs+json") httpReq.Header.Set("Accept", "application/vnd.git-lfs+json") if authHeader, ok := resolveRemoteAuthHeader(string(remote)); ok { httpReq.Header.Set("Authorization", authHeader) @@ -350,87 +354,6 @@ func parseRemoteArgs(args []string) (string, string) { return gitRemoteName, gitRemoteLocation } -type pushedRef struct { - LocalRef string - LocalSHA string - RemoteRef string - RemoteSHA string -} - -func bufferStdin(stdin io.Reader, createTempFile func(dir, pattern string) (*os.File, error)) (*os.File, error) { - tmp, err := createTempFile("", "prepush-stdin-*") - if err != nil { - return nil, fmt.Errorf("error creating temp file for stdin: %w", err) - } - - if _, err := io.Copy(tmp, stdin); err != nil { - _ = tmp.Close() - _ = os.Remove(tmp.Name()) - return nil, fmt.Errorf("error buffering stdin: %w", err) - } - - if _, err := tmp.Seek(0, 0); err != nil { - _ = tmp.Close() - _ = os.Remove(tmp.Name()) - return nil, fmt.Errorf("error seeking temp stdin: %w", err) - } - return tmp, nil -} - -// readPushedBranches reads git push lines from the provided temp file, -// extracts unique local branch names for refs under `refs/heads/` and -// returns them sorted. The file is rewound to the start before returning. -func readPushedRefs(f io.ReadSeeker) ([]pushedRef, error) { - // Ensure we read from start - // example: - // refs/heads/main 67890abcdef1234567890abcdef1234567890abcd refs/heads/main 12345abcdef67890abcdef1234567890abcdef12 - if _, err := f.Seek(0, 0); err != nil { - return nil, err - } - scanner := bufio.NewScanner(f) - refs := make([]pushedRef, 0) - for scanner.Scan() { - line := scanner.Text() - fields := strings.Fields(line) - if len(fields) < 4 { - continue - } - refs = append(refs, pushedRef{ - LocalRef: fields[0], - LocalSHA: fields[1], - RemoteRef: fields[2], - RemoteSHA: fields[3], - }) - } - if err := scanner.Err(); err != nil { - return nil, err - } - // Rewind so caller can reuse the file - if _, err := f.Seek(0, 0); err != nil { - return nil, err - } - return refs, nil -} - -func branchesFromRefs(refs []pushedRef) []string { - const prefix = "refs/heads/" - set := make(map[string]struct{}) - for _, ref := range refs { - if strings.HasPrefix(ref.LocalRef, prefix) { - branch := strings.TrimPrefix(ref.LocalRef, prefix) - if branch != "" { - set[branch] = struct{}{} - } - } - } - branches := make([]string, 0, len(set)) - for b := range set { - branches = append(branches, b) - } - sort.Strings(branches) - return branches -} - func openCache(ctx context.Context, logger *slog.Logger) (*precommit_cache.Cache, bool) { cache, err := precommit_cache.Open(ctx) if err != nil { @@ -561,45 +484,3 @@ func gitOutput(ctx context.Context, args ...string) (string, error) { } return string(out), nil } - -// readPushedBranches reads git push lines from the provided temp file, -// extracts unique local branch names for refs under `refs/heads/` and -// returns them sorted. The file is rewound to the start before returning. -func readPushedBranches(f *os.File) ([]string, error) { - // Ensure we read from start - // example: - // refs/heads/main 67890abcdef1234567890abcdef1234567890abcd refs/heads/main 12345abcdef67890abcdef1234567890abcdef12 - if _, err := f.Seek(0, 0); err != nil { - return nil, err - } - scanner := bufio.NewScanner(f) - set := make(map[string]struct{}) - for scanner.Scan() { - line := scanner.Text() - fields := strings.Fields(line) - if len(fields) < 1 { - continue - } - localRef := fields[0] - const prefix = "refs/heads/" - if strings.HasPrefix(localRef, prefix) { - branch := strings.TrimPrefix(localRef, prefix) - if branch != "" { - set[branch] = struct{}{} - } - } - } - if err := scanner.Err(); err != nil { - return nil, err - } - branches := make([]string, 0, len(set)) - for b := range set { - branches = append(branches, b) - } - sort.Strings(branches) - // Rewind so caller can reuse the file - if _, err := f.Seek(0, 0); err != nil { - return nil, err - } - return branches, nil -} diff --git a/cmd/prepush/main_test.go b/cmd/prepush/main_test.go index 5fad785b..99b0a164 100644 --- a/cmd/prepush/main_test.go +++ b/cmd/prepush/main_test.go @@ -100,7 +100,7 @@ func TestLfsFilesFromCache(t *testing.T) { } } -func TestReadPushedBranches(t *testing.T) { +func TestReadPushedRefsAndBranchesFromRefs(t *testing.T) { tests := []struct { name string input string @@ -145,12 +145,11 @@ func TestReadPushedBranches(t *testing.T) { t.Fatalf("write temp: %v", err) } - // readPushedBranches seeks to 0 itself, but we pass the *os.File - // which must be valid. - branches, err := readPushedBranches(tmp) + refs, err := readPushedRefs(tmp) if err != nil { - t.Fatalf("readPushedBranches error: %v", err) + t.Fatalf("readPushedRefs error: %v", err) } + branches := branchesFromRefs(refs) if len(branches) != len(tt.expected) { t.Errorf("expected %d branches, got %d: %v", len(tt.expected), len(branches), branches) @@ -363,8 +362,8 @@ func TestSubmitPendingLFSMetaRequestWiring(t *testing.T) { if gotAuth != "Bearer test-token" { t.Fatalf("expected auth header, got %q", gotAuth) } - if gotContentType != "application/json" { - t.Fatalf("expected content-type application/json, got %q", gotContentType) + if gotContentType != "application/vnd.git-lfs+json" { + t.Fatalf("expected content-type application/vnd.git-lfs+json, got %q", gotContentType) } if gotAccept != "application/vnd.git-lfs+json" { t.Fatalf("expected accept header application/vnd.git-lfs+json, got %q", gotAccept) diff --git a/cmd/prepush/pushed_refs.go b/cmd/prepush/pushed_refs.go new file mode 100644 index 00000000..6db5298e --- /dev/null +++ b/cmd/prepush/pushed_refs.go @@ -0,0 +1,76 @@ +package prepush + +import ( + "bufio" + "io" + "sort" + "strings" + + "github.com/calypr/git-drs/internal/drsdelete" +) + +type pushedRef struct { + LocalRef string + LocalSHA string + RemoteRef string + RemoteSHA string +} + +// readPushedRefs parses git's pre-push stdin format and rewinds the reader +// before returning so callers can reuse the buffered input. +func readPushedRefs(f io.ReadSeeker) ([]pushedRef, error) { + if _, err := f.Seek(0, 0); err != nil { + return nil, err + } + scanner := bufio.NewScanner(f) + refs := make([]pushedRef, 0) + for scanner.Scan() { + fields := strings.Fields(scanner.Text()) + if len(fields) < 4 { + continue + } + refs = append(refs, pushedRef{ + LocalRef: fields[0], + LocalSHA: fields[1], + RemoteRef: fields[2], + RemoteSHA: fields[3], + }) + } + if err := scanner.Err(); err != nil { + return nil, err + } + if _, err := f.Seek(0, 0); err != nil { + return nil, err + } + return refs, nil +} + +func branchesFromRefs(refs []pushedRef) []string { + const prefix = "refs/heads/" + set := make(map[string]struct{}) + for _, ref := range refs { + if strings.HasPrefix(ref.LocalRef, prefix) { + branch := strings.TrimPrefix(ref.LocalRef, prefix) + if branch != "" { + set[branch] = struct{}{} + } + } + } + branches := make([]string, 0, len(set)) + for branch := range set { + branches = append(branches, branch) + } + sort.Strings(branches) + return branches +} + +func drsDeleteRefs(refs []pushedRef) []drsdelete.RefUpdate { + out := make([]drsdelete.RefUpdate, 0, len(refs)) + for _, ref := range refs { + out = append(out, drsdelete.RefUpdate{ + OldSHA: strings.TrimSpace(ref.RemoteSHA), + NewSHA: strings.TrimSpace(ref.LocalSHA), + }) + } + return out +} diff --git a/cmd/pull/main.go b/cmd/pull/main.go index cf352d73..75e1d75d 100644 --- a/cmd/pull/main.go +++ b/cmd/pull/main.go @@ -3,30 +3,38 @@ package pull import ( "context" "fmt" + "log/slog" "net/url" "os" - "os/exec" + "sort" "strings" - "github.com/bytedance/sonic" "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/config" "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/drsremote" "github.com/calypr/git-drs/internal/lfs" + "github.com/calypr/git-drs/internal/pathspec" drsapi "github.com/calypr/syfon/apigen/client/drs" "github.com/spf13/cobra" ) -var runCommand = func(name string, args ...string) ([]byte, error) { - cmd := exec.Command(name, args...) - return cmd.CombinedOutput() -} +var includePatterns []string +var dryRun bool + +var ( + loadCfg = config.LoadConfig + resolveRemote = func(cfg *config.Config, name string) (config.Remote, error) { return cfg.GetRemoteOrDefault(name) } + newRemoteClient = func(cfg *config.Config, remote config.Remote, logger *slog.Logger) (*config.GitContext, error) { + return cfg.GetRemoteClient(remote, logger) + } + loadWorktreeInventory = lfs.GetWorktreeLfsFiles +) var Cmd = &cobra.Command{ Use: "pull [remote-name]", - Short: "Pull using the standard Git + Git LFS flow", - Long: "Pull using the standard Git + Git LFS flow (git pull, git lfs pull, git lfs checkout).", + Short: "Download DRS pointer file content into the current checkout", + Long: "Hydrate DRS/Git-LFS pointer files in the current checkout. By default this mirrors git lfs pull semantics for the worktree rather than running git pull.", Args: func(cmd *cobra.Command, args []string) error { if len(args) > 1 { cmd.SilenceUsage = false @@ -37,7 +45,7 @@ var Cmd = &cobra.Command{ RunE: func(cmd *cobra.Command, args []string) error { logg := drslog.GetLogger() - cfg, err := config.LoadConfig() + cfg, err := loadCfg() if err != nil { return fmt.Errorf("error loading config: %v", err) } @@ -46,55 +54,50 @@ var Cmd = &cobra.Command{ if len(args) > 0 { remote = config.Remote(args[0]) } else { - remote, err = cfg.GetDefaultRemote() + remote, err = resolveRemote(cfg, "") if err != nil { logg.Error(fmt.Sprintf("Error getting remote: %v", err)) return err } } - drsCtx, err := cfg.GetRemoteClient(remote, logg) + drsCtx, err := newRemoteClient(cfg, remote, logg) if err != nil { logg.Error(fmt.Sprintf("error creating DRS client: %s", err)) return err } - _ = drsCtx // Remote validation only. - if out, err := runCommand("git", "pull", string(remote)); err != nil { - msg := strings.TrimSpace(string(out)) - if msg == "" { - msg = err.Error() - } - return fmt.Errorf("git pull failed for remote %q: %s", remote, msg) + inventory, err := loadWorktreeInventory(logg) + if err != nil { + return fmt.Errorf("failed to discover pointer files in worktree: %w", err) } - - var parsed struct { - Files []lfs.LfsFileInfo `json:"files"` + pointers := collectPointerFiles(inventory, includePatterns) + if len(pointers) == 0 { + logg.Debug("no matching pointer files to hydrate") + return nil } - out, err := runCommand("git", "lfs", "ls-files", "--json") - if err != nil { - msg := commandMessage(out, err) - if !isMissingGitLFS(msg) { - return fmt.Errorf("git lfs ls-files failed: %s", msg) - } - lfsFiles, inventoryErr := lfs.GetAllLfsFiles(string(remote), "", []string{"HEAD"}, logg) - if inventoryErr != nil { - return fmt.Errorf("git lfs ls-files failed: %s; fallback inventory failed: %w", msg, inventoryErr) - } - parsed.Files = make([]lfs.LfsFileInfo, 0, len(lfsFiles)) - for _, f := range lfsFiles { - parsed.Files = append(parsed.Files, f) + + if dryRun { + for _, f := range pointers { + if _, err := fmt.Fprintln(cmd.OutOrStdout(), f.Name); err != nil { + return err + } } - } else if err := lfsjsonUnmarshal(out, &parsed); err != nil { - return fmt.Errorf("failed to parse git lfs ls-files output: %w", err) + return nil } ctx := context.Background() - missingOIDs := make([]string, 0, len(parsed.Files)) - seenMissing := make(map[string]struct{}, len(parsed.Files)) - for _, f := range parsed.Files { - if f.Downloaded { + missingOIDs := make([]string, 0, len(pointers)) + seenMissing := make(map[string]struct{}, len(pointers)) + for _, f := range pointers { + cachePath, err := lfs.ObjectPath(common.LFS_OBJS_PATH, f.Oid) + if err != nil { + return fmt.Errorf("failed to resolve LFS object path for %s: %w", f.Oid, err) + } + if _, err := os.Stat(cachePath); err == nil { continue + } else if !os.IsNotExist(err) { + return fmt.Errorf("failed to stat cached object for %s: %w", f.Oid, err) } if _, seen := seenMissing[f.Oid]; seen { continue @@ -131,14 +134,16 @@ var Cmd = &cobra.Command{ logg.Debug(fmt.Sprintf("bulk access prefetch failed; continuing per-object: %v", err)) } } - for _, f := range parsed.Files { - if f.Downloaded { - continue - } + for _, f := range pointers { dstPath, err := lfs.ObjectPath(common.LFS_OBJS_PATH, f.Oid) if err != nil { return fmt.Errorf("failed to resolve LFS object path for %s: %w", f.Oid, err) } + if _, err := os.Stat(dstPath); err == nil { + continue + } else if !os.IsNotExist(err) { + return fmt.Errorf("failed to stat cache path %s: %w", dstPath, err) + } if obj, ok := prefetched[f.Oid]; ok { if accessURL, ok := prefetchedAccess[obj.Id]; ok { objCopy := obj @@ -155,16 +160,10 @@ var Cmd = &cobra.Command{ } } } else { - logg.Debug("no missing LFS objects to download") + logg.Debug("no missing pointer objects to download") } - if out, err := runCommand("git", "lfs", "checkout"); err != nil { - msg := commandMessage(out, err) - if !isMissingGitLFS(msg) { - return fmt.Errorf("git lfs checkout failed: %s", msg) - } - } - if err := checkoutDownloadedFiles(parsed.Files); err != nil { + if err := checkoutDownloadedFiles(pointers); err != nil { return err } @@ -172,19 +171,31 @@ var Cmd = &cobra.Command{ }, } -func commandMessage(out []byte, err error) string { - msg := strings.TrimSpace(string(out)) - if msg == "" && err != nil { - msg = err.Error() - } - return msg +type pointerFile struct { + Name string + Oid string + Size int64 } -func isMissingGitLFS(msg string) bool { - return strings.Contains(msg, "git: 'lfs' is not a git command") +func collectPointerFiles(inventory map[string]lfs.LfsFileInfo, patterns []string) []pointerFile { + keys := make([]string, 0, len(inventory)) + for path := range inventory { + if !pathspec.MatchesAny(path, patterns) { + continue + } + keys = append(keys, path) + } + sort.Strings(keys) + + files := make([]pointerFile, 0, len(keys)) + for _, path := range keys { + info := inventory[path] + files = append(files, pointerFile{Name: path, Oid: info.Oid, Size: info.Size}) + } + return files } -func checkoutDownloadedFiles(files []lfs.LfsFileInfo) error { +func checkoutDownloadedFiles(files []pointerFile) error { for _, f := range files { if strings.TrimSpace(f.Name) == "" || strings.TrimSpace(f.Oid) == "" { continue @@ -204,10 +215,6 @@ func checkoutDownloadedFiles(files []lfs.LfsFileInfo) error { return nil } -var lfsjsonUnmarshal = func(data []byte, v any) error { - return sonic.ConfigFastest.Unmarshal(data, v) -} - func buildPullDownloadDebugContext(ctx context.Context, drsCtx *config.GitContext, oid string) string { recs, err := drsremote.ObjectsByHashForScope(ctx, drsCtx, oid) if err != nil { @@ -242,3 +249,8 @@ func buildPullDownloadDebugContext(ctx context.Context, drsCtx *config.GitContex } return fmt.Sprintf("oid=%s did=%s size=%d access_methods=%s", oid, strings.TrimSpace(match.Id), match.Size, strings.Join(methods, ", ")) } + +func init() { + Cmd.Flags().StringArrayVarP(&includePatterns, "include", "I", nil, "include pathspec/glob pattern(s)") + Cmd.Flags().BoolVar(&dryRun, "dry-run", false, "list matching pointer files without downloading them") +} diff --git a/cmd/pull/pull_test.go b/cmd/pull/pull_test.go index 41c999ac..7e8e6c74 100644 --- a/cmd/pull/pull_test.go +++ b/cmd/pull/pull_test.go @@ -1,34 +1,81 @@ package pull import ( + "bytes" + "log/slog" "testing" "github.com/calypr/git-drs/internal/config" - "github.com/calypr/git-drs/internal/testutils" - "github.com/stretchr/testify/assert" + "github.com/calypr/git-drs/internal/lfs" ) -func TestPullCmdArgs(t *testing.T) { - err := Cmd.Args(Cmd, []string{}) - assert.NoError(t, err) +func resetPullFlagsForTest() { + includePatterns = nil + dryRun = false +} - err = Cmd.Args(Cmd, []string{"origin"}) - assert.NoError(t, err) +func TestCollectPointerFilesFiltersAndSorts(t *testing.T) { + resetPullFlagsForTest() - err = Cmd.Args(Cmd, []string{"origin", "extra"}) - assert.Error(t, err) -} + inventory := map[string]lfs.LfsFileInfo{ + "data/b.bin": {Name: "data/b.bin", Oid: "bbbb", Size: 2}, + "data/a.bin": {Name: "data/a.bin", Oid: "aaaa", Size: 1}, + "misc/c.bin": {Name: "misc/c.bin", Oid: "cccc", Size: 3}, + } -func TestPullRun_LoadConfigError(t *testing.T) { - _ = testutils.SetupTestGitRepo(t) - err := Cmd.RunE(Cmd, []string{}) - assert.Error(t, err) + files := collectPointerFiles(inventory, []string{"data/**"}) + if len(files) != 2 { + t.Fatalf("expected 2 files, got %d", len(files)) + } + if files[0].Name != "data/a.bin" || files[1].Name != "data/b.bin" { + t.Fatalf("unexpected file order: %+v", files) + } } -func TestPullRun_DefaultRemoteError(t *testing.T) { - tmpDir := testutils.SetupTestGitRepo(t) - testutils.CreateTestConfig(t, tmpDir, &config.Config{}) +func TestPullDryRunListsMatchingPaths(t *testing.T) { + resetPullFlagsForTest() + + oldLoadCfg := loadCfg + oldResolveRemote := resolveRemote + oldNewRemoteClient := newRemoteClient + oldInventory := loadWorktreeInventory + t.Cleanup(func() { + loadCfg = oldLoadCfg + resolveRemote = oldResolveRemote + newRemoteClient = oldNewRemoteClient + loadWorktreeInventory = oldInventory + }) + + loadCfg = func() (*config.Config, error) { return &config.Config{}, nil } + resolveRemote = func(cfg *config.Config, name string) (config.Remote, error) { return config.Remote("origin"), nil } + newRemoteClient = func(cfg *config.Config, remote config.Remote, logger *slog.Logger) (*config.GitContext, error) { + return &config.GitContext{}, nil + } + loadWorktreeInventory = func(_ *slog.Logger) (map[string]lfs.LfsFileInfo, error) { + return map[string]lfs.LfsFileInfo{ + "data/a.bin": {Name: "data/a.bin", Oid: "aaaa", Size: 1}, + "misc/b.bin": {Name: "misc/b.bin", Oid: "bbbb", Size: 2}, + }, nil + } + + includePatterns = []string{"data/**"} + dryRun = true + + var out bytes.Buffer + Cmd.SetOut(&out) + Cmd.SetErr(&out) + Cmd.SetArgs([]string{"--dry-run"}) + t.Cleanup(func() { + Cmd.SetOut(nil) + Cmd.SetErr(nil) + Cmd.SetArgs(nil) + resetPullFlagsForTest() + }) - err := Cmd.RunE(Cmd, []string{}) - assert.Error(t, err) + if err := Cmd.RunE(Cmd, []string{}); err != nil { + t.Fatalf("RunE returned error: %v", err) + } + if got := out.String(); got != "data/a.bin\n" { + t.Fatalf("unexpected dry-run output: %q", got) + } } diff --git a/cmd/push/main.go b/cmd/push/main.go index 4d0445ca..2e04d189 100644 --- a/cmd/push/main.go +++ b/cmd/push/main.go @@ -3,10 +3,12 @@ package push import ( "context" "fmt" + "os" "os/exec" "strings" "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drsdelete" "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/lfs" "github.com/calypr/git-drs/internal/pushsync" @@ -20,6 +22,8 @@ var runCommand = func(name string, args ...string) ([]byte, error) { return cmd.CombinedOutput() } +var gitOutputFn = gitOutput + var Cmd = &cobra.Command{ Use: "push [remote-name]", Short: "Upload/register DRS objects and push Git refs", @@ -61,9 +65,19 @@ var Cmd = &cobra.Command{ } ctx := context.Background() - if err := pushsync.BatchSyncForPush(drsClient, ctx, lfsFiles); err != nil { + deleteRefs, err := currentDeleteRefUpdates(ctx) + if err != nil { + return fmt.Errorf("failed to resolve delete reconciliation base: %w", err) + } + if _, err := drsdelete.ReconcileCommittedDeletes(ctx, drsClient, deleteRefs, myLogger); err != nil { + return fmt.Errorf("failed to reconcile deletes: %w", err) + } + progress := newUploadProgressRenderer(os.Stderr) + if err := pushsync.BatchSyncForPush(drsClient, ctx, lfsFiles, progress); err != nil { + progress.Finish() return fmt.Errorf("failed batch register/upload workflow: %w", err) } + progress.Finish() pushArgs := []string{"push"} if !pushWithHooks { @@ -85,3 +99,27 @@ var Cmd = &cobra.Command{ func init() { Cmd.Flags().BoolVar(&pushWithHooks, "with-hooks", false, "Run git push with local hooks enabled (invokes pre-push)") } + +func currentDeleteRefUpdates(ctx context.Context) ([]drsdelete.RefUpdate, error) { + head, err := gitOutputFn(ctx, "rev-parse", "HEAD") + if err != nil { + return nil, err + } + upstream, err := gitOutputFn(ctx, "rev-parse", "--verify", "@{upstream}") + if err != nil { + return nil, nil + } + return []drsdelete.RefUpdate{{ + OldSHA: upstream, + NewSHA: head, + }}, nil +} + +func gitOutput(ctx context.Context, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, "git", args...) + out, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("git %s: %s", strings.Join(args, " "), strings.TrimSpace(string(out))) + } + return strings.TrimSpace(string(out)), nil +} diff --git a/cmd/push/main_test.go b/cmd/push/main_test.go new file mode 100644 index 00000000..b18b8a87 --- /dev/null +++ b/cmd/push/main_test.go @@ -0,0 +1,58 @@ +package push + +import ( + "context" + "fmt" + "testing" + + "github.com/calypr/git-drs/internal/drsdelete" +) + +func TestCurrentDeleteRefUpdatesUsesUpstreamWhenConfigured(t *testing.T) { + oldFn := gitOutputFn + gitOutputFn = func(ctx context.Context, args ...string) (string, error) { + switch fmt.Sprint(args) { + case "[rev-parse HEAD]": + return "head-sha", nil + case "[rev-parse --verify @{upstream}]": + return "upstream-sha", nil + default: + t.Fatalf("unexpected git args: %v", args) + return "", nil + } + } + t.Cleanup(func() { gitOutputFn = oldFn }) + + got, err := currentDeleteRefUpdates(context.Background()) + if err != nil { + t.Fatalf("currentDeleteRefUpdates returned error: %v", err) + } + want := []drsdelete.RefUpdate{{OldSHA: "upstream-sha", NewSHA: "head-sha"}} + if len(got) != len(want) || got[0] != want[0] { + t.Fatalf("unexpected delete refs: got %+v want %+v", got, want) + } +} + +func TestCurrentDeleteRefUpdatesSkipsWhenUpstreamMissing(t *testing.T) { + oldFn := gitOutputFn + gitOutputFn = func(ctx context.Context, args ...string) (string, error) { + switch fmt.Sprint(args) { + case "[rev-parse HEAD]": + return "head-sha", nil + case "[rev-parse --verify @{upstream}]": + return "", fmt.Errorf("git rev-parse --verify @{upstream}: fatal: no upstream configured") + default: + t.Fatalf("unexpected git args: %v", args) + return "", nil + } + } + t.Cleanup(func() { gitOutputFn = oldFn }) + + got, err := currentDeleteRefUpdates(context.Background()) + if err != nil { + t.Fatalf("currentDeleteRefUpdates returned error: %v", err) + } + if got != nil { + t.Fatalf("expected nil delete refs when upstream is missing, got %+v", got) + } +} diff --git a/cmd/push/progress.go b/cmd/push/progress.go new file mode 100644 index 00000000..2c6dd6fe --- /dev/null +++ b/cmd/push/progress.go @@ -0,0 +1,189 @@ +package push + +import ( + "fmt" + "io" + "strings" + "sync" + "time" + + "github.com/calypr/git-drs/internal/pushsync" + "github.com/mattn/go-isatty" +) + +const nonTTYProgressInterval = 2 * time.Second + +type uploadFileProgress struct { + path string + total int64 + uploaded int64 + completed bool +} + +type uploadProgressRenderer struct { + out io.Writer + isTTY bool + now func() time.Time + lastRender time.Time + mu sync.Mutex + planned bool + plan pushsync.UploadPlanSummary + files map[string]*uploadFileProgress + totalBytes int64 + doneBytes int64 + doneFiles int + currentLabel string +} + +func newUploadProgressRenderer(out io.Writer) *uploadProgressRenderer { + return &uploadProgressRenderer{ + out: out, + isTTY: isWriterTTY(out), + now: time.Now, + files: make(map[string]*uploadFileProgress), + } +} + +func isWriterTTY(w io.Writer) bool { + type fdWriter interface{ Fd() uintptr } + f, ok := w.(fdWriter) + if !ok { + return false + } + fd := f.Fd() + return isatty.IsTerminal(fd) || isatty.IsCygwinTerminal(fd) +} + +func (r *uploadProgressRenderer) OnUploadPlan(plan pushsync.UploadPlanSummary) { + r.mu.Lock() + defer r.mu.Unlock() + r.plan = plan + r.planned = plan.TotalFiles > 0 + r.totalBytes = 0 + r.doneBytes = 0 + r.doneFiles = 0 + r.currentLabel = "" + r.files = make(map[string]*uploadFileProgress, len(plan.Files)) + for _, file := range plan.Files { + r.files[file.OID] = &uploadFileProgress{ + path: file.Path, + total: file.Bytes, + } + r.totalBytes += file.Bytes + } + if r.planned { + r.renderLocked(true) + } +} + +func (r *uploadProgressRenderer) OnUploadProgress(ev pushsync.UploadProgressEvent) { + r.mu.Lock() + defer r.mu.Unlock() + if !r.planned { + return + } + file, ok := r.files[ev.OID] + if !ok { + return + } + if ev.Path != "" { + file.path = ev.Path + } + if ev.TotalBytes > 0 { + file.total = ev.TotalBytes + } + if file.total > 0 && ev.BytesSoFar > file.total { + ev.BytesSoFar = file.total + } + if ev.BytesSoFar > file.uploaded { + r.doneBytes += ev.BytesSoFar - file.uploaded + file.uploaded = ev.BytesSoFar + } + if ev.Path != "" { + r.currentLabel = ev.Path + } else if file.path != "" { + r.currentLabel = file.path + } + if ev.Phase == pushsync.UploadProgressCompleted && !file.completed { + file.completed = true + r.doneFiles++ + if file.total > 0 && file.uploaded < file.total { + r.doneBytes += file.total - file.uploaded + file.uploaded = file.total + } + } + r.renderLocked(false) +} + +func (r *uploadProgressRenderer) Finish() { + r.mu.Lock() + defer r.mu.Unlock() + if !r.planned { + return + } + r.renderLocked(true) + if r.isTTY { + _, _ = fmt.Fprintln(r.out) + } + r.planned = false +} + +func (r *uploadProgressRenderer) renderLocked(force bool) { + now := r.now() + if !force && !r.isTTY && !r.lastRender.IsZero() && now.Sub(r.lastRender) < nonTTYProgressInterval { + return + } + r.lastRender = now + totalBytes := r.totalBytes + doneBytes := r.doneBytes + doneFiles := r.doneFiles + totalFiles := r.plan.TotalFiles + percent := 0.0 + if totalBytes > 0 { + percent = (float64(doneBytes) / float64(totalBytes)) * 100 + } + current := r.currentLabel + if current == "" { + current = "preparing uploads" + } + + if r.isTTY { + barWidth := 28 + filled := 0 + if totalBytes > 0 { + filled = int((float64(doneBytes) / float64(totalBytes)) * float64(barWidth)) + } + if filled > barWidth { + filled = barWidth + } + bar := strings.Repeat("=", filled) + strings.Repeat(" ", barWidth-filled) + line := fmt.Sprintf("\rUploading %d/%d files [%s] %5.1f%% %s/%s current: %s", + doneFiles, totalFiles, bar, percent, humanBytes(doneBytes), humanBytes(totalBytes), trimProgressLabel(current, 48)) + _, _ = fmt.Fprint(r.out, line) + return + } + + line := fmt.Sprintf("Uploading %d/%d files (%.1f%%) %s/%s current=%s\n", + doneFiles, totalFiles, percent, humanBytes(doneBytes), humanBytes(totalBytes), current) + _, _ = fmt.Fprint(r.out, line) +} + +func trimProgressLabel(s string, max int) string { + if max <= 3 || len(s) <= max { + return s + } + return "..." + s[len(s)-(max-3):] +} + +func humanBytes(n int64) string { + const unit = 1024 + if n < unit { + return fmt.Sprintf("%d B", n) + } + div, exp := int64(unit), 0 + for v := n / unit; v >= unit; v /= unit { + div *= unit + exp++ + } + return fmt.Sprintf("%.1f %ciB", float64(n)/float64(div), "KMGTPE"[exp]) +} diff --git a/cmd/push/progress_test.go b/cmd/push/progress_test.go new file mode 100644 index 00000000..a740822c --- /dev/null +++ b/cmd/push/progress_test.go @@ -0,0 +1,73 @@ +package push + +import ( + "bytes" + "strings" + "testing" + "time" + + "github.com/calypr/git-drs/internal/pushsync" +) + +func TestUploadProgressRendererTTY(t *testing.T) { + var out bytes.Buffer + r := newUploadProgressRenderer(&out) + r.isTTY = true + + r.OnUploadPlan(pushsync.UploadPlanSummary{ + Files: []pushsync.UploadPlanFile{ + {OID: "oid-1", Path: "a.bin", Bytes: 100}, + {OID: "oid-2", Path: "b.bin", Bytes: 100}, + }, + TotalFiles: 2, + TotalBytes: 200, + }) + r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-1", Path: "a.bin", BytesSoFar: 50, BytesSinceLast: 50, TotalBytes: 100, Phase: pushsync.UploadProgressUploading}) + r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-1", Path: "a.bin", BytesSoFar: 100, TotalBytes: 100, Phase: pushsync.UploadProgressCompleted}) + r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-2", Path: "b.bin", BytesSoFar: 100, TotalBytes: 100, Phase: pushsync.UploadProgressCompleted}) + r.Finish() + + got := out.String() + if !strings.Contains(got, "Uploading 2/2 files") { + t.Fatalf("expected final tty summary, got %q", got) + } + if !strings.Contains(got, "100.0%") { + t.Fatalf("expected 100%% completion, got %q", got) + } + if !strings.HasSuffix(got, "\n") { + t.Fatalf("expected trailing newline, got %q", got) + } +} + +func TestUploadProgressRendererNonTTYThrottles(t *testing.T) { + var out bytes.Buffer + r := newUploadProgressRenderer(&out) + r.isTTY = false + now := time.Unix(0, 0) + r.now = func() time.Time { return now } + + r.OnUploadPlan(pushsync.UploadPlanSummary{ + Files: []pushsync.UploadPlanFile{{OID: "oid-1", Path: "a.bin", Bytes: 100}}, + TotalFiles: 1, + TotalBytes: 100, + }) + first := out.String() + if first == "" { + t.Fatal("expected initial non-tty progress line") + } + + r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-1", Path: "a.bin", BytesSoFar: 10, BytesSinceLast: 10, TotalBytes: 100, Phase: pushsync.UploadProgressUploading}) + if out.String() != first { + t.Fatalf("expected throttled output to remain unchanged, got %q", out.String()) + } + + now = now.Add(3 * time.Second) + r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-1", Path: "a.bin", BytesSoFar: 100, TotalBytes: 100, Phase: pushsync.UploadProgressCompleted}) + got := out.String() + if strings.Count(got, "\n") < 2 { + t.Fatalf("expected throttled summary updates, got %q", got) + } + if !strings.Contains(got, "Uploading 1/1 files") { + t.Fatalf("expected non-tty progress summary, got %q", got) + } +} diff --git a/cmd/remote/add/add_test.go b/cmd/remote/add/add_test.go index 2923b5d8..e71c3745 100644 --- a/cmd/remote/add/add_test.go +++ b/cmd/remote/add/add_test.go @@ -1,8 +1,13 @@ package add import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" "testing" + bucketapi "github.com/calypr/syfon/apigen/client/bucketapi" "github.com/stretchr/testify/assert" ) @@ -12,5 +17,89 @@ func TestAddCmd(t *testing.T) { } func TestGen3Cmd(t *testing.T) { - assert.Equal(t, "gen3 [remote-name]", Gen3Cmd.Use) + assert.Equal(t, "gen3 [remote-name] ", Gen3Cmd.Use) +} + +func TestParseScopeArg(t *testing.T) { + t.Run("splits org and project on slash", func(t *testing.T) { + org, project, err := parseScopeArg("HTAN_INT/BForePC") + if err != nil { + t.Fatalf("parseScopeArg returned error: %v", err) + } + if org != "HTAN_INT" || project != "BForePC" { + t.Fatalf("unexpected scope parse result: %q/%q", org, project) + } + }) + + t.Run("rejects legacy single token input", func(t *testing.T) { + _, _, err := parseScopeArg("BForePC") + if err == nil { + t.Fatal("expected invalid scope error") + } + }) + + t.Run("rejects empty org or project", func(t *testing.T) { + for _, raw := range []string{"/BForePC", "HTAN_INT/", "HTAN_INT//BForePC"} { + _, _, err := parseScopeArg(raw) + if err == nil { + t.Fatalf("expected invalid scope error for %q", raw) + } + } + }) +} + +func TestResolveBucketScopeFromServer(t *testing.T) { + t.Run("matches project resource", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/data/buckets" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + if got := r.Header.Get("Authorization"); got != "Bearer test-token" { + t.Fatalf("unexpected auth header: %q", got) + } + _, _ = w.Write([]byte(`{"S3_BUCKETS":{"cbds":{"programs":["/organization/HTAN_INT/project/BForePC"]}}}`)) + })) + defer srv.Close() + + scope, err := resolveBucketScopeFromServer(context.Background(), srv.URL, "test-token", "HTAN_INT", "BForePC") + if err != nil { + t.Fatalf("resolveBucketScopeFromServer returned error: %v", err) + } + if scope.Bucket != "cbds" { + t.Fatalf("unexpected bucket: %+v", scope) + } + }) + + t.Run("falls back to org resource", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte(`{"S3_BUCKETS":{"cbds":{"programs":["/organization/HTAN_INT"]}}}`)) + })) + defer srv.Close() + + scope, err := resolveBucketScopeFromServer(context.Background(), srv.URL, "test-token", "HTAN_INT", "BForePC") + if err != nil { + t.Fatalf("resolveBucketScopeFromServer returned error: %v", err) + } + if scope.Bucket != "cbds" { + t.Fatalf("unexpected bucket: %+v", scope) + } + }) + + t.Run("no match", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + resp := bucketapi.BucketsResponse{S3BUCKETS: map[string]bucketapi.BucketMetadata{ + "cbds": {}, + }} + w.Header().Set("Content-Type", "application/json") + if err := json.NewEncoder(w).Encode(resp); err != nil { + t.Fatalf("encode response: %v", err) + } + })) + defer srv.Close() + + _, err := resolveBucketScopeFromServer(context.Background(), srv.URL, "test-token", "HTAN_INT", "BForePC") + if err == nil { + t.Fatal("expected error when no matching bucket is visible") + } + }) } diff --git a/cmd/remote/add/gen3.go b/cmd/remote/add/gen3.go index 33151ebc..2028bfa3 100644 --- a/cmd/remote/add/gen3.go +++ b/cmd/remote/add/gen3.go @@ -2,42 +2,46 @@ package add import ( "context" + "encoding/json" "fmt" "log/slog" + "net/http" "strings" "github.com/calypr/data-client/credentials" + "github.com/calypr/git-drs/cmd/initialize" "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/config" "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/gitrepo" + bucketapi "github.com/calypr/syfon/apigen/client/bucketapi" conf "github.com/calypr/syfon/client/config" + syfoncommon "github.com/calypr/syfon/common" "github.com/spf13/cobra" ) var Gen3Cmd = &cobra.Command{ - Use: "gen3 [remote-name]", + Use: "gen3 [remote-name] ", Args: func(cmd *cobra.Command, args []string) error { - if len(args) > 1 { + if len(args) < 1 || len(args) > 2 { cmd.SilenceUsage = false - return fmt.Errorf("error: accepts at most 1 argument (remote name), received %d\n\nUsage: %s\n\nSee 'git drs remote add gen3 --help' for more details", len(args), cmd.UseLine()) + return fmt.Errorf("error: expected [remote-name] , received %d arguments\n\nUsage: %s\n\nSee 'git drs remote add gen3 --help' for more details", len(args), cmd.UseLine()) } return nil }, RunE: func(cmd *cobra.Command, args []string) error { logg := drslog.GetLogger() - // make sure at least one of the credentials params is provided - if credFile == "" && fenceToken == "" && len(args) == 0 { - return fmt.Errorf("error: Gen3 requires a credentials file or accessToken to setup project locally. Please provide either a --cred or --token flag. See 'git drs remote add gen3 --help' for more details") - } - remoteName := config.ORIGIN - if len(args) > 0 { + scopeArg := "" + if len(args) == 1 { + scopeArg = args[0] + } else { remoteName = args[0] + scopeArg = args[1] } - err := gen3Init(remoteName, credFile, fenceToken, project, organization, bucket, logg) + err := gen3Init(remoteName, credFile, fenceToken, scopeArg, logg) if err != nil { return fmt.Errorf("error configuring gen3 server: %v", err) } @@ -45,31 +49,16 @@ var Gen3Cmd = &cobra.Command{ }, } -func gen3Init(remoteName, credFile, fenceToken, project, organization, bucket string, logg *slog.Logger) error { +func gen3Init(remoteName, credFile, fenceToken, scopeArg string, logg *slog.Logger) error { if remoteName == "" { return fmt.Errorf("remote name is required") } - if project == "" { - return fmt.Errorf("project is required for Gen3 remote") + if err := initialize.EnsureInitialized(logg); err != nil { + return fmt.Errorf("failed to initialize repository: %w", err) } - - resolvedBucket := strings.TrimSpace(bucket) - resolvedStoragePrefix := "" - if strings.TrimSpace(organization) != "" { - scope, err := gitrepo.ResolveBucketScope(organization, project, resolvedBucket, "") - if err != nil { - return fmt.Errorf("failed resolving bucket mapping for organization=%q project=%q: %w", organization, project, err) - } - resolvedBucket = strings.TrimSpace(scope.Bucket) - resolvedStoragePrefix = strings.TrimSpace(scope.Prefix) - } - if resolvedBucket == "" { - if strings.TrimSpace(organization) == "" { - return fmt.Errorf("bucket is required when organization is empty") - } - if strings.TrimSpace(resolvedBucket) == "" { - return fmt.Errorf("bucket is required (or configure mapping first with `git drs bucket add-project --organization %s --project %s --path :///`)", organization, project) - } + organization, project, err := parseScopeArg(scopeArg) + if err != nil { + return err } var accessToken, apiKey, keyID, apiEndpoint string @@ -113,6 +102,33 @@ func gen3Init(remoteName, credFile, fenceToken, project, organization, bucket st return fmt.Errorf("could not determine Gen3 API endpoint") } + cred := &conf.Credential{ + Profile: remoteName, + APIEndpoint: apiEndpoint, + APIKey: apiKey, + KeyID: keyID, + AccessToken: accessToken, // may be stale + UseShepherd: "false", + MinShepherdVersion: "", + } + + if err := credentials.EnsureValidCredential(context.Background(), cred, logg); err != nil { + return fmt.Errorf("failed to verify/refresh Gen3 credential: %w", config.WrapCredentialValidationError(remoteName, err)) + } + + scope, err := gitrepo.ResolveBucketScope(organization, project, "", "") + if err != nil { + scope, err = resolveBucketScopeFromServer(context.Background(), apiEndpoint, strings.TrimSpace(cred.AccessToken), organization, project) + if err != nil { + return fmt.Errorf("failed resolving bucket mapping for organization=%q project=%q: %w", organization, project, err) + } + } + resolvedBucket := strings.TrimSpace(scope.Bucket) + resolvedStoragePrefix := strings.TrimSpace(scope.Prefix) + if resolvedBucket == "" { + return fmt.Errorf("no bucket mapping found for organization=%q project=%q", organization, project) + } + remoteGen3 := config.RemoteSelect{ Gen3: &config.Gen3Remote{ Endpoint: apiEndpoint, @@ -129,21 +145,6 @@ func gen3Init(remoteName, credFile, fenceToken, project, organization, bucket st } logg.Debug(fmt.Sprintf("Remote added/updated: %s → %s (project: %s, bucket: %s, storage_prefix: %s)", remoteName, apiEndpoint, project, resolvedBucket, resolvedStoragePrefix)) - // Step 3: Ensure credential profile is up-to-date (refreshes token if needed) - cred := &conf.Credential{ - Profile: remoteName, - APIEndpoint: apiEndpoint, - APIKey: apiKey, - KeyID: keyID, - AccessToken: accessToken, // may be stale - UseShepherd: "false", // or preserve from existing? - MinShepherdVersion: "", - } - - if err := credentials.EnsureValidCredential(context.Background(), cred, logg); err != nil { - return fmt.Errorf("failed to verify/refresh Gen3 credential: %w", err) - } - if err := configure.Save(cred); err != nil { return fmt.Errorf("failed to configure/update Gen3 profile: %w", err) } @@ -163,3 +164,95 @@ func gen3Init(remoteName, credFile, fenceToken, project, organization, bucket st logg.Debug(fmt.Sprintf("Gen3 profile '%s' configured and token refreshed successfully", remoteName)) return nil } + +func parseScopeArg(raw string) (string, string, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return "", "", fmt.Errorf("organization/project scope is required") + } + + parts := strings.Split(raw, "/") + if len(parts) != 2 { + return "", "", fmt.Errorf("invalid scope %q: expected organization/project", raw) + } + organization := strings.TrimSpace(parts[0]) + project := strings.TrimSpace(parts[1]) + if organization == "" || project == "" { + return "", "", fmt.Errorf("invalid scope %q: expected organization/project", raw) + } + return organization, project, nil +} + +func resolveBucketScopeFromServer(ctx context.Context, endpoint, token, organization, project string) (gitrepo.ResolvedBucketScope, error) { + if strings.TrimSpace(endpoint) == "" { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("missing API endpoint for server bucket lookup") + } + if strings.TrimSpace(token) == "" { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("missing access token for server bucket lookup") + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimRight(endpoint, "/")+"/data/buckets", nil) + if err != nil { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("build bucket list request: %w", err) + } + req.Header.Set("Authorization", "Bearer "+strings.TrimSpace(token)) + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("request bucket list: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("bucket list failed with status %d", resp.StatusCode) + } + + var payload bucketapi.BucketsResponse + if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("decode bucket list response: %w", err) + } + + projectResource, err := syfoncommon.ResourcePath(organization, project) + if err != nil { + return gitrepo.ResolvedBucketScope{}, err + } + orgResource, err := syfoncommon.ResourcePath(organization, "") + if err != nil { + return gitrepo.ResolvedBucketScope{}, err + } + + if bucket, ok := findBucketByResource(payload, projectResource); ok { + return gitrepo.ResolvedBucketScope{Bucket: bucket}, nil + } + if bucket, ok := findBucketByResource(payload, orgResource); ok { + return gitrepo.ResolvedBucketScope{Bucket: bucket}, nil + } + + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("no visible server bucket matched organization=%q project=%q", organization, project) +} + +func findBucketByResource(payload bucketapi.BucketsResponse, resource string) (string, bool) { + resource = syfoncommon.NormalizeAccessResource(resource) + if resource == "" { + return "", false + } + var match string + for bucket, meta := range payload.S3BUCKETS { + if meta.Programs == nil { + continue + } + for _, candidate := range *meta.Programs { + if syfoncommon.NormalizeAccessResource(candidate) != resource { + continue + } + if match != "" && match != bucket { + return "", false + } + match = bucket + break + } + } + if match == "" { + return "", false + } + return match, true +} diff --git a/cmd/remote/add/init.go b/cmd/remote/add/init.go index 55f848f2..e156d376 100644 --- a/cmd/remote/add/init.go +++ b/cmd/remote/add/init.go @@ -3,14 +3,10 @@ package add import "github.com/spf13/cobra" var ( - apiEndpoint string - bucket string credFile string fenceToken string localPassword string localUsername string - project string - organization string ) // Cmd line declaration @@ -20,17 +16,10 @@ var Cmd = &cobra.Command{ } func init() { - Gen3Cmd.Flags().StringVar(&apiEndpoint, "url", "", "[gen3] Specify the API endpoint of the data commons") - Gen3Cmd.Flags().StringVar(&bucket, "bucket", "", "[gen3] Specify the bucket name") - Gen3Cmd.Flags().StringVar(&credFile, "cred", "", "[gen3] Specify the gen3 credential file that you want to use") - Gen3Cmd.Flags().StringVar(&fenceToken, "token", "", "[gen3] Specify the token to be used as a replacement for a credential file for temporary access") - Gen3Cmd.Flags().StringVar(&project, "project", "", "[gen3] Specify the gen3 project ID in the format -") - Gen3Cmd.Flags().StringVar(&organization, "organization", "", "[gen3] Optional organization/program scope (use with --project as project id)") + Gen3Cmd.Flags().StringVar(&credFile, "cred", "", "[gen3] Import a Gen3 credential file into this profile") + Gen3Cmd.Flags().StringVar(&fenceToken, "token", "", "[gen3] Use a temporary bearer token issued from fence") Cmd.AddCommand(Gen3Cmd) - LocalCmd.Flags().StringVarP(&project, "project", "p", "", "Project ID") - LocalCmd.Flags().StringVar(&bucket, "bucket", "", "Bucket Name") - LocalCmd.Flags().StringVar(&organization, "organization", "", "Organization Name") LocalCmd.Flags().StringVar(&localUsername, "username", "", "Username for local DRS HTTP basic auth") LocalCmd.Flags().StringVar(&localPassword, "password", "", "Password for local DRS HTTP basic auth") Cmd.AddCommand(LocalCmd) diff --git a/cmd/remote/add/local.go b/cmd/remote/add/local.go index c0d61b29..8a572d9d 100644 --- a/cmd/remote/add/local.go +++ b/cmd/remote/add/local.go @@ -1,33 +1,61 @@ package add import ( + "context" + "encoding/json" "fmt" + "net/http" "strings" + "github.com/calypr/git-drs/cmd/initialize" "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/gitrepo" + bucketapi "github.com/calypr/syfon/apigen/client/bucketapi" + syfoncommon "github.com/calypr/syfon/common" "github.com/spf13/cobra" ) var LocalCmd = &cobra.Command{ - Use: "local ", + Use: "local ", Short: "Add a local DRS server", - Long: "Add a local DRS server by specifying its base URL, e.g., http://localhost:8000. Optional --username/--password configures basic auth for git-lfs and helper flows.", - Args: cobra.ExactArgs(2), + Long: "Add a local DRS server by specifying its base URL and scope. Optional --username/--password configures basic auth for helper flows.", + Args: cobra.ExactArgs(3), RunE: func(cmd *cobra.Command, args []string) error { remoteName := args[0] url := args[1] + scopeArg := args[2] + if err := initialize.EnsureInitialized(drslog.GetLogger()); err != nil { + return fmt.Errorf("failed to initialize repository: %w", err) + } if url == "" { return fmt.Errorf("URL cannot be empty") } + organization, project, err := parseScopeArg(scopeArg) + if err != nil { + return err + } + scope, err := gitrepo.ResolveBucketScope(organization, project, "", "") + if err != nil { + scope, err = resolveBucketScopeFromLocalServer(context.Background(), url, strings.TrimSpace(localUsername), strings.TrimSpace(localPassword), organization, project) + if err != nil { + return fmt.Errorf("failed resolving bucket mapping for organization=%q project=%q: %w", organization, project, err) + } + } + resolvedBucket := strings.TrimSpace(scope.Bucket) + resolvedStoragePrefix := strings.TrimSpace(scope.Prefix) + if resolvedBucket == "" { + return fmt.Errorf("no bucket mapping found for organization=%q project=%q", organization, project) + } remoteSelect := config.RemoteSelect{ Local: &config.LocalRemote{ - BaseURL: url, - ProjectID: project, - Bucket: bucket, - Organization: organization, + BaseURL: url, + ProjectID: project, + Bucket: resolvedBucket, + Organization: organization, + StoragePrefix: resolvedStoragePrefix, }, } @@ -54,3 +82,49 @@ var LocalCmd = &cobra.Command{ return nil }, } + +func resolveBucketScopeFromLocalServer(ctx context.Context, endpoint, username, password, organization, project string) (gitrepo.ResolvedBucketScope, error) { + if strings.TrimSpace(endpoint) == "" { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("missing API endpoint for server bucket lookup") + } + + req, err := http.NewRequestWithContext(ctx, http.MethodGet, strings.TrimRight(endpoint, "/")+"/data/buckets", nil) + if err != nil { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("build bucket list request: %w", err) + } + if username != "" || password != "" { + req.SetBasicAuth(username, password) + } + + resp, err := http.DefaultClient.Do(req) + if err != nil { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("request bucket list: %w", err) + } + defer resp.Body.Close() + if resp.StatusCode != http.StatusOK { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("bucket list failed with status %d", resp.StatusCode) + } + + var payload bucketapi.BucketsResponse + if err := json.NewDecoder(resp.Body).Decode(&payload); err != nil { + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("decode bucket list response: %w", err) + } + + projectResource, err := syfoncommon.ResourcePath(organization, project) + if err != nil { + return gitrepo.ResolvedBucketScope{}, err + } + orgResource, err := syfoncommon.ResourcePath(organization, "") + if err != nil { + return gitrepo.ResolvedBucketScope{}, err + } + + if bucket, ok := findBucketByResource(payload, projectResource); ok { + return gitrepo.ResolvedBucketScope{Bucket: bucket}, nil + } + if bucket, ok := findBucketByResource(payload, orgResource); ok { + return gitrepo.ResolvedBucketScope{Bucket: bucket}, nil + } + + return gitrepo.ResolvedBucketScope{}, fmt.Errorf("no visible server bucket matched organization=%q project=%q", organization, project) +} diff --git a/cmd/remote/add/local_test.go b/cmd/remote/add/local_test.go index 80e908a6..67ba39e1 100644 --- a/cmd/remote/add/local_test.go +++ b/cmd/remote/add/local_test.go @@ -1,14 +1,87 @@ package add import ( + "context" + "net/http" + "net/http/httptest" + "os" + "path/filepath" "testing" + "github.com/calypr/git-drs/internal/common" + "github.com/calypr/git-drs/internal/gitrepo" + "github.com/calypr/git-drs/internal/testutils" "github.com/stretchr/testify/assert" ) func TestAddLocalRemote(t *testing.T) { assert.NotNil(t, LocalCmd) - assert.Equal(t, "local ", LocalCmd.Use) + assert.Equal(t, "local ", LocalCmd.Use) assert.NotNil(t, LocalCmd.Flag("username")) assert.NotNil(t, LocalCmd.Flag("password")) + assert.Nil(t, LocalCmd.Flag("organization")) + assert.Nil(t, LocalCmd.Flag("project")) + assert.Nil(t, LocalCmd.Flag("bucket")) +} + +func TestResolveBucketScopeFromLocalServer(t *testing.T) { + t.Run("matches project resource", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/data/buckets" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + user, pass, ok := r.BasicAuth() + if !ok || user != "drs-user" || pass != "drs-pass" { + t.Fatalf("unexpected basic auth: ok=%v user=%q pass=%q", ok, user, pass) + } + _, _ = w.Write([]byte(`{"S3_BUCKETS":{"cbds":{"programs":["/organization/calypr/project/end_to_end_test"]}}}`)) + })) + defer srv.Close() + + scope, err := resolveBucketScopeFromLocalServer(context.Background(), srv.URL, "drs-user", "drs-pass", "calypr", "end_to_end_test") + if err != nil { + t.Fatalf("resolveBucketScopeFromLocalServer returned error: %v", err) + } + if scope.Bucket != "cbds" { + t.Fatalf("unexpected bucket: %+v", scope) + } + }) +} + +func TestLocalRemoteAddEnsuresInitialization(t *testing.T) { + testutils.SetupTestGitRepo(t) + localUsername = "" + localPassword = "" + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/data/buckets" { + t.Fatalf("unexpected path: %s", r.URL.Path) + } + _, _ = w.Write([]byte(`{"S3_BUCKETS":{"cbds":{"programs":["/organization/calypr/project/end_to_end_test"]}}}`)) + })) + defer srv.Close() + + if err := LocalCmd.RunE(LocalCmd, []string{"origin", srv.URL, "calypr/end_to_end_test"}); err != nil { + t.Fatalf("LocalCmd.RunE returned error: %v", err) + } + + if _, err := os.Stat(common.DRS_DIR); err != nil { + t.Fatalf("expected %s to exist: %v", common.DRS_DIR, err) + } + + filterProcess, err := gitrepo.GetGitConfigString("filter.drs.process") + if err != nil { + t.Fatalf("GetGitConfigString(filter.drs.process): %v", err) + } + if filterProcess != "git-drs filter" { + t.Fatalf("unexpected filter.drs.process: %q", filterProcess) + } + + preCommit, err := os.ReadFile(filepath.Join(".git", "hooks", "pre-commit")) + if err != nil { + t.Fatalf("read pre-commit hook: %v", err) + } + if string(preCommit) == "" { + t.Fatalf("expected pre-commit hook to be installed") + } } diff --git a/cmd/remote/list.go b/cmd/remote/list.go index 8723bc97..86a231c5 100644 --- a/cmd/remote/list.go +++ b/cmd/remote/list.go @@ -3,11 +3,21 @@ package remote import ( "fmt" + "github.com/calypr/data-client/credentials" "github.com/calypr/git-drs/internal/config" "github.com/calypr/git-drs/internal/drslog" + syconf "github.com/calypr/syfon/client/config" "github.com/spf13/cobra" ) +var ( + loadConfig = config.LoadConfig + loadProfileCredential = func(profile string) (*syconf.Credential, error) { + return syconf.NewConfigure(drslog.GetLogger()).Load(profile) + } + ensureValidCredential = credentials.EnsureValidCredential +) + var ListCmd = &cobra.Command{ Use: "list", Short: "List DRS repos", @@ -20,7 +30,7 @@ var ListCmd = &cobra.Command{ }, RunE: func(cmd *cobra.Command, args []string) error { logg := drslog.GetLogger() - cfg, err := config.LoadConfig() + cfg, err := loadConfig() if err != nil { logg.Debug(fmt.Sprintf("Error loading config: %s", err)) return err @@ -53,6 +63,16 @@ var ListCmd = &cobra.Command{ } fmt.Printf("%s %-10s %-8s %s\n", marker, name, remoteType, endpoint) + if remoteSelect.Gen3 != nil { + cred, err := loadProfileCredential(string(name)) + if err != nil { + logg.Warn(fmt.Sprintf("remote %s credential check skipped: %v", name, err)) + continue + } + if err := ensureValidCredential(cmd.Context(), cred, logg); err != nil { + logg.Warn(config.WrapCredentialValidationError(string(name), err).Error()) + } + } } return nil }, diff --git a/cmd/remote/remote_test.go b/cmd/remote/remote_test.go index b03b3121..c1d33aab 100644 --- a/cmd/remote/remote_test.go +++ b/cmd/remote/remote_test.go @@ -1,9 +1,14 @@ package remote import ( + "context" + "log/slog" + "os/exec" "testing" + "github.com/calypr/git-drs/internal/config" "github.com/calypr/git-drs/internal/testutils" + syconf "github.com/calypr/syfon/client/config" "github.com/stretchr/testify/assert" ) @@ -21,6 +26,22 @@ func TestRemoteListRun(t *testing.T) { tmpDir := testutils.SetupTestGitRepo(t) testutils.CreateDefaultTestConfig(t, tmpDir) + oldLoadProfileCredential := loadProfileCredential + oldEnsureValidCredential := ensureValidCredential + t.Cleanup(func() { + loadProfileCredential = oldLoadProfileCredential + ensureValidCredential = oldEnsureValidCredential + }) + + loadProfileCredential = func(profile string) (*syconf.Credential, error) { + return &syconf.Credential{Profile: profile, AccessToken: "token", APIEndpoint: "https://example.test"}, nil + } + called := false + ensureValidCredential = func(ctx context.Context, cred *syconf.Credential, _ *slog.Logger) error { + called = true + return nil + } + // Capture stdout output := testutils.CaptureStdout(t, func() { err := ListCmd.RunE(ListCmd, []string{}) @@ -29,6 +50,7 @@ func TestRemoteListRun(t *testing.T) { assert.Contains(t, output, "origin") assert.Contains(t, output, "gen3") + assert.True(t, called, "expected remote list to validate the configured credential") } func TestRemoteSetArgs(t *testing.T) { @@ -44,3 +66,89 @@ func TestRemoteSetArgs(t *testing.T) { err = SetCmd.Args(SetCmd, []string{"origin", "extra"}) assert.Error(t, err) } + +func TestRemoteRemoveArgs(t *testing.T) { + err := RemoveCmd.Args(RemoveCmd, []string{"origin"}) + assert.NoError(t, err) + + err = RemoveCmd.Args(RemoveCmd, []string{}) + assert.Error(t, err) + + err = RemoveCmd.Args(RemoveCmd, []string{"origin", "extra"}) + assert.Error(t, err) +} + +func TestRemoteRemoveRunReassignsDefaultAndCleansKeys(t *testing.T) { + tmpDir := testutils.SetupTestGitRepo(t) + testutils.CreateTestConfig(t, tmpDir, &config.Config{ + DefaultRemote: "origin", + Remotes: map[config.Remote]config.RemoteSelect{ + "origin": { + Gen3: &config.Gen3Remote{ + Endpoint: "https://origin.example", + ProjectID: "origin-proj", + Bucket: "origin-bucket", + }, + }, + "backup": { + Gen3: &config.Gen3Remote{ + Endpoint: "https://backup.example", + ProjectID: "backup-proj", + Bucket: "backup-bucket", + }, + }, + }, + }) + + for _, args := range [][]string{ + {"config", "drs.remote.origin.token", "token"}, + {"config", "drs.remote.origin.username", "alice"}, + {"config", "drs.remote.origin.password", "secret"}, + {"config", "remote.origin.lfsurl", "https://origin.example/info/lfs"}, + } { + cmd := exec.Command("git", args...) + cmd.Dir = tmpDir + err := cmd.Run() + assert.NoError(t, err) + } + + err := RemoveCmd.RunE(RemoveCmd, []string{"origin"}) + assert.NoError(t, err) + + cfg, err := config.LoadConfig() + assert.NoError(t, err) + assert.NotContains(t, cfg.Remotes, config.Remote("origin")) + assert.Equal(t, config.Remote("backup"), cfg.DefaultRemote) + + for _, key := range []string{ + "drs.remote.origin.type", + "drs.remote.origin.endpoint", + "drs.remote.origin.project", + "drs.remote.origin.bucket", + "drs.remote.origin.token", + "drs.remote.origin.username", + "drs.remote.origin.password", + "remote.origin.lfsurl", + } { + val, err := exec.Command("git", "config", "--get", key).CombinedOutput() + assert.Empty(t, string(val)) + assert.Error(t, err) + } +} + +func TestRemoteRemoveRunClearsDefaultWhenLastRemoteRemoved(t *testing.T) { + tmpDir := testutils.SetupTestGitRepo(t) + testutils.CreateDefaultTestConfig(t, tmpDir) + + err := RemoveCmd.RunE(RemoveCmd, []string{"origin"}) + assert.NoError(t, err) + + cfg, err := config.LoadConfig() + assert.NoError(t, err) + assert.Empty(t, cfg.Remotes) + assert.Equal(t, config.Remote(""), cfg.DefaultRemote) + + val, err := exec.Command("git", "config", "--get", "drs.default-remote").CombinedOutput() + assert.Empty(t, string(val)) + assert.Error(t, err) +} diff --git a/cmd/remote/remove.go b/cmd/remote/remove.go new file mode 100644 index 00000000..a5f5dbdc --- /dev/null +++ b/cmd/remote/remove.go @@ -0,0 +1,59 @@ +package remote + +import ( + "fmt" + "sort" + + "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drslog" + "github.com/spf13/cobra" +) + +var RemoveCmd = &cobra.Command{ + Use: "remove ", + Aliases: []string{"rm"}, + Short: "Remove a DRS remote", + Long: "Remove a configured DRS remote and repair the default remote if needed.", + Args: func(cmd *cobra.Command, args []string) error { + if len(args) != 1 { + cmd.SilenceUsage = false + return fmt.Errorf("error: requires exactly 1 argument (remote name), received %d\n\nUsage: %s\n\nRun 'git drs remote list' to see available remotes or 'git drs remote remove --help' for more details", len(args), cmd.UseLine()) + } + return nil + }, + RunE: func(cmd *cobra.Command, args []string) error { + remoteName := config.Remote(args[0]) + logger := drslog.GetLogger() + + cfg, err := config.LoadConfig() + if err != nil { + return fmt.Errorf("failed to load config: %w", err) + } + + if _, ok := cfg.Remotes[remoteName]; !ok { + availableRemotes := make([]string, 0, len(cfg.Remotes)) + for name := range cfg.Remotes { + availableRemotes = append(availableRemotes, string(name)) + } + sort.Strings(availableRemotes) + return fmt.Errorf( + "remote '%s' not found.\nAvailable remotes: %v", + remoteName, + availableRemotes, + ) + } + + updated, err := config.RemoveRemote(remoteName) + if err != nil { + return fmt.Errorf("failed to remove remote: %w", err) + } + + if updated.DefaultRemote == "" { + logger.Debug(fmt.Sprintf("Removed remote %s; no default remote remains", remoteName)) + return nil + } + + logger.Debug(fmt.Sprintf("Removed remote %s; default remote is now %s", remoteName, updated.DefaultRemote)) + return nil + }, +} diff --git a/cmd/remote/root.go b/cmd/remote/root.go index 7d865720..45a1963d 100644 --- a/cmd/remote/root.go +++ b/cmd/remote/root.go @@ -14,5 +14,6 @@ var Cmd = &cobra.Command{ func init() { Cmd.AddCommand(add.Cmd) Cmd.AddCommand(ListCmd) + Cmd.AddCommand(RemoveCmd) Cmd.AddCommand(SetCmd) } diff --git a/cmd/rm/main.go b/cmd/rm/main.go new file mode 100644 index 00000000..a58124d1 --- /dev/null +++ b/cmd/rm/main.go @@ -0,0 +1,58 @@ +package rm + +import ( + "context" + "fmt" + "os/exec" + "path/filepath" + "strings" + + "github.com/calypr/git-drs/internal/drslog" + "github.com/calypr/git-drs/internal/lfs" + "github.com/spf13/cobra" +) + +var runCommand = func(name string, args ...string) error { + cmd := exec.Command(name, args...) + return cmd.Run() +} + +var Cmd = &cobra.Command{ + Use: "rm ...", + Short: "Remove tracked git-drs files", + Args: cobra.MinimumNArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + return run(cmd.Context(), args) + }, +} + +func run(ctx context.Context, args []string) error { + tracked, err := lfs.GetTrackedLfsFiles(drslog.GetLogger()) + if err != nil { + return err + } + + type removal struct { + path string + oid string + } + planned := make([]removal, 0, len(args)) + for _, raw := range args { + path := filepath.ToSlash(filepath.Clean(raw)) + info, ok := tracked[path] + if !ok || strings.TrimSpace(info.Oid) == "" { + return fmt.Errorf("%s is not a tracked git-drs/LFS file", raw) + } + planned = append(planned, removal{path: path, oid: "sha256:" + strings.TrimPrefix(strings.TrimSpace(info.Oid), "sha256:")}) + } + + gitArgs := []string{"rm", "--"} + for _, item := range planned { + gitArgs = append(gitArgs, item.path) + } + if err := runCommand("git", gitArgs...); err != nil { + return err + } + + return nil +} diff --git a/cmd/rm/main_test.go b/cmd/rm/main_test.go new file mode 100644 index 00000000..16c51f28 --- /dev/null +++ b/cmd/rm/main_test.go @@ -0,0 +1,54 @@ +package rm + +import ( + "context" + "os" + "os/exec" + "path/filepath" + "testing" +) + +func TestRunRemovesTrackedFile(t *testing.T) { + repo := t.TempDir() + runGitCmd(t, repo, "init") + runGitCmd(t, repo, "config", "user.email", "test@example.com") + runGitCmd(t, repo, "config", "user.name", "Test User") + runGitCmd(t, repo, "config", "filter.drs.clean", "cat") + runGitCmd(t, repo, "config", "filter.drs.smudge", "cat") + runGitCmd(t, repo, "config", "filter.drs.process", "cat") + runGitCmd(t, repo, "config", "filter.drs.required", "false") + + if err := os.WriteFile(filepath.Join(repo, ".gitattributes"), []byte("*.dat filter=drs diff=drs merge=drs -text\n"), 0o644); err != nil { + t.Fatalf("write .gitattributes: %v", err) + } + path := filepath.Join(repo, "data.dat") + if err := os.WriteFile(path, []byte("version https://git-lfs.github.com/spec/v1\noid sha256:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\nsize 12\n"), 0o644); err != nil { + t.Fatalf("write pointer file: %v", err) + } + runGitCmd(t, repo, "add", ".") + runGitCmd(t, repo, "commit", "-m", "add pointer") + + oldWD, _ := os.Getwd() + if err := os.Chdir(repo); err != nil { + t.Fatalf("chdir repo: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(oldWD) }) + + if err := run(context.Background(), []string{"data.dat"}); err != nil { + t.Fatalf("run returned error: %v", err) + } + + if _, err := os.Stat(path); !os.IsNotExist(err) { + t.Fatalf("expected file removed from worktree, stat err=%v", err) + } +} + +func runGitCmd(t *testing.T, dir string, args ...string) { + t.Helper() + cmd := exec.Command("git", args...) + cmd.Dir = dir + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %v failed: %v\n%s", args, err, string(out)) + } +} diff --git a/cmd/root.go b/cmd/root.go index ddfc95ac..4365f5c3 100644 --- a/cmd/root.go +++ b/cmd/root.go @@ -5,28 +5,24 @@ import ( "github.com/calypr/git-drs/cmd/addurl" "github.com/calypr/git-drs/cmd/bucket" "github.com/calypr/git-drs/cmd/clean" + "github.com/calypr/git-drs/cmd/copyrecords" deleteCmd "github.com/calypr/git-drs/cmd/delete" "github.com/calypr/git-drs/cmd/deleteproject" - - "github.com/calypr/git-drs/cmd/download" - "github.com/calypr/git-drs/cmd/fetch" "github.com/calypr/git-drs/cmd/filter" "github.com/calypr/git-drs/cmd/initialize" "github.com/calypr/git-drs/cmd/install" - - "github.com/calypr/git-drs/cmd/list" "github.com/calypr/git-drs/cmd/lsfiles" + "github.com/calypr/git-drs/cmd/ping" "github.com/calypr/git-drs/cmd/precommit" "github.com/calypr/git-drs/cmd/prepush" "github.com/calypr/git-drs/cmd/pull" "github.com/calypr/git-drs/cmd/push" "github.com/calypr/git-drs/cmd/query" "github.com/calypr/git-drs/cmd/remote" + "github.com/calypr/git-drs/cmd/rm" "github.com/calypr/git-drs/cmd/smudge" "github.com/calypr/git-drs/cmd/track" "github.com/calypr/git-drs/cmd/untrack" - - "github.com/calypr/git-drs/cmd/upload" "github.com/calypr/git-drs/cmd/version" "github.com/spf13/cobra" ) @@ -46,11 +42,13 @@ func init() { RootCmd.AddCommand(initialize.Cmd) RootCmd.AddCommand(version.Cmd) + RootCmd.AddCommand(ping.Cmd) RootCmd.AddCommand(filter.Cmd) RootCmd.AddCommand(clean.Cmd) + RootCmd.AddCommand(copyrecords.Cmd) RootCmd.AddCommand(smudge.Cmd) RootCmd.AddCommand(remote.Cmd) - RootCmd.AddCommand(fetch.Cmd) + RootCmd.AddCommand(rm.Cmd) RootCmd.AddCommand(pull.Cmd) RootCmd.AddCommand(push.Cmd) RootCmd.AddCommand(precommit.Cmd) @@ -63,10 +61,7 @@ func init() { RootCmd.AddCommand(bucket.Cmd) RootCmd.AddCommand(track.Cmd) RootCmd.AddCommand(untrack.Cmd) - RootCmd.AddCommand(list.Cmd) RootCmd.AddCommand(lsfiles.Cmd) - RootCmd.AddCommand(upload.Cmd) - RootCmd.AddCommand(download.Cmd) RootCmd.AddCommand(install.Cmd) RootCmd.CompletionOptions.HiddenDefaultCmd = true diff --git a/cmd/upload/main.go b/cmd/upload/main.go deleted file mode 100644 index 992b8f52..00000000 --- a/cmd/upload/main.go +++ /dev/null @@ -1,99 +0,0 @@ -package upload - -import ( - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/calypr/git-drs/internal/common" - "github.com/calypr/git-drs/internal/config" - "github.com/calypr/git-drs/internal/drslog" - "github.com/calypr/git-drs/internal/drsobject" - "github.com/calypr/git-drs/internal/drsremote" - syupload "github.com/calypr/syfon/client/transfer/upload" - "github.com/spf13/cobra" -) - -var remote string - -// Cmd line declaration -var Cmd = &cobra.Command{ - Use: "upload ", - Short: "Upload a file to a DRS server", - Long: "Upload a file to a DRS server, without creating an LFS pointer", - Args: cobra.MinimumNArgs(1), - RunE: func(cmd *cobra.Command, args []string) error { - - logger := drslog.GetLogger() - - config, err := config.LoadConfig() - if err != nil { - return err - } - - remoteName, err := config.GetRemoteOrDefault(remote) - if err != nil { - logger.Error(fmt.Sprintf("Error getting remote: %v", err)) - return err - } - - client, err := config.GetRemoteClient(remoteName, logger) - if err != nil { - return err - } - - remoteConfig := config.GetRemote(remoteName) - organization := "" - project := "" - storagePrefix := "" - bucketName := "" - if remoteConfig != nil { - organization = remoteConfig.GetOrganization() - project = remoteConfig.GetProjectId() - storagePrefix = remoteConfig.GetStoragePrefix() - bucketName = remoteConfig.GetBucketName() - } - - for _, src := range args { - if s, err := os.Stat(src); err != nil { - logger.Error(fmt.Sprintf("Error stating file %s: %v", src, err)) - return err - } else if s.IsDir() { - logger.Error(fmt.Sprintf("Skipping directory %s", src)) - continue - } else { - sha256, err := common.CalculateFileSHA256(src) - if err != nil { - logger.Error(fmt.Sprintf("Error calculating SHA256 for file %s: %v", src, err)) - return err - } - - objs, err := drsremote.ObjectsByHashForScope(cmd.Context(), client, sha256) - if err != nil || len(objs) == 0 { - did := sha256 - name := filepath.Base(src) - drsObj, err := drsobject.BuildWithPrefix(name, sha256, s.Size(), did, bucketName, organization, project, storagePrefix) - if err != nil { - return fmt.Errorf("build DRS object for %s: %w", src, err) - } - registered, err := syupload.RegisterFile(cmd.Context(), client.Client.Data(), client.Client.DRS(), drsObj, src, bucketName) - if err != nil { - return fmt.Errorf("error uploading %s: %v", src, err) - } - if registered != nil { - logger.Info(fmt.Sprintf("Successfully uploaded %s to server with DRS ID %s", src, registered.Id)) - } - } else { - logger.Info(fmt.Sprintf("File %s already exists on server with DRS ID %s, skipping upload", src, strings.TrimSpace(objs[0].Id))) - } - } - } - - return nil - }, -} - -func init() { - Cmd.Flags().StringVarP(&remote, "remote", "r", "", "target remote DRS server (default: default_remote)") -} diff --git a/docs/architecture-drs-endpoints-and-transfer-concurrency.md b/docs/TODO/architecture-drs-endpoints-and-transfer-concurrency.md similarity index 98% rename from docs/architecture-drs-endpoints-and-transfer-concurrency.md rename to docs/TODO/architecture-drs-endpoints-and-transfer-concurrency.md index 7f5d258a..39e7bc6c 100644 --- a/docs/architecture-drs-endpoints-and-transfer-concurrency.md +++ b/docs/TODO/architecture-drs-endpoints-and-transfer-concurrency.md @@ -48,7 +48,7 @@ Notes: ## 1.3 Trace from standard Git commands -`git-drs` participates in both explicit `git drs ...` commands and standard Git workflows after `git drs init`: +`git-drs` participates in both explicit `git drs ...` commands and standard Git workflows after repository-local setup is installed. That setup can happen either through explicit `git drs init` or automatically during `git drs remote add ...`: - `git drs init` installs hooks (`cmd/initialize/main.go`): - pre-commit: `git drs precommit` diff --git a/docs/TODO/git-drs-rm-semantics.md b/docs/TODO/git-drs-rm-semantics.md new file mode 100644 index 00000000..6b7beab0 --- /dev/null +++ b/docs/TODO/git-drs-rm-semantics.md @@ -0,0 +1,155 @@ +# ADR 0004: `git drs rm` as scoped Git-plus-remote delete workflow + +## Status +Implemented + +## Context + +`git-drs` currently has a gap in normal file lifecycle handling: + +- users can add, register, and push tracked data +- users can remove pointer files from Git with `git rm` +- but removing the file from the repository does not provide a coherent remote deletion workflow + +There are older hidden delete-oriented commands, but they are not a good user-facing model: + +- they are backend-centric +- they are not path-oriented +- they are easy to use unsafely +- they do not fit the normal repository workflow + +At the same time, automatically deleting remote objects as soon as a tracked file disappears from Git is too aggressive: + +- one content object may be referenced by multiple DRS records +- one DRS record may exist in more than one project or instance +- deleting a DRS record is not the same as deleting the underlying bucket object +- destructive behavior during `push` must be explicit and scoped + +## Decision + +Adopt `git drs rm` as the user-facing delete workflow, with three distinct layers of behavior: + +1. remove the tracked pointer from the Git repository +2. remove matching remote DRS record state for the configured remote scope during `git drs push` +3. only delete underlying object bytes when explicitly requested by a stronger policy + +The canonical behavior is: + +- `git drs rm ` removes the tracked file from the worktree and index +- it does not write sidecar delete state +- `git drs push` and the managed `pre-push` hook derive deletions from pushed Git ref deltas +- default remote action is **record deletion only**, scoped to the configured organization/project +- bucket object deletion is **not** the default behavior + +## Rationale + +This model matches the way users already think: + +- `git rm` removes a file from the repository +- `git drs push` synchronizes repository state with remote DRS state + +It also avoids the worst failure mode: + +- silent deletion of shared underlying object bytes + +Separating record deletion from object-byte deletion keeps the default behavior aligned with least surprise and least destruction. + +It also avoids extra small-file local I/O under `.git/...`, which is a poor fit for HPC-style environments. + +## Detailed semantics + +### Local command behavior + +`git drs rm ...` should: + +- validate that each path is tracked as a git-drs/LFS pointer +- remove the file from the index and worktree +- fail clearly for plain Git files + +### Push behavior + +When `git drs push` or the managed `pre-push` hook reconciles deletes, it should: + +- resolve the current remote and configured organization/project scope +- compute deleted paths from the pushed Git ref delta +- read the deleted pointer blob from the old tree +- parse the tracked object identity from that pointer +- resolve the tracked object identity to matching remote DRS records in that scope +- delete matching record state when the result is unambiguous +- warn and require explicit follow-up when the result is ambiguous + +Examples of ambiguity: + +- no matching record exists in the configured scope +- more than one matching record exists in the configured scope +- the object is shared with another project or instance and the server cannot prove safe purge semantics + +### Default delete policy + +Default policy for `git drs rm` + `git drs push`: + +- remove the Git pointer locally +- if the scoped record has one `controlled_access` entry, delete the whole record +- if the scoped record has multiple `controlled_access` entries, remove only the current scope resource +- do not delete bucket bytes automatically + +### Optional stronger policy + +A future explicit mode may support purging underlying bytes, for example: + +- `git drs rm --purge-object ` +- repo config enabling scoped auto-purge + +That mode must remain opt-in and should only proceed when the server can prove the delete is safe or the user explicitly forces it. + +## Non-goals + +This ADR does not define: + +- cross-project garbage collection +- global deduplication ownership policy +- automatic deletion of all records sharing the same checksum across an instance +- silent purge of shared bucket content + +## Implementation direction + +### Phase 1: user-facing command + +Add `git drs rm` as a first-class command: + +- path-oriented +- repo-aware +- explicit about remote consequences + +### Phase 2: push-time sync + +Teach `git drs push` to: + +- derive deleted tracked pointers from pushed refs +- apply scoped record deletion +- print a concise summary of deleted records, skipped records, and ambiguous cases + +### Phase 3: optional purge policy + +Add a stricter opt-in mode for underlying object-byte deletion with strong safeguards. + +## Consequences + +### Positive + +- delete workflow becomes part of the normal `git-drs` lifecycle +- users no longer need to manually reconcile Git state and remote DRS state +- destructive bucket deletion is kept behind explicit policy + +### Negative + +- push flow must inspect Git history carefully +- delete semantics require server-side and client-side ambiguity handling +- `git drs push` needs an explicit compare base when it is not running under the `pre-push` hook + +## Current notes + +- `git drs rm` wraps `git rm` directly after validating tracked LFS/git-drs paths. +- Plain `git push` uses the managed `pre-push` hook, which receives authoritative old/new SHAs from Git. +- `git drs push` derives deletes from `HEAD` vs `@{upstream}` when an upstream exists. +- Ambiguous remote matches warn and remain untouched. diff --git a/docs/adding-s3-files.md b/docs/adding-s3-files.md index f8a83a6b..fe651263 100644 --- a/docs/adding-s3-files.md +++ b/docs/adding-s3-files.md @@ -17,7 +17,7 @@ Primary support today is S3-style URLs: - `https://bucket.s3.amazonaws.com/key` - Path-style S3-compatible HTTPS URLs -The inspector also accepts other go-cloud styles (`gs://`, `azblob://`, `file://`), but the main production path in current e2e coverage is S3/Gen3 bucket-backed workflows. +The inspector also accepts other cloud styles (`gs://`, `azblob://`), but the main production path in current e2e coverage is S3/Gen3 bucket-backed workflows. ## Two Add-URL Input Modes diff --git a/docs/commands.md b/docs/commands.md index 43ac6d5a..7e5dc492 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -50,7 +50,9 @@ Common flags: - `--multipart-threshold `: multipart threshold in MB - `--enable-data-client-logs`: enable lower-level client logging -Run this once per repository. +Use this when you want to initialize the repo explicitly, or to repair repo-local hooks/config. + +For normal onboarding, `git drs remote add ...` now auto-initializes the repository if that setup is missing. ## Remote Configuration @@ -59,14 +61,26 @@ Run this once per repository. Add or refresh a Gen3-backed Syfon remote. ```bash -git drs remote add gen3 [remote-name] [--cred | --token ] +git drs remote add gen3 [remote-name] \ + --cred ``` -Examples: +**Options:** + +- `--cred `: Path to credentials JSON file (required) +- `--token `: Token for temporary access (alternative to --cred) +- ``: Required scope argument, for example `HTAN_INT/BForePC` + +**Examples:** ```bash -git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json -git drs remote add gen3 staging HTAN_INT/BForePC --token "$GEN3_TOKEN" +# Add production remote +git drs remote add gen3 production my-program/my-project \ + --cred /path/to/credentials.json + +# Add staging remote +git drs remote add gen3 staging staging-program/staging-project \ + --cred /path/to/staging-credentials.json ``` Notes: @@ -75,6 +89,7 @@ Notes: - scope is always one positional argument: `organization/project` - `--cred` imports a Gen3 credential file - `--token` uses a temporary bearer token +- if the repo has not been initialized yet, this command bootstraps the local `git-drs` hooks/config first - bucket resolution is scope-driven; users do not need to provide `--bucket` - endpoint resolution comes from the credential/token path; users do not need to provide `--url` @@ -91,12 +106,184 @@ List configured DRS remotes. git drs remote list ``` -### `git drs remote set ` +### `git drs remote remove ` + +Remove a configured DRS remote. + +```bash +git drs remote remove +git drs remote rm +``` + +Notes: + +- this removes `git-drs` remote config, not normal Git remotes +- `git remote remove ` does not manage `git-drs` remote config +- if the removed remote was the default and other `git-drs` remotes remain, one remaining remote becomes the new default +- if the removed remote was the last one, `git-drs` clears the default remote + +### `git drs add-url [path]` + +Prepare a pointer plus local DRS metadata for an object that already exists in provider storage. + +**Usage:** + +```bash +# Preferred: object key resolved against configured bucket scope +git drs add-url path/to/object.bin data/from-bucket.bin --scheme s3 + +# Compatibility: explicit provider URL +git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin +``` + +**Options:** + +- `--scheme `: Required for object-key mode because local bucket mappings persist bucket/prefix, not provider scheme +- `--sha256 `: Expected SHA256 checksum when known + +**What it does:** + +- Resolves the effective org/project bucket scope for the current remote +- Inspects the provider object through client-owned cloud code +- Writes a Git LFS pointer into the worktree +- Stores local DRS metadata for later registration during `git drs push` + +### `git drs push [remote-name]` + +Push local DRS objects to server. Uploads new files and registers metadata. + +**Usage:** + +```bash +# Push to default remote +git drs push + +# Push to specific remote +git drs push staging +git drs push production +``` + +**What it does:** + +- Checks local `.git/drs/lfs/objects/` for DRS metadata +- For each object, uploads file to bucket if file exists locally +- If file doesn't exist locally (metadata only), registers metadata without upload +- Reconciles committed tracked-file deletions against the pushed Git ref delta +- This enables cross-remote promotion workflows + +**Cross-Remote Promotion:** + +Transfer DRS records from one remote to another (eg staging to production) without re-uploading files: + +```bash +# Fetch metadata from staging +git drs fetch staging + +# Push metadata to production (no file upload since files don't exist locally) +git drs push production +``` + +This is useful when files are already in the production bucket with matching SHA256 hashes. It can also be used to reupload files given that the files are pulled to the repo first. + +**Note:** `fetch` and `push` are commonly used together. `fetch` pulls metadata from one remote, `push` registers it to another. + +### `git drs rm ...` + +Remove tracked DRS/LFS files from the worktree and index. -Set the default DRS remote. +**Usage:** ```bash -git drs remote set production +git drs rm data/sample.bam +git drs rm data/sample1.bam data/sample2.bam +``` + +**What it does:** + +- Validates that each path is tracked as a Git LFS / git-drs file +- Runs `git rm` for those paths +- Does not mutate remote DRS state immediately + +**Remote behavior on push:** + +When the deletion is committed and pushed: + +- `git drs push` and the managed `pre-push` hook derive deleted pointers from the pushed Git commit delta +- if the scoped record has exactly one `controlled_access` entry, the whole DRS record is deleted +- if the scoped record has multiple `controlled_access` entries, only the current `organization/project` resource is removed +- underlying object bytes are not deleted by default + +### `git drs copy-records [source-remote] ` + +Copy Syfon records for one `organization/project` scope from one configured remote to another. + +**Usage:** + +```bash +git drs copy-records \ + \ + \ + +``` + +Or, to copy from the configured default remote: + +```bash +git drs copy-records \ + \ + +``` + +**Options:** + +- ``: Source remote. Optional. Defaults to the configured default remote. +- ``: Target remote. Required. +- ``: Required scope argument, for example `HTAN_INT/BForePC`. +- `--batch-size `: Source page size and target bulk write size. Default: `250`. + +**What it does:** + +- Reads all source records for the requested `organization/project` using Syfon's internal bulk/list APIs +- Looks up matching DIDs on the target in batches +- Creates records that do not already exist on the target +- For existing DIDs, preserves the target record and only merges: + - `controlled_access` + - `access_methods` + +**Merge semantics for existing target records:** + +- Existing target metadata is preserved +- `controlled_access` becomes the union of source and target values +- `access_methods` becomes the union of source and target values +- Records with no effective change are skipped + +**Example:** + +```bash +git drs copy-records \ + dev \ + prod \ + HTAN_INT/BForePC +``` + +**When to use it:** + +- Promote DRS metadata between Syfon instances +- Backfill `controlled_access` and `access_methods` onto an existing target instance +- Copy project-scoped records without re-uploading object bytes + +### `git drs query` + +Query a DRS object by its DRS ID or SHA256 checksum. + +**Usage:** + +```bash +# Query by DRS ID (default behavior) +git drs query + +# Query by SHA256 checksum +git drs query --checksum ``` ## Bucket Mapping @@ -119,7 +306,28 @@ git drs bucket add-organization production \ ### `git drs bucket add-project` -Map an organization/project to a bucket path. +```bash +# Known SHA path +git drs add-url s3://bucket/path/file.bin data/file.bin --sha256 + +# Unknown SHA path +git drs add-url s3://bucket/path/file.bin data/file.bin +``` + +**Options:** + +- `--sha256 `: Optional SHA256 hash of the source object. + If omitted, add-url uses an ETag+source-derived placeholder OID and registers metadata without a local payload blob. + +**Notes:** + +- `add-url` no longer accepts per-command AWS credential flags. +- S3 connection hints are resolved from environment/runtime config when needed (for example `AWS_REGION`, `AWS_ENDPOINT_URL`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`). +- Registration happens on `git drs push`, not at `add-url` time. + +### `git drs version` + +Display Git DRS version information. ```bash git drs bucket add-project production \ @@ -215,6 +423,13 @@ What it does: - resolves local pointer/object metadata - uploads local bytes when needed - registers object metadata with the target Syfon instance +- reconciles committed deletes derived from the pushed ref delta + +Notes: + +- delete reconciliation is Git-history-derived; there is no local delete-intent sidecar state +- `git drs push` uses the current branch upstream as the delete diff base when one exists +- plain `git push` uses the managed `pre-push` hook, which receives authoritative old/new SHAs from Git ### `git drs add-url [path]` diff --git a/docs/drs-uri-canonical-identity.md b/docs/drs-uri-canonical-identity.md new file mode 100644 index 00000000..c810d357 --- /dev/null +++ b/docs/drs-uri-canonical-identity.md @@ -0,0 +1,212 @@ +# ADR 0003: Use DRS URI as canonical remote identity and a derived compact OID for local pointer/cache identity + +## Status +Proposed + +## Context + +`git-drs` no longer needs to treat the Git LFS-style `oid` as a literal content SHA256 in every workflow. + +This is most obvious in `add-url` and other metadata-first flows: + +- the authoritative remote object identity is the DRS object itself +- the authoritative retrieval source is the DRS metadata (`access_methods`, checksums, scoped metadata) +- local pointer files and local cache paths still need a compact identifier + +Today the code still assumes the local pointer/cache identifier is shaped like a Git LFS SHA256: + +- pointers are parsed only when they contain `oid sha256:<64-hex>` +- local object fanout paths assume a 64-character hex identifier +- some code still conflates "pointer OID" with "content SHA256" + +At the same time, raw DRS URIs are not a good direct substitute for the pointer `oid` slot: + +- they are long and punctuation-heavy +- they are awkward as local cache keys and filesystem fanout paths +- they couple local identity directly to remote URI syntax +- they reduce cache/dedupe reuse if the same bytes exist under multiple DRS records + +So we need to separate two concepts that are currently blurred together: + +1. **Canonical remote identity** + - the DRS URI / DRS object ID that identifies the remote record +2. **Local pointer/cache identity** + - a compact, filesystem-safe identifier used in pointer files, cache keys, and worktree inventory + +## Decision + +Adopt this identity model: + +1. **DRS URI is the canonical remote identity** + - it identifies the remote record + - it is the primary stable reference for metadata-first workflows +2. **A derived compact OID is the local pointer/cache identity** + - it is derived deterministically from the canonical remote identity + - it is used in pointer files, local fanout paths, and cache indexing +3. **Do not use the raw DRS URI as the literal pointer OID** + - the pointer/cache slot remains compact and derived +4. **Do not require the derived compact OID to equal the content SHA256** + - content SHA256 remains useful metadata when known + - but it is not required to be the primary local identity in metadata-first flows + +## Recommended derived OID format + +The derived local OID should be: + +- deterministic +- compact +- filesystem-safe +- independent of transient access URLs + +Recommended derivation: + +```text +derived_oid = sha256("git-drs-object:v1\ndrs_uri=\n") +``` + +This gives: + +- a stable 64-hex identifier +- compatibility with existing local fanout assumptions +- independence from raw URL punctuation/length +- a clean boundary between remote identity and local cache identity + +## Why not use the raw DRS URI as the pointer OID? + +Because the pointer OID slot behaves like a local key, not just a human-readable identifier. + +Using raw DRS URI directly would: + +- make pointer parsing and cache fanout messy +- force local storage assumptions to depend on URI syntax +- complicate path encoding, diagnostics, and inventory code +- make future URI normalization/versioning harder + +The DRS URI should be stored as first-class metadata, not jammed directly into the pointer key slot. + +## Before + +The current model is effectively: + +- pointer OID is assumed to be `sha256:<64hex>` +- local cache fanout is based on that value +- some workflows use placeholders, but still pretend the OID slot is "the SHA256" + +This causes confusion: + +- "OID" and "content checksum" are treated as the same thing even when they are not +- metadata-first objects need placeholder semantics +- design discussions keep returning to "can we use md5, etag, or path instead?" + +## After + +The identity model becomes explicit: + +- **DRS URI** = canonical remote identity +- **derived OID** = compact local pointer/cache identity +- **checksums** = content metadata, not always the primary local key + +In practical terms: + +- pointer files keep a compact OID +- local fanout code keeps a compact OID +- remote record matching and canonical reference logic prefer DRS URI +- content SHA256 remains available when known, but no longer defines the whole identity model + +## Implementation guidance + +### Phase 1: Introduce explicit identity types + +Add an internal identity structure such as: + +```text +ObjectIdentity + - DRSURI + - OID + - Checksums +``` + +This is the critical first step. It prevents more code from assuming: + +```text +OID == content SHA256 +``` + +### Phase 2: Keep pointer/cache format compact + +Retain the existing compact local OID shape for compatibility: + +- `oid sha256:<64hex>` in pointer files +- existing local fanout layout + +But redefine the meaning: + +- in metadata-first flows, the 64-hex value is the derived local identity +- not necessarily the file content SHA256 + +This avoids a broad pointer-format migration up front. + +### Phase 3: Store DRS URI as authoritative metadata + +Wherever local DRS metadata is written or read: + +- persist the DRS URI explicitly +- treat it as the canonical remote reference + +This applies especially to: + +- `add-url` +- `add-ref` +- local DRS object files +- copy/query/register reconciliation logic + +### Phase 4: Remove code that equates OID and checksum + +Audit and reduce assumptions in: + +- pointer parsing/writing helpers +- inventory +- push/pull resolution +- registration/update logic +- cache helpers + +The main rule: + +- if code wants a checksum, ask for a checksum +- if code wants a pointer/cache key, ask for the derived OID +- if code wants remote identity, ask for the DRS URI + +## Consequences + +### Positive + +- clarifies identity semantics +- makes metadata-first workflows more coherent +- avoids raw DRS URI leakage into local cache/path mechanics +- preserves compact local storage behavior +- reduces pressure to misuse ETag/MD5/path directly as pointer OIDs + +### Negative + +- requires a modest identity refactor +- some current code still assumes pointer OID is content SHA256 +- compatibility language around `oid sha256:` becomes semantically looser until or unless the external pointer format is generalized later + +## Explicit non-goals + +This ADR does **not** require: + +- raw DRS URI to become the literal pointer OID +- multi-algorithm pointer syntax (`oid md5:...`, `oid etag:...`, etc.) immediately +- full removal of the current compact SHA256-shaped local OID format + +Those may be considered later, but they are not required to fix the architectural confusion now. + +## Summary + +The correct split is: + +- **DRS URI** for canonical remote identity +- **derived compact OID** for local pointer/cache identity + +That gives `git-drs` a cleaner architecture without forcing local storage and pointer behavior to inherit all the messiness of raw remote identifiers. diff --git a/docs/ga4gh-drs-scalability-gaps.md b/docs/ga4gh-drs-scalability-gaps.md new file mode 100644 index 00000000..a3395ad5 --- /dev/null +++ b/docs/ga4gh-drs-scalability-gaps.md @@ -0,0 +1,195 @@ +# GA4GH DRS Scalability Gaps for `git-drs` + +## Summary + +GA4GH DRS is a reasonable read/access protocol for individual object resolution, but it is not a good standalone fit for high-volume `git-drs` workflows when the client must translate one logical operation into many REST calls. + +The problem is not that DRS is incorrect. The problem is that the base API surface is too narrow for the operational patterns `git-drs` actually needs: + +- checksum-first lookup +- bulk existence checks +- bulk access URL resolution +- bulk registration +- upload orchestration +- scoped delete/update flows + +If every one of those has to be decomposed into multiple per-object HTTP calls, the result is not operationally scalable. + +## The core issue + +For `git-drs`, many user-facing operations are logically single operations: + +- "do these 500 OIDs already exist?" +- "give me access URLs for these 200 objects" +- "register these 150 objects" +- "delete the objects that match this checksum in this scope" + +Base GA4GH DRS mostly gives the client: + +- `GET /ga4gh/drs/v1/objects/{object_id}` +- `GET /ga4gh/drs/v1/objects/{object_id}/access/{access_id}` +- `GET /ga4gh/drs/v1/service-info` + +That works for one object at a time. It does not work well for batch-oriented Git and LFS workflows. + +## Why this is not scalable + +### 1. One logical operation turns into N network round-trips + +Examples: + +- checksum lookup by many OIDs: + - no native bulk checksum query in base DRS + - client must fan out one request per checksum +- access resolution for many objects: + - no native bulk access URL resolution in base DRS + - client must fan out one request per object/access method +- delete by hash: + - client must first resolve matching objects + - then issue one delete per object + +This is acceptable for a handful of objects. It is poor for large repos, monorepos, or pre-push/pull batch flows. + +### 2. Latency compounds even when payload size is small + +Most of these calls are metadata calls, not large transfers. The bottleneck is round-trip count: + +- HTTP connection overhead +- auth middleware and token processing +- authz lookup +- request routing +- JSON encode/decode +- repeated server-side DB lookups + +A workflow that should feel like "one batch metadata operation" becomes "hundreds of tiny RPCs over HTTP". + +### 3. Capability gaps force client-side orchestration complexity + +Without batch primitives, the client has to own: + +- fan-out scheduling +- retry policy +- partial failure handling +- de-duplication +- concurrency limits +- fallback behavior when some objects resolve and some do not + +That complexity is not free. It moves real system cost from the server contract into every client. + +### 4. It weakens read-path ergonomics for Git/LFS-shaped workflows + +`git-drs` is not a generic browser for one object at a time. It is operating on working trees, pointer inventories, checksum sets, and batch synchronization. + +Those workflows are naturally set-oriented, not object-at-a-time. + +Trying to force them through only: + +- `GetObject(id)` +- `GetAccessURL(id, access_id)` + +produces a client that is formally "more DRS-pure" but operationally worse. + +## Concrete examples + +### Bulk checksum validity + +Logical operation: + +- "tell me which of these 1000 SHA256 values already exist" + +Base DRS shape: + +- 1000 repeated checksum/object lookups + +What the client actually needs: + +- one bulk validity response: + - `sha256 -> exists` + +This is why `/index/bulk/sha256/validity` or an equivalent bulk DRS extension exists. + +### Bulk access resolution + +Logical operation: + +- "hydrate all unresolved pointer files in this checkout" + +Base DRS shape: + +- resolve object for each OID +- resolve access URL for each object + +That means at least: + +- one object-resolution request per object +- one access-resolution request per object + +For a checkout with many files, this is an avoidable round-trip explosion. + +### Delete by checksum or scoped cleanup + +Logical operation: + +- "delete matching records for this checksum in this repo scope" + +Base DRS shape: + +1. resolve records +2. iterate matching IDs +3. delete one by one + +That is technically possible. It is not a good contract for batch cleanup. + +## Position + +`git-drs` should not be forced into a "base DRS only" model when that model degrades correctness, simplicity, or scale. + +The better architecture is: + +- use base GA4GH DRS where it fits naturally + - single-object read + - access resolution + - service-info probing +- keep explicit extension APIs where batch or write semantics are required + - bulk checksum validity + - bulk checksum lookup + - bulk access URL resolution + - bulk registration + - upload negotiation + - batch delete/update helpers + +## Recommended design rule + +Do not collapse one logical client operation into multiple mandatory REST calls unless there is no practical alternative. + +More concretely: + +- if a client operation is inherently batch-oriented, prefer a batch endpoint +- if a workflow is write-oriented, do not pretend it is standard DRS when it is not +- if an optimization is required for acceptable repo-scale performance, keep it as an explicit extension instead of hiding it behind repeated single-object DRS calls + +## Recommended client split + +`git-drs` should continue to distinguish: + +- **DRS read contract** + - `GetObject` + - `GetAccessURL` + - `GetServiceInfo` + +- **DRS extension contract** + - bulk checksum lookup / validity + - object registration + - upload request negotiation + - bulk access resolution + - delete/update helpers + +That is a more honest and more scalable model than pretending all useful client operations can be reduced to base DRS primitives without cost. + +## Bottom line + +GA4GH DRS is a solid object access protocol. + +It is not, by itself, a scalable batch control-plane for `git-drs`. + +Where one custom operation would otherwise require two or more mandatory REST operations per object, a dedicated extension is justified. The scalability cost is real, and the client should not absorb it just to preserve a cleaner-looking but weaker protocol story. diff --git a/docs/getting-started.md b/docs/getting-started.md index cfef54f3..4151b627 100644 --- a/docs/getting-started.md +++ b/docs/getting-started.md @@ -34,11 +34,7 @@ It no longer tries to be a mixed bag of Git, Git LFS, and DRS transport wrappers ServerAliveInterval 30 ``` -3. Initialize `git-drs` in the repo: - - ```bash - git drs init - ``` +3. If the repo does not yet have a `git-drs` remote configured, add one now. `remote add` will bootstrap the repo-local hooks/config automatically if needed. 4. Hydrate tracked files if needed: @@ -58,13 +54,15 @@ git drs install ## One-Time Repository Setup -After cloning or creating a repository: +After cloning or creating a repository, the normal first step is adding a remote: ```bash -git drs init +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json ``` -That sets up repository-local `git-drs` state and hooks. +That command now sets up repository-local `git-drs` state and hooks automatically if they are missing. + +You can still run `git drs init` directly when you want to initialize the repo explicitly before configuring any remote, or when you want to repair the hook/config wiring. ## Add a Gen3 Remote @@ -83,6 +81,7 @@ git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials. Notes: - scope is one positional argument: `organization/project` +- if repo-local `git-drs` setup is missing, this command initializes it first - users do not provide `--bucket` - users do not provide `--url` - bucket resolution is scope-based and server-backed @@ -99,17 +98,13 @@ For a new repository or a repository that has not yet been configured with `git- 1. Initialize the repository: - ```bash - git drs init - ``` - -2. Add the target remote: - ```bash git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json ``` -3. Verify the configuration: + This bootstraps the repo-local `git-drs` hooks/config if needed. + +2. Verify the configuration: ```bash git drs remote list @@ -155,6 +150,30 @@ Example: git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json ``` +### When a key expires or is replaced + +The supported recovery path is: + +```bash +git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json +``` + +Practical answers to the common questions: + +- do you need to run `git drs init` again? + - no +- do you need to run `git drs remote add gen3` again? + - yes, if the API key itself was replaced or you want to import a new credential file/token +- does `git-drs` detect token expiry automatically? + - partially + - if the stored access token is expired but the stored API key is still valid, `git-drs` will try to refresh the access token automatically + - if the API key itself is expired, revoked, or replaced, rerun `git drs remote add gen3 ...` +- how do you check what remote/profile is in use? + - `git drs remote list` shows the configured remotes + - the Gen3 profile data lives in `~/.gen3/gen3_client_config.ini` + +As a rule, if credentials changed and you want a predictable fix, re-run `git drs remote add gen3 ...` for that remote. That updates the stored profile and repo token plumbing without requiring repo re-initialization. + ## Managing Additional Remotes You can add multiple remotes for multi-environment workflows. @@ -206,11 +225,27 @@ git add .gitattributes ```bash git add sample.bam git commit -m "Add sample" -git push +git drs push ``` `git-drs` handles pointer/object registration behavior around the Git workflow. +## Remove a Tracked File + +Use `git drs rm` for tracked DRS/LFS files: + +```bash +git drs rm sample.bam +git commit -m "Remove sample" +git drs push +``` + +This removes the pointer from Git immediately. The remote DRS mutation happens only when that deletion is committed and pushed: + +- if the scoped record has one `controlled_access` entry, the record is deleted +- if it has multiple `controlled_access` entries, only the current `organization/project` resource is removed +- underlying object bytes are not deleted by default + ## Inspect Tracked Files Use `ls-files` as the local inventory command: @@ -281,7 +316,7 @@ git drs add-url s3://my-bucket/path/to/object.bin data/from-bucket.bin ## Session Workflow -> **Note:** You do not need to run `git drs init` again. Initialization is a one-time setup per local repository clone. +> **Note:** You do not need to run `git drs init` again. Remote configuration bootstraps repo-local setup when needed, and explicit `git drs init` is mainly for manual setup or repair. For a normal work session: @@ -291,6 +326,8 @@ For a normal work session: git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json ``` + You do not need to run `git drs init` again for this. Refreshing credentials is a `remote add` operation, not a repo reinitialization step. + 2. Update Git history if needed ```bash @@ -329,21 +366,20 @@ git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credenti Use this flow when developing against a local Syfon/DRS server instead of a hosted Gen3 deployment. -1. Initialize the repo: +1. Add the local remote: ```bash - git drs init + git drs remote add local origin http://localhost:8080 \ + calypr/end_to_end_test \ + --username drs-user \ + --password drs-pass ``` -2. Add the local remote: - - ```bash - git drs remote add local origin http://localhost:8080 - ``` + This bootstraps the repo-local `git-drs` hooks/config if needed. If your local server requires basic auth, include the local auth flags supported by that command. -3. Track and push: +2. Track and push: ```bash git drs track "*.bin" @@ -352,7 +388,7 @@ Use this flow when developing against a local Syfon/DRS server instead of a host git drs push ``` -4. Verify hydration: +3. Verify hydration: ```bash git drs pull @@ -380,7 +416,6 @@ This copies metadata only. It does not copy object bytes between buckets. ```bash git drs install -git drs init git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/credentials.json git drs track "*.bam" git add .gitattributes diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md index c0b77c7b..0158fac0 100644 --- a/docs/troubleshooting.md +++ b/docs/troubleshooting.md @@ -10,12 +10,12 @@ Common issues and solutions for the cleaned `git-drs` CLI. No. -`git drs init` is repository setup. Run it once per local clone unless you are deliberately reinitializing the repo. +`git drs init` is repository setup. In most cases you do not need to run it manually at all, because `git drs remote add ...` now bootstraps that setup automatically when it is missing. Run it when: -- you first clone a repository and need local `git-drs` setup -- you create a new repository and want to enable `git-drs` +- you want to initialize repo-local `git-drs` state before adding any remote +- you want to repair hooks/config wiring explicitly Do not run it every session: @@ -72,8 +72,9 @@ Those changes persist in the clone. They are not something you redo per session. Examples: -- `git drs init` - `git drs remote add gen3 ...` +- `git drs remote remove ...` +- `git drs init` - `git drs track` - `git drs ls-files` - `git drs pull` @@ -154,10 +155,12 @@ That usually just means hydration has not happened yet. Run: ```bash -git drs init +git drs remote list git drs pull ``` +If the repo has never had a `git-drs` remote configured, run `git drs remote add ...` first. That command will also install the repo-local hooks/config. + ### Network timeout during push or download If you use SSH remotes, keepalives help: @@ -211,6 +214,25 @@ git ls-files -- path/to/file git drs ls-files -l ``` +### `git remote remove` did not remove my `git-drs` remote + +That is expected. + +Git remotes and `git-drs` remotes live in different config domains. + +Use: + +```bash +git drs remote list +git drs remote remove +``` + +or: + +```bash +git drs remote rm +``` + ### `git drs pull` does nothing That usually means one of these: @@ -269,6 +291,32 @@ Refresh by re-adding the remote with a new credential file or token: git drs remote add gen3 production HTAN_INT/BForePC --cred /path/to/new-credentials.json ``` +You do not need to run `git drs init` again. + +What `git-drs` does automatically: + +- if the stored access token is expired but the stored API key is still valid, `git-drs` will attempt to refresh the access token +- if the API key itself is expired, revoked, or replaced, you need to re-run `git drs remote add gen3 ...` + +How to think about recovery: + +- token expired, key still valid: + - often automatic +- key expired or replaced: + - rerun `git drs remote add gen3 ... --cred ...` or `--token ...` + +How to check what is in use: + +```bash +git drs remote list +``` + +And for the underlying Gen3 profile data: + +- inspect `~/.gen3/gen3_client_config.ini` + +If you want the least surprising fix, just re-run `git drs remote add gen3 ...` with the current credential file. That updates the stored profile and repo token plumbing in one step. + ### `git push` fails with upload or register errors Check: @@ -309,7 +357,6 @@ That is normal. After cloning: ```bash -git drs init git drs pull ``` diff --git a/go.mod b/go.mod index 8765febe..3096b9d6 100644 --- a/go.mod +++ b/go.mod @@ -1,16 +1,17 @@ module github.com/calypr/git-drs -go 1.26.2 +go 1.26.3 require ( github.com/bytedance/sonic v1.15.0 - github.com/calypr/data-client v0.0.0-20260504172902-8e9b714aa299 - github.com/calypr/syfon v0.2.8-0.20260503003649-cda722e27216 - github.com/calypr/syfon/apigen v0.2.6-0.20260503003649-cda722e27216 + github.com/calypr/data-client v0.0.0-20260506231822-6a4689d4201f + github.com/calypr/syfon v0.2.9-0.20260511213931-ff4d5a467b3e + github.com/calypr/syfon/apigen v0.2.7-0.20260511213931-ff4d5a467b3e github.com/git-lfs/pktline v0.0.0-20230103162542-ca444d533ef1 github.com/go-git/go-git/v5 v5.13.0 github.com/golang-jwt/jwt/v5 v5.3.1 github.com/google/uuid v1.6.0 + github.com/mattn/go-isatty v0.0.21 github.com/spf13/cobra v1.10.2 github.com/stretchr/testify v1.11.1 golang.org/x/sync v0.20.0 @@ -92,7 +93,6 @@ require ( github.com/klauspost/cpuid/v2 v2.3.0 // indirect github.com/kylelemons/godebug v1.1.0 // indirect github.com/mattn/go-colorable v0.1.14 // indirect - github.com/mattn/go-isatty v0.0.21 // indirect github.com/mattn/go-runewidth v0.0.23 // indirect github.com/oapi-codegen/runtime v1.4.0 // indirect github.com/philhofer/fwd v1.2.0 // indirect @@ -141,7 +141,7 @@ require ( github.com/aws/aws-sdk-go-v2/config v1.32.14 github.com/aws/aws-sdk-go-v2/credentials v1.19.14 github.com/aws/aws-sdk-go-v2/service/s3 v1.99.0 - github.com/calypr/syfon/client v0.2.7-0.20260503003649-cda722e27216 + github.com/calypr/syfon/client v0.2.8-0.20260511213931-ff4d5a467b3e github.com/hashicorp/go-version v1.9.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/go.sum b/go.sum index 514e9f90..d823b2d1 100644 --- a/go.sum +++ b/go.sum @@ -115,14 +115,14 @@ github.com/bytedance/sonic v1.15.0 h1:/PXeWFaR5ElNcVE84U0dOHjiMHQOwNIx3K4ymzh/uS github.com/bytedance/sonic v1.15.0/go.mod h1:tFkWrPz0/CUCLEF4ri4UkHekCIcdnkqXw9VduqpJh0k= github.com/bytedance/sonic/loader v0.5.0 h1:gXH3KVnatgY7loH5/TkeVyXPfESoqSBSBEiDd5VjlgE= github.com/bytedance/sonic/loader v0.5.0/go.mod h1:AR4NYCk5DdzZizZ5djGqQ92eEhCCcdf5x77udYiSJRo= -github.com/calypr/data-client v0.0.0-20260504172902-8e9b714aa299 h1:q5clCvC1DVNgswtek9iXa0AW31Pz0zAVEAryN/Ertdo= -github.com/calypr/data-client v0.0.0-20260504172902-8e9b714aa299/go.mod h1:cAKvEGQogXFM4Hz22/JNOH+l2bRaz+pjT3N2H5cC8D4= -github.com/calypr/syfon v0.2.8-0.20260503003649-cda722e27216 h1:f/fr9r4N3yKARlX+RPiZmg6fJG3xuSLpeTvk6as6AxM= -github.com/calypr/syfon v0.2.8-0.20260503003649-cda722e27216/go.mod h1:m2jd8Snb+Gjc32AcOTdioZVjLmhsAXw7F4uzHQTGBtg= -github.com/calypr/syfon/apigen v0.2.6-0.20260503003649-cda722e27216 h1:MuPHiYQXGX7frvN3EQZ4ZM5RjRZ1eGVY0D4iXCNAj0s= -github.com/calypr/syfon/apigen v0.2.6-0.20260503003649-cda722e27216/go.mod h1:9JNwTgR57yKJlWqZpdqP+/l4zCNzH1EIFrW+e20PyMQ= -github.com/calypr/syfon/client v0.2.7-0.20260503003649-cda722e27216 h1:1LFo3dMc4ZQHml9zQsBE5/k2ZEcHT4EpqxL+Hr/ROjo= -github.com/calypr/syfon/client v0.2.7-0.20260503003649-cda722e27216/go.mod h1:DQSqNkxl9V3w08BiMconcJh3xtc+/Je7Xeo7qRH7wto= +github.com/calypr/data-client v0.0.0-20260506231822-6a4689d4201f h1:+jRjTLBjCjxbWvcbYIi9Oe+XBGlwLJnnk8mk1wHEamY= +github.com/calypr/data-client v0.0.0-20260506231822-6a4689d4201f/go.mod h1:cAKvEGQogXFM4Hz22/JNOH+l2bRaz+pjT3N2H5cC8D4= +github.com/calypr/syfon v0.2.9-0.20260511213931-ff4d5a467b3e h1:VSCX3ZwQcOWCjrZ7WFP4TW9vGi9WPYZuuZsjEoRluwA= +github.com/calypr/syfon v0.2.9-0.20260511213931-ff4d5a467b3e/go.mod h1:FMYmSy6rbUGbFcuNTlKtxIhWSlIRPbZfpauKMO0k1V4= +github.com/calypr/syfon/apigen v0.2.7-0.20260511213931-ff4d5a467b3e h1:PtLxUIloatJGqZx/UkvG7wT9z8vh7R3N4CyDnc139Zk= +github.com/calypr/syfon/apigen v0.2.7-0.20260511213931-ff4d5a467b3e/go.mod h1:VrRZ2A17YV91Zsm7CF/u1/Z+DfcZAk8Q4Pk1xklb5xU= +github.com/calypr/syfon/client v0.2.8-0.20260511213931-ff4d5a467b3e h1:ycB0RN7nRdbU9gRynm8JC1pGsBSc4VmOxLkKGNPCLtg= +github.com/calypr/syfon/client v0.2.8-0.20260511213931-ff4d5a467b3e/go.mod h1:9MTLDQ5clwDHcDuKCEuVaV7bebsCIUtUQxTegW3RDVo= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk= diff --git a/internal/config/config.go b/internal/config/config.go index 58a94fa2..e45a0d43 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -5,6 +5,7 @@ import ( "fmt" "log/slog" "path/filepath" + "sort" "strings" "github.com/calypr/git-drs/internal/common" @@ -318,6 +319,64 @@ func SaveConfig(cfg *Config) error { return repo.Storer.SetConfig(conf) } +func RemoveRemote(name Remote) (*Config, error) { + cfg, err := LoadConfig() + if err != nil { + return nil, err + } + + if _, ok := cfg.Remotes[name]; !ok { + return nil, fmt.Errorf("remote '%s' not found", name) + } + + delete(cfg.Remotes, name) + + keys := []string{ + fmt.Sprintf("drs.remote.%s.type", name), + fmt.Sprintf("drs.remote.%s.endpoint", name), + fmt.Sprintf("drs.remote.%s.project", name), + fmt.Sprintf("drs.remote.%s.bucket", name), + fmt.Sprintf("drs.remote.%s.organization", name), + fmt.Sprintf("drs.remote.%s.storage_prefix", name), + fmt.Sprintf("drs.remote.%s.token", name), + fmt.Sprintf("drs.remote.%s.username", name), + fmt.Sprintf("drs.remote.%s.password", name), + fmt.Sprintf("remote.%s.lfsurl", name), + } + if err := gitrepo.UnsetGitConfigOptions(keys); err != nil { + return nil, err + } + + if cfg.DefaultRemote == name { + cfg.DefaultRemote = firstRemote(cfg) + } + + if err := SaveConfig(cfg); err != nil { + return nil, err + } + + if cfg.DefaultRemote == "" { + if err := gitrepo.UnsetGitConfigOptions([]string{"drs.default-remote"}); err != nil { + return nil, err + } + } + + return LoadConfig() +} + +func firstRemote(cfg *Config) Remote { + if cfg == nil || len(cfg.Remotes) == 0 { + return "" + } + + names := make([]string, 0, len(cfg.Remotes)) + for name := range cfg.Remotes { + names = append(names, string(name)) + } + sort.Strings(names) + return Remote(names[0]) +} + // GetGitConfigInt reads an integer value from git config // getGitConfigValue retrieves a value from git config by key func getConfigPath() (string, error) { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index e555b340..cdf0c98e 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -7,6 +7,7 @@ import ( "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/gitrepo" + syconf "github.com/calypr/syfon/client/config" ) func setupTestRepo(t *testing.T) string { @@ -383,3 +384,32 @@ func TestLocalRemoteGetClientResolvesBucketScopeMappings(t *testing.T) { t.Fatalf("StoragePrefix = %q, want program-root/project-subpath", gitCtx.StoragePrefix) } } + +func TestNewGitContextReadsLFSConcurrentTransfers(t *testing.T) { + setupTestRepo(t) + + if err := gitrepo.SetGitConfigOptions(map[string]string{ + "lfs.concurrenttransfers": "7", + }); err != nil { + t.Fatalf("SetGitConfigOptions failed: %v", err) + } + + cred := syconf.Credential{ + APIEndpoint: "https://example.test", + AccessToken: "token", + } + remote := Gen3Remote{ + Endpoint: "https://example.test", + Organization: "org1", + ProjectID: "proj1", + Bucket: "bucket1", + } + + gitCtx, err := newGitContext(cred, remote, drslog.GetLogger()) + if err != nil { + t.Fatalf("newGitContext failed: %v", err) + } + if gitCtx.UploadConcurrency != 7 { + t.Fatalf("UploadConcurrency = %d, want 7", gitCtx.UploadConcurrency) + } +} diff --git a/internal/config/remote.go b/internal/config/remote.go index 8bb0b0bc..3245d372 100644 --- a/internal/config/remote.go +++ b/internal/config/remote.go @@ -13,6 +13,8 @@ import ( syconf "github.com/calypr/syfon/client/config" ) +const credentialHelpSuffix = "Refresh credentials with `git drs remote add gen3 --cred ` or `--token `. See docs/getting-started.md." + type DRSRemote interface { GetProjectId() string GetOrganization() string @@ -61,7 +63,7 @@ func (s Gen3Remote) GetClient(remoteName string, logger *slog.Logger) (*GitConte return nil, err } if err := credentials.EnsureValidCredential(context.Background(), cred, logger); err != nil { - return nil, err + return nil, WrapCredentialValidationError(remoteName, err) } return newGitContext(*cred, s, logger) } @@ -194,3 +196,13 @@ func localRemoteFromGen3(gen3 *Gen3Remote, username string, password string) *Lo BasicPassword: strings.TrimSpace(password), } } + +func WrapCredentialValidationError(remoteName string, err error) error { + if err == nil { + return nil + } + if strings.TrimSpace(remoteName) == "" { + return fmt.Errorf("%w. %s", err, credentialHelpSuffix) + } + return fmt.Errorf("%w. Remote %q requires refreshed credentials. %s", err, remoteName, credentialHelpSuffix) +} diff --git a/internal/drsdelete/git_history.go b/internal/drsdelete/git_history.go new file mode 100644 index 00000000..acc39351 --- /dev/null +++ b/internal/drsdelete/git_history.go @@ -0,0 +1,98 @@ +package drsdelete + +import ( + "context" + "fmt" + "os/exec" + "sort" + "strings" + + "github.com/calypr/git-drs/internal/lfs" +) + +type deletedPointer struct { + Path string + OID string +} + +func collectDeletedPointers(ctx context.Context, refs []RefUpdate) (map[string][]deletedPointer, error) { + grouped := make(map[string][]deletedPointer) + seen := make(map[string]struct{}) + for _, ref := range refs { + oldSHA := strings.TrimSpace(ref.OldSHA) + newSHA := strings.TrimSpace(ref.NewSHA) + if oldSHA == "" || newSHA == "" || isZeroSHA(oldSHA) || isZeroSHA(newSHA) { + continue + } + paths, err := gitDeletedPaths(ctx, oldSHA, newSHA) + if err != nil { + return nil, err + } + for _, path := range paths { + key := oldSHA + "\x00" + path + if _, ok := seen[key]; ok { + continue + } + seen[key] = struct{}{} + + oid, ok, err := gitPointerOID(ctx, oldSHA, path) + if err != nil { + return nil, err + } + if !ok { + continue + } + grouped[oid] = append(grouped[oid], deletedPointer{Path: path, OID: oid}) + } + } + return grouped, nil +} + +func deletedPaths(items []deletedPointer) []string { + out := make([]string, 0, len(items)) + for _, item := range items { + out = append(out, item.Path) + } + sort.Strings(out) + return out +} + +func gitDeletedPaths(ctx context.Context, oldSHA, newSHA string) ([]string, error) { + cmd := exec.CommandContext(ctx, "git", "diff", "--name-status", "--diff-filter=D", "-M", oldSHA, newSHA) + out, err := cmd.CombinedOutput() + if err != nil { + return nil, fmt.Errorf("git diff deleted paths %s..%s: %s", oldSHA, newSHA, strings.TrimSpace(string(out))) + } + lines := strings.Split(strings.TrimSpace(string(out)), "\n") + paths := make([]string, 0, len(lines)) + for _, line := range lines { + line = strings.TrimSpace(line) + if line == "" { + continue + } + parts := strings.Split(line, "\t") + if len(parts) < 2 || parts[0] != "D" { + continue + } + paths = append(paths, parts[1]) + } + return paths, nil +} + +func gitPointerOID(ctx context.Context, ref, path string) (string, bool, error) { + spec := ref + ":" + path + cmd := exec.CommandContext(ctx, "git", "show", spec) + out, err := cmd.CombinedOutput() + if err != nil { + return "", false, fmt.Errorf("git show %s: %s", spec, strings.TrimSpace(string(out))) + } + oid, _, ok := lfs.ParseLFSPointer(out) + if !ok { + return "", false, nil + } + return "sha256:" + strings.TrimPrefix(strings.TrimSpace(oid), "sha256:"), true, nil +} + +func isZeroSHA(sha string) bool { + return strings.TrimSpace(sha) == "0000000000000000000000000000000000000000" +} diff --git a/internal/drsdelete/live_refs.go b/internal/drsdelete/live_refs.go new file mode 100644 index 00000000..7198c32e --- /dev/null +++ b/internal/drsdelete/live_refs.go @@ -0,0 +1,42 @@ +package drsdelete + +import ( + "log/slog" + "sort" + "strings" + + "github.com/calypr/git-drs/internal/lfs" +) + +func collectLivePathsByOID(refs []RefUpdate, logger *slog.Logger) (map[string][]string, error) { + targets := make([]string, 0, len(refs)) + seen := make(map[string]struct{}, len(refs)) + for _, ref := range refs { + newSHA := strings.TrimSpace(ref.NewSHA) + if newSHA == "" || isZeroSHA(newSHA) { + continue + } + if _, ok := seen[newSHA]; ok { + continue + } + seen[newSHA] = struct{}{} + targets = append(targets, newSHA) + } + if len(targets) == 0 { + return map[string][]string{}, nil + } + + files, err := lfs.GetLfsFilesForRefs(targets, logger) + if err != nil { + return nil, err + } + liveByOID := make(map[string][]string) + for path, info := range files { + oid := "sha256:" + strings.TrimPrefix(strings.TrimSpace(info.Oid), "sha256:") + liveByOID[oid] = append(liveByOID[oid], path) + } + for oid := range liveByOID { + sort.Strings(liveByOID[oid]) + } + return liveByOID, nil +} diff --git a/internal/drsdelete/reconcile.go b/internal/drsdelete/reconcile.go new file mode 100644 index 00000000..7e413e5e --- /dev/null +++ b/internal/drsdelete/reconcile.go @@ -0,0 +1,115 @@ +package drsdelete + +import ( + "context" + "fmt" + "io" + "log/slog" + + "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drsremote" + sycommon "github.com/calypr/syfon/common" +) + +type RefUpdate struct { + OldSHA string + NewSHA string +} + +type Summary struct { + DeletedRecords int + RemovedResources int + ClearedLocalOnly int + PendingMissing int + PendingAmbiguous int +} + +func ReconcileCommittedDeletes(ctx context.Context, drsCtx *config.GitContext, refs []RefUpdate, logger *slog.Logger) (Summary, error) { + if drsCtx == nil || drsCtx.Client == nil { + return Summary{}, fmt.Errorf("DRS client unavailable") + } + if logger == nil { + logger = slog.New(slog.NewTextHandler(io.Discard, nil)) + } + if len(refs) == 0 { + return Summary{}, nil + } + + deletedByOID, err := collectDeletedPointers(ctx, refs) + if err != nil { + return Summary{}, err + } + if len(deletedByOID) == 0 { + return Summary{}, nil + } + + liveByOID, err := collectLivePathsByOID(refs, logger) + if err != nil { + return Summary{}, err + } + + resource, err := sycommon.ResourcePath(drsCtx.Organization, drsCtx.ProjectId) + if err != nil { + return Summary{}, err + } + + summary := Summary{} + for oid, deletions := range deletedByOID { + if livePaths := liveByOID[oid]; len(livePaths) > 0 { + summary.ClearedLocalOnly += len(deletions) + continue + } + + records, err := drsremote.ObjectsByHashForScope(ctx, drsCtx, oid) + if err != nil { + return summary, err + } + switch len(records) { + case 0: + summary.PendingMissing += len(deletions) + if logger != nil { + logger.Warn("deleted pointer has no scoped DRS match", "oid", oid, "paths", deletedPaths(deletions)) + } + continue + case 1: + default: + summary.PendingAmbiguous += len(deletions) + if logger != nil { + logger.Warn("deleted pointer matched multiple scoped DRS records", "oid", oid, "count", len(records), "paths", deletedPaths(deletions)) + } + continue + } + + record := records[0] + controlled := []string(nil) + if record.ControlledAccess != nil { + controlled = sycommon.NormalizeAccessResources(*record.ControlledAccess) + } + if len(controlled) <= 1 { + if err := drsCtx.Client.DRS().DeleteObject(ctx, record.Id, true); err != nil { + return summary, err + } + summary.DeletedRecords++ + continue + } + + var out map[string]any + if err := drsCtx.Client.Requestor().Do(ctx, "POST", "/index/"+record.Id+"/controlled-access/remove", map[string]string{ + "resource": resource, + }, &out); err != nil { + return summary, err + } + summary.RemovedResources++ + } + + if logger != nil && (summary.DeletedRecords > 0 || summary.RemovedResources > 0 || summary.ClearedLocalOnly > 0 || summary.PendingMissing > 0 || summary.PendingAmbiguous > 0) { + logger.Info("delete reconciliation complete", + "deleted_records", summary.DeletedRecords, + "removed_resources", summary.RemovedResources, + "cleared_local_only", summary.ClearedLocalOnly, + "pending_missing", summary.PendingMissing, + "pending_ambiguous", summary.PendingAmbiguous, + ) + } + return summary, nil +} diff --git a/internal/drsdelete/reconcile_test.go b/internal/drsdelete/reconcile_test.go new file mode 100644 index 00000000..73c900c5 --- /dev/null +++ b/internal/drsdelete/reconcile_test.go @@ -0,0 +1,167 @@ +package drsdelete + +import ( + "context" + "encoding/json" + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "strings" + "testing" + + drsapi "github.com/calypr/syfon/apigen/client/drs" +) + +func TestReconcileCommittedDeletes_RemovesControlledAccess(t *testing.T) { + repo := initRepoWithDelete(t, []pointerSpec{{Path: "data.dat", OID: strings.Repeat("a", 64)}}) + + oldWD, _ := os.Getwd() + if err := os.Chdir(repo); err != nil { + t.Fatalf("chdir repo: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(oldWD) }) + + oldSHA := gitRevParse(t, repo, "HEAD~1") + newSHA := gitRevParse(t, repo, "HEAD") + + var removedResource string + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/ga4gh/drs/v1/objects/checksum/"+strings.Repeat("a", 64): + obj := drsapi.DrsObject{ + Id: "did-1", + ControlledAccess: &[]string{"/organization/org/project/proj", "/organization/other/project/x"}, + Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: strings.Repeat("a", 64)}}, + } + records := []drsapi.DrsObject{obj} + writeJSON(t, w, http.StatusOK, drsapi.N200OkDrsObjects{ResolvedDrsObject: &records}) + case r.Method == http.MethodPost && r.URL.Path == "/index/did-1/controlled-access/remove": + var req struct { + Resource string `json:"resource"` + } + if err := json.NewDecoder(r.Body).Decode(&req); err != nil { + t.Fatalf("decode remove controlled access: %v", err) + } + removedResource = req.Resource + writeJSON(t, w, http.StatusOK, map[string]any{"did": "did-1"}) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.Path) + } + })) + defer server.Close() + + drsCtx := newGitContext(t, server.URL) + summary, err := ReconcileCommittedDeletes(context.Background(), drsCtx, []RefUpdate{{OldSHA: oldSHA, NewSHA: newSHA}}, nil) + if err != nil { + t.Fatalf("reconcile returned error: %v", err) + } + if summary.RemovedResources != 1 { + t.Fatalf("expected one removed resource, got %+v", summary) + } + if removedResource != "/organization/org/project/proj" { + t.Fatalf("unexpected removed resource: %s", removedResource) + } +} + +func TestReconcileCommittedDeletes_DeletesWholeRecord(t *testing.T) { + repo := initRepoWithDelete(t, []pointerSpec{{Path: "other.dat", OID: strings.Repeat("b", 64)}}) + + oldWD, _ := os.Getwd() + if err := os.Chdir(repo); err != nil { + t.Fatalf("chdir repo: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(oldWD) }) + + oldSHA := gitRevParse(t, repo, "HEAD~1") + newSHA := gitRevParse(t, repo, "HEAD") + + deleted := false + var deleteReq struct { + DeleteObjectMetadata bool `json:"delete_object_metadata"` + DeleteStorageData bool `json:"delete_storage_data"` + } + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/ga4gh/drs/v1/objects/checksum/"+strings.Repeat("b", 64): + obj := drsapi.DrsObject{ + Id: "did-2", + ControlledAccess: &[]string{"/organization/org/project/proj"}, + Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: strings.Repeat("b", 64)}}, + } + records := []drsapi.DrsObject{obj} + writeJSON(t, w, http.StatusOK, drsapi.N200OkDrsObjects{ResolvedDrsObject: &records}) + case r.Method == http.MethodPut && r.URL.Path == "/ga4gh/drs/v1/objects/did-2/delete": + if err := json.NewDecoder(r.Body).Decode(&deleteReq); err != nil { + t.Fatalf("decode delete request: %v", err) + } + deleted = true + w.WriteHeader(http.StatusOK) + default: + t.Fatalf("unexpected request: %s %s", r.Method, r.URL.Path) + } + })) + defer server.Close() + + drsCtx := newGitContext(t, server.URL) + summary, err := ReconcileCommittedDeletes(context.Background(), drsCtx, []RefUpdate{{OldSHA: oldSHA, NewSHA: newSHA}}, nil) + if err != nil { + t.Fatalf("reconcile returned error: %v", err) + } + if summary.DeletedRecords != 1 || !deleted { + t.Fatalf("expected full delete, deleted=%v summary=%+v", deleted, summary) + } + if !deleteReq.DeleteObjectMetadata || !deleteReq.DeleteStorageData { + t.Fatalf("expected delete request to purge metadata and storage, got %+v", deleteReq) + } +} + +func TestReconcileCommittedDeletes_SkipsWhenOIDStillLive(t *testing.T) { + oid := strings.Repeat("c", 64) + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + runGit(t, repo, "config", "filter.lfs.clean", "cat") + runGit(t, repo, "config", "filter.lfs.smudge", "cat") + runGit(t, repo, "config", "filter.lfs.process", "cat") + runGit(t, repo, "config", "filter.lfs.required", "false") + runGit(t, repo, "checkout", "-b", "main") + if err := os.WriteFile(filepath.Join(repo, ".gitattributes"), []byte("*.dat filter=lfs diff=lfs merge=lfs -text\n"), 0o644); err != nil { + t.Fatalf("write .gitattributes: %v", err) + } + writePointerFile(t, filepath.Join(repo, "data.dat"), oid, "12") + writePointerFile(t, filepath.Join(repo, "copy.dat"), oid, "12") + runGit(t, repo, "add", ".") + runGit(t, repo, "commit", "-m", "add two pointers") + runGit(t, repo, "rm", "--", "data.dat") + runGit(t, repo, "commit", "-m", "delete one pointer") + + oldWD, _ := os.Getwd() + if err := os.Chdir(repo); err != nil { + t.Fatalf("chdir repo: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(oldWD) }) + + oldSHA := gitRevParse(t, repo, "HEAD~1") + newSHA := gitRevParse(t, repo, "HEAD") + + called := false + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called = true + t.Fatalf("unexpected remote mutation request: %s %s", r.Method, r.URL.Path) + })) + defer server.Close() + + drsCtx := newGitContext(t, server.URL) + summary, err := ReconcileCommittedDeletes(context.Background(), drsCtx, []RefUpdate{{OldSHA: oldSHA, NewSHA: newSHA}}, nil) + if err != nil { + t.Fatalf("reconcile returned error: %v", err) + } + if called { + t.Fatalf("expected no remote call when oid still live") + } + if summary.ClearedLocalOnly != 1 { + t.Fatalf("expected local-only clear, got %+v", summary) + } +} diff --git a/internal/drsdelete/test_helpers_test.go b/internal/drsdelete/test_helpers_test.go new file mode 100644 index 00000000..147a1867 --- /dev/null +++ b/internal/drsdelete/test_helpers_test.go @@ -0,0 +1,103 @@ +package drsdelete + +import ( + "encoding/json" + "net/http" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + "github.com/calypr/git-drs/internal/config" + syclient "github.com/calypr/syfon/client" +) + +type pointerSpec struct { + Path string + OID string +} + +func initRepoWithDelete(t *testing.T, specs []pointerSpec) string { + t.Helper() + repo := t.TempDir() + runGit(t, repo, "init") + runGit(t, repo, "config", "user.email", "test@example.com") + runGit(t, repo, "config", "user.name", "Test User") + runGit(t, repo, "config", "filter.lfs.clean", "cat") + runGit(t, repo, "config", "filter.lfs.smudge", "cat") + runGit(t, repo, "config", "filter.lfs.process", "cat") + runGit(t, repo, "config", "filter.lfs.required", "false") + runGit(t, repo, "checkout", "-b", "main") + + if err := os.WriteFile(filepath.Join(repo, ".gitattributes"), []byte("*.dat filter=lfs diff=lfs merge=lfs -text\n"), 0o644); err != nil { + t.Fatalf("write .gitattributes: %v", err) + } + for _, spec := range specs { + writePointerFile(t, filepath.Join(repo, spec.Path), spec.OID, "12") + } + runGit(t, repo, "add", ".") + runGit(t, repo, "commit", "-m", "add pointers") + for _, spec := range specs { + runGit(t, repo, "rm", "--", spec.Path) + } + runGit(t, repo, "commit", "-m", "delete pointers") + return repo +} + +func newGitContext(t *testing.T, serverURL string) *config.GitContext { + t.Helper() + rawClient, err := syclient.New(serverURL) + if err != nil { + t.Fatalf("new client: %v", err) + } + client := rawClient.(*syclient.Client) + return &config.GitContext{ + Client: client, + Organization: "org", + ProjectId: "proj", + } +} + +func gitRevParse(t *testing.T, dir, ref string) string { + t.Helper() + cmd := exec.Command("git", "rev-parse", ref) + cmd.Dir = dir + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git rev-parse %s failed: %v\n%s", ref, err, string(out)) + } + return strings.TrimSpace(string(out)) +} + +func runGit(t *testing.T, dir string, args ...string) { + t.Helper() + cmd := exec.Command("git", args...) + cmd.Dir = dir + out, err := cmd.CombinedOutput() + if err != nil { + t.Fatalf("git %s failed: %v\n%s", strings.Join(args, " "), err, string(out)) + } +} + +func writeJSON(t *testing.T, w http.ResponseWriter, status int, v any) { + t.Helper() + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + if err := json.NewEncoder(w).Encode(v); err != nil { + t.Fatalf("encode json: %v", err) + } +} + +func writePointerFile(t *testing.T, path, oid, size string) { + t.Helper() + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + t.Fatalf("mkdir pointer dir: %v", err) + } + content := "version https://git-lfs.github.com/spec/v1\n" + + "oid sha256:" + oid + "\n" + + "size " + size + "\n" + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + t.Fatalf("write pointer file: %v", err) + } +} diff --git a/internal/drsmap/drs_map.go b/internal/drsmap/drs_map.go index 2d88734a..6029ecbf 100644 --- a/internal/drsmap/drs_map.go +++ b/internal/drsmap/drs_map.go @@ -40,9 +40,7 @@ func WriteObjectsForLFSFiles(builder drsobject.Builder, lfsFiles map[string]lfs. name := file.Name authoritativeObj.Name = &name authoritativeObj.Size = file.Size - - authzMap := syfoncommon.AuthzMapFromScope(builder.Organization, builder.Project) - authoritativeObj, _ = syfoncommon.EnsureAccessMethodAuthorizations(authoritativeObj, authzMap) + ensureControlledAccess(authoritativeObj, builder.Organization, builder.Project) } else { drsID := uuid.NewSHA1(drsobject.UUIDNamespace, []byte(fmt.Sprintf("%s:%s", builder.Project, drsobject.NormalizeOid(file.Oid)))).String() authoritativeObj, err = builder.Build(file.Name, file.Oid, file.Size, drsID) @@ -74,16 +72,12 @@ func WriteObjectsForLFSFiles(builder drsobject.Builder, lfsFiles map[string]lfs. } if opts.PreferCacheURL && hint != "" { - cacheAuthzMap := syfoncommon.AuthzMapFromScope(builder.Organization, builder.Project) if authoritativeObj.AccessMethods != nil && len(*authoritativeObj.AccessMethods) > 0 { am := &(*authoritativeObj.AccessMethods)[0] am.AccessUrl = &struct { Headers *[]string `json:"headers,omitempty"` Url string `json:"url"` }{Url: hint} - if cacheAuthzMap != nil { - am.Authorizations = syfoncommon.AccessMethodAuthorizationsFromAuthzMap(cacheAuthzMap) - } } else { newAm := drsapi.AccessMethod{ Type: drsapi.AccessMethodTypeS3, @@ -92,11 +86,9 @@ func WriteObjectsForLFSFiles(builder drsobject.Builder, lfsFiles map[string]lfs. Url string `json:"url"` }{Url: hint}, } - if cacheAuthzMap != nil { - newAm.Authorizations = syfoncommon.AccessMethodAuthorizationsFromAuthzMap(cacheAuthzMap) - } authoritativeObj.AccessMethods = &[]drsapi.AccessMethod{newAm} } + ensureControlledAccess(authoritativeObj, builder.Organization, builder.Project) } if err := drsobject.WriteObject(common.DRS_OBJS_PATH, authoritativeObj, file.Oid); err != nil { @@ -108,3 +100,27 @@ func WriteObjectsForLFSFiles(builder drsobject.Builder, lfsFiles map[string]lfs. return nil } + +func ensureControlledAccess(obj *drsapi.DrsObject, org, project string) { + if obj == nil { + return + } + authzMap := syfoncommon.AuthzMapFromScope(org, project) + if len(authzMap) == 0 { + return + } + next := append([]string(nil), derefStringSlice(obj.ControlledAccess)...) + next = append(next, syfoncommon.AuthzMapToControlledAccess(authzMap)...) + normalized := syfoncommon.NormalizeAccessResources(next) + if len(normalized) == 0 { + return + } + obj.ControlledAccess = &normalized +} + +func derefStringSlice(ptr *[]string) []string { + if ptr == nil { + return nil + } + return append([]string(nil), (*ptr)...) +} diff --git a/internal/drsmap/drs_map_test.go b/internal/drsmap/drs_map_test.go index 00fd3b70..99bad3e8 100644 --- a/internal/drsmap/drs_map_test.go +++ b/internal/drsmap/drs_map_test.go @@ -17,7 +17,6 @@ import ( "github.com/calypr/git-drs/internal/lfs" "github.com/calypr/git-drs/internal/precommit_cache" drsapi "github.com/calypr/syfon/apigen/client/drs" - syfoncommon "github.com/calypr/syfon/common" ) func setupTestRepo(t *testing.T) { @@ -37,7 +36,7 @@ func setupTestRepo(t *testing.T) { t.Cleanup(func() { _ = os.Chdir(cwd) }) } -func TestWriteObjectsForLFSFilesBackfillsMissingAuthzWithoutOverwritingURL(t *testing.T) { +func TestWriteObjectsForLFSFilesBackfillsMissingControlledAccessWithoutOverwritingURL(t *testing.T) { setupTestRepo(t) oid := "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" @@ -73,23 +72,24 @@ func TestWriteObjectsForLFSFilesBackfillsMissingAuthzWithoutOverwritingURL(t *te if method.AccessUrl == nil || method.AccessUrl.Url != explicitURL { t.Fatalf("access url overwritten: %+v", method.AccessUrl) } - authz := syfoncommon.AuthzMapFromAccessMethodAuthorizations(method.Authorizations) - want := map[string][]string{"org": {"proj"}} - if !equalAuthzMaps(authz, want) { - t.Fatalf("unexpected authz: got=%v want=%v", authz, want) + if method.Authorizations != nil { + t.Fatalf("did not expect access method authorizations: %+v", method.Authorizations) + } + if !equalStringSlices(derefStringSlice(got.ControlledAccess), []string{"/organization/org/project/proj"}) { + t.Fatalf("unexpected controlled_access: %+v", derefStringSlice(got.ControlledAccess)) } } -func TestWriteObjectsForLFSFilesPreservesExistingAuthz(t *testing.T) { +func TestWriteObjectsForLFSFilesUnionsExistingControlledAccess(t *testing.T) { setupTestRepo(t) oid := "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee" - existingAuthz := syfoncommon.AccessMethodAuthorizationsFromAuthzMap(map[string][]string{"keep": {"me"}}) + existingControlled := []string{"/organization/keep/project/me"} if err := drsobject.WriteObject(common.DRS_OBJS_PATH, &drsapi.DrsObject{ - Id: "did-2", + Id: "did-2", + ControlledAccess: &existingControlled, AccessMethods: &[]drsapi.AccessMethod{{ - Type: drsapi.AccessMethodTypeS3, - Authorizations: existingAuthz, + Type: drsapi.AccessMethodTypeS3, }}, Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: oid}}, }, oid); err != nil { @@ -109,14 +109,16 @@ func TestWriteObjectsForLFSFilesPreservesExistingAuthz(t *testing.T) { if err != nil { t.Fatalf("ReadObject error: %v", err) } - authz := syfoncommon.AuthzMapFromAccessMethodAuthorizations((*got.AccessMethods)[0].Authorizations) - want := map[string][]string{"keep": {"me"}} - if !equalAuthzMaps(authz, want) { - t.Fatalf("existing authz overwritten: got=%v want=%v", authz, want) + if (*got.AccessMethods)[0].Authorizations != nil { + t.Fatalf("did not expect access method authorizations: %+v", (*got.AccessMethods)[0].Authorizations) + } + want := []string{"/organization/keep/project/me", "/organization/org/project/proj"} + if !equalStringSlices(derefStringSlice(got.ControlledAccess), want) { + t.Fatalf("unexpected controlled_access: got=%v want=%v", derefStringSlice(got.ControlledAccess), want) } } -func TestWriteObjectsForLFSFilesPreferCacheURLPreservesAuthz(t *testing.T) { +func TestWriteObjectsForLFSFilesPreferCacheURLSetsControlledAccess(t *testing.T) { setupTestRepo(t) oid := "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff" @@ -143,10 +145,11 @@ func TestWriteObjectsForLFSFilesPreferCacheURLPreservesAuthz(t *testing.T) { if method.AccessUrl == nil || method.AccessUrl.Url != "s3://cache/object" { t.Fatalf("expected cache URL, got %+v", method.AccessUrl) } - authz := syfoncommon.AuthzMapFromAccessMethodAuthorizations(method.Authorizations) - want := map[string][]string{"org": {"proj"}} - if !equalAuthzMaps(authz, want) { - t.Fatalf("unexpected authz after cache URL preference: got=%v want=%v", authz, want) + if method.Authorizations != nil { + t.Fatalf("did not expect access method authorizations: %+v", method.Authorizations) + } + if !equalStringSlices(derefStringSlice(got.ControlledAccess), []string{"/organization/org/project/proj"}) { + t.Fatalf("unexpected controlled_access after cache URL preference: %+v", derefStringSlice(got.ControlledAccess)) } } @@ -157,23 +160,14 @@ func testLogger(t *testing.T) *slog.Logger { return slog.New(slog.NewTextHandler(io.Discard, nil)) } -func equalAuthzMaps(got, want map[string][]string) bool { +func equalStringSlices(got, want []string) bool { if len(got) != len(want) { return false } - for org, wantProjects := range want { - gotProjects, ok := got[org] - if !ok { + for i := range want { + if got[i] != want[i] { return false } - if len(gotProjects) != len(wantProjects) { - return false - } - for i := range wantProjects { - if gotProjects[i] != wantProjects[i] { - return false - } - } } return true } diff --git a/internal/drsobject/object.go b/internal/drsobject/object.go index fd85bd38..1d295a6f 100644 --- a/internal/drsobject/object.go +++ b/internal/drsobject/object.go @@ -58,15 +58,16 @@ func ConvertToCandidate(obj *drsapi.DrsObject) drsapi.DrsObjectCandidate { return drsapi.DrsObjectCandidate{} } return drsapi.DrsObjectCandidate{ - AccessMethods: obj.AccessMethods, - Aliases: obj.Aliases, - Checksums: obj.Checksums, - Contents: obj.Contents, - Description: obj.Description, - MimeType: obj.MimeType, - Name: obj.Name, - Size: obj.Size, - Version: obj.Version, + AccessMethods: obj.AccessMethods, + Aliases: obj.Aliases, + Checksums: obj.Checksums, + Contents: obj.Contents, + ControlledAccess: obj.ControlledAccess, + Description: obj.Description, + MimeType: obj.MimeType, + Name: obj.Name, + Size: obj.Size, + Version: obj.Version, } } @@ -113,11 +114,12 @@ func BuildWithOptions(fileName string, checksum string, size int64, drsID string Url string `json:"url"` }{Url: accessURL}, } - if authzMap := syfoncommon.AuthzMapFromScope(opts.Organization, opts.Project); authzMap != nil { - am.Authorizations = syfoncommon.AccessMethodAuthorizationsFromAuthzMap(authzMap) - } ams := []drsapi.AccessMethod{am} obj.AccessMethods = &ams + if authzMap := syfoncommon.AuthzMapFromScope(opts.Organization, opts.Project); authzMap != nil { + controlled := syfoncommon.AuthzMapToControlledAccess(authzMap) + obj.ControlledAccess = &controlled + } return obj, nil } diff --git a/internal/drsremote/remote.go b/internal/drsremote/remote.go index 4d7c2a01..9d0336a5 100644 --- a/internal/drsremote/remote.go +++ b/internal/drsremote/remote.go @@ -33,6 +33,60 @@ func ObjectsByHash(ctx context.Context, drsCtx *config.GitContext, checksum stri return page.DrsObjects, nil } +func ObjectsByHashes(ctx context.Context, drsCtx *config.GitContext, checksums []string) (map[string][]drsapi.DrsObject, error) { + if drsCtx == nil || drsCtx.Client == nil { + return nil, fmt.Errorf("DRS client unavailable") + } + normalizedToOriginal := make(map[string]string, len(checksums)) + queryChecksums := make([]string, 0, len(checksums)) + for _, checksum := range checksums { + normalized := drsobject.NormalizeChecksum(checksum) + if normalized == "" { + continue + } + if _, exists := normalizedToOriginal[normalized]; exists { + continue + } + normalizedToOriginal[normalized] = checksum + queryChecksums = append(queryChecksums, normalized) + } + if len(queryChecksums) == 0 { + return map[string][]drsapi.DrsObject{}, nil + } + + page, err := drsCtx.Client.DRS().BatchGetObjectsByHash(ctx, queryChecksums) + if err != nil { + return nil, err + } + + results := make(map[string][]drsapi.DrsObject, len(normalizedToOriginal)) + for normalized, original := range normalizedToOriginal { + results[original] = nil + results[normalized] = nil + } + for _, obj := range page.DrsObjects { + for _, checksum := range obj.Checksums { + if checksum.Type == "" || checksum.Checksum == "" { + continue + } + normalized := drsobject.NormalizeChecksum(fmt.Sprintf("%s:%s", checksum.Type, checksum.Checksum)) + if normalized == "" { + continue + } + original, ok := normalizedToOriginal[normalized] + if !ok { + continue + } + results[original] = append(results[original], obj) + if original != normalized { + results[normalized] = append(results[normalized], obj) + } + } + } + + return results, nil +} + func ObjectsByHashForScope(ctx context.Context, drsCtx *config.GitContext, checksum string) ([]drsapi.DrsObject, error) { objects, err := ObjectsByHash(ctx, drsCtx, checksum) if err != nil { @@ -47,6 +101,24 @@ func ObjectsByHashForScope(ctx context.Context, drsCtx *config.GitContext, check return result, nil } +func ObjectsByHashesForScope(ctx context.Context, drsCtx *config.GitContext, checksums []string) (map[string][]drsapi.DrsObject, error) { + objectsByChecksum, err := ObjectsByHashes(ctx, drsCtx, checksums) + if err != nil { + return nil, err + } + results := make(map[string][]drsapi.DrsObject, len(objectsByChecksum)) + for checksum, objects := range objectsByChecksum { + filtered := make([]drsapi.DrsObject, 0, len(objects)) + for _, obj := range objects { + if MatchesScope(&obj, drsCtx.Organization, drsCtx.ProjectId) { + filtered = append(filtered, obj) + } + } + results[checksum] = filtered + } + return results, nil +} + func AccessURLForHashScope(ctx context.Context, drsCtx *config.GitContext, checksum string) (*drsapi.AccessURL, *drsapi.DrsObject, error) { records, err := ObjectsByHashForScope(ctx, drsCtx, checksum) if err != nil { diff --git a/internal/drsremote/remote_test.go b/internal/drsremote/remote_test.go index fadc40f3..4a9a6a4f 100644 --- a/internal/drsremote/remote_test.go +++ b/internal/drsremote/remote_test.go @@ -14,7 +14,6 @@ import ( drsapi "github.com/calypr/syfon/apigen/client/drs" syclient "github.com/calypr/syfon/client" sydownload "github.com/calypr/syfon/client/transfer/download" - syfoncommon "github.com/calypr/syfon/common" ) type roundTripFunc func(*http.Request) (*http.Response, error) @@ -76,23 +75,19 @@ func TestFindMatchingRecord_EmptyList(t *testing.T) { } } -func makeAuthzRecord(id, org, project string) drsapi.DrsObject { - authzMap := map[string][]string{org: {project}} - accessMethods := []drsapi.AccessMethod{{ - Type: "s3", - Authorizations: syfoncommon.AccessMethodAuthorizationsFromAuthzMap(authzMap), - }} +func makeScopedRecord(id, resource string) drsapi.DrsObject { + controlledAccess := []string{resource} return drsapi.DrsObject{ - Id: id, - AccessMethods: &accessMethods, - Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "sha256"}}, + Id: id, + ControlledAccess: &controlledAccess, + Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "sha256"}}, } } func TestFindMatchingRecord_MatchFound(t *testing.T) { records := []drsapi.DrsObject{ - makeAuthzRecord("no-match", "OTHER", "resource"), - makeAuthzRecord("match", "PROG", "PROJ"), + makeScopedRecord("no-match", "/organization/OTHER/project/resource"), + makeScopedRecord("match", "/organization/PROG/project/PROJ"), } result, err := FindMatchingRecord(records, "", "PROG-PROJ") @@ -100,20 +95,20 @@ func TestFindMatchingRecord_MatchFound(t *testing.T) { t.Fatalf("unexpected error: %v", err) } if result == nil || result.Id != "match" { - t.Fatalf("expected record id match, got %#v", result) + t.Fatalf("expected controlled_access record match, got %#v", result) } } -func TestFindMatchingRecord_NoAuthzMatchReturnsNil(t *testing.T) { +func TestFindMatchingRecord_NoControlledAccessMatchReturnsNil(t *testing.T) { records := []drsapi.DrsObject{ - makeAuthzRecord("no-match", "OTHER", "resource"), + makeScopedRecord("no-match", "/organization/OTHER/project/resource"), } result, err := FindMatchingRecord(records, "", "PROG-PROJ") if err != nil { t.Fatalf("unexpected error: %v", err) } if result != nil { - t.Fatalf("expected nil when no authz matches, got id=%q", result.Id) + t.Fatalf("expected nil when no controlled_access matches, got id=%q", result.Id) } } @@ -129,18 +124,18 @@ func TestAccessURLForHashScope_FiltersByScope(t *testing.T) { projectAccessID := "s3-project" orgAccessID := "s3-org" projectMethods := []drsapi.AccessMethod{{ - Type: drsapi.AccessMethodTypeS3, - AccessId: &projectAccessID, - Authorizations: syfoncommon.AccessMethodAuthorizationsFromAuthzMap(map[string][]string{"org1": {"proj1"}}), + Type: drsapi.AccessMethodTypeS3, + AccessId: &projectAccessID, }} orgMethods := []drsapi.AccessMethod{{ - Type: drsapi.AccessMethodTypeS3, - AccessId: &orgAccessID, - Authorizations: syfoncommon.AccessMethodAuthorizationsFromAuthzMap(map[string][]string{"org1": {}}), + Type: drsapi.AccessMethodTypeS3, + AccessId: &orgAccessID, }} + projectControlled := []string{"/organization/org1/project/proj1"} + orgControlled := []string{"/organization/org1"} checksumResponse := drsapi.N200OkDrsObjects{ResolvedDrsObject: &[]drsapi.DrsObject{ - {Id: "obj-project", Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "abc"}}, AccessMethods: &projectMethods}, - {Id: "obj-org", Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "abc"}}, AccessMethods: &orgMethods}, + {Id: "obj-project", ControlledAccess: &projectControlled, Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "abc"}}, AccessMethods: &projectMethods}, + {Id: "obj-org", ControlledAccess: &orgControlled, Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "abc"}}, AccessMethods: &orgMethods}, }} checksumBody, err := json.Marshal(checksumResponse) if err != nil { @@ -195,6 +190,75 @@ func TestAccessURLForHashScope_FiltersByScope(t *testing.T) { } } +func TestObjectsByHashesForScope_FiltersByScope(t *testing.T) { + t.Parallel() + + projectAccessID := "s3-project" + orgAccessID := "s3-org" + projectMethods := []drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessId: &projectAccessID, + }} + orgMethods := []drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessId: &orgAccessID, + }} + otherMethods := []drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + }} + projectControlled := []string{"/organization/org1/project/proj1"} + orgControlled := []string{"/organization/org1"} + otherControlled := []string{"/organization/other/project/proj"} + checksumResponse := drsapi.N200OkDrsObjects{ResolvedDrsObject: &[]drsapi.DrsObject{ + {Id: "obj-project", ControlledAccess: &projectControlled, Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "abc"}}, AccessMethods: &projectMethods}, + {Id: "obj-org", ControlledAccess: &orgControlled, Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "abc"}}, AccessMethods: &orgMethods}, + {Id: "obj-other", ControlledAccess: &otherControlled, Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "def"}}, AccessMethods: &otherMethods}, + }} + checksumBody, err := json.Marshal(checksumResponse) + if err != nil { + t.Fatalf("marshal checksum response: %v", err) + } + + httpClient := &http.Client{Transport: roundTripFunc(func(r *http.Request) (*http.Response, error) { + switch { + case r.Method == http.MethodGet && r.URL.Path == "/ga4gh/drs/v1/objects/checksum/abc": + return &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader(string(checksumBody))), + Header: http.Header{"Content-Type": []string{"application/json"}}, + Request: r, + }, nil + case r.Method == http.MethodGet && r.URL.Path == "/ga4gh/drs/v1/objects/checksum/def": + return &http.Response{ + StatusCode: http.StatusOK, + Body: io.NopCloser(strings.NewReader(`{"resolved_drs_object":[]}`)), + Header: http.Header{"Content-Type": []string{"application/json"}}, + Request: r, + }, nil + default: + return nil, io.EOF + } + })} + + raw, err := syclient.New("http://example.test", syclient.WithHTTPClient(httpClient)) + if err != nil { + t.Fatalf("syclient.New: %v", err) + } + client := raw.(*syclient.Client) + ctx := &config.GitContext{Client: client, Organization: "org1", ProjectId: "proj1"} + + got, err := ObjectsByHashesForScope(context.Background(), ctx, []string{"sha256:abc", "sha256:def"}) + if err != nil { + t.Fatalf("ObjectsByHashesForScope returned error: %v", err) + } + if len(got["sha256:abc"]) != 2 { + t.Fatalf("expected project and org-wide matches for abc, got %+v", got["sha256:abc"]) + } + if len(got["sha256:def"]) != 0 { + t.Fatalf("expected non-matching scope to be filtered, got %+v", got["sha256:def"]) + } +} + func TestDownloadResolvedToPath_RangeIgnoredRestartsDownload(t *testing.T) { t.Parallel() diff --git a/internal/drsremote/scope.go b/internal/drsremote/scope.go index 4692f8f3..91aa92ba 100644 --- a/internal/drsremote/scope.go +++ b/internal/drsremote/scope.go @@ -24,17 +24,8 @@ func FindMatchingRecord(records []drsapi.DrsObject, organization, projectID stri } for _, record := range records { - if record.AccessMethods == nil { - continue - } - for _, access := range *record.AccessMethods { - authzMap := syfoncommon.AuthzMapFromAccessMethodAuthorizations(access.Authorizations) - if len(authzMap) == 0 { - continue - } - if syfoncommon.AuthzMapMatchesScope(authzMap, org, project) { - return &record, nil - } + if MatchesScope(&record, org, project) { + return &record, nil } } return nil, nil diff --git a/internal/gitrepo/bucket_scope.go b/internal/gitrepo/bucket_scope.go index 643aeee3..7ad45d55 100644 --- a/internal/gitrepo/bucket_scope.go +++ b/internal/gitrepo/bucket_scope.go @@ -66,7 +66,10 @@ func ResolveBucketScope(organization, project, configuredBucket, configuredPrefi } if configuredBucket == "" { - return ResolvedBucketScope{}, fmt.Errorf("bucket is required (or configure mapping with `git drs bucket add-organization --organization %s --path :///`)", organization) + if project != "" { + return ResolvedBucketScope{}, fmt.Errorf("no bucket mapping found for organization=%q project=%q; configure one first with `git drs bucket add-organization --organization %s --path :///` or `git drs bucket add-project --organization %s --project %s --path :///`", organization, project, organization, organization, project) + } + return ResolvedBucketScope{}, fmt.Errorf("no bucket mapping found for organization=%q; configure one first with `git drs bucket add-organization --organization %s --path :///`", organization, organization) } return ResolvedBucketScope{ Bucket: configuredBucket, diff --git a/internal/lfs/inventory.go b/internal/lfs/inventory.go index 69b490d6..897ddf16 100644 --- a/internal/lfs/inventory.go +++ b/internal/lfs/inventory.go @@ -3,6 +3,7 @@ package lfs import ( "bytes" "context" + "errors" "fmt" "log/slog" "os" @@ -45,7 +46,7 @@ func IsLFSTracked(path string) (bool, error) { if len(fields) < 3 { return false, nil } - return strings.TrimSpace(fields[2]) == "lfs", nil + return isTrackedFilter(strings.TrimSpace(fields[2])), nil } func GetAllLfsFiles(gitRemoteName, gitRemoteLocation string, branches []string, logger *slog.Logger) (map[string]LfsFileInfo, error) { @@ -80,28 +81,119 @@ func GetAllLfsFiles(gitRemoteName, gitRemoteLocation string, branches []string, return lfsFileMap, nil } -func addFilesFromRef(ctx context.Context, repoDir, ref string, logger *slog.Logger, lfsFileMap map[string]LfsFileInfo) error { - out, err := runGitCommand(ctx, repoDir, "ls-tree", "-r", "-z", "--long", ref) +// GetLfsFilesForRefs scans arbitrary refs or SHAs and returns the LFS pointer +// files present in those trees. +func GetLfsFilesForRefs(refs []string, logger *slog.Logger) (map[string]LfsFileInfo, error) { + if logger == nil { + return nil, fmt.Errorf("logger is required") + } + repoDir, err := os.Getwd() if err != nil { - return fmt.Errorf("git ls-tree failed for %s: %w", ref, err) + return nil, err } - entries := strings.Split(out, "\x00") - for _, entry := range entries { - entry = strings.TrimSpace(entry) - if entry == "" { + ctx := context.Background() + lfsFileMap := make(map[string]LfsFileInfo) + seen := make(map[string]struct{}, len(refs)) + for _, ref := range refs { + ref = strings.TrimSpace(ref) + if ref == "" { + continue + } + if _, ok := seen[ref]; ok { continue } + seen[ref] = struct{}{} + if err := addFilesFromRef(ctx, repoDir, ref, logger, lfsFileMap); err != nil { + return nil, err + } + } + return lfsFileMap, nil +} - oid, path, err := parseLsTreeEntry(entry) +// GetWorktreeLfsFiles scans the current checkout and returns tracked files whose +// worktree content is currently a valid Git LFS pointer. This is the fast path +// for interactive commands like `git-drs ls-files`. +func GetWorktreeLfsFiles(logger *slog.Logger) (map[string]LfsFileInfo, error) { + if logger == nil { + return nil, fmt.Errorf("logger is required") + } + repoDir, err := os.Getwd() + if err != nil { + return nil, err + } + logger.Debug("Scanning current worktree for LFS pointer files") + ctx := context.Background() + paths, err := listTrackedWorktreeFiles(ctx, repoDir) + if err != nil { + return nil, err + } + files := make(map[string]LfsFileInfo) + for _, path := range paths { + payload, err := os.ReadFile(filepath.Join(repoDir, filepath.FromSlash(path))) if err != nil { - logger.Debug(fmt.Sprintf("skipping unparseable ls-tree entry for %s: %q", ref, entry)) continue } + pointer, ok := parseLFSPointer(string(payload)) + if !ok { + continue + } + files[path] = LfsFileInfo{ + Name: path, + Size: pointer.Size, + IsPointer: true, + OidType: pointer.OidType, + Oid: pointer.Oid, + Version: pointer.Version, + } + } + return files, nil +} + +// GetTrackedLfsFiles scans the current checkout and returns files that are LFS +// tracked according to Git attributes. Pointer metadata is taken from the +// worktree when still present, or from the index when the worktree has already +// been hydrated. +func GetTrackedLfsFiles(logger *slog.Logger) (map[string]LfsFileInfo, error) { + if logger == nil { + return nil, fmt.Errorf("logger is required") + } + repoDir, err := os.Getwd() + if err != nil { + return nil, err + } + logger.Debug("Scanning current worktree for LFS-tracked files") + ctx := context.Background() + paths, err := listTrackedWorktreeFiles(ctx, repoDir) + if err != nil { + return nil, err + } + tracked, err := filterLfsTrackedPaths(ctx, repoDir, paths) + if err != nil { + return nil, err + } + files := make(map[string]LfsFileInfo, len(tracked)) + for _, path := range tracked { + if info, ok := readWorktreePointerInfo(repoDir, path); ok { + files[path] = info + continue + } + if info, ok := readIndexPointerInfo(ctx, repoDir, path); ok { + files[path] = info + } + } + return files, nil +} - blob, err := runGitCommand(ctx, repoDir, "cat-file", "-p", oid) +func addFilesFromRef(ctx context.Context, repoDir, ref string, logger *slog.Logger, lfsFileMap map[string]LfsFileInfo) error { + paths, err := grepPointerPaths(ctx, repoDir, ref) + if err != nil { + return fmt.Errorf("git grep failed for %s: %w", ref, err) + } + for _, path := range paths { + blob, err := runGitCommand(ctx, repoDir, "show", fmt.Sprintf("%s:%s", ref, path)) if err != nil { - logger.Debug(fmt.Sprintf("skipping path %s in %s: unable to read blob %s", path, ref, oid)) + logger.Debug(fmt.Sprintf("skipping path %s in %s: unable to read blob", path, ref)) continue } @@ -123,9 +215,31 @@ func addFilesFromRef(ctx context.Context, repoDir, ref string, logger *slog.Logg return nil } -func runGitCommand(ctx context.Context, repoDir string, args ...string) (string, error) { - cmd := exec.CommandContext(ctx, "git", args...) +func listTrackedWorktreeFiles(ctx context.Context, repoDir string) ([]string, error) { + out, err := runGitCommand(ctx, repoDir, "ls-files", "-z") + if err != nil { + return nil, fmt.Errorf("git ls-files failed: %w", err) + } + raw := strings.Split(out, "\x00") + paths := make([]string, 0, len(raw)) + for _, entry := range raw { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + paths = append(paths, entry) + } + return paths, nil +} + +func filterLfsTrackedPaths(ctx context.Context, repoDir string, paths []string) ([]string, error) { + if len(paths) == 0 { + return nil, nil + } + + cmd := exec.CommandContext(ctx, "git", "check-attr", "-z", "--stdin", "filter") cmd.Dir = repoDir + cmd.Stdin = strings.NewReader(strings.Join(paths, "\x00") + "\x00") var stdout, stderr bytes.Buffer cmd.Stdout = &stdout cmd.Stderr = &stderr @@ -134,31 +248,122 @@ func runGitCommand(ctx context.Context, repoDir string, args ...string) (string, if msg == "" { msg = err.Error() } - return "", fmt.Errorf("%s", msg) + return nil, fmt.Errorf("git check-attr failed: %s", msg) } - return stdout.String(), nil + + raw := strings.Split(stdout.String(), "\x00") + filtered := make([]string, 0, len(paths)) + for i := 0; i+2 < len(raw); i += 3 { + path := strings.TrimSpace(raw[i]) + attr := strings.TrimSpace(raw[i+1]) + value := strings.TrimSpace(raw[i+2]) + if path == "" || attr != "filter" { + continue + } + if isTrackedFilter(value) { + filtered = append(filtered, path) + } + } + return filtered, nil } -func parseLsTreeEntry(entry string) (string, string, error) { - tab := strings.Index(entry, "\t") - if tab < 0 { - return "", "", fmt.Errorf("missing tab separator") +func isTrackedFilter(value string) bool { + switch strings.TrimSpace(value) { + case "lfs", "drs": + return true + default: + return false } +} - meta := strings.Fields(entry[:tab]) - if len(meta) < 3 { - return "", "", fmt.Errorf("invalid ls-tree metadata") +func readWorktreePointerInfo(repoDir, path string) (LfsFileInfo, bool) { + payload, err := os.ReadFile(filepath.Join(repoDir, filepath.FromSlash(path))) + if err != nil { + return LfsFileInfo{}, false } - if meta[1] != "blob" { - return "", "", fmt.Errorf("not a blob entry") + pointer, ok := parseLFSPointer(string(payload)) + if !ok { + return LfsFileInfo{}, false } + return LfsFileInfo{ + Name: path, + Size: pointer.Size, + IsPointer: true, + OidType: pointer.OidType, + Oid: pointer.Oid, + Version: pointer.Version, + }, true +} - oid := strings.TrimSpace(meta[2]) - path := strings.TrimSpace(entry[tab+1:]) - if oid == "" || path == "" { - return "", "", fmt.Errorf("missing oid or path") +func readIndexPointerInfo(ctx context.Context, repoDir, path string) (LfsFileInfo, bool) { + blob, err := runGitCommand(ctx, repoDir, "show", ":"+path) + if err != nil { + return LfsFileInfo{}, false + } + pointer, ok := parseLFSPointer(blob) + if !ok { + return LfsFileInfo{}, false } - return oid, path, nil + return LfsFileInfo{ + Name: path, + Size: pointer.Size, + IsPointer: false, + OidType: pointer.OidType, + Oid: pointer.Oid, + Version: pointer.Version, + }, true +} + +func grepPointerPaths(ctx context.Context, repoDir, ref string) ([]string, error) { + cmd := exec.CommandContext(ctx, "git", "grep", "-z", "-l", "https://git-lfs.github.com/spec/v1", ref, "--") + cmd.Dir = repoDir + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + err := cmd.Run() + if err != nil { + var exitErr *exec.ExitError + if errors.As(err, &exitErr) && exitErr.ExitCode() == 1 { + return nil, nil + } + msg := strings.TrimSpace(stderr.String()) + if msg == "" { + msg = err.Error() + } + return nil, fmt.Errorf("%s", msg) + } + + raw := strings.Split(stdout.String(), "\x00") + paths := make([]string, 0, len(raw)) + prefix := ref + ":" + for _, entry := range raw { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + path := entry + if strings.HasPrefix(path, prefix) { + path = strings.TrimPrefix(path, prefix) + } + paths = append(paths, path) + } + return paths, nil +} + +func runGitCommand(ctx context.Context, repoDir string, args ...string) (string, error) { + cmd := exec.CommandContext(ctx, "git", args...) + cmd.Dir = repoDir + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + if err := cmd.Run(); err != nil { + msg := strings.TrimSpace(stderr.String()) + if msg == "" { + msg = err.Error() + } + return "", fmt.Errorf("%s", msg) + } + return stdout.String(), nil } type lfsPointer struct { diff --git a/internal/lfs/inventory_test.go b/internal/lfs/inventory_test.go index 9b876760..ba7b3c4a 100644 --- a/internal/lfs/inventory_test.go +++ b/internal/lfs/inventory_test.go @@ -14,6 +14,10 @@ func TestGetAllLfsFilesFromGitRefsWithoutLfsCli(t *testing.T) { runGitCmdTest(t, repo, "init") runGitCmdTest(t, repo, "config", "user.email", "test@example.com") runGitCmdTest(t, repo, "config", "user.name", "Test User") + runGitCmdTest(t, repo, "config", "filter.lfs.clean", "cat") + runGitCmdTest(t, repo, "config", "filter.lfs.smudge", "cat") + runGitCmdTest(t, repo, "config", "filter.lfs.process", "cat") + runGitCmdTest(t, repo, "config", "filter.lfs.required", "false") runGitCmdTest(t, repo, "checkout", "-b", "main") oidMain := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" @@ -88,6 +92,120 @@ func TestGetAllLfsFilesFromGitRefsWithoutLfsCli(t *testing.T) { } } +func TestGetWorktreeLfsFiles(t *testing.T) { + repo := t.TempDir() + runGitCmdTest(t, repo, "init") + runGitCmdTest(t, repo, "config", "user.email", "test@example.com") + runGitCmdTest(t, repo, "config", "user.name", "Test User") + + oid := "cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc" + pointerPath := filepath.Join(repo, "data", "pointer.dat") + writePointerFile(t, pointerPath, oid, "789") + + localizedPath := filepath.Join(repo, "data", "localized.bin") + if err := os.WriteFile(localizedPath, []byte("hydrated"), 0o644); err != nil { + t.Fatalf("write localized file: %v", err) + } + + runGitCmdTest(t, repo, "add", ".") + runGitCmdTest(t, repo, "commit", "-m", "commit pointer") + + if err := os.WriteFile(pointerPath, []byte("hydrated pointer replacement"), 0o644); err != nil { + t.Fatalf("replace pointer with hydrated content: %v", err) + } + + oldWD, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + if err := os.Chdir(repo); err != nil { + t.Fatalf("chdir repo: %v", err) + } + t.Cleanup(func() { + _ = os.Chdir(oldWD) + }) + + logger := drslog.NewNoOpLogger() + files, err := GetWorktreeLfsFiles(logger) + if err != nil { + t.Fatalf("GetWorktreeLfsFiles error: %v", err) + } + if _, exists := files["data/pointer.dat"]; exists { + t.Fatalf("hydrated file should not still appear as a pointer") + } + + if err := os.WriteFile(pointerPath, []byte("version https://git-lfs.github.com/spec/v1\noid sha256:"+oid+"\nsize 789\n"), 0o644); err != nil { + t.Fatalf("restore pointer file: %v", err) + } + + files, err = GetWorktreeLfsFiles(logger) + if err != nil { + t.Fatalf("GetWorktreeLfsFiles error after restore: %v", err) + } + info, ok := files["data/pointer.dat"] + if !ok { + t.Fatalf("expected pointer in worktree inventory") + } + if info.Oid != oid || info.Size != 789 { + t.Fatalf("unexpected pointer info: %+v", info) + } +} + +func TestGetTrackedLfsFiles_IncludesHydratedTrackedFileUsingIndexPointer(t *testing.T) { + repo := t.TempDir() + runGitCmdTest(t, repo, "init") + runGitCmdTest(t, repo, "config", "user.email", "test@example.com") + runGitCmdTest(t, repo, "config", "user.name", "Test User") + runGitCmdTest(t, repo, "config", "filter.drs.clean", "cat") + runGitCmdTest(t, repo, "config", "filter.drs.smudge", "cat") + runGitCmdTest(t, repo, "config", "filter.drs.process", "cat") + runGitCmdTest(t, repo, "config", "filter.drs.required", "false") + + attrPath := filepath.Join(repo, ".gitattributes") + if err := os.WriteFile(attrPath, []byte("*.dat filter=drs diff=drs merge=drs -text\n"), 0o644); err != nil { + t.Fatalf("write .gitattributes: %v", err) + } + + oid := "dddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddddd" + pointerPath := filepath.Join(repo, "data", "hydrated.dat") + writePointerFile(t, pointerPath, oid, "321") + + runGitCmdTest(t, repo, "add", ".") + runGitCmdTest(t, repo, "commit", "-m", "commit tracked pointer") + + if err := os.WriteFile(pointerPath, []byte("localized payload"), 0o644); err != nil { + t.Fatalf("hydrate tracked file: %v", err) + } + + oldWD, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + if err := os.Chdir(repo); err != nil { + t.Fatalf("chdir repo: %v", err) + } + t.Cleanup(func() { + _ = os.Chdir(oldWD) + }) + + logger := drslog.NewNoOpLogger() + files, err := GetTrackedLfsFiles(logger) + if err != nil { + t.Fatalf("GetTrackedLfsFiles error: %v", err) + } + + info, ok := files["data/hydrated.dat"] + if !ok { + t.Fatalf("expected hydrated tracked file in inventory") + } + if info.Oid != oid || info.Size != 321 { + t.Fatalf("unexpected hydrated tracked info: %+v", info) + } + if info.IsPointer { + t.Fatalf("expected hydrated tracked file to be marked non-pointer in worktree inventory") + } +} + func writePointerFile(t *testing.T, path, oid, size string) { t.Helper() if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { @@ -118,7 +236,7 @@ func TestIsLFSTracked(t *testing.T) { repo := t.TempDir() mustRun(t, repo, "git", "init") - attr := []byte("*.dat filter=lfs diff=lfs merge=lfs -text\n") + attr := []byte("*.dat filter=drs diff=drs merge=drs -text\n") if err := os.WriteFile(filepath.Join(repo, ".gitattributes"), attr, 0o644); err != nil { t.Fatalf("write .gitattributes: %v", err) } diff --git a/internal/lfs/sentinel.go b/internal/lfs/sentinel.go deleted file mode 100644 index a47beab5..00000000 --- a/internal/lfs/sentinel.go +++ /dev/null @@ -1,65 +0,0 @@ -package lfs - -import ( - "crypto/sha256" - "fmt" - "os" - "path/filepath" - "strings" -) - -const addURLSentinelHeader = "git-drs-add-url-sentinel:v1\n" - -func SyntheticOIDFromETag(etag string) (string, error) { - e := strings.TrimSpace(strings.Trim(etag, `"`)) - if e == "" { - return "", fmt.Errorf("etag is required for synthetic oid") - } - sum := sha256.Sum256([]byte(e)) - return fmt.Sprintf("%x", sum[:]), nil -} - -func buildAddURLSentinel(etag string, sourceURL string) ([]byte, error) { - e := strings.TrimSpace(strings.Trim(etag, `"`)) - if e == "" { - return nil, fmt.Errorf("etag is required for sentinel") - } - return []byte(addURLSentinelHeader + "etag=" + e + "\nsource=" + strings.TrimSpace(sourceURL) + "\n"), nil -} - -func IsAddURLSentinelBytes(data []byte) bool { - return strings.HasPrefix(string(data), addURLSentinelHeader) -} - -func IsAddURLSentinelObject(path string) (bool, error) { - f, err := os.Open(path) - if err != nil { - return false, err - } - defer f.Close() - - buf := make([]byte, len(addURLSentinelHeader)) - n, err := f.Read(buf) - if err != nil && n == 0 { - return false, err - } - return IsAddURLSentinelBytes(buf[:n]), nil -} - -func WriteAddURLSentinelObject(lfsRoot string, oid string, etag string, sourceURL string) (string, error) { - objPath, err := ObjectPath(filepath.Join(lfsRoot, "objects"), oid) - if err != nil { - return "", err - } - if err := os.MkdirAll(filepath.Dir(objPath), 0o755); err != nil { - return "", fmt.Errorf("mkdir %s: %w", filepath.Dir(objPath), err) - } - payload, err := buildAddURLSentinel(etag, sourceURL) - if err != nil { - return "", err - } - if err := os.WriteFile(objPath, payload, 0o644); err != nil { - return "", fmt.Errorf("write sentinel %s: %w", objPath, err) - } - return objPath, nil -} diff --git a/internal/lfs/sentinel_test.go b/internal/lfs/sentinel_test.go deleted file mode 100644 index 74d5b9e5..00000000 --- a/internal/lfs/sentinel_test.go +++ /dev/null @@ -1,66 +0,0 @@ -package lfs - -import ( - "os" - "path/filepath" - "testing" -) - -func TestSyntheticOIDFromETag(t *testing.T) { - oid, err := SyntheticOIDFromETag("abcd1234") - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if len(oid) != 64 { - t.Fatalf("expected 64-char oid, got %q", oid) - } -} - -func TestWriteAndDetectAddURLSentinelObject(t *testing.T) { - root := t.TempDir() - oid, err := SyntheticOIDFromETag("etag-abc") - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - path, err := WriteAddURLSentinelObject(root, oid, "etag-abc", "s3://bucket/key") - if err != nil { - t.Fatalf("write sentinel: %v", err) - } - if _, err := os.Stat(path); err != nil { - t.Fatalf("expected sentinel file: %v", err) - } - ok, err := IsAddURLSentinelObject(path) - if err != nil { - t.Fatalf("IsAddURLSentinelObject error: %v", err) - } - if !ok { - t.Fatalf("expected sentinel detection true") - } -} - -func TestIsAddURLSentinelBytes(t *testing.T) { - payload, err := buildAddURLSentinel("etag", "s3://bucket/key") - if err != nil { - t.Fatalf("build sentinel: %v", err) - } - if !IsAddURLSentinelBytes(payload) { - t.Fatalf("expected sentinel bytes to be detected") - } - non := []byte("not-a-sentinel") - if IsAddURLSentinelBytes(non) { - t.Fatalf("did not expect non-sentinel to match") - } -} - -func TestWriteAddURLSentinelObjectCreatesDirectories(t *testing.T) { - root := t.TempDir() - oid := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" - path, err := WriteAddURLSentinelObject(root, oid, "etag", "s3://bucket/key") - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - expected := filepath.Join(root, "objects", oid[:2], oid[2:4], oid) - if path != expected { - t.Fatalf("expected %s, got %s", expected, path) - } -} diff --git a/internal/pathspec/match.go b/internal/pathspec/match.go new file mode 100644 index 00000000..ba034cd8 --- /dev/null +++ b/internal/pathspec/match.go @@ -0,0 +1,62 @@ +package pathspec + +import ( + "path/filepath" + "regexp" + "strings" +) + +func MatchesAny(path string, patterns []string) bool { + if len(patterns) == 0 { + return true + } + normalized := filepath.ToSlash(filepath.Clean(path)) + for _, pattern := range patterns { + pattern = strings.TrimSpace(pattern) + if pattern == "" { + continue + } + if Matches(normalized, pattern) { + return true + } + } + return false +} + +func Matches(path, pattern string) bool { + pattern = filepath.ToSlash(filepath.Clean(pattern)) + if !strings.ContainsAny(pattern, "*?[") { + return path == pattern + } + re, err := regexp.Compile(globToRegexp(pattern)) + if err != nil { + return false + } + return re.MatchString(path) +} + +func globToRegexp(pattern string) string { + var b strings.Builder + b.WriteString("^") + for i := 0; i < len(pattern); i++ { + ch := pattern[i] + switch ch { + case '*': + if i+1 < len(pattern) && pattern[i+1] == '*' { + b.WriteString(".*") + i++ + continue + } + b.WriteString(`[^/]*`) + case '?': + b.WriteString(`[^/]`) + case '.', '+', '(', ')', '|', '^', '$', '{', '}', '[', ']', '\\': + b.WriteByte('\\') + b.WriteByte(ch) + default: + b.WriteByte(ch) + } + } + b.WriteString("$") + return b.String() +} diff --git a/internal/pathspec/match_test.go b/internal/pathspec/match_test.go new file mode 100644 index 00000000..f8ac42b5 --- /dev/null +++ b/internal/pathspec/match_test.go @@ -0,0 +1,22 @@ +package pathspec + +import "testing" + +func TestMatches(t *testing.T) { + cases := []struct { + pattern string + path string + want bool + }{ + {pattern: "data/**", path: "data/a/b/file.bin", want: true}, + {pattern: "*.bam", path: "data/file.bam", want: false}, + {pattern: "data/*.bam", path: "data/file.bam", want: true}, + {pattern: "a/file.txt", path: "a/file.txt", want: true}, + {pattern: "a/file.txt", path: "a/other.txt", want: false}, + } + for _, tc := range cases { + if got := Matches(tc.path, tc.pattern); got != tc.want { + t.Fatalf("Matches(%q, %q) = %v, want %v", tc.path, tc.pattern, got, tc.want) + } + } +} diff --git a/internal/precommit_cache/helpers.go b/internal/precommit_cache/helpers.go index 0a178621..cf4bae7e 100644 --- a/internal/precommit_cache/helpers.go +++ b/internal/precommit_cache/helpers.go @@ -169,6 +169,46 @@ func (c *Cache) ReadOIDEntry(oid string) (*OIDEntry, bool, error) { return &oe, true, nil } +func (c *Cache) EnsureLayout() error { + for _, dir := range []string{c.Root, c.PathsDir, c.OIDsDir} { + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + } + return nil +} + +func (c *Cache) UpsertPathEntry(entry PathEntry) error { + if err := c.EnsureLayout(); err != nil { + return err + } + return writeJSONAtomic(c.pathEntryFile(entry.Path), entry) +} + +func (c *Cache) AddOrReplaceOIDPath(oid, oldPath, newPath, now string, contentChanged bool) error { + if err := c.EnsureLayout(); err != nil { + return err + } + return oidAddOrReplacePath(c.OIDsDir, oid, oldPath, newPath, now, contentChanged) +} + +func (c *Cache) RemovePathFromOID(oid, path, now string) error { + if err := c.EnsureLayout(); err != nil { + return err + } + return oidRemovePath(c.OIDsDir, oid, path, now) +} + +func (c *Cache) DeletePathEntry(path string) error { + if err := c.EnsureLayout(); err != nil { + return err + } + if err := os.Remove(c.pathEntryFile(path)); err != nil && !errors.Is(err, fs.ErrNotExist) { + return err + } + return nil +} + // // Validation helpers (optional) // @@ -276,3 +316,97 @@ func git(ctx context.Context, args ...string) ([]byte, error) { } return out, nil } + +func writeJSONAtomic(path string, v any) error { + dir := filepath.Dir(path) + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + tmp, err := os.CreateTemp(dir, ".tmp-*.json") + if err != nil { + return err + } + tmpName := tmp.Name() + enc := json.NewEncoder(tmp) + enc.SetIndent("", " ") + if err := enc.Encode(v); err != nil { + _ = tmp.Close() + _ = os.Remove(tmpName) + return err + } + if err := tmp.Close(); err != nil { + _ = os.Remove(tmpName) + return err + } + return os.Rename(tmpName, path) +} + +func oidAddOrReplacePath(oidsDir, oid, oldPath, newPath, now string, contentChanged bool) error { + f := oidEntryFilePath(oidsDir, oid) + var oe OIDEntry + if b, err := os.ReadFile(f); err == nil { + _ = json.Unmarshal(b, &oe) + } + if oe.LFSOID == "" { + oe.LFSOID = oid + } + pathsSet := make(map[string]struct{}, len(oe.Paths)+1) + for _, p := range oe.Paths { + p = strings.TrimSpace(p) + if p == "" || p == oldPath { + continue + } + pathsSet[p] = struct{}{} + } + if strings.TrimSpace(newPath) != "" { + pathsSet[newPath] = struct{}{} + } + oe.Paths = oe.Paths[:0] + for p := range pathsSet { + oe.Paths = append(oe.Paths, p) + } + sort.Strings(oe.Paths) + oe.UpdatedAt = now + oe.ContentChange = contentChanged + return writeJSONAtomic(f, oe) +} + +func oidRemovePath(oidsDir, oid, path, now string) error { + if strings.TrimSpace(oid) == "" || strings.TrimSpace(path) == "" { + return nil + } + f := oidEntryFilePath(oidsDir, oid) + b, err := os.ReadFile(f) + if err != nil { + if errors.Is(err, fs.ErrNotExist) { + return nil + } + return err + } + var oe OIDEntry + if err := json.Unmarshal(b, &oe); err != nil { + return err + } + filtered := oe.Paths[:0] + for _, existing := range oe.Paths { + if strings.TrimSpace(existing) == "" || existing == path { + continue + } + filtered = append(filtered, existing) + } + oe.Paths = filtered + oe.UpdatedAt = now + if len(oe.Paths) == 0 { + if err := os.Remove(f); err != nil && !errors.Is(err, fs.ErrNotExist) { + return err + } + return nil + } + sort.Strings(oe.Paths) + return writeJSONAtomic(f, oe) +} + +func oidEntryFilePath(oidsDir, oid string) string { + sum := sha256.Sum256([]byte(oid)) + return filepath.Join(oidsDir, fmt.Sprintf("%x.json", sum[:])) +} diff --git a/internal/pushsync/batch_sync.go b/internal/pushsync/batch_sync.go index 427b9444..8a783cf7 100644 --- a/internal/pushsync/batch_sync.go +++ b/internal/pushsync/batch_sync.go @@ -3,9 +3,11 @@ package pushsync import ( "context" "fmt" + "net/url" "os" "path/filepath" "sort" + "strings" localcommon "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/config" @@ -13,6 +15,7 @@ import ( "github.com/calypr/git-drs/internal/drsremote" "github.com/calypr/git-drs/internal/lfs" drsapi "github.com/calypr/syfon/apigen/client/drs" + sycommon "github.com/calypr/syfon/client/common" "github.com/calypr/syfon/client/hash" "github.com/google/uuid" "golang.org/x/sync/errgroup" @@ -21,6 +24,7 @@ import ( type batchSyncSession struct { ctx context.Context rt *pushRuntime + reporter UploadProgressReporter filesByOID map[string]lfs.LfsFileInfo oids []string drsObjByOID map[string]*drsapi.DrsObject @@ -37,10 +41,11 @@ type uploadCandidate struct { } // BatchSyncForPush performs checksum-first push preparation. -func BatchSyncForPush(cl *config.GitContext, ctx context.Context, files map[string]lfs.LfsFileInfo) error { +func BatchSyncForPush(cl *config.GitContext, ctx context.Context, files map[string]lfs.LfsFileInfo, reporter UploadProgressReporter) error { session := &batchSyncSession{ ctx: ctx, rt: newPushRuntime(cl), + reporter: reporter, drsObjByOID: make(map[string]*drsapi.DrsObject), existingByHash: make(map[string][]drsapi.DrsObject), registeredOids: make(map[string]bool), @@ -196,9 +201,10 @@ func scopedDRSObjectForPush(rt *pushRuntime, oid string, path string, size int64 return obj, nil } - if existing.AccessMethods != nil && len(*existing.AccessMethods) > 0 { + if shouldPreserveExistingAccessMethodsForPush(existing, obj, oid) { obj.AccessMethods = existing.AccessMethods } + obj.Aliases = existing.Aliases obj.Contents = existing.Contents obj.Description = existing.Description @@ -215,10 +221,69 @@ func scopedDRSObjectForPush(rt *pushRuntime, oid string, path string, size int64 return obj, nil } +func shouldPreserveExistingAccessMethodsForPush(existing *drsapi.DrsObject, generated *drsapi.DrsObject, oid string) bool { + existingURL := firstAccessURL(existing) + if existingURL == "" { + return false + } + generatedURL := firstAccessURL(generated) + if existingURL == generatedURL { + return true + } + existingBucket, existingKey, existingOK := parseStorageURL(existingURL) + if !existingOK { + return true + } + generatedBucket, generatedKey, generatedOK := parseStorageURL(generatedURL) + normalizedOID := strings.Trim(strings.TrimPrefix(strings.TrimSpace(oid), "sha256:"), "/") + existingKey = strings.Trim(existingKey, "/") + if strings.EqualFold(existingBucket, "objects") { + return false + } + if generatedOK && existingKey == "" { + return false + } + if generatedOK && !strings.EqualFold(existingBucket, generatedBucket) && existingKey == normalizedOID { + return false + } + if generatedOK && existingKey == generatedKey && strings.EqualFold(existingBucket, generatedBucket) { + return true + } + return true +} + +func firstAccessURL(obj *drsapi.DrsObject) string { + if obj == nil || obj.AccessMethods == nil || len(*obj.AccessMethods) == 0 { + return "" + } + am := (*obj.AccessMethods)[0] + if am.AccessUrl == nil { + return "" + } + return strings.TrimSpace(am.AccessUrl.Url) +} + +func parseStorageURL(raw string) (bucket string, key string, ok bool) { + u, err := url.Parse(strings.TrimSpace(raw)) + if err != nil || u.Scheme == "" || u.Host == "" { + return "", "", false + } + switch strings.ToLower(u.Scheme) { + case "s3", "gs", "azblob": + return u.Host, strings.Trim(u.Path, "/"), true + default: + return "", "", false + } +} + func (s *batchSyncSession) identifyUploadCandidates() ([]uploadCandidate, error) { candidates := make([]uploadCandidate, 0) for _, oid := range s.oids { - if !s.needsUpload(oid) { + needsUpload, err := s.needsUpload(oid) + if err != nil { + return nil, err + } + if !needsUpload { continue } @@ -248,17 +313,22 @@ func (s *batchSyncSession) identifyUploadCandidates() ([]uploadCandidate, error) return candidates, nil } -func (s *batchSyncSession) needsUpload(oid string) bool { +func (s *batchSyncSession) needsUpload(oid string) (bool, error) { if s.registeredOids[oid] { - return true + return true, nil } if len(s.existingByHash[oid]) == 0 { - return true + return true, nil } - if downloadable, err := isFileDownloadable(s.rt, s.ctx, s.drsObjByOID[oid]); err != nil || !downloadable { - return true + obj := s.drsObjByOID[oid] + if obj == nil { + return false, nil + } + downloadable, err := isFileDownloadable(s.rt, s.ctx, obj) + if err != nil { + return false, fmt.Errorf("failed to check remote object availability for oid %s: %w", oid, err) } - return false + return !downloadable, nil } func (s *batchSyncSession) executeUploadPlan(candidates []uploadCandidate) error { @@ -273,6 +343,9 @@ func (s *batchSyncSession) executeUploadPlan(candidates []uploadCandidate) error small, large := splitCandidatesByThreshold(candidates, threshold) s.rt.Logger.InfoContext(s.ctx, "upload plan prepared", "total", len(candidates), "parallel_small", len(small), "sequential_large", len(large)) + if s.reporter != nil { + s.reporter.OnUploadPlan(buildUploadPlanSummary(candidates)) + } if len(small) > 0 { eg, egCtx := errgroup.WithContext(s.ctx) @@ -280,7 +353,12 @@ func (s *batchSyncSession) executeUploadPlan(candidates []uploadCandidate) error for _, c := range small { c := c eg.Go(func() error { - return uploadFileForObject(s.rt, egCtx, c.obj, c.src, false) + uploadCtx := s.progressContextForCandidate(egCtx, c) + if err := uploadFileForObject(s.rt, uploadCtx, c.obj, c.src, false); err != nil { + return err + } + s.reportUploadCompleted(c) + return nil }) } if err := eg.Wait(); err != nil { @@ -289,13 +367,68 @@ func (s *batchSyncSession) executeUploadPlan(candidates []uploadCandidate) error } for _, c := range large { - if err := uploadFileForObject(s.rt, s.ctx, c.obj, c.src, false); err != nil { + uploadCtx := s.progressContextForCandidate(s.ctx, c) + if err := uploadFileForObject(s.rt, uploadCtx, c.obj, c.src, false); err != nil { return err } + s.reportUploadCompleted(c) } return nil } +func buildUploadPlanSummary(candidates []uploadCandidate) UploadPlanSummary { + files := make([]UploadPlanFile, 0, len(candidates)) + var totalBytes int64 + for _, c := range candidates { + files = append(files, UploadPlanFile{ + OID: c.oid, + Path: c.file.Name, + Bytes: c.size, + }) + totalBytes += c.size + } + return UploadPlanSummary{ + Files: files, + TotalFiles: len(files), + TotalBytes: totalBytes, + } +} + +func (s *batchSyncSession) progressContextForCandidate(ctx context.Context, c uploadCandidate) context.Context { + if s.reporter == nil { + return ctx + } + ctx = sycommon.WithOid(ctx, c.oid) + return sycommon.WithProgress(ctx, func(ev sycommon.ProgressEvent) error { + if ev.Event != "progress" { + return nil + } + s.reporter.OnUploadProgress(UploadProgressEvent{ + OID: c.oid, + Path: c.file.Name, + BytesSoFar: ev.BytesSoFar, + BytesSinceLast: ev.BytesSinceLast, + TotalBytes: c.size, + Phase: UploadProgressUploading, + }) + return nil + }) +} + +func (s *batchSyncSession) reportUploadCompleted(c uploadCandidate) { + if s.reporter == nil { + return + } + s.reporter.OnUploadProgress(UploadProgressEvent{ + OID: c.oid, + Path: c.file.Name, + BytesSoFar: c.size, + BytesSinceLast: 0, + TotalBytes: c.size, + Phase: UploadProgressCompleted, + }) +} + func splitCandidatesByThreshold(candidates []uploadCandidate, threshold int64) (small, large []uploadCandidate) { for _, c := range candidates { if c.size < threshold { diff --git a/internal/pushsync/batch_sync_test.go b/internal/pushsync/batch_sync_test.go new file mode 100644 index 00000000..96fbdb21 --- /dev/null +++ b/internal/pushsync/batch_sync_test.go @@ -0,0 +1,449 @@ +package pushsync + +import ( + "context" + "io" + "os" + "path/filepath" + "strings" + "sync" + "sync/atomic" + "testing" + + "github.com/calypr/git-drs/internal/config" + "github.com/calypr/git-drs/internal/drslog" + "github.com/calypr/git-drs/internal/lfs" + drsapi "github.com/calypr/syfon/apigen/client/drs" + sycommon "github.com/calypr/syfon/client/common" + "github.com/calypr/syfon/client/transfer" +) + +type recordingReporter struct { + plan UploadPlanSummary + events []UploadProgressEvent +} + +func (r *recordingReporter) OnUploadPlan(plan UploadPlanSummary) { + r.plan = plan +} + +func (r *recordingReporter) OnUploadProgress(ev UploadProgressEvent) { + r.events = append(r.events, ev) +} + +type pushUploadBackendStub struct { + mu sync.Mutex + + resolveFunc func(context.Context, string, string, sycommon.FileMetadata, string) (string, error) + uploadFunc func(context.Context, string, io.Reader, int64) error + + lastResolve struct { + guid string + filename string + metadata sycommon.FileMetadata + bucket string + } + lastUpload struct { + url string + size int64 + body string + } +} + +func setTestPushScope(rt *pushRuntime) { + rt.Scope = pushScope{ + Organization: "syfon", + Project: "e2e", + Bucket: "syfon-e2e-bucket", + } +} + +func (b *pushUploadBackendStub) Name() string { return "push-upload-backend-stub" } + +func (b *pushUploadBackendStub) Logger() transfer.TransferLogger { return transfer.NoOpLogger{} } + +func (b *pushUploadBackendStub) Upload(ctx context.Context, url string, body io.Reader, size int64) error { + var bodyText string + if body != nil { + data, _ := io.ReadAll(body) + bodyText = string(data) + } + + b.mu.Lock() + b.lastUpload.url = url + b.lastUpload.size = size + b.lastUpload.body = bodyText + b.mu.Unlock() + + if b.uploadFunc != nil { + return b.uploadFunc(ctx, url, strings.NewReader(bodyText), size) + } + return nil +} + +func (b *pushUploadBackendStub) ResolveUploadURL(ctx context.Context, guid string, filename string, metadata sycommon.FileMetadata, bucket string) (string, error) { + b.mu.Lock() + b.lastResolve.guid = guid + b.lastResolve.filename = filename + b.lastResolve.metadata = metadata + b.lastResolve.bucket = bucket + b.mu.Unlock() + + if b.resolveFunc != nil { + return b.resolveFunc(ctx, guid, filename, metadata, bucket) + } + return "https://upload.example/" + filename, nil +} + +func (b *pushUploadBackendStub) MultipartInit(context.Context, string) (string, error) { + return "upload-id", nil +} + +func (b *pushUploadBackendStub) MultipartPart(context.Context, string, string, int, io.Reader) (string, error) { + return "etag", nil +} + +func (b *pushUploadBackendStub) MultipartComplete(context.Context, string, string, []transfer.MultipartPart) error { + return nil +} + +func TestExecuteUploadPlanReportsProgress(t *testing.T) { + tmp := t.TempDir() + filePath := filepath.Join(tmp, "a.bin") + if err := os.WriteFile(filePath, []byte("hello world"), 0o644); err != nil { + t.Fatalf("write temp file: %v", err) + } + + reporter := &recordingReporter{} + rt := newPushRuntime(nil) + setTestPushScope(rt) + rt.Logger = drslog.NewNoOpLogger() + rt.Tuning.MultiPartThreshold = 1024 + rt.Tuning.UploadConcurrency = 2 + + backend := &pushUploadBackendStub{ + uploadFunc: func(ctx context.Context, _ string, _ io.Reader, _ int64) error { + cb := sycommon.GetProgress(ctx) + if cb == nil { + t.Fatal("expected progress callback in upload context") + } + _ = cb(sycommon.ProgressEvent{Event: "progress", Oid: sycommon.GetOid(ctx), BytesSoFar: 5, BytesSinceLast: 5}) + _ = cb(sycommon.ProgressEvent{Event: "progress", Oid: sycommon.GetOid(ctx), BytesSoFar: 11, BytesSinceLast: 6}) + return nil + }, + } + oldBackend := uploadBackendForRuntime + uploadBackendForRuntime = func(*pushRuntime) transfer.MultipartBackend { return backend } + t.Cleanup(func() { uploadBackendForRuntime = oldBackend }) + + session := &batchSyncSession{ + ctx: context.Background(), + rt: rt, + reporter: reporter, + } + candidates := []uploadCandidate{{ + oid: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", + obj: &drsapi.DrsObject{Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa"}}}, + file: lfs.LfsFileInfo{Name: filePath}, + size: 11, + src: filePath, + }} + + if err := session.executeUploadPlan(candidates); err != nil { + t.Fatalf("executeUploadPlan returned error: %v", err) + } + if reporter.plan.TotalFiles != 1 || reporter.plan.TotalBytes != 11 { + t.Fatalf("unexpected plan summary: %+v", reporter.plan) + } + if len(reporter.events) < 3 { + t.Fatalf("expected progress + completed events, got %+v", reporter.events) + } + last := reporter.events[len(reporter.events)-1] + if last.Phase != UploadProgressCompleted || last.BytesSoFar != 11 { + t.Fatalf("unexpected final progress event: %+v", last) + } +} + +func TestExecuteUploadPlanHonorsUploadConcurrency(t *testing.T) { + tmp := t.TempDir() + rt := newPushRuntime(nil) + setTestPushScope(rt) + rt.Logger = drslog.NewNoOpLogger() + rt.Tuning.MultiPartThreshold = 1024 + rt.Tuning.UploadConcurrency = 2 + + var active int32 + var maxActive int32 + var mu sync.Mutex + releaseChans := make([]chan struct{}, 0, 3) + + backend := &pushUploadBackendStub{ + uploadFunc: func(context.Context, string, io.Reader, int64) error { + cur := atomic.AddInt32(&active, 1) + for { + max := atomic.LoadInt32(&maxActive) + if cur <= max || atomic.CompareAndSwapInt32(&maxActive, max, cur) { + break + } + } + + release := make(chan struct{}) + mu.Lock() + releaseChans = append(releaseChans, release) + mu.Unlock() + + <-release + atomic.AddInt32(&active, -1) + return nil + }, + } + oldBackend := uploadBackendForRuntime + uploadBackendForRuntime = func(*pushRuntime) transfer.MultipartBackend { return backend } + t.Cleanup(func() { uploadBackendForRuntime = oldBackend }) + + makeCandidate := func(name string) uploadCandidate { + path := filepath.Join(tmp, name) + if err := os.WriteFile(path, []byte("hello"), 0o644); err != nil { + t.Fatalf("write temp file %s: %v", name, err) + } + return uploadCandidate{ + oid: name + "-oid", + obj: &drsapi.DrsObject{Checksums: []drsapi.Checksum{{Type: "sha256", Checksum: name + "-oid"}}}, + file: lfs.LfsFileInfo{Name: path}, + size: 5, + src: path, + } + } + + session := &batchSyncSession{ + ctx: context.Background(), + rt: rt, + } + candidates := []uploadCandidate{ + makeCandidate("a.bin"), + makeCandidate("b.bin"), + makeCandidate("c.bin"), + } + + done := make(chan error, 1) + go func() { + done <- session.executeUploadPlan(candidates) + }() + + for { + mu.Lock() + count := len(releaseChans) + mu.Unlock() + if count >= 2 { + break + } + } + + if got := atomic.LoadInt32(&maxActive); got != 2 { + t.Fatalf("max active uploads = %d, want 2", got) + } + + mu.Lock() + firstBatch := append([]chan struct{}(nil), releaseChans[:2]...) + mu.Unlock() + for _, ch := range firstBatch { + close(ch) + } + + for { + mu.Lock() + count := len(releaseChans) + mu.Unlock() + if count >= 3 { + break + } + } + + mu.Lock() + last := releaseChans[2] + mu.Unlock() + close(last) + + if err := <-done; err != nil { + t.Fatalf("executeUploadPlan returned error: %v", err) + } + if got := atomic.LoadInt32(&maxActive); got != 2 { + t.Fatalf("max active uploads after completion = %d, want 2", got) + } +} + +func TestScopedDRSObjectForPushRebuildsAccessMethodsFromCurrentScope(t *testing.T) { + assertScopedDRSObjectForPushRebuildsAccessMethod(t, "s3://objects/existing-did") + assertScopedDRSObjectForPushRebuildsAccessMethod(t, "s3://7b9de5b9-19b2-536f-abcc-fe2a146c4eb5") +} + +func TestUploadFileForObjectUsesScopedKeyForMalformedRegisteredAccessURL(t *testing.T) { + tmp := t.TempDir() + filePath := filepath.Join(tmp, "project-subpath.bin") + if err := os.WriteFile(filePath, []byte("project subpath payload"), 0o644); err != nil { + t.Fatalf("write temp file: %v", err) + } + + rt := &pushRuntime{ + Logger: drslog.NewNoOpLogger(), + Scope: pushScope{ + Organization: "syfon", + Project: "e2e", + Bucket: "syfon-e2e-bucket", + StoragePref: "program-root/project-subpath", + }, + Tuning: pushTuning{MultiPartThreshold: 1024}, + } + + oid := "412f8568bfb0e62937ee40c6fcdeaa1cf55910c558c0152250340356c8829a47" + obj := &drsapi.DrsObject{ + Id: "f781273b-52eb-5ac2-a484-775235eef303", + Name: ptrString("project-subpath.bin"), + Size: 23, + Checksums: []drsapi.Checksum{{ + Type: "sha256", + Checksum: oid, + }}, + ControlledAccess: &[]string{"/organization/syfon/project/e2e"}, + AccessMethods: &[]drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: "s3://f781273b-52eb-5ac2-a484-775235eef303"}, + }}, + } + + backend := &pushUploadBackendStub{} + oldBackend := uploadBackendForRuntime + uploadBackendForRuntime = func(*pushRuntime) transfer.MultipartBackend { return backend } + t.Cleanup(func() { uploadBackendForRuntime = oldBackend }) + + if err := uploadFileForObject(rt, context.Background(), obj, filePath, false); err != nil { + t.Fatalf("uploadFileForObject returned error: %v", err) + } + if backend.lastUpload.body != "project subpath payload" { + t.Fatalf("uploaded body = %q, want project subpath payload", backend.lastUpload.body) + } + if backend.lastResolve.guid != obj.Id { + t.Fatalf("upload guid = %q, want DID %q", backend.lastResolve.guid, obj.Id) + } + if backend.lastResolve.bucket != "" { + t.Fatalf("upload bucket hint = %q, want empty scoped upload hint", backend.lastResolve.bucket) + } + if got := backend.lastResolve.metadata.Authorizations["syfon"]; len(got) != 1 || got[0] != "e2e" { + t.Fatalf("upload scope metadata = %+v, want syfon/e2e", backend.lastResolve.metadata.Authorizations) + } + wantKey := "program-root/project-subpath/" + oid + if backend.lastResolve.filename != wantKey { + t.Fatalf("upload object key = %q, want %q", backend.lastResolve.filename, wantKey) + } + if backend.lastUpload.url != "https://upload.example/"+wantKey { + t.Fatalf("upload URL = %q, want signed URL for %q", backend.lastUpload.url, wantKey) + } +} + +func assertScopedDRSObjectForPushRebuildsAccessMethod(t *testing.T, existingURL string) { + t.Helper() + tmp := t.TempDir() + filePath := filepath.Join(tmp, "program-root.bin") + if err := os.WriteFile(filePath, []byte("payload"), 0o644); err != nil { + t.Fatalf("write temp file: %v", err) + } + + rt := &pushRuntime{ + API: &config.GitContext{ + Organization: "syfon", + ProjectId: "e2e", + BucketName: "syfon-e2e-bucket", + StoragePrefix: "program-root", + }, + Scope: pushScope{ + Organization: "syfon", + Project: "e2e", + Bucket: "syfon-e2e-bucket", + StoragePref: "program-root", + }, + } + + existing := &drsapi.DrsObject{ + Id: "existing-did", + Name: ptrString("program-root.bin"), + Checksums: []drsapi.Checksum{{ + Type: "sha256", + Checksum: "3d71f043937a09b77826109db4f2b47c46f19923ef823f6a777a15fde0b2c9c7", + }}, + AccessMethods: &[]drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: existingURL}, + }}, + } + + obj, err := scopedDRSObjectForPush(rt, "3d71f043937a09b77826109db4f2b47c46f19923ef823f6a777a15fde0b2c9c7", filePath, 7, existing) + if err != nil { + t.Fatalf("scopedDRSObjectForPush returned error: %v", err) + } + if obj.AccessMethods == nil || len(*obj.AccessMethods) != 1 { + t.Fatalf("expected rebuilt access method, got %+v", obj.AccessMethods) + } + got := (*obj.AccessMethods)[0].AccessUrl.Url + want := "s3://syfon-e2e-bucket/program-root/3d71f043937a09b77826109db4f2b47c46f19923ef823f6a777a15fde0b2c9c7" + if got != want { + t.Fatalf("access url = %q, want %q", got, want) + } +} + +func TestScopedDRSObjectForPushPreservesExplicitAddURLAccessMethod(t *testing.T) { + tmp := t.TempDir() + filePath := filepath.Join(tmp, "from-bucket.bin") + if err := os.WriteFile(filePath, []byte("payload"), 0o644); err != nil { + t.Fatalf("write temp file: %v", err) + } + + rt := &pushRuntime{ + API: &config.GitContext{ + Organization: "syfon", + ProjectId: "e2e", + BucketName: "syfon-e2e-bucket", + }, + Scope: pushScope{ + Organization: "syfon", + Project: "e2e", + Bucket: "syfon-e2e-bucket", + }, + } + + oid := "95d536cc8df0a8e265832c6bd0422d69593f564d5ff0518e77535c45bc10bfde" + explicitURL := "s3://syfon-e2e-bucket/syfon/e2e/addurl/" + oid + existing := &drsapi.DrsObject{ + Id: "existing-did", + Name: ptrString("from-bucket.bin"), + Checksums: []drsapi.Checksum{{ + Type: "sha256", + Checksum: oid, + }}, + AccessMethods: &[]drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: explicitURL}, + }}, + } + + obj, err := scopedDRSObjectForPush(rt, oid, filePath, 7, existing) + if err != nil { + t.Fatalf("scopedDRSObjectForPush returned error: %v", err) + } + if obj.AccessMethods == nil || len(*obj.AccessMethods) != 1 || (*obj.AccessMethods)[0].AccessUrl == nil { + t.Fatalf("expected access method, got %+v", obj.AccessMethods) + } + if got := (*obj.AccessMethods)[0].AccessUrl.Url; got != explicitURL { + t.Fatalf("access url = %q, want explicit add-url %q", got, explicitURL) + } +} + +func ptrString(s string) *string { return &s } diff --git a/internal/pushsync/progress.go b/internal/pushsync/progress.go new file mode 100644 index 00000000..996e5480 --- /dev/null +++ b/internal/pushsync/progress.go @@ -0,0 +1,34 @@ +package pushsync + +type UploadProgressPhase string + +const ( + UploadProgressUploading UploadProgressPhase = "uploading" + UploadProgressCompleted UploadProgressPhase = "completed" +) + +type UploadPlanFile struct { + OID string + Path string + Bytes int64 +} + +type UploadPlanSummary struct { + Files []UploadPlanFile + TotalFiles int + TotalBytes int64 +} + +type UploadProgressEvent struct { + OID string + Path string + BytesSoFar int64 + BytesSinceLast int64 + TotalBytes int64 + Phase UploadProgressPhase +} + +type UploadProgressReporter interface { + OnUploadPlan(UploadPlanSummary) + OnUploadProgress(UploadProgressEvent) +} diff --git a/internal/pushsync/register.go b/internal/pushsync/register.go index de7a2bd3..60c75d9c 100644 --- a/internal/pushsync/register.go +++ b/internal/pushsync/register.go @@ -14,11 +14,22 @@ import ( localdrsobject "github.com/calypr/git-drs/internal/drsobject" "github.com/calypr/git-drs/internal/lfs" drsapi "github.com/calypr/syfon/apigen/client/drs" + internalapi "github.com/calypr/syfon/apigen/client/internalapi" + sycommon "github.com/calypr/syfon/client/common" conf "github.com/calypr/syfon/client/config" "github.com/calypr/syfon/client/hash" + syrequest "github.com/calypr/syfon/client/request" + "github.com/calypr/syfon/client/transfer" syupload "github.com/calypr/syfon/client/transfer/upload" ) +var uploadBackendForRuntime = func(rt *pushRuntime) transfer.MultipartBackend { + if rt == nil || rt.API == nil || rt.API.Client == nil { + return nil + } + return rt.API.Client.Data() +} + type pushScope struct { Organization string Project string @@ -127,11 +138,6 @@ func resolveUploadSourcePath(oid string, worktreePath string, isPointer bool) (s lfsObjPath, err := lfs.ObjectPath(localcommon.LFS_OBJS_PATH, oid) if err == nil { if st, statErr := os.Stat(lfsObjPath); statErr == nil && !st.IsDir() && st.Size() > 0 { - if isPointer { - if sentinel, sentinelErr := lfs.IsAddURLSentinelObject(lfsObjPath); sentinelErr == nil && sentinel { - return "", false, nil - } - } return lfsObjPath, true, nil } } @@ -153,7 +159,7 @@ func resolveUploadSourcePath(oid string, worktreePath string, isPointer bool) (s func uploadFileForObject(rt *pushRuntime, ctx context.Context, drsObject *drsapi.DrsObject, filePath string, skipIfDownloadable bool) error { hInfo := hash.ConvertDrsChecksumsToHashInfo(drsObject.Checksums) if skipIfDownloadable { - rt.Logger.InfoContext(ctx, fmt.Sprintf("checking if oid %s is already downloadable", hInfo.SHA256)) + rt.Logger.DebugContext(ctx, fmt.Sprintf("checking if oid %s is already downloadable", hInfo.SHA256)) downloadable, err := isFileDownloadable(rt, ctx, drsObject) if err != nil { return fmt.Errorf("error checking if file is downloadable: oid %s %v", hInfo.SHA256, err) @@ -164,7 +170,7 @@ func uploadFileForObject(rt *pushRuntime, ctx context.Context, drsObject *drsapi } } - rt.Logger.InfoContext(ctx, fmt.Sprintf("file %s is not downloadable, proceeding to upload", hInfo.SHA256)) + rt.Logger.DebugContext(ctx, fmt.Sprintf("file %s is not downloadable, proceeding to upload", hInfo.SHA256)) multiPartThreshold := int64(5 * 1024 * 1024 * 1024) if rt.Tuning.MultiPartThreshold > 0 { multiPartThreshold = rt.Tuning.MultiPartThreshold @@ -197,12 +203,74 @@ func uploadFileForObject(rt *pushRuntime, ctx context.Context, drsObject *drsapi "threshold", multiPartThreshold, "forceMultipart", forceMultipart, ) - if err := syupload.UploadObjectFile(ctx, rt.API.Client.Data(), filePath, objectKey, drsObject.Id, rt.Scope.Bucket, forceMultipart); err != nil { + backend := uploadBackendForRuntime(rt) + if backend == nil { + return fmt.Errorf("upload backend is required") + } + if forceMultipart { + if err := syupload.UploadObjectFile(ctx, backend, filePath, objectKey, drsObject.Id, rt.Scope.Bucket, true); err != nil { + return fmt.Errorf("upload error: %w", err) + } + return nil + } + + signedURL, err := resolveScopedUploadURL(rt, ctx, backend, drsObject.Id, objectKey) + if err != nil { + return fmt.Errorf("upload error: failed to get upload URL: %w", err) + } + + file, err := os.Open(filePath) + if err != nil { + return fmt.Errorf("upload error: open source: %w", err) + } + defer file.Close() + if err := backend.Upload(ctx, signedURL, file, fileSize); err != nil { return fmt.Errorf("upload error: %w", err) } + if cb := sycommon.GetProgress(ctx); cb != nil { + if err := cb(sycommon.ProgressEvent{Event: "progress", Oid: sycommon.GetOid(ctx), BytesSoFar: fileSize, BytesSinceLast: fileSize}); err != nil { + return fmt.Errorf("upload progress callback failed: %w", err) + } + } return nil } +func resolveScopedUploadURL(rt *pushRuntime, ctx context.Context, backend transfer.MultipartBackend, did, objectKey string) (string, error) { + organization := strings.TrimSpace(rt.Scope.Organization) + project := strings.TrimSpace(rt.Scope.Project) + if organization == "" || project == "" { + return "", fmt.Errorf("upload scope organization/project is required") + } + + if rt.API != nil && rt.API.Client != nil && rt.API.Client.Requestor() != nil { + query := url.Values{} + query.Set("organization", organization) + query.Set("project", project) + query.Set("file_name", objectKey) + var out internalapi.InternalSignedURL + if err := rt.API.Client.Requestor().Do(ctx, http.MethodGet, "/data/upload/"+url.PathEscape(did), nil, &out, syrequest.WithQueryValues(query)); err != nil { + return "", err + } + if out.Url == nil || strings.TrimSpace(*out.Url) == "" { + return "", fmt.Errorf("response missing URL") + } + return *out.Url, nil + } + + resolver, ok := backend.(interface { + ResolveUploadURL(context.Context, string, string, sycommon.FileMetadata, string) (string, error) + }) + if !ok { + return "", fmt.Errorf("upload backend cannot resolve upload URLs") + } + metadata := sycommon.FileMetadata{ + Authorizations: map[string][]string{ + organization: {project}, + }, + } + return resolver.ResolveUploadURL(ctx, did, objectKey, metadata, "") +} + func newDownloadProbe(cl *config.GitContext) func(context.Context, string) error { httpClient := http.DefaultClient if cl != nil && cl.Client != nil && cl.Client.HTTPClient() != nil { @@ -214,10 +282,11 @@ func newDownloadProbe(cl *config.GitContext) func(context.Context, string) error } func probeDownloadURL(ctx context.Context, httpClient *http.Client, rawURL string) error { - req, err := http.NewRequestWithContext(ctx, http.MethodHead, rawURL, nil) + req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil) if err != nil { return err } + req.Header.Set("Range", "bytes=0-0") if httpClient == nil { httpClient = http.DefaultClient } diff --git a/internal/pushsync/register_test.go b/internal/pushsync/register_test.go index 36e57074..5eed7c2e 100644 --- a/internal/pushsync/register_test.go +++ b/internal/pushsync/register_test.go @@ -1,33 +1,40 @@ package pushsync import ( - "context" - "io" - "net/http" - "strings" + "os" + "path/filepath" "testing" -) -func TestProbeDownloadURLUsesProvidedHTTPClient(t *testing.T) { - t.Parallel() + localcommon "github.com/calypr/git-drs/internal/common" +) - client := &http.Client{Transport: roundTripFunc(func(r *http.Request) (*http.Response, error) { - if r.Method != http.MethodHead { - t.Fatalf("expected HEAD request, got %s", r.Method) - } - return &http.Response{ - StatusCode: http.StatusOK, - Body: io.NopCloser(strings.NewReader("")), - Header: make(http.Header), - Request: r, - }, nil - })} +func TestResolveUploadSourcePath_NoSentinelObjectForPointer(t *testing.T) { + repo := t.TempDir() + oldWD, err := os.Getwd() + if err != nil { + t.Fatalf("getwd: %v", err) + } + if err := os.Chdir(repo); err != nil { + t.Fatalf("chdir: %v", err) + } + t.Cleanup(func() { _ = os.Chdir(oldWD) }) - if err := probeDownloadURL(context.Background(), client, "https://signed.example/object.bin"); err != nil { - t.Fatalf("probeDownloadURL returned error: %v", err) + oid := "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + worktreePath := filepath.Join(repo, "data.bin") + if err := os.WriteFile(worktreePath, []byte("version https://git-lfs.github.com/spec/v1\noid sha256:"+oid+"\nsize 1\n"), 0o644); err != nil { + t.Fatalf("write pointer: %v", err) } -} -type roundTripFunc func(*http.Request) (*http.Response, error) + // Ensure the implicit cache root matches command behavior under the cwd. + if _, err := os.Stat(localcommon.LFS_OBJS_PATH); err == nil { + t.Fatalf("expected no local object cache at %s", localcommon.LFS_OBJS_PATH) + } -func (f roundTripFunc) RoundTrip(r *http.Request) (*http.Response, error) { return f(r) } + src, ok, err := resolveUploadSourcePath(oid, worktreePath, true) + if err != nil { + t.Fatalf("resolveUploadSourcePath: %v", err) + } + if ok || src != "" { + t.Fatalf("expected pointer without local payload to skip upload source, got src=%q ok=%v", src, ok) + } +} diff --git a/internal/testutils/config.go b/internal/testutils/config.go index bc012cde..e8ab73bc 100644 --- a/internal/testutils/config.go +++ b/internal/testutils/config.go @@ -67,11 +67,17 @@ func CreateTestConfig(t *testing.T, tmpDir string, cfg *config.Config) { setConfig(prefix+".endpoint", remote.Gen3.Endpoint) setConfig(prefix+".project", remote.Gen3.ProjectID) setConfig(prefix+".bucket", remote.Gen3.Bucket) + if remote.Gen3.Organization != "" { + setConfig(prefix+".organization", remote.Gen3.Organization) + } } else if remote.Local != nil { setConfig(prefix+".type", "local") setConfig(prefix+".endpoint", remote.Local.BaseURL) setConfig(prefix+".project", remote.Local.ProjectID) setConfig(prefix+".bucket", remote.Local.Bucket) + if remote.Local.Organization != "" { + setConfig(prefix+".organization", remote.Local.Organization) + } } } } diff --git a/tests/README.md b/tests/README.md index 5a2c02ff..daf48d6d 100644 --- a/tests/README.md +++ b/tests/README.md @@ -66,7 +66,7 @@ TEST_STRICT_CLEANUP=true - multipart/resume checks - LFS compatibility path - `tests/e2e-gen3-remote-addurl.sh` - - add-url with known SHA and unknown SHA sentinel path + - add-url with known SHA and unknown SHA placeholder path - `tests/e2e-local-full.sh` - wrapper for full suite in `local` server mode - `tests/e2e-local-addurl.sh` diff --git a/tests/coverage-test.sh b/tests/coverage-test.sh index b2c57572..0bc980d1 100755 --- a/tests/coverage-test.sh +++ b/tests/coverage-test.sh @@ -276,7 +276,7 @@ export AWS_SECRET_ACCESS_KEY="$SECRET_KEY" export AWS_ENDPOINT_URL_S3="$ENDPOINT" export AWS_REGION="${AWS_REGION:-us-east-1}" -# use add-url without --sha256 (experimental sentinel mode) +# use add-url without --sha256 git drs add-url s3://$PREFIX$BUCKET/simple_test_file.txt data/simple_test_file.txt # set the .gitattributes to track the file @@ -305,7 +305,7 @@ if [ -z "$original_add_url_oid" ]; then exit 1 fi if [ "$original_add_url_oid" = "$sha256" ]; then - err "expected sentinel/synthetic oid for unknown-sha add-url, but found real sha256" + err "expected placeholder oid for unknown-sha add-url, but found real sha256" exit 1 fi diff --git a/tests/e2e-gen3-remote-addurl.sh b/tests/e2e-gen3-remote-addurl.sh index 7976a10c..b4b155e0 100755 --- a/tests/e2e-gen3-remote-addurl.sh +++ b/tests/e2e-gen3-remote-addurl.sh @@ -650,19 +650,15 @@ main() { if [[ "$SERVER_MODE" == "remote" ]]; then log "Configuring gen3 remote" run_cmd_with_timeout "$TEST_CMD_TIMEOUT_SECONDS" "git drs remote add gen3 (source repo)" \ - git drs remote add gen3 "$REMOTE_NAME" \ - --token "$GEN3_TOKEN" \ - --bucket "$TEST_BUCKET_NAME" \ - --organization "$ORGANIZATION" \ - --project "$PROJECT_ID" + git drs remote add gen3 "$REMOTE_NAME" "$ORGANIZATION/$PROJECT_ID" \ + --token "$GEN3_TOKEN" else log "Configuring local remote" local -a local_add_args local_add_args=( git drs remote add local "$REMOTE_NAME" "$DRS_URL" + "$ORGANIZATION/$PROJECT_ID" --bucket "$TEST_BUCKET_NAME" - --organization "$ORGANIZATION" - --project "$PROJECT_ID" ) if [[ -n "$LOCAL_USERNAME" && -n "$LOCAL_PASSWORD" ]]; then local_add_args+=(--username "$LOCAL_USERNAME" --password "$LOCAL_PASSWORD") @@ -717,7 +713,7 @@ main() { exit 1 fi if [[ "$unknown_pointer_oid" == "$unknown_real_oid" ]]; then - echo "error: unknown-sha add-url unexpectedly used real sha256 (expected synthetic/sentinel oid)" >&2 + echo "error: unknown-sha add-url unexpectedly used real sha256 (expected placeholder oid)" >&2 exit 1 fi ALL_OIDS+=("$unknown_pointer_oid") @@ -755,18 +751,14 @@ main() { configure_lfs_endpoint_for_repo "$REMOTE_NAME" if [[ "$SERVER_MODE" == "remote" ]]; then run_cmd_with_timeout "$TEST_CMD_TIMEOUT_SECONDS" "git drs remote add gen3 (clone repo)" \ - git drs remote add gen3 "$REMOTE_NAME" \ - --token "$GEN3_TOKEN" \ - --bucket "$TEST_BUCKET_NAME" \ - --organization "$ORGANIZATION" \ - --project "$PROJECT_ID" + git drs remote add gen3 "$REMOTE_NAME" "$ORGANIZATION/$PROJECT_ID" \ + --token "$GEN3_TOKEN" else local -a local_add_args_clone local_add_args_clone=( git drs remote add local "$REMOTE_NAME" "$DRS_URL" + "$ORGANIZATION/$PROJECT_ID" --bucket "$TEST_BUCKET_NAME" - --organization "$ORGANIZATION" - --project "$PROJECT_ID" ) if [[ -n "$LOCAL_USERNAME" && -n "$LOCAL_PASSWORD" ]]; then local_add_args_clone+=(--username "$LOCAL_USERNAME" --password "$LOCAL_PASSWORD") diff --git a/tests/e2e-gen3-remote-full.sh b/tests/e2e-gen3-remote-full.sh index dea0572c..d1142f91 100755 --- a/tests/e2e-gen3-remote-full.sh +++ b/tests/e2e-gen3-remote-full.sh @@ -1416,10 +1416,10 @@ main() { configure_local_credential_helper git config --local lfs.basictransfersonly true if [[ "$SERVER_MODE" == "remote" ]]; then - git drs remote add gen3 "$REMOTE_NAME" --token "$GEN3_TOKEN" --bucket "$active_bucket" --organization "$ORGANIZATION" --project "$PROJECT_ID" + git drs remote add gen3 "$REMOTE_NAME" "$ORGANIZATION/$PROJECT_ID" --token "$GEN3_TOKEN" else local -a local_add_args - local_add_args=(git drs remote add local "$REMOTE_NAME" "$DRS_URL" --bucket "$active_bucket" --organization "$ORGANIZATION" --project "$PROJECT_ID") + local_add_args=(git drs remote add local "$REMOTE_NAME" "$DRS_URL" "$ORGANIZATION/$PROJECT_ID") if [[ -n "$LOCAL_USERNAME" && -n "$LOCAL_PASSWORD" ]]; then local_add_args+=(--username "$LOCAL_USERNAME" --password "$LOCAL_PASSWORD") fi @@ -1544,10 +1544,10 @@ main() { configure_local_credential_helper git config --local lfs.basictransfersonly true if [[ "$SERVER_MODE" == "remote" ]]; then - git drs remote add gen3 "$REMOTE_NAME" --token "$GEN3_TOKEN" --bucket "$active_bucket" --organization "$ORGANIZATION" --project "$PROJECT_ID" + git drs remote add gen3 "$REMOTE_NAME" "$ORGANIZATION/$PROJECT_ID" --token "$GEN3_TOKEN" else local -a local_add_args_clone - local_add_args_clone=(git drs remote add local "$REMOTE_NAME" "$DRS_URL" --bucket "$active_bucket" --organization "$ORGANIZATION" --project "$PROJECT_ID") + local_add_args_clone=(git drs remote add local "$REMOTE_NAME" "$DRS_URL" "$ORGANIZATION/$PROJECT_ID") if [[ -n "$LOCAL_USERNAME" && -n "$LOCAL_PASSWORD" ]]; then local_add_args_clone+=(--username "$LOCAL_USERNAME" --password "$LOCAL_PASSWORD") fi @@ -1608,10 +1608,10 @@ main() { configure_local_credential_helper git config --local lfs.basictransfersonly true if [[ "$SERVER_MODE" == "remote" ]]; then - git drs remote add gen3 "$REMOTE_NAME" --token "$GEN3_TOKEN" --bucket "$active_bucket" --organization "$ORGANIZATION" --project "$PROJECT_ID" + git drs remote add gen3 "$REMOTE_NAME" "$ORGANIZATION/$PROJECT_ID" --token "$GEN3_TOKEN" else local -a local_add_args_lfs - local_add_args_lfs=(git drs remote add local "$REMOTE_NAME" "$DRS_URL" --bucket "$active_bucket" --organization "$ORGANIZATION" --project "$PROJECT_ID") + local_add_args_lfs=(git drs remote add local "$REMOTE_NAME" "$DRS_URL" "$ORGANIZATION/$PROJECT_ID") if [[ -n "$LOCAL_USERNAME" && -n "$LOCAL_PASSWORD" ]]; then local_add_args_lfs+=(--username "$LOCAL_USERNAME" --password "$LOCAL_PASSWORD") fi diff --git a/tests/integration/docker_syfon/docker_syfon_e2e_assertions_test.go b/tests/integration/docker_syfon/docker_syfon_e2e_assertions_test.go index 58984c1a..7ca29110 100644 --- a/tests/integration/docker_syfon/docker_syfon_e2e_assertions_test.go +++ b/tests/integration/docker_syfon/docker_syfon_e2e_assertions_test.go @@ -91,22 +91,14 @@ func normalizeDockerBucketMapKeyPart(v string) string { func upsertSyfonBucketScope(t *testing.T, serverURL string, minioEnv *minioContainer, org, project, path string) { t.Helper() body, err := json.Marshal(map[string]string{ - "bucket": minioEnv.bucket, - "provider": "s3", - "region": minioEnv.region, - "access_key": minioEnv.accessKey, - "secret_key": minioEnv.secretKey, - "endpoint": minioEnv.endpoint, - "billing_log_bucket": minioEnv.bucket, - "billing_log_prefix": dockerE2EProviderLogPrefix, - "organization": org, - "project_id": project, - "path": path, + "organization": org, + "project_id": project, + "path": path, }) if err != nil { t.Fatalf("marshal bucket scope request: %v", err) } - req, err := http.NewRequest(http.MethodPut, strings.TrimRight(serverURL, "/")+"/data/buckets", bytes.NewReader(body)) + req, err := http.NewRequest(http.MethodPost, strings.TrimRight(serverURL, "/")+"/data/buckets/"+minioEnv.bucket+"/scopes", bytes.NewReader(body)) if err != nil { t.Fatalf("build bucket scope request: %v", err) } @@ -254,3 +246,33 @@ func assertMinIOObjectExists(t *testing.T, client *s3.Client, bucket, key string t.Fatalf("expected MinIO object %s/%s to exist: %v", bucket, key, err) } } + +func assertMinIOObjectMissing(t *testing.T, client *s3.Client, bucket, key string) { + t.Helper() + _, err := client.HeadObject(context.Background(), &s3.HeadObjectInput{ + Bucket: aws.String(bucket), + Key: aws.String(key), + }) + if err == nil { + t.Fatalf("expected MinIO object %s/%s to be deleted", bucket, key) + } +} + +func assertDRSRecordMissing(t *testing.T, serverURL, did string) { + t.Helper() + target := strings.TrimRight(serverURL, "/") + "/ga4gh/drs/v1/objects/" + did + req, err := http.NewRequest(http.MethodGet, target, nil) + if err != nil { + t.Fatalf("build GET %s: %v", target, err) + } + req.SetBasicAuth(dockerE2ELocalUser, dockerE2ELocalPassword) + resp, err := http.DefaultClient.Do(req) + if err != nil { + t.Fatalf("GET %s: %v", target, err) + } + defer resp.Body.Close() + body, _ := io.ReadAll(resp.Body) + if resp.StatusCode != http.StatusNotFound { + t.Fatalf("expected DRS record %s to be deleted, got status=%d body=%s", did, resp.StatusCode, string(body)) + } +} diff --git a/tests/integration/docker_syfon/docker_syfon_e2e_helpers_test.go b/tests/integration/docker_syfon/docker_syfon_e2e_helpers_test.go index a77582e4..456ffca8 100644 --- a/tests/integration/docker_syfon/docker_syfon_e2e_helpers_test.go +++ b/tests/integration/docker_syfon/docker_syfon_e2e_helpers_test.go @@ -149,10 +149,9 @@ func configureLocalRepo(t *testing.T, dir, credentialStore string) { func configureGitDrsRemote(t *testing.T, repoDir, serverURL string, minioEnv *minioContainer) { t.Helper() t.Logf("configuring git-drs remote: repo=%s server=%s bucket=%s org=%s project=%s", repoDir, serverURL, minioEnv.bucket, dockerE2EOrganization, dockerE2EProjectID) + setLocalBucketMapping(t, repoDir, dockerE2EOrganization, dockerE2EProjectID, minioEnv.bucket, "") runCommand(t, repoDir, nil, "git", "drs", "remote", "add", "local", "origin", serverURL, - "--bucket", minioEnv.bucket, - "--organization", dockerE2EOrganization, - "--project", dockerE2EProjectID, + dockerE2EOrganization+"/"+dockerE2EProjectID, "--username", dockerE2ELocalUser, "--password", dockerE2ELocalPassword, ) diff --git a/tests/integration/docker_syfon/docker_syfon_e2e_test.go b/tests/integration/docker_syfon/docker_syfon_e2e_test.go index 4d90ce32..40f604ef 100644 --- a/tests/integration/docker_syfon/docker_syfon_e2e_test.go +++ b/tests/integration/docker_syfon/docker_syfon_e2e_test.go @@ -65,6 +65,7 @@ func TestGitDrsDockerMinIOE2E(t *testing.T) { runCommand(t, repoDir, nil, "git", "remote", "add", "origin", gogsEnv.repoCloneURL) runCommand(t, repoDir, nil, "git", "drs", "init") configureGitDrsRemote(t, repoDir, server.url, minioEnv) + upsertSyfonBucketScope(t, server.url, minioEnv, dockerE2EOrganization, dockerE2EProjectID, "s3://"+minioEnv.bucket) logRepoSnapshot(t, repoDir, "post-init") t.Logf("STEP 5: Uploading tracked files through git-drs push...") @@ -105,6 +106,7 @@ func TestGitDrsDockerMinIOE2E(t *testing.T) { t.Fatalf("restore git credential store: %v", err) } runCommand(t, repoDir, nil, "git", "drs", "push", "origin") + runCommand(t, repoDir, nil, "git", "branch", "--set-upstream-to=origin/main", "main") logRepoSnapshot(t, repoDir, "post-push") querySmall := runCommand(t, repoDir, nil, "git", "drs", "query", "--remote", "origin", "--pretty", smallDid) if !strings.Contains(querySmall, smallDid) { @@ -169,6 +171,14 @@ func TestGitDrsDockerMinIOE2E(t *testing.T) { t.Fatalf("multipart file checksum lookup mismatch: expected DID %s and hash %s in output %q", largeDid, largeSumHex, largeHashOut) } t.Logf("hash verification complete for source.txt=%s multipart.bin=%s", smallSumHex, largeSumHex) + + t.Logf("STEP 7: Deleting a tracked file via git drs rm and verifying remote record + bucket removal...") + runCommand(t, repoDir, nil, "git", "drs", "rm", "data/source.txt") + runCommand(t, repoDir, nil, "git", "commit", "-m", "remove source.txt through git drs rm") + runCommand(t, repoDir, nil, "git", "drs", "push", "origin") + logRepoSnapshot(t, repoDir, "post-delete-push") + assertMinIOObjectMissing(t, minioEnv.s3Client, minioEnv.bucket, smallDid) + assertDRSRecordMissing(t, server.url, smallDid) } func TestGitDrsDockerAddURLE2E(t *testing.T) { @@ -355,6 +365,7 @@ func TestGitDrsDockerBucketScopePathsE2E(t *testing.T) { runCommand(t, repoDir, nil, "git", "drs", "track", "*.bin") t.Logf("STEP 5: Adding managed uploads for each bucket scope path case...") + upsertSyfonBucketScope(t, server.url, minioEnv, dockerE2EOrganization, dockerE2EProjectID, "s3://"+minioEnv.bucket) defaultOID := addTrackedPayloadCommit(t, repoDir, "data/default-root.bin", []byte("default bucket root payload"), ".gitattributes") defaultKey := defaultOID runCommand(t, repoDir, nil, "git", "drs", "push", "origin") diff --git a/tests/monorepos/e2e-monorepo-remote.sh b/tests/monorepos/e2e-monorepo-remote.sh index a53dc5a2..d02bc19a 100755 --- a/tests/monorepos/e2e-monorepo-remote.sh +++ b/tests/monorepos/e2e-monorepo-remote.sh @@ -14,9 +14,10 @@ fi DRS_URL="${TEST_DRS_URL:-${DRS_URL:-https://caliper-training.ohsu.edu}}" SERVER_MODE="${TEST_SERVER_MODE:-${SERVER_MODE:-remote}}" -GEN3_TOKEN="${TEST_GEN3_TOKEN:-${GEN3_TOKEN:-}}" -GEN3_TOKEN_SOURCE="${TEST_GEN3_TOKEN:+env:TEST_GEN3_TOKEN}" +GEN3_TOKEN="" +GEN3_TOKEN_SOURCE="" GEN3_PROFILE="${TEST_GEN3_PROFILE:-${GEN3_PROFILE:-}}" +USER_SUPPLIED_GEN3_TOKEN="${TEST_GEN3_TOKEN:-${GEN3_TOKEN:-}}" GEN3_CONFIG_PATH="${TEST_GEN3_CONFIG_PATH:-${GEN3_CONFIG_PATH:-$HOME/.gen3/gen3_client_config.ini}}" ADMIN_AUTH_HEADER="${TEST_ADMIN_AUTH_HEADER:-${ADMIN_AUTH_HEADER:-}}" LOCAL_USERNAME="${TEST_LOCAL_USERNAME:-${LOCAL_USERNAME:-${DRS_BASIC_AUTH_USER:-}}}" @@ -259,15 +260,14 @@ resolve_auth_from_profile_if_needed() { if [[ "$SERVER_MODE" == "local" ]]; then return fi - if [[ -n "$GEN3_TOKEN" ]]; then - GEN3_TOKEN_SOURCE="${GEN3_TOKEN_SOURCE:-env:TEST_GEN3_TOKEN}" - return - fi require_env GEN3_PROFILE "$GEN3_PROFILE" if [[ ! -f "$GEN3_CONFIG_PATH" ]]; then echo "error: GEN3 profile config file not found at $GEN3_CONFIG_PATH" >&2 exit 1 fi + if [[ -n "$USER_SUPPLIED_GEN3_TOKEN" ]]; then + echo "warning: TEST_GEN3_TOKEN/GEN3_TOKEN is ignored by this script; using GEN3_PROFILE='$GEN3_PROFILE' from $GEN3_CONFIG_PATH" >&2 + fi local profile_token profile_endpoint profile_api_key profile_token="$(load_profile_field "$GEN3_PROFILE" "access_token" "$GEN3_CONFIG_PATH")" @@ -478,8 +478,8 @@ validate_config() { ;; esac - if [[ "$SERVER_MODE" == "remote" && -z "$GEN3_TOKEN" && -z "$GEN3_PROFILE" ]]; then - echo "error: remote mode requires TEST_GEN3_TOKEN or GEN3_PROFILE/TEST_GEN3_PROFILE" >&2 + if [[ "$SERVER_MODE" == "remote" && -z "$GEN3_PROFILE" ]]; then + echo "error: remote mode requires GEN3_PROFILE/TEST_GEN3_PROFILE" >&2 exit 1 fi if [[ "$SERVER_MODE" == "local" ]]; then @@ -651,18 +651,14 @@ setup_repo() { configure_local_credential_helper if [[ "$SERVER_MODE" == "remote" ]]; then git drs remote add gen3 "$MONO_REMOTE_NAME" \ - --token "$GEN3_TOKEN" \ - --bucket "$ACTIVE_BUCKET" \ - --organization "$TEST_ORGANIZATION" \ - --project "$TEST_PROJECT_ID" + "$TEST_ORGANIZATION/$TEST_PROJECT_ID" ensure_repo_remote_token "$MONO_REMOTE_NAME" else local -a local_add_args local_add_args=( git drs remote add local "$MONO_REMOTE_NAME" "$DRS_URL" + "$TEST_ORGANIZATION/$TEST_PROJECT_ID" --bucket "$ACTIVE_BUCKET" - --organization "$TEST_ORGANIZATION" - --project "$TEST_PROJECT_ID" ) if [[ -n "$LOCAL_USERNAME" && -n "$LOCAL_PASSWORD" ]]; then local_add_args+=(--username "$LOCAL_USERNAME" --password "$LOCAL_PASSWORD") @@ -730,18 +726,14 @@ clone_and_verify() { configure_local_credential_helper if [[ "$SERVER_MODE" == "remote" ]]; then git drs remote add gen3 "$MONO_REMOTE_NAME" \ - --token "$GEN3_TOKEN" \ - --bucket "$ACTIVE_BUCKET" \ - --organization "$TEST_ORGANIZATION" \ - --project "$TEST_PROJECT_ID" + "$TEST_ORGANIZATION/$TEST_PROJECT_ID" ensure_repo_remote_token "$MONO_REMOTE_NAME" else local -a local_add_args_clone local_add_args_clone=( git drs remote add local "$MONO_REMOTE_NAME" "$DRS_URL" + "$TEST_ORGANIZATION/$TEST_PROJECT_ID" --bucket "$ACTIVE_BUCKET" - --organization "$TEST_ORGANIZATION" - --project "$TEST_PROJECT_ID" ) if [[ -n "$LOCAL_USERNAME" && -n "$LOCAL_PASSWORD" ]]; then local_add_args_clone+=(--username "$LOCAL_USERNAME" --password "$LOCAL_PASSWORD") diff --git a/tests/monorepos/run-test.sh b/tests/monorepos/run-test.sh index 47a5835e..263d0993 100755 --- a/tests/monorepos/run-test.sh +++ b/tests/monorepos/run-test.sh @@ -211,7 +211,7 @@ if [ "$CLONE" = "true" ]; then fi echo "Pulling LFS objects from remote" >&2 git drs init - git drs remote add gen3 "$PROFILE" --cred "$CREDENTIALS_PATH" --bucket $BUCKET --project "$PROGRAM-$PROJECT" --url https://calypr-dev.ohsu.edu + git drs remote add gen3 "$PROFILE" "$PROGRAM/$PROJECT" --cred "$CREDENTIALS_PATH" git lfs pull origin main if grep -q 'https://git-lfs.github.com/spec/v1' ./TARGET-ALL-P2/sub-directory-1/*file-0001.dat; then echo "error: LFS pointer resolved and data in `TARGET-ALL-P2/sub-directory-1/file-0001.dat`" >&2 @@ -235,7 +235,7 @@ else # Initialize drs configuration for this repo git drs init -t 16 - git drs remote add gen3 "$PROFILE" --cred "$CREDENTIALS_PATH" --bucket $BUCKET --project "$PROGRAM-$PROJECT" --url https://calypr-dev.ohsu.edu + git drs remote add gen3 "$PROFILE" "$PROGRAM/$PROJECT" --cred "$CREDENTIALS_PATH" # Set multipart-threshold to 10 (MB) for testing purposes # Using a smaller threshold to force a multipart upload for testing # default is 500 (MB) From 2784d9a69a45f340a386deb0dc12f23bc4c5ba70 Mon Sep 17 00:00:00 2001 From: matthewpeterkort Date: Mon, 11 May 2026 15:53:04 -0700 Subject: [PATCH 5/7] pin versions to latest syfon release --- go.mod | 6 +++--- go.sum | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/go.mod b/go.mod index 3096b9d6..b513958d 100644 --- a/go.mod +++ b/go.mod @@ -5,8 +5,8 @@ go 1.26.3 require ( github.com/bytedance/sonic v1.15.0 github.com/calypr/data-client v0.0.0-20260506231822-6a4689d4201f - github.com/calypr/syfon v0.2.9-0.20260511213931-ff4d5a467b3e - github.com/calypr/syfon/apigen v0.2.7-0.20260511213931-ff4d5a467b3e + github.com/calypr/syfon v0.2.9 + github.com/calypr/syfon/apigen v0.2.7 github.com/git-lfs/pktline v0.0.0-20230103162542-ca444d533ef1 github.com/go-git/go-git/v5 v5.13.0 github.com/golang-jwt/jwt/v5 v5.3.1 @@ -141,7 +141,7 @@ require ( github.com/aws/aws-sdk-go-v2/config v1.32.14 github.com/aws/aws-sdk-go-v2/credentials v1.19.14 github.com/aws/aws-sdk-go-v2/service/s3 v1.99.0 - github.com/calypr/syfon/client v0.2.8-0.20260511213931-ff4d5a467b3e + github.com/calypr/syfon/client v0.2.8 github.com/hashicorp/go-version v1.9.0 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect github.com/spf13/pflag v1.0.10 // indirect diff --git a/go.sum b/go.sum index d823b2d1..73ae11d9 100644 --- a/go.sum +++ b/go.sum @@ -117,12 +117,12 @@ github.com/bytedance/sonic/loader v0.5.0 h1:gXH3KVnatgY7loH5/TkeVyXPfESoqSBSBEiD github.com/bytedance/sonic/loader v0.5.0/go.mod h1:AR4NYCk5DdzZizZ5djGqQ92eEhCCcdf5x77udYiSJRo= github.com/calypr/data-client v0.0.0-20260506231822-6a4689d4201f h1:+jRjTLBjCjxbWvcbYIi9Oe+XBGlwLJnnk8mk1wHEamY= github.com/calypr/data-client v0.0.0-20260506231822-6a4689d4201f/go.mod h1:cAKvEGQogXFM4Hz22/JNOH+l2bRaz+pjT3N2H5cC8D4= -github.com/calypr/syfon v0.2.9-0.20260511213931-ff4d5a467b3e h1:VSCX3ZwQcOWCjrZ7WFP4TW9vGi9WPYZuuZsjEoRluwA= -github.com/calypr/syfon v0.2.9-0.20260511213931-ff4d5a467b3e/go.mod h1:FMYmSy6rbUGbFcuNTlKtxIhWSlIRPbZfpauKMO0k1V4= -github.com/calypr/syfon/apigen v0.2.7-0.20260511213931-ff4d5a467b3e h1:PtLxUIloatJGqZx/UkvG7wT9z8vh7R3N4CyDnc139Zk= -github.com/calypr/syfon/apigen v0.2.7-0.20260511213931-ff4d5a467b3e/go.mod h1:VrRZ2A17YV91Zsm7CF/u1/Z+DfcZAk8Q4Pk1xklb5xU= -github.com/calypr/syfon/client v0.2.8-0.20260511213931-ff4d5a467b3e h1:ycB0RN7nRdbU9gRynm8JC1pGsBSc4VmOxLkKGNPCLtg= -github.com/calypr/syfon/client v0.2.8-0.20260511213931-ff4d5a467b3e/go.mod h1:9MTLDQ5clwDHcDuKCEuVaV7bebsCIUtUQxTegW3RDVo= +github.com/calypr/syfon v0.2.9 h1:ZpcXCtlD6QxWTQKn5S1ulGEMh/tbnHv6kAIum1kQbCQ= +github.com/calypr/syfon v0.2.9/go.mod h1:FMYmSy6rbUGbFcuNTlKtxIhWSlIRPbZfpauKMO0k1V4= +github.com/calypr/syfon/apigen v0.2.7 h1:h5ZcxoLFPuLXt+8EUWICbKQgE/MMu2jym4oUshV3m8k= +github.com/calypr/syfon/apigen v0.2.7/go.mod h1:VrRZ2A17YV91Zsm7CF/u1/Z+DfcZAk8Q4Pk1xklb5xU= +github.com/calypr/syfon/client v0.2.8 h1:xMq5pZNnvkY9UnfrQVvJXe8Ic/Y0D2OLyreMaqNVP5U= +github.com/calypr/syfon/client v0.2.8/go.mod h1:9MTLDQ5clwDHcDuKCEuVaV7bebsCIUtUQxTegW3RDVo= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk= From 21e4dcd050577705091282e1970a2388b15b7f7d Mon Sep 17 00:00:00 2001 From: Brian Date: Tue, 12 May 2026 07:56:55 -0700 Subject: [PATCH 6/7] fix-unnecessary-introductory-text (#234) Removed unnecessary introductory text from developer documentation. Co-authored-by: Matthew Peterkort <33436238+matthewpeterkort@users.noreply.github.com> --- docs/precommit.md | 6 ------ 1 file changed, 6 deletions(-) diff --git a/docs/precommit.md b/docs/precommit.md index 89bb318d..d99b4add 100644 --- a/docs/precommit.md +++ b/docs/precommit.md @@ -1,9 +1,3 @@ -Below is **developer-facing documentation** you can drop into your repo (e.g. `docs/precommit-cache.md` or `README-precommit.md`). -It assumes: - -* `cmd/precommit` installs the **pre-commit hook** -* `precommit_cache` is the **read-only helper library** used by pre-push (and other tooling) - --- # Developer Documentation: `.git/drs/pre-commit` Cache & Helpers From 17880ebadcf5ee0ac4dcd819ffe3503bb85c5721 Mon Sep 17 00:00:00 2001 From: Matthew Peterkort <33436238+matthewpeterkort@users.noreply.github.com> Date: Tue, 12 May 2026 17:53:09 -0700 Subject: [PATCH 7/7] Fix/misc issues (#235) * clean up progress upload * add progress for git drs pull downloads, update docs * fix tests * fix coverage test * fix bot issues * bump library deps --- cmd/initialize/main.go | 14 +- cmd/initialize/main_test.go | 11 + cmd/lsfiles/main.go | 101 +- cmd/lsfiles/main_test.go | 126 +- cmd/pull/main.go | 54 +- cmd/pull/progress.go | 185 + cmd/pull/progress_test.go | 85 + cmd/push/main.go | 9 + cmd/push/progress.go | 174 +- cmd/push/progress_test.go | 120 +- cmd/remote/add/local_test.go | 14 + coverage/combined.html | 14839 +++++++++------- coverage/combined.out | 4567 +++-- coverage/summary.txt | 720 +- docs/TODO/bug-checksum-match-upload-skip.md | 17 + docs/bucket-mapping.md | 210 + docs/commands.md | 19 +- docs/getting-started.md | 426 +- docs/git-lfs.md | 37 + docs/installation.md | 142 +- docs/quickstart.md | 121 + docs/remove-files.md | 59 + go.mod | 50 +- go.sum | 116 +- internal/config/remote.go | 1 + internal/progressui/renderer.go | 228 + internal/pushsync/batch_sync.go | 88 +- internal/pushsync/batch_sync_test.go | 164 +- internal/pushsync/register.go | 47 +- internal/pushsync/register_test.go | 197 + .../docker_syfon_e2e_setup_test.go | 38 + .../docker_syfon_e2e_syfon_test.go | 32 +- 32 files changed, 13656 insertions(+), 9355 deletions(-) create mode 100644 cmd/pull/progress.go create mode 100644 cmd/pull/progress_test.go create mode 100644 docs/TODO/bug-checksum-match-upload-skip.md create mode 100644 docs/bucket-mapping.md create mode 100644 docs/git-lfs.md create mode 100644 docs/quickstart.md create mode 100644 docs/remove-files.md create mode 100644 internal/progressui/renderer.go diff --git a/cmd/initialize/main.go b/cmd/initialize/main.go index 26a0738b..dd9d6fca 100644 --- a/cmd/initialize/main.go +++ b/cmd/initialize/main.go @@ -127,6 +127,15 @@ func isInitialized() (bool, error) { if val, err := gitrepo.GetGitConfigString("filter.drs.process"); err != nil || strings.TrimSpace(val) != "git-drs filter" { return false, err } + if val, err := gitrepo.GetGitConfigString("filter.drs.clean"); err != nil || strings.TrimSpace(val) != "git-drs clean -- %f" { + return false, err + } + if val, err := gitrepo.GetGitConfigString("filter.drs.smudge"); err != nil || strings.TrimSpace(val) != "git-drs smudge -- %f" { + return false, err + } + if val, err := gitrepo.GetGitConfigString("filter.drs.required"); err != nil || strings.TrimSpace(val) != "true" { + return false, err + } preCommitInstalled, err := hookContains("pre-commit", "git drs precommit") if err != nil { @@ -165,7 +174,10 @@ func initGitConfig() error { // Use git-drs as the long-running filter-process handler. // This replaces the default git-lfs smudge/clean per-invocation commands // with a single persistent process that calls the DRS transfer stack directly. - "filter.drs.process": "git-drs filter", + "filter.drs.clean": "git-drs clean -- %f", + "filter.drs.smudge": "git-drs smudge -- %f", + "filter.drs.process": "git-drs filter", + "filter.drs.required": "true", // Canonical git-drs config keys consumed by clients. "drs.upsert": strconv.FormatBool(upsert), "drs.multipart-threshold": strconv.Itoa(multiPartThreshold), diff --git a/cmd/initialize/main_test.go b/cmd/initialize/main_test.go index 04b02369..0c2beab3 100644 --- a/cmd/initialize/main_test.go +++ b/cmd/initialize/main_test.go @@ -106,6 +106,10 @@ func TestInitConfigValues(t *testing.T) { check("lfs.concurrenttransfers", "8") check("lfs.allowincompletepush", "false") + check("filter.drs.clean", "git-drs clean -- %f") + check("filter.drs.smudge", "git-drs smudge -- %f") + check("filter.drs.process", "git-drs filter") + check("filter.drs.required", "true") } func TestEnsureInitialized(t *testing.T) { @@ -129,4 +133,11 @@ func TestEnsureInitialized(t *testing.T) { if filterProcess != "git-drs filter" { t.Fatalf("unexpected filter.drs.process: %q", filterProcess) } + filterClean, err := gitrepo.GetGitConfigString("filter.drs.clean") + if err != nil { + t.Fatalf("GetGitConfigString(filter.drs.clean): %v", err) + } + if filterClean != "git-drs clean -- %f" { + t.Fatalf("unexpected filter.drs.clean: %q", filterClean) + } } diff --git a/cmd/lsfiles/main.go b/cmd/lsfiles/main.go index 5da58516..d403a33e 100644 --- a/cmd/lsfiles/main.go +++ b/cmd/lsfiles/main.go @@ -5,6 +5,7 @@ import ( "fmt" "log/slog" "os" + "os/exec" "sort" "strings" @@ -35,8 +36,11 @@ var ( if len(branches) == 0 { return lfs.GetTrackedLfsFiles(logger) } - return lfs.GetAllLfsFiles(gitRemoteName, gitRemoteLocation, branches, logger) + return lfs.GetLfsFilesForRefs(branches, logger) } + listRemoteRefs = defaultListRemoteRefs + listGitRemotes = defaultListGitRemotes + resolveDefaultRemote = defaultResolveDefaultRemote lookupScopedObjectsBatch = drsremote.ObjectsByHashesForScope ) @@ -51,6 +55,73 @@ type fileRow struct { Detail string `json:"detail,omitempty"` } +func defaultListRemoteRefs(gitRemoteName string) ([]string, error) { + if strings.TrimSpace(gitRemoteName) == "" { + return nil, nil + } + + cmd := exec.Command("git", "for-each-ref", "--format=%(refname)", "refs/remotes/"+gitRemoteName) + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("list refs for remote %s: %w", gitRemoteName, err) + } + + lines := strings.Split(string(out), "\n") + refs := make([]string, 0, len(lines)) + for _, line := range lines { + ref := strings.TrimSpace(line) + if ref == "" || strings.HasSuffix(ref, "/HEAD") { + continue + } + refs = append(refs, ref) + } + sort.Strings(refs) + return refs, nil +} + +func defaultListGitRemotes() ([]string, error) { + cmd := exec.Command("git", "remote") + out, err := cmd.Output() + if err != nil { + return nil, fmt.Errorf("list git remotes: %w", err) + } + + lines := strings.Split(string(out), "\n") + remotes := make([]string, 0, len(lines)) + for _, line := range lines { + name := strings.TrimSpace(line) + if name == "" { + continue + } + remotes = append(remotes, name) + } + sort.Strings(remotes) + return remotes, nil +} + +func defaultResolveDefaultRemote() string { + cfg, err := loadConfig() + if err == nil && cfg != nil { + if remote, err := cfg.GetRemoteOrDefault(""); err == nil { + return strings.TrimSpace(string(remote)) + } + } + + remotes, err := listGitRemotes() + if err != nil || len(remotes) == 0 { + return "" + } + for _, remote := range remotes { + if remote == config.ORIGIN { + return remote + } + } + if len(remotes) == 1 { + return remotes[0] + } + return "" +} + func collectRows(cmd *cobra.Command, gitRemoteName, drsRemoteName string, patterns []string, resolveDRS bool) ([]fileRow, error) { logger := drslog.GetLogger() @@ -73,10 +144,36 @@ func collectRows(cmd *cobra.Command, gitRemoteName, drsRemoteName string, patter } } - lfsFiles, err := loadLFSInventory(gitRemoteName, drsRemoteName, []string{}, logger) + var ( + refs []string + err error + ) + if strings.TrimSpace(gitRemoteName) != "" { + refs, err = listRemoteRefs(gitRemoteName) + if err != nil { + return nil, err + } + } + + lfsFiles, err := loadLFSInventory(gitRemoteName, drsRemoteName, refs, logger) if err != nil { return nil, err } + if len(lfsFiles) == 0 && strings.TrimSpace(gitRemoteName) == "" { + fallbackRemote := resolveDefaultRemote() + if fallbackRemote != "" { + refs, err = listRemoteRefs(fallbackRemote) + if err != nil { + return nil, err + } + if len(refs) > 0 { + lfsFiles, err = loadLFSInventory(fallbackRemote, drsRemoteName, refs, logger) + if err != nil { + return nil, err + } + } + } + } keys := make([]string, 0, len(lfsFiles)) for path := range lfsFiles { diff --git a/cmd/lsfiles/main_test.go b/cmd/lsfiles/main_test.go index 1daab41b..d4adf1a6 100644 --- a/cmd/lsfiles/main_test.go +++ b/cmd/lsfiles/main_test.go @@ -31,9 +31,11 @@ func TestCollectRowsLocalDefault(t *testing.T) { oldLoadLFSInventory := loadLFSInventory oldLookupScopedObjectsBatch := lookupScopedObjectsBatch + oldResolveDefaultRemote := resolveDefaultRemote t.Cleanup(func() { loadLFSInventory = oldLoadLFSInventory lookupScopedObjectsBatch = oldLookupScopedObjectsBatch + resolveDefaultRemote = oldResolveDefaultRemote }) tmpDir := t.TempDir() @@ -74,6 +76,7 @@ func TestCollectRowsLocalDefault(t *testing.T) { t.Fatalf("unexpected remote lookup for checksums %v", checksums) return nil, nil } + resolveDefaultRemote = func() string { return "" } cmd := &cobra.Command{} rows, err := collectRows(cmd, "", "", nil, false) @@ -111,13 +114,17 @@ func TestCollectRowsWithDRSLookupAndFilters(t *testing.T) { oldResolveRemote := resolveRemote oldNewRemoteClient := newRemoteClient oldLoadLFSInventory := loadLFSInventory + oldListRemoteRefs := listRemoteRefs oldLookupScopedObjectsBatch := lookupScopedObjectsBatch + oldResolveDefaultRemote := resolveDefaultRemote t.Cleanup(func() { loadConfig = oldLoadConfig resolveRemote = oldResolveRemote newRemoteClient = oldNewRemoteClient loadLFSInventory = oldLoadLFSInventory + listRemoteRefs = oldListRemoteRefs lookupScopedObjectsBatch = oldLookupScopedObjectsBatch + resolveDefaultRemote = oldResolveDefaultRemote }) loadConfig = func() (*config.Config, error) { @@ -137,6 +144,12 @@ func TestCollectRowsWithDRSLookupAndFilters(t *testing.T) { "data/file3.txt": {Name: "data/file3.txt", Oid: strings.Repeat("c", 64)}, }, nil } + listRemoteRefs = func(remote string) ([]string, error) { + if remote == "" { + return nil, nil + } + return []string{"refs/remotes/dev/main"}, nil + } lookupScopedObjectsBatch = func(ctx context.Context, drsCtx *config.GitContext, checksums []string) (map[string][]drsapi.DrsObject, error) { got := map[string][]drsapi.DrsObject{} for _, checksum := range checksums { @@ -149,9 +162,10 @@ func TestCollectRowsWithDRSLookupAndFilters(t *testing.T) { } return got, nil } + resolveDefaultRemote = func() string { return "" } cmd := &cobra.Command{} - rows, err := collectRows(cmd, "", "", []string{"data/**"}, true) + rows, err := collectRows(cmd, "dev", "", []string{"data/**"}, true) if err != nil { t.Fatalf("collectRows returned error: %v", err) } @@ -188,13 +202,17 @@ func TestCollectRowsWithDRSLookupBatchError(t *testing.T) { oldResolveRemote := resolveRemote oldNewRemoteClient := newRemoteClient oldLoadLFSInventory := loadLFSInventory + oldListRemoteRefs := listRemoteRefs oldLookupScopedObjectsBatch := lookupScopedObjectsBatch + oldResolveDefaultRemote := resolveDefaultRemote t.Cleanup(func() { loadConfig = oldLoadConfig resolveRemote = oldResolveRemote newRemoteClient = oldNewRemoteClient loadLFSInventory = oldLoadLFSInventory + listRemoteRefs = oldListRemoteRefs lookupScopedObjectsBatch = oldLookupScopedObjectsBatch + resolveDefaultRemote = oldResolveDefaultRemote }) loadConfig = func() (*config.Config, error) { return &config.Config{}, nil } @@ -210,12 +228,19 @@ func TestCollectRowsWithDRSLookupBatchError(t *testing.T) { "data/file3.txt": {Name: "data/file3.txt", Oid: strings.Repeat("c", 64)}, }, nil } + listRemoteRefs = func(remote string) ([]string, error) { + if remote == "" { + return nil, nil + } + return []string{"refs/remotes/dev/main"}, nil + } lookupScopedObjectsBatch = func(ctx context.Context, drsCtx *config.GitContext, checksums []string) (map[string][]drsapi.DrsObject, error) { return nil, errors.New("lookup failed") } + resolveDefaultRemote = func() string { return "" } cmd := &cobra.Command{} - rows, err := collectRows(cmd, "", "", []string{"data/**"}, true) + rows, err := collectRows(cmd, "dev", "", []string{"data/**"}, true) if err != nil { t.Fatalf("collectRows returned error: %v", err) } @@ -229,6 +254,103 @@ func TestCollectRowsWithDRSLookupBatchError(t *testing.T) { } } +func TestCollectRowsUsesRemoteRefsWhenGitRemoteProvided(t *testing.T) { + resetFlagsForTest() + + oldLoadLFSInventory := loadLFSInventory + oldListRemoteRefs := listRemoteRefs + oldResolveDefaultRemote := resolveDefaultRemote + t.Cleanup(func() { + loadLFSInventory = oldLoadLFSInventory + listRemoteRefs = oldListRemoteRefs + resolveDefaultRemote = oldResolveDefaultRemote + }) + + listRemoteRefs = func(remote string) ([]string, error) { + if remote != "dev" { + t.Fatalf("unexpected remote %q", remote) + } + return []string{"refs/remotes/dev/main", "refs/remotes/dev/release"}, nil + } + + loadLFSInventory = func(gitRemoteName, gitRemoteLocation string, branches []string, logger *slog.Logger) (map[string]lfs.LfsFileInfo, error) { + if gitRemoteName != "dev" { + t.Fatalf("unexpected git remote name %q", gitRemoteName) + } + if len(branches) != 2 || branches[0] != "refs/remotes/dev/main" || branches[1] != "refs/remotes/dev/release" { + t.Fatalf("unexpected refs %v", branches) + } + return map[string]lfs.LfsFileInfo{ + "data/file.bam": {Name: "data/file.bam", Oid: strings.Repeat("a", 64)}, + }, nil + } + resolveDefaultRemote = func() string { + t.Fatal("default remote fallback should not be used when --git-remote is set") + return "" + } + + cmd := &cobra.Command{} + rows, err := collectRows(cmd, "dev", "", nil, false) + if err != nil { + t.Fatalf("collectRows returned error: %v", err) + } + if len(rows) != 1 || rows[0].Path != "data/file.bam" { + t.Fatalf("unexpected rows %+v", rows) + } +} + +func TestCollectRowsFallsBackToDefaultRemoteWhenLocalInventoryEmpty(t *testing.T) { + resetFlagsForTest() + + oldLoadLFSInventory := loadLFSInventory + oldListRemoteRefs := listRemoteRefs + oldResolveDefaultRemote := resolveDefaultRemote + t.Cleanup(func() { + loadLFSInventory = oldLoadLFSInventory + listRemoteRefs = oldListRemoteRefs + resolveDefaultRemote = oldResolveDefaultRemote + }) + + callCount := 0 + loadLFSInventory = func(gitRemoteName, gitRemoteLocation string, branches []string, logger *slog.Logger) (map[string]lfs.LfsFileInfo, error) { + callCount++ + if callCount == 1 { + if gitRemoteName != "" || len(branches) != 0 { + t.Fatalf("first inventory call should be local-only, got remote=%q refs=%v", gitRemoteName, branches) + } + return map[string]lfs.LfsFileInfo{}, nil + } + if gitRemoteName != "dev" { + t.Fatalf("expected fallback remote dev, got %q", gitRemoteName) + } + if len(branches) != 1 || branches[0] != "refs/remotes/dev/main" { + t.Fatalf("unexpected fallback refs: %v", branches) + } + return map[string]lfs.LfsFileInfo{ + "data/file2.bam": {Name: "data/file2.bam", Oid: strings.Repeat("b", 64)}, + }, nil + } + resolveDefaultRemote = func() string { return "dev" } + listRemoteRefs = func(remote string) ([]string, error) { + if remote != "dev" { + t.Fatalf("expected fallback remote query for dev, got %q", remote) + } + return []string{"refs/remotes/dev/main"}, nil + } + + cmd := &cobra.Command{} + rows, err := collectRows(cmd, "", "", nil, false) + if err != nil { + t.Fatalf("collectRows returned error: %v", err) + } + if len(rows) != 1 || rows[0].Path != "data/file2.bam" { + t.Fatalf("unexpected rows: %+v", rows) + } + if callCount != 2 { + t.Fatalf("expected 2 inventory calls, got %d", callCount) + } +} + func TestValidateOutputFlags(t *testing.T) { resetFlagsForTest() diff --git a/cmd/pull/main.go b/cmd/pull/main.go index 75e1d75d..2019515c 100644 --- a/cmd/pull/main.go +++ b/cmd/pull/main.go @@ -3,9 +3,11 @@ package pull import ( "context" "fmt" + "io" "log/slog" "net/url" "os" + "path/filepath" "sort" "strings" @@ -16,6 +18,7 @@ import ( "github.com/calypr/git-drs/internal/lfs" "github.com/calypr/git-drs/internal/pathspec" drsapi "github.com/calypr/syfon/apigen/client/drs" + sycommon "github.com/calypr/syfon/client/common" "github.com/spf13/cobra" ) @@ -77,6 +80,10 @@ var Cmd = &cobra.Command{ return nil } + progress := newPullProgressRenderer(os.Stderr) + progress.OnPlan(pointers) + defer progress.Finish() + if dryRun { for _, f := range pointers { if _, err := fmt.Fprintln(cmd.OutOrStdout(), f.Name); err != nil { @@ -144,17 +151,19 @@ var Cmd = &cobra.Command{ } else if !os.IsNotExist(err) { return fmt.Errorf("failed to stat cache path %s: %w", dstPath, err) } + progress.OnDownloadStart(f) + downloadCtx := progressContextForPointer(ctx, progress, f) if obj, ok := prefetched[f.Oid]; ok { if accessURL, ok := prefetchedAccess[obj.Id]; ok { objCopy := obj - if err := drsremote.DownloadResolvedToCachePath(ctx, drsCtx, f.Oid, dstPath, &objCopy, &accessURL); err != nil { + if err := drsremote.DownloadResolvedToCachePath(downloadCtx, drsCtx, f.Oid, dstPath, &objCopy, &accessURL); err != nil { debugCtx := buildPullDownloadDebugContext(ctx, drsCtx, f.Oid) return fmt.Errorf("failed to download oid %s to %s: %w\npull-debug: %s", f.Oid, dstPath, err, debugCtx) } continue } } - if err := drsremote.DownloadToCachePath(ctx, drsCtx, logg, f.Oid, dstPath); err != nil { + if err := drsremote.DownloadToCachePath(downloadCtx, drsCtx, logg, f.Oid, dstPath); err != nil { debugCtx := buildPullDownloadDebugContext(ctx, drsCtx, f.Oid) return fmt.Errorf("failed to download oid %s to %s: %w\npull-debug: %s", f.Oid, dstPath, err, debugCtx) } @@ -163,7 +172,7 @@ var Cmd = &cobra.Command{ logg.Debug("no missing pointer objects to download") } - if err := checkoutDownloadedFiles(pointers); err != nil { + if err := checkoutDownloadedFiles(pointers, progress); err != nil { return err } @@ -195,7 +204,18 @@ func collectPointerFiles(inventory map[string]lfs.LfsFileInfo, patterns []string return files } -func checkoutDownloadedFiles(files []pointerFile) error { +func progressContextForPointer(ctx context.Context, progress *pullProgressRenderer, file pointerFile) context.Context { + ctx = sycommon.WithOid(ctx, file.Name) + return sycommon.WithProgress(ctx, func(ev sycommon.ProgressEvent) error { + if ev.Event != "progress" { + return nil + } + progress.OnDownloadProgress(file.Name, ev.BytesSoFar, file.Size) + return nil + }) +} + +func checkoutDownloadedFiles(files []pointerFile, progress *pullProgressRenderer) error { for _, f := range files { if strings.TrimSpace(f.Name) == "" || strings.TrimSpace(f.Oid) == "" { continue @@ -204,13 +224,35 @@ func checkoutDownloadedFiles(files []pointerFile) error { if err != nil { return fmt.Errorf("failed to resolve cached object for %s: %w", f.Oid, err) } - payload, err := os.ReadFile(srcPath) + src, err := os.Open(srcPath) if err != nil { return fmt.Errorf("failed to read cached object %s: %w", srcPath, err) } - if err := os.WriteFile(f.Name, payload, 0o644); err != nil { + progress.OnCheckoutStart(f) + if dir := filepath.Dir(f.Name); dir != "." { + if err := os.MkdirAll(dir, 0o755); err != nil { + src.Close() + return fmt.Errorf("failed to create directory for %s: %w", f.Name, err) + } + } + dst, err := os.OpenFile(f.Name, os.O_CREATE|os.O_WRONLY|os.O_TRUNC, 0o644) + if err != nil { + src.Close() + return fmt.Errorf("failed to checkout %s: %w", f.Name, err) + } + if _, err := io.Copy(dst, src); err != nil { + dst.Close() + src.Close() return fmt.Errorf("failed to checkout %s: %w", f.Name, err) } + if err := dst.Close(); err != nil { + src.Close() + return fmt.Errorf("failed to finalize checkout for %s: %w", f.Name, err) + } + if err := src.Close(); err != nil { + return fmt.Errorf("failed to close cached object %s: %w", srcPath, err) + } + progress.OnCompleted(f) } return nil } diff --git a/cmd/pull/progress.go b/cmd/pull/progress.go new file mode 100644 index 00000000..f1a6d345 --- /dev/null +++ b/cmd/pull/progress.go @@ -0,0 +1,185 @@ +package pull + +import ( + "fmt" + "io" + + "github.com/calypr/git-drs/internal/progressui" +) + +const pullNonTTYProgressInterval = progressui.NonTTYProgressInterval + +type pullProgressPhase string + +const ( + pullProgressPending pullProgressPhase = "pending" + pullProgressDownloading pullProgressPhase = "downloading" + pullProgressCheckingOut pullProgressPhase = "checking_out" + pullProgressCompleted pullProgressPhase = "completed" +) + +type pullFileProgress struct { + path string + total int64 + current int64 + phase pullProgressPhase +} + +type pullProgressRenderer struct { + base *progressui.Renderer + planned bool + files map[string]*pullFileProgress + fileOrder []string +} + +func newPullProgressRenderer(out io.Writer) *pullProgressRenderer { + return &pullProgressRenderer{ + base: progressui.NewRenderer(out), + files: make(map[string]*pullFileProgress), + } +} + +func isPullWriterTTY(w io.Writer) bool { + return progressui.IsWriterTTY(w) +} + +func (r *pullProgressRenderer) render(force bool) { + lines := make([]string, 0, len(r.fileOrder)) + for _, id := range r.fileOrder { + item := r.files[id] + if item == nil { + continue + } + lines = append(lines, r.renderLine(item)) + } + r.base.Render(force, lines) +} + +func (r *pullProgressRenderer) OnPlan(files []pointerFile) { + r.planned = len(files) > 0 + r.files = make(map[string]*pullFileProgress, len(files)) + r.fileOrder = r.fileOrder[:0] + for _, file := range files { + r.files[file.Name] = &pullFileProgress{ + path: file.Name, + total: file.Size, + phase: pullProgressPending, + } + r.fileOrder = append(r.fileOrder, file.Name) + } + if r.planned { + r.render(true) + } +} + +func (r *pullProgressRenderer) OnDownloadStart(file pointerFile) { + if !r.planned { + return + } + item, ok := r.files[file.Name] + if !ok { + return + } + item.path = file.Name + if file.Size > 0 { + item.total = file.Size + } + item.phase = pullProgressDownloading + r.render(false) +} + +func (r *pullProgressRenderer) OnDownloadProgress(id string, bytesSoFar int64, total int64) { + if !r.planned { + return + } + item, ok := r.files[id] + if !ok { + return + } + if total > 0 { + item.total = total + } + if bytesSoFar > item.current { + item.current = bytesSoFar + } + item.phase = pullProgressDownloading + r.render(false) +} + +func (r *pullProgressRenderer) OnCheckoutStart(file pointerFile) { + if !r.planned { + return + } + item, ok := r.files[file.Name] + if !ok { + return + } + item.phase = pullProgressCheckingOut + if item.total == 0 && file.Size > 0 { + item.total = file.Size + } + r.render(false) +} + +func (r *pullProgressRenderer) OnCompleted(file pointerFile) { + if !r.planned { + return + } + item, ok := r.files[file.Name] + if !ok { + return + } + if item.total == 0 && file.Size > 0 { + item.total = file.Size + } + if item.total > 0 { + item.current = item.total + } + item.phase = pullProgressCompleted + r.render(false) +} + +func (r *pullProgressRenderer) Finish() { + if !r.planned { + return + } + lines := make([]string, 0, len(r.fileOrder)) + for _, id := range r.fileOrder { + item := r.files[id] + if item == nil { + continue + } + lines = append(lines, r.renderLine(item)) + } + r.base.Finish(lines) + r.planned = false +} + +func (r *pullProgressRenderer) renderLine(file *pullFileProgress) string { + label := "preparing pull" + if file != nil && file.path != "" { + label = progressui.TrimLabel(file.path, 48) + } + + prefix := "" + if file != nil { + switch file.phase { + case pullProgressDownloading, pullProgressCheckingOut: + if !(file.total > 0 && file.current >= file.total) { + prefix = r.base.Spinner() + " " + } + } + } + + current := int64(0) + total := int64(0) + if file != nil { + current = file.current + total = file.total + } + bar := progressui.RenderProgressBar(current, total, 24) + pct := progressui.RenderPercent(current, total) + bytesLabel := progressui.RenderByteProgress(current, total, current >= total) + + return fmt.Sprintf("%s%s %s %s %s", prefix, label, bar, pct, bytesLabel) +} diff --git a/cmd/pull/progress_test.go b/cmd/pull/progress_test.go new file mode 100644 index 00000000..77682c71 --- /dev/null +++ b/cmd/pull/progress_test.go @@ -0,0 +1,85 @@ +package pull + +import ( + "bytes" + "strings" + "testing" + "time" +) + +func TestPullProgressRendererTTY(t *testing.T) { + var out bytes.Buffer + r := newPullProgressRenderer(&out) + r.base.SetTTY(true) + r.base.SetClock(func() time.Time { return time.Unix(0, 0) }) + + files := []pointerFile{ + {Name: "a.bin", Oid: "oid-1", Size: 100}, + {Name: "b.bin", Oid: "oid-2", Size: 100}, + } + r.OnPlan(files) + r.OnDownloadStart(files[0]) + r.OnDownloadProgress("a.bin", 50, 100) + r.OnCheckoutStart(files[0]) + r.OnCompleted(files[0]) + r.OnCompleted(files[1]) + + got := out.String() + if !strings.Contains(got, "a.bin [============") { + t.Fatalf("expected progress bar output for a.bin, got %q", got) + } + if !strings.Contains(got, "100.0% 100 B/100 B") { + t.Fatalf("expected completed byte summary, got %q", got) + } +} + +func TestPullProgressRendererNonTTYThrottles(t *testing.T) { + var out bytes.Buffer + now := time.Unix(0, 0) + r := newPullProgressRenderer(&out) + r.base.SetTTY(false) + r.base.SetClock(func() time.Time { return now }) + + file := pointerFile{Name: "a.bin", Oid: "oid-1", Size: 100} + r.OnPlan([]pointerFile{file}) + initial := out.String() + if !strings.Contains(initial, "a.bin") { + t.Fatalf("expected initial non-tty progress line, got %q", initial) + } + + r.OnDownloadStart(file) + r.OnDownloadProgress("a.bin", 10, 100) + if got := out.String(); got != initial { + t.Fatalf("expected throttled output before interval, got %q", got) + } + + now = now.Add(pullNonTTYProgressInterval) + r.OnCompleted(file) + got := out.String() + if !strings.Contains(got, "100.0% 100 B/100 B") { + t.Fatalf("expected rendered completion after interval, got %q", got) + } +} + +func TestPullProgressRendererNoSpinnerAtFullDownloadedBytes(t *testing.T) { + var out bytes.Buffer + r := newPullProgressRenderer(&out) + r.base.SetTTY(true) + r.base.SetClock(func() time.Time { return time.Unix(0, 0) }) + + file := pointerFile{Name: "a.bin", Oid: "oid-1", Size: 100} + r.OnPlan([]pointerFile{file}) + r.OnDownloadStart(file) + r.OnDownloadProgress("a.bin", 100, 100) + + got := out.String() + if strings.Contains(got, "/ a.bin [========================] 100.0% 100 B/100 B") || + strings.Contains(got, "| a.bin [========================] 100.0% 100 B/100 B") || + strings.Contains(got, "- a.bin [========================] 100.0% 100 B/100 B") || + strings.Contains(got, "\\ a.bin [========================] 100.0% 100 B/100 B") { + t.Fatalf("expected no spinner prefix on fully downloaded file, got %q", got) + } + if !strings.Contains(got, "a.bin [========================] 100.0% 100 B/100 B") { + t.Fatalf("expected completed byte line without spinner, got %q", got) + } +} diff --git a/cmd/push/main.go b/cmd/push/main.go index 2e04d189..e9877f3b 100644 --- a/cmd/push/main.go +++ b/cmd/push/main.go @@ -16,6 +16,7 @@ import ( ) var pushWithHooks bool +var pushForceUpload bool var runCommand = func(name string, args ...string) ([]byte, error) { cmd := exec.Command(name, args...) @@ -59,6 +60,7 @@ var Cmd = &cobra.Command{ myLogger.Debug(fmt.Sprintf("Error creating DRS client: %s", err)) return err } + drsClient.ForceUpload = pushForceUpload lfsFiles, err := lfs.GetAllLfsFiles(string(remote), "", []string{"HEAD"}, myLogger) if err != nil { return fmt.Errorf("failed to discover LFS files to push: %w", err) @@ -78,6 +80,12 @@ var Cmd = &cobra.Command{ return fmt.Errorf("failed batch register/upload workflow: %w", err) } progress.Finish() + switch { + case len(lfsFiles) == 0: + fmt.Fprintln(os.Stdout, "No git-drs tracked files found; pushing Git refs only.") + case !progress.HadUploads(): + fmt.Fprintln(os.Stdout, "No DRS payload uploads needed; all tracked objects are already available remotely.") + } pushArgs := []string{"push"} if !pushWithHooks { @@ -98,6 +106,7 @@ var Cmd = &cobra.Command{ func init() { Cmd.Flags().BoolVar(&pushWithHooks, "with-hooks", false, "Run git push with local hooks enabled (invokes pre-push)") + Cmd.Flags().BoolVar(&pushForceUpload, "force-upload", false, "Upload payload bytes even when a matching downloadable object already exists remotely") } func currentDeleteRefUpdates(ctx context.Context) ([]drsdelete.RefUpdate, error) { diff --git a/cmd/push/progress.go b/cmd/push/progress.go index 2c6dd6fe..a22156ec 100644 --- a/cmd/push/progress.go +++ b/cmd/push/progress.go @@ -3,73 +3,62 @@ package push import ( "fmt" "io" - "strings" "sync" - "time" + "github.com/calypr/git-drs/internal/progressui" "github.com/calypr/git-drs/internal/pushsync" - "github.com/mattn/go-isatty" ) -const nonTTYProgressInterval = 2 * time.Second - type uploadFileProgress struct { path string total int64 - uploaded int64 + current int64 + started bool completed bool } type uploadProgressRenderer struct { - out io.Writer - isTTY bool - now func() time.Time - lastRender time.Time - mu sync.Mutex - planned bool - plan pushsync.UploadPlanSummary - files map[string]*uploadFileProgress - totalBytes int64 - doneBytes int64 - doneFiles int - currentLabel string + mu sync.Mutex + base *progressui.Renderer + planned bool + plan pushsync.UploadPlanSummary + files map[string]*uploadFileProgress + fileOrder []string } func newUploadProgressRenderer(out io.Writer) *uploadProgressRenderer { return &uploadProgressRenderer{ - out: out, - isTTY: isWriterTTY(out), - now: time.Now, + base: progressui.NewRenderer(out), files: make(map[string]*uploadFileProgress), } } -func isWriterTTY(w io.Writer) bool { - type fdWriter interface{ Fd() uintptr } - f, ok := w.(fdWriter) - if !ok { - return false +func (r *uploadProgressRenderer) renderLocked(force bool) { + lines := make([]string, 0, len(r.fileOrder)) + for idx, oid := range r.fileOrder { + file := r.files[oid] + if file == nil { + continue + } + lines = append(lines, r.renderLine(idx, len(r.fileOrder), file)) } - fd := f.Fd() - return isatty.IsTerminal(fd) || isatty.IsCygwinTerminal(fd) + r.base.Render(force, lines) } func (r *uploadProgressRenderer) OnUploadPlan(plan pushsync.UploadPlanSummary) { r.mu.Lock() defer r.mu.Unlock() + r.plan = plan r.planned = plan.TotalFiles > 0 - r.totalBytes = 0 - r.doneBytes = 0 - r.doneFiles = 0 - r.currentLabel = "" r.files = make(map[string]*uploadFileProgress, len(plan.Files)) + r.fileOrder = r.fileOrder[:0] for _, file := range plan.Files { r.files[file.OID] = &uploadFileProgress{ path: file.Path, total: file.Bytes, } - r.totalBytes += file.Bytes + r.fileOrder = append(r.fileOrder, file.OID) } if r.planned { r.renderLocked(true) @@ -79,6 +68,7 @@ func (r *uploadProgressRenderer) OnUploadPlan(plan pushsync.UploadPlanSummary) { func (r *uploadProgressRenderer) OnUploadProgress(ev pushsync.UploadProgressEvent) { r.mu.Lock() defer r.mu.Unlock() + if !r.planned { return } @@ -92,24 +82,17 @@ func (r *uploadProgressRenderer) OnUploadProgress(ev pushsync.UploadProgressEven if ev.TotalBytes > 0 { file.total = ev.TotalBytes } - if file.total > 0 && ev.BytesSoFar > file.total { - ev.BytesSoFar = file.total + if ev.BytesSoFar > file.current { + file.current = ev.BytesSoFar } - if ev.BytesSoFar > file.uploaded { - r.doneBytes += ev.BytesSoFar - file.uploaded - file.uploaded = ev.BytesSoFar - } - if ev.Path != "" { - r.currentLabel = ev.Path - } else if file.path != "" { - r.currentLabel = file.path + if ev.Phase == pushsync.UploadProgressUploading { + file.started = true } if ev.Phase == pushsync.UploadProgressCompleted && !file.completed { + file.started = true file.completed = true - r.doneFiles++ - if file.total > 0 && file.uploaded < file.total { - r.doneBytes += file.total - file.uploaded - file.uploaded = file.total + if file.total > 0 { + file.current = file.total } } r.renderLocked(false) @@ -118,72 +101,55 @@ func (r *uploadProgressRenderer) OnUploadProgress(ev pushsync.UploadProgressEven func (r *uploadProgressRenderer) Finish() { r.mu.Lock() defer r.mu.Unlock() + if !r.planned { return } - r.renderLocked(true) - if r.isTTY { - _, _ = fmt.Fprintln(r.out) + lines := make([]string, 0, len(r.fileOrder)) + for idx, oid := range r.fileOrder { + file := r.files[oid] + if file == nil { + continue + } + lines = append(lines, r.renderLine(idx, len(r.fileOrder), file)) } + r.base.Finish(lines) r.planned = false } -func (r *uploadProgressRenderer) renderLocked(force bool) { - now := r.now() - if !force && !r.isTTY && !r.lastRender.IsZero() && now.Sub(r.lastRender) < nonTTYProgressInterval { - return - } - r.lastRender = now - totalBytes := r.totalBytes - doneBytes := r.doneBytes - doneFiles := r.doneFiles - totalFiles := r.plan.TotalFiles - percent := 0.0 - if totalBytes > 0 { - percent = (float64(doneBytes) / float64(totalBytes)) * 100 - } - current := r.currentLabel - if current == "" { - current = "preparing uploads" - } - - if r.isTTY { - barWidth := 28 - filled := 0 - if totalBytes > 0 { - filled = int((float64(doneBytes) / float64(totalBytes)) * float64(barWidth)) - } - if filled > barWidth { - filled = barWidth - } - bar := strings.Repeat("=", filled) + strings.Repeat(" ", barWidth-filled) - line := fmt.Sprintf("\rUploading %d/%d files [%s] %5.1f%% %s/%s current: %s", - doneFiles, totalFiles, bar, percent, humanBytes(doneBytes), humanBytes(totalBytes), trimProgressLabel(current, 48)) - _, _ = fmt.Fprint(r.out, line) - return - } - - line := fmt.Sprintf("Uploading %d/%d files (%.1f%%) %s/%s current=%s\n", - doneFiles, totalFiles, percent, humanBytes(doneBytes), humanBytes(totalBytes), current) - _, _ = fmt.Fprint(r.out, line) +func (r *uploadProgressRenderer) HadUploads() bool { + r.mu.Lock() + defer r.mu.Unlock() + return r != nil && r.planned } -func trimProgressLabel(s string, max int) string { - if max <= 3 || len(s) <= max { - return s +func (r *uploadProgressRenderer) renderLine(idx int, total int, file *uploadFileProgress) string { + label := "preparing upload" + if file != nil && file.path != "" { + label = progressui.TrimLabel(file.path, 48) + } + prefix := "" + if file != nil { + switch { + case file.started && !file.completed: + prefix = r.base.Spinner() + " " + } } - return "..." + s[len(s)-(max-3):] -} -func humanBytes(n int64) string { - const unit = 1024 - if n < unit { - return fmt.Sprintf("%d B", n) - } - div, exp := int64(unit), 0 - for v := n / unit; v >= unit; v /= unit { - div *= unit - exp++ - } - return fmt.Sprintf("%.1f %ciB", float64(n)/float64(div), "KMGTPE"[exp]) + current := int64(0) + totalBytes := int64(0) + completed := false + if file != nil { + current = file.current + totalBytes = file.total + completed = file.completed + } + displayCurrent := progressui.VisibleProgressBytes(current, totalBytes, completed) + bar := progressui.RenderProgressBar(displayCurrent, totalBytes, 24) + pct := progressui.RenderPercentCapped(displayCurrent, totalBytes, completed) + bytesLabel := progressui.RenderByteProgress(displayCurrent, totalBytes, completed) + + _ = idx + _ = total + return fmt.Sprintf("%s%s %s %s %s", prefix, label, bar, pct, bytesLabel) } diff --git a/cmd/push/progress_test.go b/cmd/push/progress_test.go index a740822c..b9c66cb6 100644 --- a/cmd/push/progress_test.go +++ b/cmd/push/progress_test.go @@ -3,6 +3,7 @@ package push import ( "bytes" "strings" + "sync" "testing" "time" @@ -12,7 +13,7 @@ import ( func TestUploadProgressRendererTTY(t *testing.T) { var out bytes.Buffer r := newUploadProgressRenderer(&out) - r.isTTY = true + r.base.SetTTY(true) r.OnUploadPlan(pushsync.UploadPlanSummary{ Files: []pushsync.UploadPlanFile{ @@ -22,17 +23,25 @@ func TestUploadProgressRendererTTY(t *testing.T) { TotalFiles: 2, TotalBytes: 200, }) + r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-1", Path: "a.bin", BytesSoFar: 0, TotalBytes: 100, Phase: pushsync.UploadProgressUploading}) r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-1", Path: "a.bin", BytesSoFar: 50, BytesSinceLast: 50, TotalBytes: 100, Phase: pushsync.UploadProgressUploading}) r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-1", Path: "a.bin", BytesSoFar: 100, TotalBytes: 100, Phase: pushsync.UploadProgressCompleted}) + r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-2", Path: "b.bin", BytesSoFar: 0, TotalBytes: 100, Phase: pushsync.UploadProgressUploading}) r.OnUploadProgress(pushsync.UploadProgressEvent{OID: "oid-2", Path: "b.bin", BytesSoFar: 100, TotalBytes: 100, Phase: pushsync.UploadProgressCompleted}) r.Finish() got := out.String() - if !strings.Contains(got, "Uploading 2/2 files") { - t.Fatalf("expected final tty summary, got %q", got) + if !strings.Contains(got, "a.bin [============ ] 50.0% 50 B/100 B") { + t.Fatalf("expected first file uploading line, got %q", got) } - if !strings.Contains(got, "100.0%") { - t.Fatalf("expected 100%% completion, got %q", got) + if !strings.Contains(got, "b.bin [ ] 0.0% 0 B/100 B") { + t.Fatalf("expected second file pending line, got %q", got) + } + if !strings.Contains(got, "b.bin [========================] 100.0% 100 B/100 B") { + t.Fatalf("expected completed second file line, got %q", got) + } + if strings.Contains(got, "(uploading)") || strings.Contains(got, "(pending)") || strings.Contains(got, "(complete)") { + t.Fatalf("did not expect parenthesized state text, got %q", got) } if !strings.HasSuffix(got, "\n") { t.Fatalf("expected trailing newline, got %q", got) @@ -42,9 +51,9 @@ func TestUploadProgressRendererTTY(t *testing.T) { func TestUploadProgressRendererNonTTYThrottles(t *testing.T) { var out bytes.Buffer r := newUploadProgressRenderer(&out) - r.isTTY = false + r.base.SetTTY(false) now := time.Unix(0, 0) - r.now = func() time.Time { return now } + r.base.SetClock(func() time.Time { return now }) r.OnUploadPlan(pushsync.UploadPlanSummary{ Files: []pushsync.UploadPlanFile{{OID: "oid-1", Path: "a.bin", Bytes: 100}}, @@ -67,7 +76,102 @@ func TestUploadProgressRendererNonTTYThrottles(t *testing.T) { if strings.Count(got, "\n") < 2 { t.Fatalf("expected throttled summary updates, got %q", got) } - if !strings.Contains(got, "Uploading 1/1 files") { + if !strings.Contains(got, "a.bin [========================] 100.0% 100 B/100 B") { t.Fatalf("expected non-tty progress summary, got %q", got) } + if strings.Contains(got, "1/1") || strings.Contains(got, "[*]") { + t.Fatalf("did not expect positional or completion prefix clutter, got %q", got) + } +} + +func TestUploadProgressRendererDoesNotShowFullCompletionBeforeCompleteEvent(t *testing.T) { + var out bytes.Buffer + r := newUploadProgressRenderer(&out) + r.base.SetTTY(true) + + total := int64(500 * 1024 * 1024) + r.OnUploadPlan(pushsync.UploadPlanSummary{ + Files: []pushsync.UploadPlanFile{{OID: "oid-1", Path: "large.bin", Bytes: total}}, + TotalFiles: 1, + TotalBytes: total, + }) + r.OnUploadProgress(pushsync.UploadProgressEvent{ + OID: "oid-1", + Path: "large.bin", + BytesSoFar: total, + TotalBytes: total, + Phase: pushsync.UploadProgressUploading, + }) + + got := out.String() + if !strings.Contains(got, "99.9%") { + t.Fatalf("expected in-flight upload to stay below 100%%, got %q", got) + } + if !strings.Contains(got, "<500.0 MiB/500.0 MiB") { + t.Fatalf("expected in-flight byte label to avoid full equality, got %q", got) + } + if strings.Contains(got, "100.0%") { + t.Fatalf("did not expect in-flight upload to render as 100%%, got %q", got) + } +} + +func TestUploadProgressRendererHadUploads(t *testing.T) { + var out bytes.Buffer + r := newUploadProgressRenderer(&out) + if r.HadUploads() { + t.Fatal("expected fresh renderer to report no uploads") + } + + r.OnUploadPlan(pushsync.UploadPlanSummary{ + Files: []pushsync.UploadPlanFile{{OID: "oid-1", Path: "a.bin", Bytes: 1}}, + TotalFiles: 1, + TotalBytes: 1, + }) + if !r.HadUploads() { + t.Fatal("expected renderer to report uploads after a non-empty plan") + } + + r.Finish() + if r.HadUploads() { + t.Fatal("expected renderer to reset after finish") + } +} + +func TestUploadProgressRendererConcurrentProgress(t *testing.T) { + var out bytes.Buffer + r := newUploadProgressRenderer(&out) + r.base.SetTTY(false) + + r.OnUploadPlan(pushsync.UploadPlanSummary{ + Files: []pushsync.UploadPlanFile{ + {OID: "oid-1", Path: "a.bin", Bytes: 100}, + {OID: "oid-2", Path: "b.bin", Bytes: 100}, + }, + TotalFiles: 2, + TotalBytes: 200, + }) + + events := []pushsync.UploadProgressEvent{ + {OID: "oid-1", Path: "a.bin", BytesSoFar: 10, BytesSinceLast: 10, TotalBytes: 100, Phase: pushsync.UploadProgressUploading}, + {OID: "oid-2", Path: "b.bin", BytesSoFar: 20, BytesSinceLast: 20, TotalBytes: 100, Phase: pushsync.UploadProgressUploading}, + {OID: "oid-1", Path: "a.bin", BytesSoFar: 100, TotalBytes: 100, Phase: pushsync.UploadProgressCompleted}, + {OID: "oid-2", Path: "b.bin", BytesSoFar: 100, TotalBytes: 100, Phase: pushsync.UploadProgressCompleted}, + } + + var wg sync.WaitGroup + wg.Add(len(events)) + for _, ev := range events { + ev := ev + go func() { + defer wg.Done() + r.OnUploadProgress(ev) + }() + } + wg.Wait() + r.Finish() + + got := out.String() + if !strings.Contains(got, "a.bin") || !strings.Contains(got, "b.bin") { + t.Fatalf("expected both files in concurrent progress output, got %q", got) + } } diff --git a/cmd/remote/add/local_test.go b/cmd/remote/add/local_test.go index 67ba39e1..1f5c42a8 100644 --- a/cmd/remote/add/local_test.go +++ b/cmd/remote/add/local_test.go @@ -76,6 +76,20 @@ func TestLocalRemoteAddEnsuresInitialization(t *testing.T) { if filterProcess != "git-drs filter" { t.Fatalf("unexpected filter.drs.process: %q", filterProcess) } + filterClean, err := gitrepo.GetGitConfigString("filter.drs.clean") + if err != nil { + t.Fatalf("GetGitConfigString(filter.drs.clean): %v", err) + } + if filterClean != "git-drs clean -- %f" { + t.Fatalf("unexpected filter.drs.clean: %q", filterClean) + } + filterSmudge, err := gitrepo.GetGitConfigString("filter.drs.smudge") + if err != nil { + t.Fatalf("GetGitConfigString(filter.drs.smudge): %v", err) + } + if filterSmudge != "git-drs smudge -- %f" { + t.Fatalf("unexpected filter.drs.smudge: %q", filterSmudge) + } preCommit, err := os.ReadFile(filepath.Join(".git", "hooks", "pre-commit")) if err != nil { diff --git a/coverage/combined.html b/coverage/combined.html index f00491d3..bcd8b776 100644 --- a/coverage/combined.html +++ b/coverage/combined.html @@ -61,109 +61,135 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -526,7 +552,7 @@ "os" "path/filepath" - "github.com/calypr/git-drs/internal/cloud" + sycloud "github.com/calypr/syfon/client/cloud" "github.com/spf13/cobra" ) @@ -577,7 +603,7 @@ // printResolvedInfo writes a human-readable summary of resolved Git/LFS and // cloud object information to the command's stdout for user confirmation. -func printResolvedInfo(cmd *cobra.Command, gitCommonDir, lfsRoot string, objectInfo *cloud.ObjectInfo, pathArg string, isTracked bool, sha256 string) error { +func printResolvedInfo(cmd *cobra.Command, gitCommonDir, lfsRoot string, objectInfo *sycloud.ObjectInfo, pathArg string, isTracked bool, sha256 string) error { if _, err := fmt.Fprintf(cmd.OutOrStdout(), ` Resolved Git LFS Object Info ---------------------------- @@ -651,29 +677,34 @@ // NewCommand constructs the Cobra command for the `add-url` subcommand, // wiring usage, argument validation and the RunE handler. -func NewCommand() *cobra.Command { +func NewCommand() *cobra.Command { cmd := &cobra.Command{ - Use: "add-url <cloud-url> [path]", - Short: "Add a file to the Git DRS repo using a cloud object URL", + Use: "add-url <object-url-or-key> [path]", + Short: "Add a file from a provider URL or configured bucket object key", Args: func(cmd *cobra.Command, args []string) error { if len(args) < 1 || len(args) > 2 { - return errors.New("usage: add-url <cloud-url> [path]") + return errors.New("usage: add-url <object-url-or-key> [path]") } return nil }, RunE: runAddURL, } - addFlags(cmd) + addFlags(cmd) return cmd } // addFlags registers optional expected SHA256 checksum. -func addFlags(cmd *cobra.Command) { +func addFlags(cmd *cobra.Command) { cmd.Flags().String( "sha256", "", "Expected SHA256 checksum (optional)", ) + cmd.Flags().String( + "scheme", + "", + "Storage scheme for object-key mode (for example: s3 or gs)", + ) } // runAddURL is the Cobra RunE wrapper that delegates execution to the service. @@ -688,72 +719,132 @@ "fmt" "net/url" "os" + "path" "strings" - "github.com/calypr/git-drs/internal/cloud" + "github.com/calypr/git-drs/internal/gitrepo" + sycloud "github.com/calypr/syfon/client/cloud" "github.com/spf13/cobra" ) // addURLInput holds the parsed CLI state for the add-url command. type addURLInput struct { - objectURL string - path string - sha256 string - objectParams cloud.ObjectParameters + sourceArg string + objectURL string + path string + sha256 string + scheme string } -// parseAddURLInput parses CLI args and flags into an addURLInput and constructs -// cloud.ObjectParameters for metadata inspection. -func parseAddURLInput(cmd *cobra.Command, args []string) (addURLInput, error) { - objectURL := args[0] +// parseAddURLInput parses CLI args and flags into an addURLInput. +func parseAddURLInput(cmd *cobra.Command, args []string) (addURLInput, error) { + sourceArg := strings.TrimSpace(args[0]) - pathArg, err := resolvePathArg(objectURL, args) + pathArg, err := resolvePathArg(sourceArg, args) if err != nil { return addURLInput{}, err } - sha256Param, err := cmd.Flags().GetString("sha256") + sha256Param, err := cmd.Flags().GetString("sha256") if err != nil { return addURLInput{}, fmt.Errorf("read flag sha256: %w", err) } + scheme, err := cmd.Flags().GetString("scheme") + if err != nil { + return addURLInput{}, fmt.Errorf("read flag scheme: %w", err) + } - return addURLInput{ - objectURL: objectURL, + return addURLInput{ + sourceArg: sourceArg, path: pathArg, sha256: sha256Param, - objectParams: cloud.ObjectParameters{ - ObjectURL: objectURL, - S3Region: firstNonEmpty(os.Getenv("AWS_REGION"), os.Getenv("AWS_DEFAULT_REGION"), os.Getenv("TEST_BUCKET_REGION")), - S3Endpoint: firstNonEmpty(os.Getenv("AWS_ENDPOINT_URL_S3"), os.Getenv("AWS_ENDPOINT_URL"), os.Getenv("TEST_BUCKET_ENDPOINT")), - S3AccessKey: firstNonEmpty(os.Getenv("AWS_ACCESS_KEY_ID"), os.Getenv("TEST_BUCKET_ACCESS_KEY")), - S3SecretKey: firstNonEmpty(os.Getenv("AWS_SECRET_ACCESS_KEY"), os.Getenv("TEST_BUCKET_SECRET_KEY")), - SHA256: sha256Param, - DestinationPath: pathArg, - }, + scheme: strings.ToLower(strings.TrimSpace(scheme)), }, nil } // resolvePathArg returns the explicit destination path argument when provided, -// otherwise derives the worktree path from the given cloud URL path component. -func resolvePathArg(objectURL string, args []string) (string, error) { +// otherwise derives the worktree path from the given cloud URL or object key. +func resolvePathArg(sourceArg string, args []string) (string, error) { if len(args) == 2 { return args[1], nil } - u, err := url.Parse(objectURL) + if looksLikeCloudURL(sourceArg) { + u, err := url.Parse(sourceArg) + if err != nil { + return "", err + } + return strings.TrimPrefix(u.Path, "/"), nil + } + return strings.Trim(strings.TrimSpace(sourceArg), "/"), nil +} + +func buildObjectParameters(objectURL, pathArg, sha256 string) sycloud.ObjectParameters { + return sycloud.ObjectParameters{ + ObjectURL: objectURL, + S3Region: firstNonEmpty(os.Getenv("AWS_REGION"), os.Getenv("AWS_DEFAULT_REGION"), os.Getenv("TEST_BUCKET_REGION")), + S3Endpoint: firstNonEmpty(os.Getenv("AWS_ENDPOINT_URL_S3"), os.Getenv("AWS_ENDPOINT_URL"), os.Getenv("TEST_BUCKET_ENDPOINT")), + S3AccessKey: firstNonEmpty(os.Getenv("AWS_ACCESS_KEY_ID"), os.Getenv("TEST_BUCKET_ACCESS_KEY")), + S3SecretKey: firstNonEmpty(os.Getenv("AWS_SECRET_ACCESS_KEY"), os.Getenv("TEST_BUCKET_SECRET_KEY")), + SHA256: sha256, + DestinationPath: pathArg, + } +} + +func looksLikeCloudURL(raw string) bool { + u, err := url.Parse(strings.TrimSpace(raw)) if err != nil { - return "", err + return false + } + if strings.TrimSpace(u.Scheme) == "" { + return false + } + switch strings.ToLower(strings.TrimSpace(u.Scheme)) { + case "s3", "gs", "gcs", "azblob", "http", "https": + return strings.TrimSpace(u.Host) != "" + default: + return false + } +} + +func resolveObjectURL(input addURLInput, scope gitrepo.ResolvedBucketScope) (string, error) { + if looksLikeCloudURL(input.sourceArg) { + return input.sourceArg, nil } - return strings.TrimPrefix(u.Path, "/"), nil + if input.scheme == "" { + return "", fmt.Errorf("object key mode requires --scheme because local bucket mappings store bucket/prefix but not provider scheme") + } + key := joinObjectKey(scope.Prefix, input.sourceArg) + switch input.scheme { + case "s3": + return fmt.Sprintf("s3://%s/%s", scope.Bucket, key), nil + case "gs", "gcs": + return fmt.Sprintf("gs://%s/%s", scope.Bucket, key), nil + case "azblob", "az": + return "", fmt.Errorf("object key mode for Azure requires a full azblob:// URL because the local mapping does not store account_name") + default: + return "", fmt.Errorf("unsupported --scheme %q (expected s3 or gs, or pass a full object URL)", input.scheme) + } +} + +func joinObjectKey(prefix, key string) string { + parts := make([]string, 0, 2) + if p := strings.Trim(strings.TrimSpace(prefix), "/"); p != "" { + parts = append(parts, p) + } + if k := strings.Trim(strings.TrimSpace(key), "/"); k != "" { + parts = append(parts, k) + } + return path.Join(parts...) } -func firstNonEmpty(values ...string) string { - for _, v := range values { +func firstNonEmpty(values ...string) string { + for _, v := range values { v = strings.TrimSpace(v) - if v != "" { + if v != "" { return v } } - return "" + return "" } @@ -790,16 +881,20 @@ import ( "context" + "crypto/sha256" "fmt" "log/slog" - "os" + "strings" - "github.com/calypr/git-drs/internal/cloud" "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/config" "github.com/calypr/git-drs/internal/drslog" - "github.com/calypr/git-drs/internal/drsmap" + "github.com/calypr/git-drs/internal/drsobject" + "github.com/calypr/git-drs/internal/drstrack" "github.com/calypr/git-drs/internal/lfs" + drsapi "github.com/calypr/syfon/apigen/client/drs" + sycloud "github.com/calypr/syfon/client/cloud" + "github.com/google/uuid" "github.com/spf13/cobra" ) @@ -807,7 +902,7 @@ // behavior (logger factory, object inspection, LFS helpers, config loader, etc.). type AddURLService struct { newLogger func(string, bool) (*slog.Logger, error) - inspectObject func(ctx context.Context, input cloud.ObjectParameters) (*cloud.ObjectInfo, error) + inspectObject func(ctx context.Context, input sycloud.ObjectParameters) (*sycloud.ObjectInfo, error) isLFSTracked func(path string) (bool, error) getGitRoots func(ctx context.Context) (string, string, error) gitLFSTrack func(ctx context.Context, path string) (bool, error) @@ -816,131 +911,186 @@ // NewAddURLService constructs an AddURLService populated with production // implementations of its dependencies. -func NewAddURLService() *AddURLService { +func NewAddURLService() *AddURLService { return &AddURLService{ newLogger: drslog.NewLogger, - inspectObject: cloud.InspectObjectForLFS, + inspectObject: sycloud.InspectObject, isLFSTracked: lfs.IsLFSTracked, getGitRoots: lfs.GetGitRootDirectories, - gitLFSTrack: lfs.GitLFSTrackReadOnly, + gitLFSTrack: drstrack.TrackReadOnly, loadConfig: config.LoadConfig, } } -// Run executes the add-url workflow: parse CLI input, inspect the cloud object, +// Run executes the add-url workflow: parse CLI input, resolve the target bucket +// scope, inspect the provider object through the client-owned cloud package, // ensure the LFS object exists in local storage, write a pointer file, update // the pre-commit cache (best-effort), optionally add a tracking entry, and // record the DRS mapping. -func (s *AddURLService) Run(cmd *cobra.Command, args []string) error { +func (s *AddURLService) Run(cmd *cobra.Command, args []string) error { ctx := cmd.Context() - if ctx == nil { + if ctx == nil { ctx = context.Background() } - logger, err := s.newLogger("", false) + logger, err := s.newLogger("", false) if err != nil { return fmt.Errorf("error creating logger: %v", err) } - input, err := parseAddURLInput(cmd, args) + input, err := parseAddURLInput(cmd, args) if err != nil { return err } - objectInfo, err := s.inspectObject(ctx, input.objectParams) + cfg, err := s.loadConfig() if err != nil { - return err + return fmt.Errorf("error getting config: %v", err) } - isTracked, err := s.isLFSTracked(input.path) + remote, err := cfg.GetDefaultRemote() if err != nil { - return fmt.Errorf("check LFS tracking for %s: %w", input.path, err) + return err } - gitCommonDir, lfsRoot, err := s.getGitRoots(ctx) - if err != nil { - return fmt.Errorf("get git root directories: %w", err) + remoteConfig := cfg.GetRemote(remote) + if remoteConfig == nil { + return fmt.Errorf("error getting remote configuration for %s", remote) } - if err := printResolvedInfo(cmd, gitCommonDir, lfsRoot, objectInfo, input.path, isTracked, input.sha256); err != nil { + org, project, scope, err := resolveTargetScope(remoteConfig) + if err != nil { return err } - oid, err := s.ensureLFSObject(ctx, objectInfo, input, lfsRoot) + input.objectURL, err = resolveObjectURL(input, scope) if err != nil { return err } - if err := writePointerFile(input.path, oid, objectInfo.SizeBytes); err != nil { + objectInfo, err := s.inspectObject(ctx, buildObjectParameters(input.objectURL, input.path, input.sha256)) + if err != nil { return err } - if err := updatePrecommitCache(ctx, logger, input.path, oid, input.objectURL); err != nil { - logger.Warn("pre-commit cache update skipped", "error", err) + isTracked, err := s.isLFSTracked(input.path) + if err != nil { + return fmt.Errorf("check LFS tracking for %s: %w", input.path, err) + } + + gitCommonDir, lfsRoot, err := s.getGitRoots(ctx) + if err != nil { + return fmt.Errorf("get git root directories: %w", err) } - if err := maybeTrackLFS(ctx, s.gitLFSTrack, input.path, isTracked); err != nil { + if err := printResolvedInfo(cmd, gitCommonDir, lfsRoot, objectInfo, input.path, isTracked, input.sha256); err != nil { return err } - cfg, err := s.loadConfig() + oid, err := s.ensureLFSObject(ctx, objectInfo, input, lfsRoot) if err != nil { - return fmt.Errorf("error getting config: %v", err) + return err } - remote, err := cfg.GetDefaultRemote() - if err != nil { + if err := writePointerFile(input.path, oid, objectInfo.SizeBytes); err != nil { return err } - remoteConfig := cfg.GetRemote(remote) - if remoteConfig == nil { - return fmt.Errorf("error getting remote configuration for %s", remote) + if err := updatePrecommitCache(ctx, logger, input.path, oid, input.objectURL); err != nil { + logger.Warn("pre-commit cache update skipped", "error", err) } - org, project, scope, err := resolveTargetScope(remoteConfig) - if err != nil { + if err := maybeTrackLFS(ctx, s.gitLFSTrack, input.path, isTracked); err != nil { return err } - builder := common.NewObjectBuilder(scope.Bucket, project) + builder := drsobject.NewBuilder(scope.Bucket, project) builder.Organization = org builder.StoragePrefix = scope.Prefix - file := lfs.LfsFileInfo{ + file := addURLDrsFile{ Name: input.path, Size: objectInfo.SizeBytes, Oid: oid, } - if _, err := drsmap.WriteDrsFile(builder, file, &input.objectURL); err != nil { - return fmt.Errorf("error WriteDrsFile: %v", err) + if _, err := writeAddURLDrsObject(builder, file, input.objectURL); err != nil { + return fmt.Errorf("write local DRS object: %w", err) } - return nil + return nil +} + +type addURLDrsFile struct { + Name string + Size int64 + Oid string +} + +func writeAddURLDrsObject(builder drsobject.Builder, file addURLDrsFile, objectPath string) (*drsapi.DrsObject, error) { + existing, err := drsobject.ReadObject(common.DRS_OBJS_PATH, file.Oid) + var drsObj *drsapi.DrsObject + if err == nil && existing != nil { + drsObj = existing + name := file.Name + drsObj.Name = &name + drsObj.Size = file.Size + } else { + drsID := uuid.NewSHA1(drsobject.UUIDNamespace, []byte(fmt.Sprintf("%s:%s", builder.Project, drsobject.NormalizeOid(file.Oid)))).String() + drsObj, err = builder.Build(file.Name, file.Oid, file.Size, drsID) + if err != nil { + return nil, fmt.Errorf("error building DRS object for oid %s: %w", file.Oid, err) + } + } + + if objectPath != "" { + if drsObj.AccessMethods != nil && len(*drsObj.AccessMethods) > 0 { + am := &(*drsObj.AccessMethods)[0] + am.AccessUrl = &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: objectPath} + } else { + drsObj.AccessMethods = &[]drsapi.AccessMethod{{ + Type: drsapi.AccessMethodTypeS3, + AccessUrl: &struct { + Headers *[]string `json:"headers,omitempty"` + Url string `json:"url"` + }{Url: objectPath}, + }} + } + } + + if err := drsobject.WriteObject(common.DRS_OBJS_PATH, drsObj, file.Oid); err != nil { + return nil, fmt.Errorf("error writing DRS object for oid %s: %w", file.Oid, err) + } + return drsObj, nil } -// ensureLFSObject ensures the LFS object identified by objectInfo exists in the -// repository's LFS storage. If SHA256 is provided, it is trusted and returned. -// Otherwise we create a sentinel object and synthetic OID derived from ETag, -// deferring true checksum validation to first real data use. -func (s *AddURLService) ensureLFSObject(ctx context.Context, objectInfo *cloud.ObjectInfo, input addURLInput, lfsRoot string) (string, error) { +// ensureLFSObject returns the LFS pointer OID to use for the add-url target. +// If SHA256 is provided, it is trusted and returned. Otherwise we derive a +// deterministic placeholder OID from provider identity without writing any +// local LFS object payload. +func (s *AddURLService) ensureLFSObject(ctx context.Context, objectInfo *sycloud.ObjectInfo, input addURLInput, lfsRoot string) (string, error) { _ = ctx + _ = lfsRoot if input.sha256 != "" { return input.sha256, nil } - oid, err := lfs.SyntheticOIDFromETag(objectInfo.ETag) - if err != nil { - return "", err - } - objPath, err := lfs.WriteAddURLSentinelObject(lfsRoot, oid, objectInfo.ETag, input.objectURL) - if err != nil { - return "", err + return placeholderOIDForUnknownSHA(objectInfo.ETag, input.objectURL) +} + +func placeholderOIDForUnknownSHA(etag string, sourceURL string) (string, error) { + e := strings.TrimSpace(strings.Trim(etag, `"`)) + src := strings.TrimSpace(sourceURL) + if e == "" { + return "", fmt.Errorf("etag is required for placeholder oid") } - if _, err := fmt.Fprintf(os.Stderr, "Added add-url sentinel object at %s\n", objPath); err != nil { - return "", fmt.Errorf("stderr write: %w", err) + if src == "" { + return "", fmt.Errorf("source URL is required for placeholder oid") } - return oid, nil + sum := sha256.Sum256([]byte("git-drs-add-url-placeholder:v2\netag=" + e + "\nsource=" + src + "\n")) + return fmt.Sprintf("%x", sum[:]), nil } @@ -957,11 +1107,11 @@ "strings" "time" - gitauth "github.com/calypr/git-drs/internal/auth" + "github.com/calypr/data-client/credentials" "github.com/calypr/git-drs/internal/common" "github.com/calypr/git-drs/internal/drslog" "github.com/calypr/git-drs/internal/gitrepo" - "github.com/calypr/syfon/client/conf" + conf "github.com/calypr/syfon/client/config" "github.com/spf13/cobra" ) @@ -1195,7 +1345,7 @@ if prof, err := configure.Load(remoteName); err == nil { token = strings.TrimSpace(prof.AccessToken) if token == "" { - if ensureErr := gitauth.EnsureValidCredential(context.Background(), prof, drslog.GetLogger()); ensureErr == nil { + if ensureErr := credentials.EnsureValidCredential(context.Background(), prof, drslog.GetLogger()); ensureErr == nil { _ = configure.Save(prof) token = strings.TrimSpace(prof.AccessToken) } @@ -1326,135 +1476,487 @@ } -