Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 69 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,79 @@ jobs:
uses: prefix-dev/setup-pixi@v0.8.3

- name: Run tests via pixi tasks
run: pixi run test
run: pixi run -e default test

- name: Build conda package
run: pixi build

- name: Test R package
working-directory: R/package
run: pixi run test

# ─── S3 integration tests (MinIO) ──────────────────────────────────────────
# Uses MinIO to emulate S3 — no real AWS account or license required.
# Runs on every push/PR automatically.
test-s3:
name: S3 integration tests (MinIO)
runs-on: ubuntu-latest

env:
AWS_ENDPOINT_URL: http://localhost:9000
AWS_DEFAULT_REGION: us-east-1
AWS_ACCESS_KEY_ID: minioadmin
AWS_SECRET_ACCESS_KEY: minioadmin
# example.16bits.bgen has 199 variants, 500 samples
BGEN_S3_EXPECTED_VARIANTS: "199"
BGEN_S3_EXPECTED_SAMPLES: "500"
BGEN_S3_TEST_BUCKET: bgen-test

steps:
- uses: actions/checkout@v4

- name: Start MinIO
run: |
docker run -d --name minio \
-p 9000:9000 \
-e MINIO_ROOT_USER=minioadmin \
-e MINIO_ROOT_PASSWORD=minioadmin \
quay.io/minio/minio:latest server /data
# Wait up to 60s for MinIO to become healthy
for i in $(seq 1 30); do
curl -sf http://localhost:9000/minio/health/live && echo "MinIO ready" && break
sleep 2
done

- name: Set up pixi
uses: prefix-dev/setup-pixi@v0.8.3

# Create two buckets:
# bgen-test — authenticated access (default credentials)
# bgen-public — anonymous access (bucket policy allows public reads)
- name: Create bucket and upload example BGEN files
run: |
# awscli is pre-installed on ubuntu-latest runners
aws s3 mb "s3://${BGEN_S3_TEST_BUCKET}" --region us-east-1

KEY="ci-test/example.16bits.bgen"
aws s3 cp example/example.16bits.bgen "s3://${BGEN_S3_TEST_BUCKET}/${KEY}"

# Public-access bucket: set a bucket policy allowing anonymous GetObject/HeadObject.
# MinIO (and S3) honour this without requiring per-object ACLs.
aws s3 mb "s3://bgen-public" --region us-east-1
PUBLIC_KEY="example.16bits.bgen"
aws s3 cp example/example.16bits.bgen "s3://bgen-public/${PUBLIC_KEY}"
aws s3api put-bucket-policy --bucket bgen-public --policy '{
"Version": "2012-10-17",
"Statement": [{
"Effect": "Allow",
"Principal": "*",
"Action": ["s3:GetObject"],
"Resource": ["arn:aws:s3:::bgen-public/*"]
}]
}'

echo "BGEN_S3_TEST_URI=s3://${BGEN_S3_TEST_BUCKET}/${KEY}" >> "$GITHUB_ENV"
echo "BGEN_S3_PUBLIC_TEST_URI=s3://bgen-public/${PUBLIC_KEY}" >> "$GITHUB_ENV"

- name: Build S3-enabled bgen and run integration tests
run: pixi run -e s3 --frozen test-s3
17 changes: 16 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ project(bgen VERSION 1.1.7 LANGUAGES C CXX)
option(BGEN_BUILD_EXAMPLES "Build example programs" OFF)
option(BGEN_BUILD_TESTS "Build tests" ON)
option(BGEN_BUILD_TOOLS "Build command-line tools (bgenix, cat-bgen, edit-bgen)" ON)
option(BGEN_WITH_S3 "Enable AWS S3 remote file support" OFF)
option(BUILD_SHARED_LIBS "Build shared libraries" OFF)

# ─── Language standards ──────────────────────────────────────────────────────
Expand Down Expand Up @@ -34,6 +35,11 @@ find_package(Boost REQUIRED COMPONENTS
chrono
)

# ─── Optional: AWS S3 support ────────────────────────────────────────────────
if(BGEN_WITH_S3)
find_package(AWSSDK REQUIRED COMPONENTS s3)
endif()

# ─── Generated version header ───────────────────────────────────────────────
configure_file(
"${CMAKE_CURRENT_SOURCE_DIR}/cmake/bgen_revision.hpp.in"
Expand All @@ -46,13 +52,18 @@ add_subdirectory(db)
add_subdirectory(appcontext)

# ─── Core bgen library ───────────────────────────────────────────────────────
add_library(bgen
set(BGEN_SOURCES
src/bgen.cpp
src/IndexQuery.cpp
src/MissingValue.cpp
src/View.cpp
src/zlib.cpp
)
if(BGEN_WITH_S3)
list(APPEND BGEN_SOURCES src/S3StreamBuf.cpp)
endif()

add_library(bgen ${BGEN_SOURCES})
add_library(bgen::bgen ALIAS bgen)

target_include_directories(bgen
Expand All @@ -63,6 +74,9 @@ target_include_directories(bgen
${CMAKE_CURRENT_BINARY_DIR}
)
target_compile_features(bgen PUBLIC cxx_std_17)
if(BGEN_WITH_S3)
target_compile_definitions(bgen PUBLIC BGEN_WITH_S3=1)
endif()
target_link_libraries(bgen
PUBLIC
ZLIB::ZLIB
Expand All @@ -72,6 +86,7 @@ target_link_libraries(bgen
Boost::filesystem
PRIVATE
bgen::db
$<$<BOOL:${BGEN_WITH_S3}>:${AWSSDK_LINK_LIBRARIES}>
)

# ─── Command-line tools ──────────────────────────────────────────────────────
Expand Down
84 changes: 82 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
| `std::random_shuffle` | used in tests | replaced with `std::shuffle` (C++17) |
| CMake package config | none | `find_package(bgen)` works for downstream projects |
| R package | bundled in build dir | self-contained in `R/package/` with its own `pixi.toml` |
| Remote files | local filesystem only | **AWS S3** support via range requests (`s3://…`) |

---

Expand All @@ -31,6 +32,7 @@
- **[edit-bgen](https://bitbucket.org/gavinband/bgen/wiki/edit-bgen)** — edit BGEN file metadata
- **[rbgen](R/package/)** — R package (separate pixi environment, not bundled in the conda package)
- **[Example programs](example/)** — `bgen_to_vcf`, `count_alleles`, etc.
- **[AWS S3 support](#aws-s3-support)** — read BGEN files directly from S3 using `s3://bucket/key` URIs

---

Expand Down Expand Up @@ -62,6 +64,12 @@ cmake --build build --parallel
ctest --test-dir build --output-on-failure
```

To also enable S3 support, add `-DBGEN_WITH_S3=ON` and ensure the AWS SDK for C++ is findable:

```bash
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DBGEN_WITH_S3=ON
```

Install to a prefix:

```bash
Expand All @@ -78,6 +86,77 @@ target_link_libraries(my_target PRIVATE bgen::bgen)

---

## AWS S3 support

The library can read BGEN files directly from AWS S3 without downloading them first.
It uses [HTTP range requests](https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests) via the
[AWS SDK for C++](https://github.com/aws/aws-sdk-cpp), so only the blocks actually needed are fetched.

### Enabling

S3 support is opt-in. Pass `-DBGEN_WITH_S3=ON` to CMake and make sure the AWS SDK is on your `CMAKE_PREFIX_PATH`:

```bash
# With pixi (installs aws-sdk-cpp automatically):
pixi run -e s3 configure
pixi run -e s3 build

# Or with CMake directly (requires aws-sdk-cpp on the prefix path):
cmake -S . -B build -G Ninja \
-DCMAKE_BUILD_TYPE=Release \
-DBGEN_WITH_S3=ON
cmake --build build --parallel
```

### Usage

Pass an `s3://bucket/key` URI anywhere a filename is accepted:

```cpp
// C++ API
auto view = genfile::bgen::View::create("s3://my-bucket/cohort.bgen");
while (view->read_variant(&snpid, &rsid, &chr, &pos, &alleles)) {
view->read_genotype_data_block(setter);
}
```

```bash
# Command-line tools
bgenix -g s3://my-bucket/cohort.bgen -list
cat-bgen -g s3://my-bucket/part1.bgen s3://my-bucket/part2.bgen -og merged.bgen
```

### Authentication

Credentials are resolved by the AWS SDK's default provider chain in this order:

1. Environment variables — `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` / `AWS_SESSION_TOKEN`
2. `~/.aws/credentials` and `~/.aws/config`
3. EC2/ECS/EKS instance metadata

The AWS region is picked up from `AWS_DEFAULT_REGION` or `~/.aws/config`.
You can also set it programmatically when constructing a stream directly:

```cpp
#include "genfile/bgen/S3StreamBuf.hpp"
auto stream = genfile::bgen::make_s3_istream("s3://my-bucket/cohort.bgen", "eu-west-1");
```

### Tuning

The default read block size is **1 MB**. For high-latency connections or very large genotype blocks,
construct an `S3StreamBuf` directly with a larger block size:

```cpp
auto buf = std::make_unique<genfile::bgen::S3StreamBuf>(
"my-bucket", "cohort.bgen",
/* region = */ "us-east-1",
/* block_size = */ 8 * 1024 * 1024 // 8 MB
);
```

---

## R package

The R package lives in [`R/package/`](R/package/) and has its own pixi environment:
Expand All @@ -103,5 +182,6 @@ If you use this library, its tools, or example programs, please cite the origina
Released under the [Boost Software License v1.0](LICENSE_1_0.txt) — a permissive open-source license compatible with many others.

This repository also uses [SQLite](https://www.sqlite.org/copyright.html) (public domain),
[Boost](https://www.boost.org/users/license.html) (Boost Software License), and
[zstandard](https://github.com/facebook/zstd/blob/dev/LICENSE) (BSD).
[Boost](https://www.boost.org/users/license.html) (Boost Software License),
[zstandard](https://github.com/facebook/zstd/blob/dev/LICENSE) (BSD), and optionally the
[AWS SDK for C++](https://github.com/aws/aws-sdk-cpp/blob/main/LICENSE) (Apache 2.0, only when built with `-DBGEN_WITH_S3=ON`).
4 changes: 4 additions & 0 deletions cmake/bgenConfig.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ endif()
cmake_policy(SET CMP0167 NEW)
find_dependency(Boost COMPONENTS filesystem thread date_time timer chrono)

if(@BGEN_WITH_S3@)
find_dependency(AWSSDK COMPONENTS s3)
endif()

include("${CMAKE_CURRENT_LIST_DIR}/bgenTargets.cmake")

check_required_components(bgen)
78 changes: 78 additions & 0 deletions genfile/include/genfile/bgen/S3StreamBuf.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// Copyright Julianus Pfeuffer 2024.
// Distributed under the Boost Software License, Version 1.0.
// (See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt)

#ifndef GENFILE_BGEN_S3STREAMBUF_HPP
#define GENFILE_BGEN_S3STREAMBUF_HPP

#include <streambuf>
#include <istream>
#include <memory>
#include <string>
#include <vector>
#include <cstdint>

namespace genfile {
namespace bgen {

/// Parse an S3 URI of the form s3://bucket/key into its components.
/// Returns true if the URI was successfully parsed.
bool parse_s3_uri(std::string const& uri, std::string& bucket, std::string& key);

/// Returns true if the given filename looks like an S3 URI (starts with "s3://").
bool is_s3_uri(std::string const& filename);

/// A std::streambuf implementation that reads from AWS S3 using range requests.
/// Supports seeking (seekg) and buffered block-based reads for efficiency.
class S3StreamBuf : public std::streambuf {
public:
/// Construct an S3StreamBuf for the given bucket and key.
/// block_size controls the granularity of range GET requests (default 1 MB).
S3StreamBuf(std::string const& bucket, std::string const& key,
std::string const& region = "",
std::size_t block_size = 1024 * 1024);

~S3StreamBuf() override;

/// Get the total size of the S3 object in bytes.
std::int64_t object_size() const;

protected:
// std::streambuf overrides
int_type underflow() override;
pos_type seekoff(off_type off, std::ios_base::seekdir dir,
std::ios_base::openmode which = std::ios_base::in) override;
pos_type seekpos(pos_type pos,
std::ios_base::openmode which = std::ios_base::in) override;
std::streamsize showmanyc() override;

private:
void fetch_block(std::int64_t block_index);
std::int64_t current_position() const;

struct Impl;
std::unique_ptr<Impl> m_impl;

std::string m_bucket;
std::string m_key;
std::size_t m_block_size;
std::int64_t m_object_size;
std::int64_t m_position; // logical position in the stream

// Buffer for the currently loaded block
std::vector<char> m_buffer;
std::int64_t m_buffer_block_index; // which block is loaded (-1 = none)
};

/// Create an std::istream that reads from an S3 URI.
/// The URI must be of the form s3://bucket/key.
/// Optionally specify the AWS region (otherwise uses SDK defaults/env).
std::unique_ptr<std::istream> make_s3_istream(
std::string const& s3_uri,
std::string const& region = "");

} // namespace bgen
} // namespace genfile

#endif
2 changes: 1 addition & 1 deletion genfile/include/genfile/zlib.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ namespace genfile {
void zstd_uncompress( byte_t const* begin, byte_t const* const end, std::vector< T >* dest ) {
std::size_t const source_size = ( end - begin ) ;
std::size_t const dest_size = dest->size() * sizeof( T ) ;
std::size_t const uncompressed_size = ZSTD_getDecompressedSize( reinterpret_cast< void const* >( begin ), source_size ) ;
std::size_t const uncompressed_size = ZSTD_getFrameContentSize( reinterpret_cast< void const* >( begin ), source_size ) ;
std::size_t const result = ZSTD_decompress(
reinterpret_cast< void* >( &dest->operator[]( 0 ) ),
dest_size,
Expand Down
11 changes: 11 additions & 0 deletions pixi.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,17 @@ configure = "cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_PRE
build = { cmd = "cmake --build build --parallel", depends-on = ["configure"] }
test = { cmd = "ctest --test-dir build --output-on-failure -V", depends-on = ["build"] }

[feature.s3.dependencies]
aws-sdk-cpp = "*"

[feature.s3.tasks]
configure = "cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=$CONDA_PREFIX -DBGEN_BUILD_TESTS=ON -DBGEN_WITH_S3=ON"
test-s3 = { cmd = "ctest --test-dir build --output-on-failure -V -R test_s3_bgen", depends-on = ["build"] }

[environments]
default = { solve-group = "default" }
s3 = { features = ["s3"], solve-group = "default" }

[package]
name = "bgen"
version = "1.1.7"
Expand Down
Loading
Loading