Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 79 additions & 0 deletions src/VariantIdentifier.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
<?php declare(strict_types=1);

namespace MLL\Utils;

use function Safe\preg_match;

class VariantIdentifier
{
public GenomicPosition $genomicPosition;

public DnaSequence $reference;

public DnaSequence $alternate;

public function __construct(GenomicPosition $genomicPosition, DnaSequence $reference, DnaSequence $alternate)
{
$this->genomicPosition = $genomicPosition;
$this->reference = $reference;
$this->alternate = $alternate;
}

public static function parse(string $value): self
{
if (strpos($value, '/') !== false) {
return self::parseCanonical($value);
}

return self::parseDash($value);
}

public function toString(VariantIdentifierFormat $format, NamingConvention $namingConvention): string
{
$chromosome = $this->genomicPosition->chromosome->toString($namingConvention);
$position = $this->genomicPosition->position;
$ref = $this->reference->toString();
$alt = $this->alternate->toString();

switch ($format->value) {
case VariantIdentifierFormat::DASH:
return "{$chromosome}-{$position}-{$ref}-{$alt}";
case VariantIdentifierFormat::CANONICAL:
return "{$chromosome}-{$position}-{$ref}/{$alt}";
case VariantIdentifierFormat::TAB:
return "{$chromosome}\t{$position}\t{$ref}\t{$alt}";
default:
throw new \InvalidArgumentException("No toString logic implemented for format: {$format->value}.");
}
}

private static function parseCanonical(string $value): self
{
if (preg_match('/^(.+)-(\d+)-([ATGC]+)\/([ATGC]+)$/i', $value, $matches) === 0) {
throw new \InvalidArgumentException("Invalid canonical variant identifier format: {$value}.");
}

assert(isset($matches[1], $matches[2], $matches[3], $matches[4]));

return new self(
new GenomicPosition(new Chromosome($matches[1]), NucleotidePosition::fromOneBased(SafeCast::toInt($matches[2]))),
new DnaSequence($matches[3]),
new DnaSequence($matches[4])
);
}

private static function parseDash(string $value): self
{
if (preg_match('/^(.+)-(\d+)-([ATGC]+)-([ATGC]+)$/i', $value, $matches) === 0) {
throw new \InvalidArgumentException("Invalid VCF variant identifier format: {$value}.");
}

assert(isset($matches[1], $matches[2], $matches[3], $matches[4]));

return new self(
new GenomicPosition(new Chromosome($matches[1]), NucleotidePosition::fromOneBased(SafeCast::toInt($matches[2]))),
new DnaSequence($matches[3]),
new DnaSequence($matches[4])
);
}
}
25 changes: 25 additions & 0 deletions src/VariantIdentifierFormat.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?php declare(strict_types=1);

namespace MLL\Utils;

class VariantIdentifierFormat
{
public const DASH = 'DASH';
public const CANONICAL = 'CANONICAL';
public const TAB = 'TAB';

public string $value;

public function __construct(string $value)
{
switch ($value) {
case self::DASH:
case self::CANONICAL:
case self::TAB:
$this->value = $value;
break;
default:
throw new \InvalidArgumentException("Invalid variant identifier format: {$value}.");
}
}
}
139 changes: 139 additions & 0 deletions tests/VariantIdentifierTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
<?php declare(strict_types=1);

namespace MLL\Utils\Tests;

use MLL\Utils\Chromosome;
use MLL\Utils\DnaSequence;
use MLL\Utils\GenomicPosition;
use MLL\Utils\NamingConvention;
use MLL\Utils\NucleotidePosition;
use MLL\Utils\VariantIdentifier;
use MLL\Utils\VariantIdentifierFormat;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;

final class VariantIdentifierTest extends TestCase
{
public function testConstructAndToStringVCF(): void
{
$variant = new VariantIdentifier(
new GenomicPosition(new Chromosome('chr1'), NucleotidePosition::fromOneBased(12345)),
new DnaSequence('A'),
new DnaSequence('T')
);

self::assertSame(
'chr1-12345-A-T',
$variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::DASH), new NamingConvention(NamingConvention::UCSC))
);
}

public function testConstructAndToStringCanonical(): void
{
$variant = new VariantIdentifier(
new GenomicPosition(new Chromosome('chr1'), NucleotidePosition::fromOneBased(12345)),
new DnaSequence('A'),
new DnaSequence('ATT')
);

self::assertSame(
'chr1-12345-A/ATT',
$variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::CANONICAL), new NamingConvention(NamingConvention::UCSC))
);
}

public function testToStringEnsembl(): void
{
$variant = new VariantIdentifier(
new GenomicPosition(new Chromosome('chrX'), NucleotidePosition::fromOneBased(99999)),
new DnaSequence('GC'),
new DnaSequence('A')
);

self::assertSame(
'X-99999-GC-A',
$variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::DASH), new NamingConvention(NamingConvention::ENSEMBL))
);
self::assertSame(
'X-99999-GC/A',
$variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::CANONICAL), new NamingConvention(NamingConvention::ENSEMBL))
);
}

/** @dataProvider parseVCFProvider */
#[DataProvider('parseVCFProvider')]
public function testParseVCF(string $input, string $expectedChromosome, int $expectedPosition, string $expectedRef, string $expectedAlt): void
{
$variant = VariantIdentifier::parse($input);

self::assertSame($expectedChromosome, $variant->genomicPosition->chromosome->value());
self::assertSame($expectedPosition, $variant->genomicPosition->position);
self::assertSame($expectedRef, $variant->reference->toString());
self::assertSame($expectedAlt, $variant->alternate->toString());
}

/** @return iterable<string, array{string, string, int, string, string}> */
public static function parseVCFProvider(): iterable
{
yield 'SNV with chr prefix' => ['chr1-12345-A-T', '1', 12345, 'A', 'T'];
yield 'SNV without chr prefix' => ['1-12345-A-T', '1', 12345, 'A', 'T'];
yield 'deletion' => ['chr7-5000-ATG-A', '7', 5000, 'ATG', 'A'];
yield 'insertion' => ['chrX-99999-A-ATCG', 'X', 99999, 'A', 'ATCG'];
yield 'chromosome 22' => ['chr22-100-G-C', '22', 100, 'G', 'C'];
yield 'mitochondrial' => ['chrM-8000-T-C', 'M', 8000, 'T', 'C'];
}

/** @dataProvider parseCanonicalProvider */
#[DataProvider('parseCanonicalProvider')]
public function testParseCanonical(string $input, string $expectedChromosome, int $expectedPosition, string $expectedRef, string $expectedAlt): void
{
$variant = VariantIdentifier::parse($input);

self::assertSame($expectedChromosome, $variant->genomicPosition->chromosome->value());
self::assertSame($expectedPosition, $variant->genomicPosition->position);
self::assertSame($expectedRef, $variant->reference->toString());
self::assertSame($expectedAlt, $variant->alternate->toString());
}

/** @return iterable<string, array{string, string, int, string, string}> */
public static function parseCanonicalProvider(): iterable
{
yield 'SNV with chr prefix' => ['chr1-12345-A/T', '1', 12345, 'A', 'T'];
yield 'multi-base alternate' => ['chr1-12345-A/ATT', '1', 12345, 'A', 'ATT'];
yield 'without chr prefix' => ['12-500-G/CA', '12', 500, 'G', 'CA'];
}

public function testParseInvalidVCFThrows(): void
{
$this->expectException(\InvalidArgumentException::class);
VariantIdentifier::parse('invalid');
}

public function testParseInvalidCanonicalThrows(): void
{
$this->expectException(\InvalidArgumentException::class);
VariantIdentifier::parse('invalid/stuff');
}

public function testRoundtripVCF(): void
{
$input = 'chr7-140753336-A-T';
$variant = VariantIdentifier::parse($input);

self::assertSame(
$input,
$variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::DASH), new NamingConvention(NamingConvention::UCSC))
);
}

public function testRoundtripCanonical(): void
{
$input = 'chr7-140753336-A/T';
$variant = VariantIdentifier::parse($input);

self::assertSame(
$input,
$variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::CANONICAL), new NamingConvention(NamingConvention::UCSC))
);
}
}
Loading