diff --git a/src/VariantIdentifier.php b/src/VariantIdentifier.php new file mode 100644 index 0000000..5136b49 --- /dev/null +++ b/src/VariantIdentifier.php @@ -0,0 +1,79 @@ +genomicPosition = $genomicPosition; + $this->reference = $reference; + $this->alternate = $alternate; + } + + public static function parse(string $value): self + { + if (strpos($value, '/') !== false) { + return self::parseCanonical($value); + } + + return self::parseDash($value); + } + + public function toString(VariantIdentifierFormat $format, NamingConvention $namingConvention): string + { + $chromosome = $this->genomicPosition->chromosome->toString($namingConvention); + $position = $this->genomicPosition->position; + $ref = $this->reference->toString(); + $alt = $this->alternate->toString(); + + switch ($format->value) { + case VariantIdentifierFormat::DASH: + return "{$chromosome}-{$position}-{$ref}-{$alt}"; + case VariantIdentifierFormat::CANONICAL: + return "{$chromosome}-{$position}-{$ref}/{$alt}"; + case VariantIdentifierFormat::TAB: + return "{$chromosome}\t{$position}\t{$ref}\t{$alt}"; + default: + throw new \InvalidArgumentException("No toString logic implemented for format: {$format->value}."); + } + } + + private static function parseCanonical(string $value): self + { + if (preg_match('/^(.+)-(\d+)-([ATGC]+)\/([ATGC]+)$/i', $value, $matches) === 0) { + throw new \InvalidArgumentException("Invalid canonical variant identifier format: {$value}."); + } + + assert(isset($matches[1], $matches[2], $matches[3], $matches[4])); + + return new self( + new GenomicPosition(new Chromosome($matches[1]), NucleotidePosition::fromOneBased(SafeCast::toInt($matches[2]))), + new DnaSequence($matches[3]), + new DnaSequence($matches[4]) + ); + } + + private static function parseDash(string $value): self + { + if (preg_match('/^(.+)-(\d+)-([ATGC]+)-([ATGC]+)$/i', $value, $matches) === 0) { + throw new \InvalidArgumentException("Invalid VCF variant identifier format: {$value}."); + } + + assert(isset($matches[1], $matches[2], $matches[3], $matches[4])); + + return new self( + new GenomicPosition(new Chromosome($matches[1]), NucleotidePosition::fromOneBased(SafeCast::toInt($matches[2]))), + new DnaSequence($matches[3]), + new DnaSequence($matches[4]) + ); + } +} diff --git a/src/VariantIdentifierFormat.php b/src/VariantIdentifierFormat.php new file mode 100644 index 0000000..dffaab8 --- /dev/null +++ b/src/VariantIdentifierFormat.php @@ -0,0 +1,25 @@ +value = $value; + break; + default: + throw new \InvalidArgumentException("Invalid variant identifier format: {$value}."); + } + } +} diff --git a/tests/VariantIdentifierTest.php b/tests/VariantIdentifierTest.php new file mode 100644 index 0000000..c6e2236 --- /dev/null +++ b/tests/VariantIdentifierTest.php @@ -0,0 +1,139 @@ +toString(new VariantIdentifierFormat(VariantIdentifierFormat::DASH), new NamingConvention(NamingConvention::UCSC)) + ); + } + + public function testConstructAndToStringCanonical(): void + { + $variant = new VariantIdentifier( + new GenomicPosition(new Chromosome('chr1'), NucleotidePosition::fromOneBased(12345)), + new DnaSequence('A'), + new DnaSequence('ATT') + ); + + self::assertSame( + 'chr1-12345-A/ATT', + $variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::CANONICAL), new NamingConvention(NamingConvention::UCSC)) + ); + } + + public function testToStringEnsembl(): void + { + $variant = new VariantIdentifier( + new GenomicPosition(new Chromosome('chrX'), NucleotidePosition::fromOneBased(99999)), + new DnaSequence('GC'), + new DnaSequence('A') + ); + + self::assertSame( + 'X-99999-GC-A', + $variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::DASH), new NamingConvention(NamingConvention::ENSEMBL)) + ); + self::assertSame( + 'X-99999-GC/A', + $variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::CANONICAL), new NamingConvention(NamingConvention::ENSEMBL)) + ); + } + + /** @dataProvider parseVCFProvider */ + #[DataProvider('parseVCFProvider')] + public function testParseVCF(string $input, string $expectedChromosome, int $expectedPosition, string $expectedRef, string $expectedAlt): void + { + $variant = VariantIdentifier::parse($input); + + self::assertSame($expectedChromosome, $variant->genomicPosition->chromosome->value()); + self::assertSame($expectedPosition, $variant->genomicPosition->position); + self::assertSame($expectedRef, $variant->reference->toString()); + self::assertSame($expectedAlt, $variant->alternate->toString()); + } + + /** @return iterable */ + public static function parseVCFProvider(): iterable + { + yield 'SNV with chr prefix' => ['chr1-12345-A-T', '1', 12345, 'A', 'T']; + yield 'SNV without chr prefix' => ['1-12345-A-T', '1', 12345, 'A', 'T']; + yield 'deletion' => ['chr7-5000-ATG-A', '7', 5000, 'ATG', 'A']; + yield 'insertion' => ['chrX-99999-A-ATCG', 'X', 99999, 'A', 'ATCG']; + yield 'chromosome 22' => ['chr22-100-G-C', '22', 100, 'G', 'C']; + yield 'mitochondrial' => ['chrM-8000-T-C', 'M', 8000, 'T', 'C']; + } + + /** @dataProvider parseCanonicalProvider */ + #[DataProvider('parseCanonicalProvider')] + public function testParseCanonical(string $input, string $expectedChromosome, int $expectedPosition, string $expectedRef, string $expectedAlt): void + { + $variant = VariantIdentifier::parse($input); + + self::assertSame($expectedChromosome, $variant->genomicPosition->chromosome->value()); + self::assertSame($expectedPosition, $variant->genomicPosition->position); + self::assertSame($expectedRef, $variant->reference->toString()); + self::assertSame($expectedAlt, $variant->alternate->toString()); + } + + /** @return iterable */ + public static function parseCanonicalProvider(): iterable + { + yield 'SNV with chr prefix' => ['chr1-12345-A/T', '1', 12345, 'A', 'T']; + yield 'multi-base alternate' => ['chr1-12345-A/ATT', '1', 12345, 'A', 'ATT']; + yield 'without chr prefix' => ['12-500-G/CA', '12', 500, 'G', 'CA']; + } + + public function testParseInvalidVCFThrows(): void + { + $this->expectException(\InvalidArgumentException::class); + VariantIdentifier::parse('invalid'); + } + + public function testParseInvalidCanonicalThrows(): void + { + $this->expectException(\InvalidArgumentException::class); + VariantIdentifier::parse('invalid/stuff'); + } + + public function testRoundtripVCF(): void + { + $input = 'chr7-140753336-A-T'; + $variant = VariantIdentifier::parse($input); + + self::assertSame( + $input, + $variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::DASH), new NamingConvention(NamingConvention::UCSC)) + ); + } + + public function testRoundtripCanonical(): void + { + $input = 'chr7-140753336-A/T'; + $variant = VariantIdentifier::parse($input); + + self::assertSame( + $input, + $variant->toString(new VariantIdentifierFormat(VariantIdentifierFormat::CANONICAL), new NamingConvention(NamingConvention::UCSC)) + ); + } +}