From 08c50ecf0be26092102bed4092c60ef7a61d1661 Mon Sep 17 00:00:00 2001 From: CNuge Date: Fri, 25 Apr 2025 08:52:53 -0400 Subject: [PATCH 01/43] drafting the ibd and penetrance funcs --- R/encoding.R | 4 +- R/pedigree.r | 157 +++++++++++++++++++++ inst/extdata/example_pedigree_encoding.tsv | 15 ++ tests/testthat/test_pedigree_eval.R | 11 ++ 4 files changed, 185 insertions(+), 2 deletions(-) create mode 100644 R/pedigree.r create mode 100644 inst/extdata/example_pedigree_encoding.tsv create mode 100644 tests/testthat/test_pedigree_eval.R diff --git a/R/encoding.R b/R/encoding.R index f8fe84f..659d958 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -26,7 +26,7 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ if(theoretical.max){ return("A_c") } - else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0"|| variant == "1|1" ){ + else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0" || variant == "1|1" ){ return("A_c") }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ return("A_i") @@ -36,7 +36,7 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ }else if (status == "U"){ if(theoretical.max){ return("U_c") - } else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0"|| variant == "1|1" ){ + } else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0" || variant == "1|1" ){ return("U_i") }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ return("U_c") diff --git a/R/pedigree.r b/R/pedigree.r new file mode 100644 index 0000000..d571466 --- /dev/null +++ b/R/pedigree.r @@ -0,0 +1,157 @@ + +#' Title +#' +#' @param K The estimate of penetrance rate. +#' @param a Count of affected individuals +#' @param b Count of obligate carriers +#' @param c Count of children of either affecteds or carriers, with no children of their own +#' @param d Count of Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring) +#' @param n Count of the number of second generation progeny in a given tree. +#' +#' @return +#' @export +#' +#' @examples +penetrance <- function(K, a, b, c, d, n) { + a*log(K) + b*log(1-K) + c*log(2-K) + sum(d*log(2^n + (1-K)*(2-K)^n) - d*(n+1)*log(2)) +} + + + +#' Calculation of Identity by descent (IBD). +#' +#' Use the relationships of information from the pedigree to calculate and +#' estimate of the amount of the genome they have inherited it from a +#' common ancestor without recombination. +#' +#' @param a +#' @param b +#' @param c +#' @param d +#' @param n +#' @param K The estimate of penetrance rate. +#' @param theoretical Boolean indicating if the calculation should be +#' theoretical IBD calculation (using only d and k), or if the calculation +#' should use the provided n value. +#' +#' +#' @return +#' @export +#' +#' @examples +ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { + if (theoretical) { + x <- sum(d) * 2^K + } else { + x <- sum(d*(1 - (2^n-1)/2^(n+1))) * 2^K + } + (a + b + c * 2^K + x) - 1 +} + + +#TODO - CHECK with BARI ?The number of non-overlapping paths between all pedigree members. +# for IBD is this the correct param? +#' @param n The number of non-overlapping paths between all pedigree members. + +#' Title +#' +#' @param pihat +#' @param K +#' +#' @return +#' @export +#' +#' @examples +rank <- function(pihat, K) { + log(2^pihat) + #log(2^pihat*K) +} + + +#' Read in the encoded pedigree data file. +#' TODO - can we build this encoded matrix from a more standard file format? i.e. ped-like? +#' +#' @param filename name of the file with the data. +#' +#' @return A data frame containing the encoded pedigree information. +#' @export +#' +#' @examples +read.pedigree <- function(filename){ + h <- read.table(filename, header=TRUE, sep="\t", check.names=FALSE, colClasses=c("Family"="character")) + return(h) +} + + +count the number of non-overlapping paths between all pedigree members +That number becomes n in (1/2)^n. We can do this twice - once for the total potential relatedness in a pedigree, and again for the actual relatedness across collected samples. This method avoids the issue of using IBD estimates relative to the proband, which is that the relatedness estimates change based on who you are in the pedigree, + +#' Take the encoded information about the pedigrees and calculate penetrance. +#' +#' Determine a value rank of families by comparing their relationship structure. +#' More distant relationships between affecteds (e.g. affected cousins) +#' is more valuable that close relationships (e.g. affected siblings) +#' as there is less IBD and therefore a smaller search space. +#' +#'Simplifying assumptions: +#' - Autosomal dominant +#' - No ambiguous statuses +#' - No more than two sequential generations of unknown carrier status +#' (non-obligate carrier vs. non-carrier). +#' Generalized support of arbitrary tree structures gets a lot more complicated, especially for the likelihood function. +#' - Exclude big giant trees of unaffecteds - related to above. +#' Will slightly bias the result toward higher penetrance. +#' - Exclude subjects younger than age of onset +#' +#' @param h A data frame containing the encoded pedigree information +#' +#' @return A data frame containing the theoretical ranking of the power of a +#' family assuming you were able to collect everyone on the simplified pedigree, +#' as well as a current ranking, examining only those for whom you currently have DNA. +#' @export +#' +#' @examples +cal.penetrance <- function(h){ + + family_vec <- c() + penetrance_vec <- c() + max_pihat_vec <- c() + max_rank_vec <- c() + current_pihat_vec <- c() + current_rank_vec <- c() + for (i in seq_len(nrow(h))) { + family <- h[i,"Family"] + max_a <- h[i, "max_a"] + max_b <- h[i, "max_b"] + max_c <- h[i, "max_c"] + max_d <- h[i, "max_d"] + max_n <- h[i, "max_n"] + a_actual <- h[i, "a"] + b_actual <- h[i, "b"] + c_actual <- h[i, "c"] + d_actual <- h[i, "d"] + n_actual <- h[i, "n"] + + max_d <- as.numeric(strsplit(as.character(max_d), ",")[[1]]) + max_n <- as.numeric(strsplit(as.character(max_n), ",")[[1]]) + + K <- optimize(penetrance, c(0,1), max_a, max_b, max_c, max_d, max_n, maximum=TRUE)$max + max_pihat <- ibd(max_a, max_b, max_c, max_d, max_n, K) + max_rank <- rank(max_pihat, K) + current_pihat <- ibd(a_actual, b_actual, c_actual, d_actual, n_actual, K, FALSE) + current_rank <- rank(current_pihat, K) + + family_vec <- c(family_vec, family) + penetrance_vec <- c(penetrance_vec, K) + max_pihat_vec <- c(max_pihat_vec, max_pihat) + max_rank_vec <- c(max_rank_vec, max_rank) + current_pihat_vec <- c(current_pihat_vec, current_pihat) + current_rank_vec <- c(current_rank_vec,current_rank) + } + + df <- data.frame(family_vec, penetrance_vec, max_pihat_vec, max_rank_vec, current_pihat_vec, current_rank_vec) + colnames(df) <- c("family", "penetrance", "max_pi-hat", "max_rank", "current_pi-hat", "current_rank") + df$pct_of_max <- df$current_rank / df$max_rank * 100 + df[, -1] <- round(df[, -1], 2) + return(df) +} diff --git a/inst/extdata/example_pedigree_encoding.tsv b/inst/extdata/example_pedigree_encoding.tsv new file mode 100644 index 0000000..747efee --- /dev/null +++ b/inst/extdata/example_pedigree_encoding.tsv @@ -0,0 +1,15 @@ +Family max_a a max_b b max_c c max_d d max_n n +1111 4 4 0 0 0 0 0 0 0 0 +1122 1 1 0 0 0 0 1 1 2 2 +1133 1 1 1 1 0 0 1 1 1 1 +1144 1 1 0 0 1 1 1 1 1 1 +1155 1 1 0 0 1 1 0 0 0 0 +1166 1 1 0 0 0 0 1 1 0 0 +2222 4 4 0 0 0 0 1 1 2 2 +0347 6 2 6 3 17 1 7,3,1,1 1 2,3,1,8 1 +5031 5 2 8 4 2 4 5 2 0 0 +6325 7 2 5 0 4 0 1,2 0 1,2 0 +7003 5 4 4 1 4 0 1,2 1 1,3 1 +4974 7 4 2 0 7 4 1 1 5 1 +4006 2 2 1 0 4 1 0 0 0 0 +3734 3 0 2 0 3 0 1,3,1 1 1,3,5 0 diff --git a/tests/testthat/test_pedigree_eval.R b/tests/testthat/test_pedigree_eval.R new file mode 100644 index 0000000..e5161d4 --- /dev/null +++ b/tests/testthat/test_pedigree_eval.R @@ -0,0 +1,11 @@ +test_that("Data are read from files correctly", { + + ################################## + # test 1 + example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') + + + example.pedigree.df <- read.pedigree(example.pedigree.file) + + + penetrance.df <- cal.penetrance(example.pedigree.df) From 87ab21a59dc00f6ab8549daf412ef1e6c87ea8a4 Mon Sep 17 00:00:00 2001 From: CNuge Date: Fri, 25 Apr 2025 09:36:51 -0400 Subject: [PATCH 02/43] check all pushed --- NAMESPACE | 5 +++ R/pedigree.r | 58 +++++++++++++++++------------ man/cal.penetrance.Rd | 42 +++++++++++++++++++++ man/penetrance.Rd | 29 +++++++++++++++ man/rank.Rd | 18 +++++++++ man/read.pedigree.Rd | 23 ++++++++++++ man/score.fam.Rd | 7 +++- man/subset.mat.Rd | 11 ++++++ tests/testthat/test_pedigree_eval.R | 1 + 9 files changed, 169 insertions(+), 25 deletions(-) create mode 100644 man/cal.penetrance.Rd create mode 100644 man/penetrance.Rd create mode 100644 man/rank.Rd create mode 100644 man/read.pedigree.Rd create mode 100644 man/subset.mat.Rd diff --git a/NAMESPACE b/NAMESPACE index c893e29..9ffcbfe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,9 +3,14 @@ S3method(sum,fam.scores) export(assign.status) export(build.relation.dict) +export(cal.penetrance) export(calc.rv.score) export(encode.rows) +export(ibd) +export(penetrance) +export(rank) export(read.indiv) +export(read.pedigree) export(read.relation.mat) export(read.var.table) export(score.fam) diff --git a/R/pedigree.r b/R/pedigree.r index d571466..2cece69 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -1,14 +1,15 @@ -#' Title +#' Likelihood function for calculation of Pedigree-based autosomal dominant penetrance value. +#' Formula deployed via optimize so as to determine the optimal value. #' -#' @param K The estimate of penetrance rate. +#' @param K Seed value for the estimate of penetrance rate. #' @param a Count of affected individuals #' @param b Count of obligate carriers #' @param c Count of children of either affecteds or carriers, with no children of their own #' @param d Count of Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring) #' @param n Count of the number of second generation progeny in a given tree. #' -#' @return +#' @return K Pedigree-based estimation of autosomal dominant penetrance rate. #' @export #' #' @examples @@ -17,6 +18,10 @@ penetrance <- function(K, a, b, c, d, n) { } +#TODO - CHECK with BARI +# for IBD, am I wrong and is this the correct param description? +#' @param n The number of non-overlapping paths between all pedigree members. + #' Calculation of Identity by descent (IBD). #' @@ -24,18 +29,21 @@ penetrance <- function(K, a, b, c, d, n) { #' estimate of the amount of the genome they have inherited it from a #' common ancestor without recombination. #' -#' @param a -#' @param b -#' @param c -#' @param d -#' @param n +#' Can do this for the total potential relatedness in a pedigree (theoretical=TRUE), +#' or for the actual relatedness across collected samples (theoretical=FALSE) +#' +#' @param a Count of affected individuals +#' @param b Count of obligate carriers +#' @param c Count of children of either affecteds or carriers, with no children of their own +#' @param d Count of Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring) +#' @param n Count of the number of second generation progeny in a given tree. #' @param K The estimate of penetrance rate. #' @param theoretical Boolean indicating if the calculation should be #' theoretical IBD calculation (using only d and k), or if the calculation #' should use the provided n value. #' #' -#' @return +#' @return pi-hat value. The proportion of genome shared between individuals. #' @export #' #' @examples @@ -45,26 +53,27 @@ ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { } else { x <- sum(d*(1 - (2^n-1)/2^(n+1))) * 2^K } - (a + b + c * 2^K + x) - 1 + pihat<-(a + b + c * 2^K + x) - 1 + return(pihat) } -#TODO - CHECK with BARI ?The number of non-overlapping paths between all pedigree members. -# for IBD is this the correct param? -#' @param n The number of non-overlapping paths between all pedigree members. -#' Title +#' Rank the pedigrees using the pihat values. #' -#' @param pihat -#' @param K +#' @param pihat Estimated proportion of genome shared between individuals, from function: ibd. +#' @param K Estimated penetrance value, from function: penetrance. +#' @param rank.by.K Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE #' #' @return #' @export #' #' @examples -rank <- function(pihat, K) { +rank <- function(pihat, K=-1,rank.by.K=FALSE) { + if(rank.by.K==TRUE){ + log(2^pihat*K) + } log(2^pihat) - #log(2^pihat*K) } @@ -77,14 +86,14 @@ rank <- function(pihat, K) { #' @export #' #' @examples +#' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +#' example.pedigree.df <- read.pedigree(example.pedigree.file) read.pedigree <- function(filename){ h <- read.table(filename, header=TRUE, sep="\t", check.names=FALSE, colClasses=c("Family"="character")) return(h) } -count the number of non-overlapping paths between all pedigree members -That number becomes n in (1/2)^n. We can do this twice - once for the total potential relatedness in a pedigree, and again for the actual relatedness across collected samples. This method avoids the issue of using IBD estimates relative to the proband, which is that the relatedness estimates change based on who you are in the pedigree, #' Take the encoded information about the pedigrees and calculate penetrance. #' @@ -104,14 +113,17 @@ That number becomes n in (1/2)^n. We can do this twice - once for the total pot #' - Exclude subjects younger than age of onset #' #' @param h A data frame containing the encoded pedigree information -#' +#' @param rank.by.K Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE #' @return A data frame containing the theoretical ranking of the power of a #' family assuming you were able to collect everyone on the simplified pedigree, #' as well as a current ranking, examining only those for whom you currently have DNA. #' @export #' #' @examples -cal.penetrance <- function(h){ +#' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +#' example.pedigree.df <- read.pedigree(example.pedigree.file) +#' penetrance.df <- cal.penetrance(example.pedigree.df) +cal.penetrance <- function(h, rank.by.K=FALSE){ family_vec <- c() penetrance_vec <- c() @@ -137,7 +149,7 @@ cal.penetrance <- function(h){ K <- optimize(penetrance, c(0,1), max_a, max_b, max_c, max_d, max_n, maximum=TRUE)$max max_pihat <- ibd(max_a, max_b, max_c, max_d, max_n, K) - max_rank <- rank(max_pihat, K) + max_rank <- rank(max_pihat, K, rank.by.K = rank.by.K ) current_pihat <- ibd(a_actual, b_actual, c_actual, d_actual, n_actual, K, FALSE) current_rank <- rank(current_pihat, K) diff --git a/man/cal.penetrance.Rd b/man/cal.penetrance.Rd new file mode 100644 index 0000000..4e01532 --- /dev/null +++ b/man/cal.penetrance.Rd @@ -0,0 +1,42 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pedigree.r +\name{cal.penetrance} +\alias{cal.penetrance} +\title{Take the encoded information about the pedigrees and calculate penetrance.} +\usage{ +cal.penetrance(h, rank.by.K = FALSE) +} +\arguments{ +\item{h}{A data frame containing the encoded pedigree information} + +\item{rank.by.K}{Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE} +} +\value{ +A data frame containing the theoretical ranking of the power of a +family assuming you were able to collect everyone on the simplified pedigree, +as well as a current ranking, examining only those for whom you currently have DNA. +} +\description{ +Determine a value rank of families by comparing their relationship structure. +More distant relationships between affecteds (e.g. affected cousins) +is more valuable that close relationships (e.g. affected siblings) +as there is less IBD and therefore a smaller search space. +} +\details{ +Simplifying assumptions: +\itemize{ +\item Autosomal dominant +\item No ambiguous statuses +\item No more than two sequential generations of unknown carrier status +(non-obligate carrier vs. non-carrier). +Generalized support of arbitrary tree structures gets a lot more complicated, especially for the likelihood function. +\item Exclude big giant trees of unaffecteds - related to above. +Will slightly bias the result toward higher penetrance. +\item Exclude subjects younger than age of onset +} +} +\examples{ +example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +example.pedigree.df <- read.pedigree(example.pedigree.file) +penetrance.df <- cal.penetrance(example.pedigree.df) +} diff --git a/man/penetrance.Rd b/man/penetrance.Rd new file mode 100644 index 0000000..12e3045 --- /dev/null +++ b/man/penetrance.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pedigree.r +\name{penetrance} +\alias{penetrance} +\title{Likelihood function for calculation of Pedigree-based autosomal dominant penetrance value. +Formula deployed via optimize so as to determine the optimal value.} +\usage{ +penetrance(K, a, b, c, d, n) +} +\arguments{ +\item{K}{Seed value for the estimate of penetrance rate.} + +\item{a}{Count of affected individuals} + +\item{b}{Count of obligate carriers} + +\item{c}{Count of children of either affecteds or carriers, with no children of their own} + +\item{d}{Count of Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring)} + +\item{n}{Count of the number of second generation progeny in a given tree.} +} +\value{ +K Pedigree-based estimation of autosomal dominant penetrance rate. +} +\description{ +Likelihood function for calculation of Pedigree-based autosomal dominant penetrance value. +Formula deployed via optimize so as to determine the optimal value. +} diff --git a/man/rank.Rd b/man/rank.Rd new file mode 100644 index 0000000..dc5b01a --- /dev/null +++ b/man/rank.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pedigree.r +\name{rank} +\alias{rank} +\title{Rank the pedigrees using the pihat values.} +\usage{ +rank(pihat, rank.by.K = FALSE) +} +\arguments{ +\item{pihat}{Estimated proportion of genome shared between individuals, from function: ibd.} + +\item{rank.by.K}{Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE} + +\item{K}{Estimated penetrance value, from function: penetrance.} +} +\description{ +Rank the pedigrees using the pihat values. +} diff --git a/man/read.pedigree.Rd b/man/read.pedigree.Rd new file mode 100644 index 0000000..c8a5ff3 --- /dev/null +++ b/man/read.pedigree.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pedigree.r +\name{read.pedigree} +\alias{read.pedigree} +\title{Read in the encoded pedigree data file. +TODO - can we build this encoded matrix from a more standard file format? i.e. ped-like?} +\usage{ +read.pedigree(filename) +} +\arguments{ +\item{filename}{name of the file with the data.} +} +\value{ +A data frame containing the encoded pedigree information. +} +\description{ +Read in the encoded pedigree data file. +TODO - can we build this encoded matrix from a more standard file format? i.e. ped-like? +} +\examples{ +example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +example.pedigree.df <- read.pedigree(example.pedigree.file) +} diff --git a/man/score.fam.Rd b/man/score.fam.Rd index 0042991..9ca98d3 100644 --- a/man/score.fam.Rd +++ b/man/score.fam.Rd @@ -13,7 +13,8 @@ score.fam( return.sums = FALSE, return.means = TRUE, affected.only = TRUE, - max.err = 4 + max.err = 4, + subset.fam = TRUE ) } \arguments{ @@ -42,7 +43,9 @@ A labelled vector with names: score, score.for, score.against By default all individuals are treated as the reference 'proband' and the given variant's score is calculated based on relationships to all other individuals. e.g. for each row in the relationship matrix. calc.rv.score is run, with the row name indiciating the -reference individual that the calcualtion is relative to. +reference individual that the calculation is relative to. +Note that the relation.mat can include more individuals than are present within the status.df, but the matrix will +be subset to include only those individuals that have status information provided. } \details{ There are several return options possible. diff --git a/man/subset.mat.Rd b/man/subset.mat.Rd new file mode 100644 index 0000000..8131b2c --- /dev/null +++ b/man/subset.mat.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/relatedness.r +\name{subset.mat} +\alias{subset.mat} +\title{Take the matrix and subset out only the encoded individuals that are present in the status dataframe.} +\usage{ +\method{subset}{mat}(mat.df, status.df) +} +\description{ +Take the matrix and subset out only the encoded individuals that are present in the status dataframe. +} diff --git a/tests/testthat/test_pedigree_eval.R b/tests/testthat/test_pedigree_eval.R index e5161d4..b3bffb7 100644 --- a/tests/testthat/test_pedigree_eval.R +++ b/tests/testthat/test_pedigree_eval.R @@ -9,3 +9,4 @@ test_that("Data are read from files correctly", { penetrance.df <- cal.penetrance(example.pedigree.df) +}) From 4dffba9083085e426a15d1742a5621a7b74d2b87 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 28 Apr 2025 13:59:44 -0400 Subject: [PATCH 03/43] vignette setup --- vignettes/seqbio.variant.scoring-vignette.Rmd | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 vignettes/seqbio.variant.scoring-vignette.Rmd diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd new file mode 100644 index 0000000..3032011 --- /dev/null +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -0,0 +1,20 @@ +--- +title: "seqbio.variant.scoring - pedigree-informed rare variant association and penetrance scoring" +author: "Cameron M. Nugent" +data: "`r Sys.Date()`" +output: pdf_document #rmarkdown::html_vignette # +vignette: > + %\VignetteIndexEntry{seqbio.variant.scoring-vignette} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +## Introduction + +seqbio.variant.scoring, an R package meant to aid in comparative evaluation of candidate in family based rare-variant association studies. The package considers evidence of rare variants' association with disease status in a family, based on relationships of individuals in a pedigree and can thereby assess the relative merit of candidate variants. + +The program leverages Wright’s coefficient of relatedness to score families based on the disease status, genotypes and relationship of individuals. A candidate is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not associated with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large portion of the genome shared by chance alone by the two individuals and thereby gives evidence in favour of the candidate. + +```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} +library(seqbio.variant.scoring) +``` From 24fa56fa987b3b28f015fb5feca41a168000c8f4 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 28 Apr 2025 15:38:36 -0300 Subject: [PATCH 04/43] progress on vignette --- README.md | 11 +- vignettes/seqbio.variant.scoring-vignette.Rmd | 110 +++++++++++++++++- 2 files changed, 114 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 4023386..a956dfe 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ -# seqbio-variant-scoring -## A method for scoring variants based on relationships of individuals. +# seqbio.variant.scoring +## An R package method for scoring variants based on relationships of individuals. *WORK IN PROGRESS - currently a minimal viable product.* +## Introduction +When looking at shared rare variants across families, not all sets of affected and unaffected individuals are equal. + This R package is designed to score rare variants, assigning values based on the disease status of individuals, the presence or absence of a rare variant in those individuals, and their pairwise coefficients of relatedness. The package uses a custom formula to assign value to a variant that gives more weight to shared variants common @@ -19,10 +22,6 @@ of unaffected evidence can be customized. - penetrance related notes: https://sequencebio.atlassian.net/wiki/spaces/SEQUENCEPE/pages/1443364866/Quantifying+power+of+a+family+for+discovery -## Introduction -When looking at shared rare variants across families, not all sets of affected and unaffected individuals are equal. -This package is a method for scoring the evidence of a series of individuals in a way that takes into account the relatedness of the individuals as well as their disease status and genotype. - ## TODO - add a description of the formulas and scoring system here diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index 3032011..416d87b 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -11,10 +11,118 @@ vignette: > ## Introduction -seqbio.variant.scoring, an R package meant to aid in comparative evaluation of candidate in family based rare-variant association studies. The package considers evidence of rare variants' association with disease status in a family, based on relationships of individuals in a pedigree and can thereby assess the relative merit of candidate variants. +When looking at shared rare variants across families, not all sets of affected and unaffected individuals are equal. seqbio.variant.scoring is an R package meant to aid in comparative evaluation of candidate in family based rare-variant association studies. The package considers evidence of rare variants' association with disease status in a family, based on relationships of individuals in a pedigree and can thereby assess the relative merit of candidate variants. The program leverages Wright’s coefficient of relatedness to score families based on the disease status, genotypes and relationship of individuals. A candidate is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not associated with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large portion of the genome shared by chance alone by the two individuals and thereby gives evidence in favour of the candidate. ```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} library(seqbio.variant.scoring) ``` + + +## Input data + +### The relation matrix + +The function `read.relation.mat` +```{r} +mat.name1<-system.file('extdata/7003_notch3.mat', package = 'seqbio.variant.scoring') +rel.mat <- read.relation.mat(mat.name1) +rel.mat +``` + + +### The status file + +This file include the disease and variant status for all individuals + + +```{r} + +tsv.name1<-system.file('extdata/7003_notch3.tsv', package = 'seqbio.variant.scoring') +status.df <- read.indiv(tsv.name1) + +status.df +``` + +The disease-genotype scoring can then be encoded using the `score.variant.status` function. + +```{r} + +full.df.status <- score.variant.status(status.df) +full.df.status +``` + + +## How it works +### A Minimal example: Scoring a variant from perspective of a single individual. + +Calculate a relatedness-weighted score for a given rare variant can be performed for + + +## Scoring a family + +For most real-world applications you will likely want to score an entire family in conjunction with one another and take the mean score. This can be accomplished with `score.fam`, which takes in the matrix of relationships and the table with encoded `statvar.cat` of all individuals. + +```{r} + +ex_score_default <- score.fam(rel.mat, full.df.status) +ex_score_default +``` + + +By default `score.fam` returns: +- Nhe scores considering only the Affected individuals as the starting points (skipping the rows for MS-7003-1002, MS-7003-1006, and MS-7003-6001) in the previous example. +- The of the calculated score for each starting individuals. + +Note that if an individual is present in the relationship matrix and not in the status file, it is assumed there is no genetic information for this individual and they are ignored when calculating the variant score. + +The scoring can be changed to summing across all combinations as opposed to the mean by passing the following options. Note using the program in this way will return higher scores for more dense pedigrees. +```{r} + +ex_score_sum <- score.fam(rel.mat, full.df.status, return.sums = TRUE, return.means = FALSE) +ex_score_sum +``` + + +To obtain a long form table with the scores for variants expressed relative to each individual, set both `return.sums` and `return.means` to `FALSE`. +```{r} + +ex_score_table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) +ex_score_table +``` + + + + +```{r} + + +``` + + + + +```{r} + + +``` + + + +```{r} + + +``` + + +```{r} + + +``` + + +```{r} + + +``` From add56a43c85238b60335362635efdc1c98132ea3 Mon Sep 17 00:00:00 2001 From: CNuge Date: Tue, 29 Apr 2025 08:40:13 -0400 Subject: [PATCH 05/43] adding to vignette --- R/relatedness.r | 6 +- vignettes/seqbio.variant.scoring-vignette.Rmd | 83 +++++++++++++++++-- 2 files changed, 79 insertions(+), 10 deletions(-) diff --git a/R/relatedness.r b/R/relatedness.r index c7aed86..367d210 100644 --- a/R/relatedness.r +++ b/R/relatedness.r @@ -4,7 +4,7 @@ #' #' These scores can be used to compare variants of interest within a family. #' -#' For each individual, a relationship-informed weight is applied to their sharing +#' For each encoded relationship, a relationship-informed weight is applied to their sharing #' or not sharing of a variant. #' The score for affected status is: #' (1 / coefficient_of_relatedness) * status_weight @@ -12,7 +12,7 @@ #' (1/0.125) * affected weight #' 8 * 1 #' = 8 points in favour of the variant. -#' Whereas for unaffected unaffected individuals, scores decay the further a person is in +#' Whereas for unaffected individuals, scores decay the further a person is in #' relation to the proband based on the formula: #' ((unaffected.max*2) * coefficient_of_relatedness ) * unaffected_weight #' For example, with the default unaffected.max of 8. The sister that does not have a variant would get a score of @@ -22,7 +22,7 @@ #' If these were the only two relatives considered we could sum the points #' and get a score in favour of the variant of #' 8 + 4 = 12 -#' Ig there is evidence against a variant, this is factored into the score as: +#' If there is evidence against a variant, this is factored into the score as: #' total_score = evidence_for - evidence_against #' For example, if there were also an affected sibling without the variant we would have the score against of: #' (1/0.5) * 1 = 2 diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index 416d87b..9c15e86 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -54,11 +54,6 @@ full.df.status ``` -## How it works -### A Minimal example: Scoring a variant from perspective of a single individual. - -Calculate a relatedness-weighted score for a given rare variant can be performed for - ## Scoring a family @@ -91,28 +86,102 @@ To obtain a long form table with the scores for variants expressed relative to e ex_score_table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) ex_score_table ``` +## How it works. A Minimal example +### Scoring a variant from perspective of a single individual. +This section is meant to demonstrate how the variant scoring is accomplished on a finer scale. A user does not need to interact with the package on this level of granularity. This section is for explanatory purposes only. +Under the hood, the `score.fam` function is running the scoring method once for each affected individual in the status dataframe (or for each individual regardless of status if `affected.only = FALSE`). To do this, for each individual, the program takes corresponding row of the relationship matrix to determine the relations to all other individuals in the pedigree. +For example, the degrees of relationships of all other members of the example family relative to the proband are show in the following subset of the matrix: ```{r} +rel.mat.proband <- rel.mat["MS-7003-1001",] +rel.mat.proband +``` +This is taken along with a list of encoded statuses of all individuals for the given variant. Obove `score.variant.status` used the disease status and a genetic variant are used to determine which category the combo falls in. Within `score.fam`, a labelled list of encodings for all individuals is generated. Where: + - A_c = Affected individual with ALT variant + - A_i = Affected individual without ALT variant + - U_c = Unaffected individual without ALT variant + - U_i = Unaffected individual with ALT variant +```{r} +name.stat.dict <- full.df.status$statvar.cat +names(name.stat.dict) <- full.df.status$name +name.stat.dict +``` +`build.relation.dict` is used summarize the collate the status and relationship information and evidence affected and unaffected relations giving evidence for and evidence against a variant. +```{r} +rel.dict<-build.relation.dict(rel.mat.proband, name.stat.dict) +rel.dict +``` +In this example, the proband, two first degree relations, and a third degree relations are all affected and share the candadte variant. For the affected correct (`A_c`) category we therefore see the following encoded: +```{r} +rel.dict$A_c ``` +Since one first degree unaffected relative has the variant, they are categorized as "unaffected incorrect"(`U_i`) and we see: +```{r} +rel.dict$U_i +``` +Scoring a relatedness-weighted score for the variant from the perspective of the given individual is then performed by `calc.rv.score` +For each encoded relationship, a relationship-informed weight is applied to their sharing or not sharing of a variant. -```{r} +The score for affected status increase as individuals become more distantly related. The formula is: +``` + (1 / coefficient_of_relatedness) * affected.weight +``` +For example, an affected cousin (encoded as a 3) would get a score of: +``` + (1/0.125) * affected.weight + 8 * 1 + = 8 points in favour of the variant. +``` +for unaffected individuals, scores decay the further a person is in relation to the query individual based on the formula: +``` +((unaffected.max*2) * coefficient_of_relatedness ) * unaffected.weight +``` +For example, with the default `unaffected.weight = 0.5` and unaffected sister that does not have a variant would get a score of +``` + ((8*2) 0.5) * unaffected_weight + (16 * 0.5) * 0.5 + = 4 points for the variant. -``` +``` +If these were the only two relatives considered we could sum the points and get a score in favour of the variant of +``` + 8 + 4 = 12 +``` +If there is evidence against a variant, this is factored into the score as: +``` + total_score = evidence_for - evidence_against +``` +For example, if there were also an affected sibling without the variant we would have the score against of: +``` + (1/0.5) * 1 = 2 +``` +The final score for the variant would then be: +``` + for - against = total + 12 - 2 = 10 +``` +Giving a final score of 10 for the variant. +This is all accomplished by the function `calc.rv.score`. ```{r} +calc.rv.score(rel.dict) +``` +The weights of the scoring can be adjusted, for example if we wanted to consider only `affected`-based evidence, we could turn off the unaffected part of the calculation by setting the unaffected weighting to 0. This can be useful for incompletely penetrant variants, where disease status and genotype of unaffected indiviudals are more likely to have imperfect concordance. +```{r} +calc.rv.score(rel.dict, unaffected.weight=0) ``` From 37b1d836e013398ec9a26dff545413aeff198f48 Mon Sep 17 00:00:00 2001 From: CNuge Date: Tue, 29 Apr 2025 09:05:00 -0400 Subject: [PATCH 06/43] adding notes --- vignettes/seqbio.variant.scoring-vignette.Rmd | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index 9c15e86..a0b5c45 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -86,8 +86,8 @@ To obtain a long form table with the scores for variants expressed relative to e ex_score_table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) ex_score_table ``` -## How it works. A Minimal example -### Scoring a variant from perspective of a single individual. +## How scoring works +### A Minimal example, scoring a variant from perspective of a single individual. This section is meant to demonstrate how the variant scoring is accomplished on a finer scale. A user does not need to interact with the package on this level of granularity. This section is for explanatory purposes only. @@ -184,6 +184,8 @@ The weights of the scoring can be adjusted, for example if we wanted to consider calc.rv.score(rel.dict, unaffected.weight=0) ``` +The `score.fam` function automatically walks through this process from all specified perspectives in the pedigree and by default returns the average score. The use of the averages and different perspectives is meant to eliminate pedigree-associated bias, such as for instances when a proband is distantly related to all other members in a family (considering the relaitonships from only the perspective of the proband in this case would give an inflated score for the variant's value). + ```{r} From 9f94b498a4ae45e1f2564e86d72e28a47bab9b47 Mon Sep 17 00:00:00 2001 From: CNuge Date: Tue, 29 Apr 2025 12:35:33 -0400 Subject: [PATCH 07/43] notes on the vignette --- R/relatedness.r | 2 +- vignettes/seqbio.variant.scoring-vignette.Rmd | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/R/relatedness.r b/R/relatedness.r index 367d210..53ffa04 100644 --- a/R/relatedness.r +++ b/R/relatedness.r @@ -27,7 +27,7 @@ #' For example, if there were also an affected sibling without the variant we would have the score against of: #' (1/0.5) * 1 = 2 #' The final score for the variant would then be -#'. for - against = total +#' for - against = total #' 12 - 2 = 10 #' Giving a final score of 10 for the variant. Comparing values across variants can be used #' to rank them based on pedigree-informed levels of variant sharing across affected diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index a0b5c45..c411590 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -11,9 +11,9 @@ vignette: > ## Introduction -When looking at shared rare variants across families, not all sets of affected and unaffected individuals are equal. seqbio.variant.scoring is an R package meant to aid in comparative evaluation of candidate in family based rare-variant association studies. The package considers evidence of rare variants' association with disease status in a family, based on relationships of individuals in a pedigree and can thereby assess the relative merit of candidate variants. +When looking at shared rare variants across families, not all affected and unaffected individuals are equal. seqbio.variant.scoring is an R package meant to aid in comparative evaluation of candidate in family-based rare-variant association studies; the package considers evidence of rare variants' association with disease status in a family and score the variants based on relationships of individuals in a pedigree. Comparingscores across candidates can thereby help in assessing their relative merit. -The program leverages Wright’s coefficient of relatedness to score families based on the disease status, genotypes and relationship of individuals. A candidate is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not associated with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large portion of the genome shared by chance alone by the two individuals and thereby gives evidence in favour of the candidate. +The program leverages Wright’s coefficient of relatedness to score families based on the relationship of individuals as well as their disease status and genotypes for a given variant. A candidate variant is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not due to association with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large IBD portion of the genome shared by the two individuals and thereby gives evidence in favour of the candidate. ```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} library(seqbio.variant.scoring) @@ -184,7 +184,7 @@ The weights of the scoring can be adjusted, for example if we wanted to consider calc.rv.score(rel.dict, unaffected.weight=0) ``` -The `score.fam` function automatically walks through this process from all specified perspectives in the pedigree and by default returns the average score. The use of the averages and different perspectives is meant to eliminate pedigree-associated bias, such as for instances when a proband is distantly related to all other members in a family (considering the relaitonships from only the perspective of the proband in this case would give an inflated score for the variant's value). +The `score.fam` function automatically walks through this process from all specified perspectives in the pedigree and by default returns the average score. The use of the averages and different perspectives is meant to eliminate pedigree-associated bias, such as for instances when a proband is distantly related to all other members in a family (considering the relationships from only the perspective of the proband in this case would give an inflated score for the variant's value). ```{r} From ef6de5a8886133f3dc8b6e330edf9cd384cab969 Mon Sep 17 00:00:00 2001 From: CNuge Date: Tue, 29 Apr 2025 13:05:54 -0400 Subject: [PATCH 08/43] anon examples --- R/encoding.R | 2 +- R/io.R | 12 ++++++------ R/relatedness.r | 4 ++-- inst/extdata/1234_ex2.mat | 9 +++++++++ inst/extdata/1234_ex2.tsv | 9 +++++++++ inst/extdata/5432_ex1.mat | 5 +++++ inst/extdata/5432_ex1.tsv | 5 +++++ inst/extdata/5678_ex1.mat | 6 ++++++ inst/extdata/5678_ex1.tsv | 6 ++++++ inst/extdata/9876_ex1.mat | 10 ++++++++++ inst/extdata/9876_ex1.tsv | 10 ++++++++++ inst/extdata/example_pedigree_encoding.tsv | 4 ++-- inst/extdata/example_vcf_extract_4107.tsv | 2 -- inst/extdata/example_vcf_extract_4974.tsv | 2 -- inst/extdata/example_vcf_extract_5678.tsv | 2 ++ inst/extdata/example_vcf_extract_9876.tsv | 2 ++ man/calc.rv.score.Rd | 8 ++++---- man/rank.Rd | 6 +++--- man/read.indiv.Rd | 4 ++-- man/read.relation.mat.Rd | 2 +- man/read.var.table.Rd | 6 +++--- man/score.fam.Rd | 4 ++-- man/score.variant.status.Rd | 2 +- vignettes/seqbio.variant.scoring-vignette.Rmd | 8 ++++---- 24 files changed, 95 insertions(+), 35 deletions(-) create mode 100644 inst/extdata/1234_ex2.mat create mode 100644 inst/extdata/1234_ex2.tsv create mode 100644 inst/extdata/5432_ex1.mat create mode 100644 inst/extdata/5432_ex1.tsv create mode 100644 inst/extdata/5678_ex1.mat create mode 100644 inst/extdata/5678_ex1.tsv create mode 100644 inst/extdata/9876_ex1.mat create mode 100644 inst/extdata/9876_ex1.tsv delete mode 100644 inst/extdata/example_vcf_extract_4107.tsv delete mode 100644 inst/extdata/example_vcf_extract_4974.tsv create mode 100644 inst/extdata/example_vcf_extract_5678.tsv create mode 100644 inst/extdata/example_vcf_extract_9876.tsv diff --git a/R/encoding.R b/R/encoding.R index 659d958..3d80fb9 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -61,7 +61,7 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ #' TODO - switch to numbers 1-4 and -1? #' @param indiv.df A dataframe with the format: #' name status variant -#' MS-4107-1001 A 0/1 +#' MS-5678-1001 A 0/1 #' @param theoretical.max Should the theoretical maxima be returned instead of the observed values? #' When true, the scoring assumes correct variant-status pair for each individual. #' Default is FALSE. diff --git a/R/io.R b/R/io.R index b29b019..6b64245 100644 --- a/R/io.R +++ b/R/io.R @@ -4,10 +4,10 @@ #' #' @param fname A file name, expected format of contents is: #' name status variant -#' MS-4107-1001 A 0/1 +#' MS-5678-1001 A 0/1 #' @return A data frame. #' @examples -#' tsv.name1 <-system.file('extdata/7003_notch3.tsv', package = 'seqbio.variant.scoring') +#' tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') #' id.df1 <- read.indiv(tsv.name1) #' @export read.indiv <- function(fname){ @@ -24,7 +24,7 @@ read.indiv <- function(fname){ #' @param fname The file with the relationship matrix information. #' @return A matrix with the relationships and individual ids as rownames and colnames. #' @examples -#' mat.name1 <-system.file('extdata/7003_notch3.mat', package = 'seqbio.variant.scoring') +#' mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') #' mat1 <- read.relation.mat(mat.name1) #' @export read.relation.mat <- function(fname){ @@ -49,14 +49,14 @@ read.relation.mat <- function(fname){ #' #' #' @param fname A file name, expected format of contents is: -#' #CHROM POS REF ALT MS-4107-1001_A MS-4107-1002_U ... +#' #CHROM POS REF ALT MS-5678-1001_A MS-5678-1002_U ... #' chr3 46203838 G A 0/1 0/0 ... #' @return A dataframe. #' Data will be worked into a data frame with format. #' name status variant -#' MS-4107-1001 A 0/1 +#' MS-5678-1001 A 0/1 #' @examples -#' ex.infile <-system.file('extdata/example_vcf_extract_4107.tsv', +#' ex.infile <-system.file('extdata/example_vcf_extract_5678.tsv', #' package = 'seqbio.variant.scoring') #' read.var.table(ex.infile) #' @export diff --git a/R/relatedness.r b/R/relatedness.r index 53ffa04..7337ae4 100644 --- a/R/relatedness.r +++ b/R/relatedness.r @@ -156,8 +156,8 @@ subset.mat <- function(mat.df, status.df){ #' of points for or against. This simplifies scoring and allows for fast filtering of poor quality variants. Default is 4. #' @return A labelled vector with names: score, score.for, score.against #' @examples -#' mat.name1<-system.file('extdata/7003_notch3.mat', package = 'seqbio.variant.scoring') -#' tsv.name1<-system.file('extdata/7003_notch3.tsv', package = 'seqbio.variant.scoring') +#' mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') +#' tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') #' mat.df <- read.relation.mat(mat.name1) #' ind.df <- read.indiv(tsv.name1) #' ind.df.status <- score.variant.status(ind.df) diff --git a/inst/extdata/1234_ex2.mat b/inst/extdata/1234_ex2.mat new file mode 100644 index 0000000..f53b7cd --- /dev/null +++ b/inst/extdata/1234_ex2.mat @@ -0,0 +1,9 @@ + MS-1234-1001 MS-1234-1002 MS-1234-1003 MS-1234-1004 MS-1234-1005 MS-1234-1006 MS-1234-5001 MS-1234-6001 +MS-1234-1001 0 1 1 3 3 1 1 2 +MS-1234-1002 1 0 1 3 3 1 2 1 +MS-1234-1003 1 1 0 3 3 1 2 2 +MS-1234-1004 3 3 3 0 -1 3 4 4 +MS-1234-1005 3 3 3 -1 0 3 4 4 +MS-1234-1006 1 1 1 3 3 0 2 2 +MS-1234-5001 2 4 4 2 2 1 0 3 +MS-1234-6001 3 2 4 4 2 1 2 0 diff --git a/inst/extdata/1234_ex2.tsv b/inst/extdata/1234_ex2.tsv new file mode 100644 index 0000000..117e4db --- /dev/null +++ b/inst/extdata/1234_ex2.tsv @@ -0,0 +1,9 @@ +name status variant +MS-1234-1001 A 0/1 +MS-1234-1002 U 0/1 +MS-1234-1003 A 0/1 +MS-1234-1004 A 0/1 +MS-1234-1005 A 0/0 +MS-1234-1006 U 0/0 +MS-1234-5001 A 0/1 +MS-1234-6001 U 0/0 diff --git a/inst/extdata/5432_ex1.mat b/inst/extdata/5432_ex1.mat new file mode 100644 index 0000000..a27bed4 --- /dev/null +++ b/inst/extdata/5432_ex1.mat @@ -0,0 +1,5 @@ + MS-5432-1001 MS-5432-1002 MS-5432-6001 MS-5432-6002 +MS-5432-1001 0 1 2 3 +MS-5432-1002 1 0 1 2 +MS-5432-6001 2 1 0 1 +MS-5432-6002 3 2 1 0 diff --git a/inst/extdata/5432_ex1.tsv b/inst/extdata/5432_ex1.tsv new file mode 100644 index 0000000..cfcce59 --- /dev/null +++ b/inst/extdata/5432_ex1.tsv @@ -0,0 +1,5 @@ +name status variant +MS-5432-1001 A 0/1 +MS-5432-1002 U 0/0 +MS-5432-6001 A 0/0 +MS-5432-6002 U 0/0 diff --git a/inst/extdata/5678_ex1.mat b/inst/extdata/5678_ex1.mat new file mode 100644 index 0000000..632b3b2 --- /dev/null +++ b/inst/extdata/5678_ex1.mat @@ -0,0 +1,6 @@ + MS-5678-1001 MS-5678-1002 MS-5678-1003 MS-5678-1004 MS-5678-6001 +MS-5678-1001 0 1 1 1 2 +MS-5678-1002 1 0 1 1 2 +MS-5678-1003 1 1 0 1 2 +MS-5678-1004 1 1 1 0 1 +MS-5678-6001 2 2 2 1 0 diff --git a/inst/extdata/5678_ex1.tsv b/inst/extdata/5678_ex1.tsv new file mode 100644 index 0000000..f3751b4 --- /dev/null +++ b/inst/extdata/5678_ex1.tsv @@ -0,0 +1,6 @@ +name status variant +MS-5678-1001 A 0/1 +MS-5678-1002 U 0/1 +MS-5678-1003 A 0/1 +MS-5678-1004 U 0/0 +MS-5678-6001 U 0/0 diff --git a/inst/extdata/9876_ex1.mat b/inst/extdata/9876_ex1.mat new file mode 100644 index 0000000..50858bc --- /dev/null +++ b/inst/extdata/9876_ex1.mat @@ -0,0 +1,10 @@ + MS-9876-1001 MS-9876-1002 MS-9876-1003 MS-9876-1004 MS-9876-1005 MS-9876-1006 MS-9876-1007 MS-9876-1008 MS-9876-1009 +MS-9876-1001 0 3 4 3 4 4 4 4 3 +MS-9876-1002 3 0 1 1 2 4 4 4 3 +MS-9876-1003 4 1 0 2 3 5 5 5 4 +MS-9876-1004 3 1 2 0 1 4 4 4 3 +MS-9876-1005 4 2 3 1 0 6 6 6 5 +MS-9876-1006 4 4 5 4 6 0 1 1 2 +MS-9876-1007 4 4 5 4 6 1 0 1 2 +MS-9876-1008 4 4 5 4 6 1 1 0 2 +MS-9876-1009 3 3 4 3 5 2 2 2 0 diff --git a/inst/extdata/9876_ex1.tsv b/inst/extdata/9876_ex1.tsv new file mode 100644 index 0000000..3838a4e --- /dev/null +++ b/inst/extdata/9876_ex1.tsv @@ -0,0 +1,10 @@ +name status variant +MS-9876-1001 A 0/1 +MS-9876-1002 U 0/0 +MS-9876-1003 U 0/0 +MS-9876-1004 A 0/1 +MS-9876-1005 A 0/1 +MS-9876-1006 A 0/0 +MS-9876-1007 U 0/0 +MS-9876-1008 U 0/0 +MS-9876-1009 U 0/0 diff --git a/inst/extdata/example_pedigree_encoding.tsv b/inst/extdata/example_pedigree_encoding.tsv index 747efee..c2eef67 100644 --- a/inst/extdata/example_pedigree_encoding.tsv +++ b/inst/extdata/example_pedigree_encoding.tsv @@ -9,7 +9,7 @@ Family max_a a max_b b max_c c max_d d max_n n 0347 6 2 6 3 17 1 7,3,1,1 1 2,3,1,8 1 5031 5 2 8 4 2 4 5 2 0 0 6325 7 2 5 0 4 0 1,2 0 1,2 0 -7003 5 4 4 1 4 0 1,2 1 1,3 1 -4974 7 4 2 0 7 4 1 1 5 1 +1234 5 4 4 1 4 0 1,2 1 1,3 1 +9876 7 4 2 0 7 4 1 1 5 1 4006 2 2 1 0 4 1 0 0 0 0 3734 3 0 2 0 3 0 1,3,1 1 1,3,5 0 diff --git a/inst/extdata/example_vcf_extract_4107.tsv b/inst/extdata/example_vcf_extract_4107.tsv deleted file mode 100644 index c63da40..0000000 --- a/inst/extdata/example_vcf_extract_4107.tsv +++ /dev/null @@ -1,2 +0,0 @@ -#CHROM POS REF ALT MS-4107-1001_A MS-4107-1002_U MS-4107-1004_U MS-4107-6001_U MS-4107-1003_A -chr3 46203838 G A 0/1 0/0 0/0 0/0 0/1 diff --git a/inst/extdata/example_vcf_extract_4974.tsv b/inst/extdata/example_vcf_extract_4974.tsv deleted file mode 100644 index fd08b2c..0000000 --- a/inst/extdata/example_vcf_extract_4974.tsv +++ /dev/null @@ -1,2 +0,0 @@ -#CHROM POS REF ALT MS-4974-1002_U MS-4974-1009_U MS-4974-1006_A MS-4974-1004_A MS-4974-1007_U MS-4974-1003_U MS-4974-1001_A MS-4974-1008_U MS-4974-1005_U -chr12 120155305 C T 0|0 0|0 0|0 0|1 0|0 0|0 0|1 0|0 0|1 diff --git a/inst/extdata/example_vcf_extract_5678.tsv b/inst/extdata/example_vcf_extract_5678.tsv new file mode 100644 index 0000000..c5894d2 --- /dev/null +++ b/inst/extdata/example_vcf_extract_5678.tsv @@ -0,0 +1,2 @@ +#CHROM POS REF ALT MS-5678-1001_A MS-5678-1002_U MS-5678-1004_U MS-5678-6001_U MS-5678-1003_A +chr3 46203838 G A 0/1 0/0 0/0 0/0 0/1 diff --git a/inst/extdata/example_vcf_extract_9876.tsv b/inst/extdata/example_vcf_extract_9876.tsv new file mode 100644 index 0000000..d3063fe --- /dev/null +++ b/inst/extdata/example_vcf_extract_9876.tsv @@ -0,0 +1,2 @@ +#CHROM POS REF ALT MS-9876-1002_U MS-9876-1009_U MS-9876-1006_A MS-9876-1004_A MS-9876-1007_U MS-9876-1003_U MS-9876-1001_A MS-9876-1008_U MS-9876-1005_U +chr12 120155305 C T 0|0 0|0 0|0 0|1 0|0 0|0 0|1 0|0 0|1 diff --git a/man/calc.rv.score.Rd b/man/calc.rv.score.Rd index d5f42bc..065a096 100644 --- a/man/calc.rv.score.Rd +++ b/man/calc.rv.score.Rd @@ -42,7 +42,7 @@ A list with three components: score, score.for, score.against. These scores can be used to compare variants of interest within a family. } \details{ -For each individual, a relationship-informed weight is applied to their sharing +For each encoded relationship, a relationship-informed weight is applied to their sharing or not sharing of a variant. The score for affected status is: (1 / coefficient_of_relatedness) * status_weight @@ -50,7 +50,7 @@ For example, an affected cousin (encoded as a 3) would get a score of: (1/0.125) * affected weight 8 * 1 = 8 points in favour of the variant. -Whereas for unaffected unaffected individuals, scores decay the further a person is in +Whereas for unaffected individuals, scores decay the further a person is in relation to the proband based on the formula: ((unaffected.max\emph{2) * coefficient_of_relatedness ) * unaffected_weight For example, with the default unaffected.max of 8. The sister that does not have a variant would get a score of @@ -60,12 +60,12 @@ For example, with the default unaffected.max of 8. The sister that does not have If these were the only two relatives considered we could sum the points and get a score in favour of the variant of 8 + 4 = 12 -Ig there is evidence against a variant, this is factored into the score as: +If there is evidence against a variant, this is factored into the score as: total_score = evidence_for - evidence_against For example, if there were also an affected sibling without the variant we would have the score against of: (1/0.5) * 1 = 2 The final score for the variant would then be -. for - against = total +for - against = total 12 - 2 = 10 Giving a final score of 10 for the variant. Comparing values across variants can be used to rank them based on pedigree-informed levels of variant sharing across affected diff --git a/man/rank.Rd b/man/rank.Rd index dc5b01a..fae0da3 100644 --- a/man/rank.Rd +++ b/man/rank.Rd @@ -4,14 +4,14 @@ \alias{rank} \title{Rank the pedigrees using the pihat values.} \usage{ -rank(pihat, rank.by.K = FALSE) +rank(pihat, K = -1, rank.by.K = FALSE) } \arguments{ \item{pihat}{Estimated proportion of genome shared between individuals, from function: ibd.} -\item{rank.by.K}{Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE} - \item{K}{Estimated penetrance value, from function: penetrance.} + +\item{rank.by.K}{Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE} } \description{ Rank the pedigrees using the pihat values. diff --git a/man/read.indiv.Rd b/man/read.indiv.Rd index b963459..ec59145 100644 --- a/man/read.indiv.Rd +++ b/man/read.indiv.Rd @@ -9,7 +9,7 @@ read.indiv(fname) \arguments{ \item{fname}{A file name, expected format of contents is: name status variant -MS-4107-1001 A 0/1} +MS-5678-1001 A 0/1} } \value{ A data frame. @@ -18,6 +18,6 @@ A data frame. Read in variant and status info for individuals. } \examples{ -tsv.name1 <-system.file('extdata/7003_notch3.tsv', package = 'seqbio.variant.scoring') +tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') id.df1 <- read.indiv(tsv.name1) } diff --git a/man/read.relation.mat.Rd b/man/read.relation.mat.Rd index a611a59..cdf64e7 100644 --- a/man/read.relation.mat.Rd +++ b/man/read.relation.mat.Rd @@ -18,6 +18,6 @@ Row/column intersections give the degree of relationship for the two individuals. 0 = self, -1 = unrelated. } \examples{ -mat.name1 <-system.file('extdata/7003_notch3.mat', package = 'seqbio.variant.scoring') +mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') mat1 <- read.relation.mat(mat.name1) } diff --git a/man/read.var.table.Rd b/man/read.var.table.Rd index 0ed188b..ee7f129 100644 --- a/man/read.var.table.Rd +++ b/man/read.var.table.Rd @@ -10,14 +10,14 @@ read.var.table(fname) } \arguments{ \item{fname}{A file name, expected format of contents is: -#CHROM POS REF ALT MS-4107-1001_A MS-4107-1002_U ... +#CHROM POS REF ALT MS-5678-1001_A MS-5678-1002_U ... chr3 46203838 G A 0/1 0/0 ...} } \value{ A dataframe. Data will be worked into a data frame with format. name status variant -MS-4107-1001 A 0/1 +MS-5678-1001 A 0/1 } \description{ Note - ensure the status in the names match your desired encoding! @@ -25,7 +25,7 @@ There are individuals with ambigious statues, that you may require to be encoded in a specific fashion for you current purposes. } \examples{ -ex.infile <-system.file('extdata/example_vcf_extract_4107.tsv', +ex.infile <-system.file('extdata/example_vcf_extract_5678.tsv', package = 'seqbio.variant.scoring') read.var.table(ex.infile) } diff --git a/man/score.fam.Rd b/man/score.fam.Rd index 9ca98d3..1f32802 100644 --- a/man/score.fam.Rd +++ b/man/score.fam.Rd @@ -58,8 +58,8 @@ NOTE: if affected.only = True, the averages and sums are calculated using only t } } \examples{ -mat.name1<-system.file('extdata/7003_notch3.mat', package = 'seqbio.variant.scoring') -tsv.name1<-system.file('extdata/7003_notch3.tsv', package = 'seqbio.variant.scoring') +mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') +tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') mat.df <- read.relation.mat(mat.name1) ind.df <- read.indiv(tsv.name1) ind.df.status <- score.variant.status(ind.df) diff --git a/man/score.variant.status.Rd b/man/score.variant.status.Rd index 526ce52..e41317d 100644 --- a/man/score.variant.status.Rd +++ b/man/score.variant.status.Rd @@ -11,7 +11,7 @@ score.variant.status(indiv.df, theoretical.max = FALSE) \arguments{ \item{indiv.df}{A dataframe with the format: name status variant -MS-4107-1001 A 0/1} +MS-5678-1001 A 0/1} \item{theoretical.max}{Should the theoretical maxima be returned instead of the observed values? When true, the scoring assumes correct variant-status pair for each individual. diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index c411590..436669b 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -26,7 +26,7 @@ library(seqbio.variant.scoring) The function `read.relation.mat` ```{r} -mat.name1<-system.file('extdata/7003_notch3.mat', package = 'seqbio.variant.scoring') +mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') rel.mat <- read.relation.mat(mat.name1) rel.mat ``` @@ -39,7 +39,7 @@ This file include the disease and variant status for all individuals ```{r} -tsv.name1<-system.file('extdata/7003_notch3.tsv', package = 'seqbio.variant.scoring') +tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') status.df <- read.indiv(tsv.name1) status.df @@ -67,7 +67,7 @@ ex_score_default By default `score.fam` returns: -- Nhe scores considering only the Affected individuals as the starting points (skipping the rows for MS-7003-1002, MS-7003-1006, and MS-7003-6001) in the previous example. +- Nhe scores considering only the Affected individuals as the starting points (skipping the rows for MS-1234-1002, MS-1234-1006, and MS-1234-6001) in the previous example. - The of the calculated score for each starting individuals. Note that if an individual is present in the relationship matrix and not in the status file, it is assumed there is no genetic information for this individual and they are ignored when calculating the variant score. @@ -96,7 +96,7 @@ Under the hood, the `score.fam` function is running the scoring method once for For example, the degrees of relationships of all other members of the example family relative to the proband are show in the following subset of the matrix: ```{r} -rel.mat.proband <- rel.mat["MS-7003-1001",] +rel.mat.proband <- rel.mat["MS-1234-1001",] rel.mat.proband ``` This is taken along with a list of encoded statuses of all individuals for the given variant. Obove `score.variant.status` used the disease status and a genetic variant are used to determine which category the combo falls in. Within `score.fam`, a labelled list of encodings for all individuals is generated. Where: From a483348fe7b8e307f0e52e503896e361486d7483 Mon Sep 17 00:00:00 2001 From: CNuge Date: Wed, 30 Apr 2025 08:16:05 -0400 Subject: [PATCH 09/43] draft of relatedness vignette --- vignettes/seqbio.variant.scoring-vignette.Rmd | 57 +++++++++---------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index 436669b..2edd606 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -11,7 +11,7 @@ vignette: > ## Introduction -When looking at shared rare variants across families, not all affected and unaffected individuals are equal. seqbio.variant.scoring is an R package meant to aid in comparative evaluation of candidate in family-based rare-variant association studies; the package considers evidence of rare variants' association with disease status in a family and score the variants based on relationships of individuals in a pedigree. Comparingscores across candidates can thereby help in assessing their relative merit. +When looking at shared rare variants across families, not all affected and unaffected individuals are of equal importance. `seqbio.variant.scoring` is an R package meant to aid in comparative evaluation of information in family-based rare-variant association studies; the package considers evidence of rare variant's association with disease status in a family and scores the variant based on relationships of individuals in a pedigree. Comparing scores across candidates can thereby help in assessing their relative merit. The program leverages Wright’s coefficient of relatedness to score families based on the relationship of individuals as well as their disease status and genotypes for a given variant. A candidate variant is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not due to association with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large IBD portion of the genome shared by the two individuals and thereby gives evidence in favour of the candidate. @@ -24,6 +24,8 @@ library(seqbio.variant.scoring) ### The relation matrix +This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). + The function `read.relation.mat` ```{r} mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') @@ -34,7 +36,7 @@ rel.mat ### The status file -This file include the disease and variant status for all individuals +This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. ```{r} @@ -45,7 +47,7 @@ status.df <- read.indiv(tsv.name1) status.df ``` -The disease-genotype scoring can then be encoded using the `score.variant.status` function. +The disease-genotype scoring can then be encoded using the `score.variant.status` function to produce the status-variant category for all individuals (new output column named: `statvar.cat`). ```{r} @@ -67,8 +69,8 @@ ex_score_default By default `score.fam` returns: -- Nhe scores considering only the Affected individuals as the starting points (skipping the rows for MS-1234-1002, MS-1234-1006, and MS-1234-6001) in the previous example. -- The of the calculated score for each starting individuals. +- The scores considering only the Affected individuals as the reference individual (skipping the rows for the U individuals: MS-1234-1002, MS-1234-1006, and MS-1234-6001, in the previous example). +- The mean of the calculated score from each reference individuals. Note that if an individual is present in the relationship matrix and not in the status file, it is assumed there is no genetic information for this individual and they are ignored when calculating the variant score. @@ -80,7 +82,7 @@ ex_score_sum ``` -To obtain a long form table with the scores for variants expressed relative to each individual, set both `return.sums` and `return.means` to `FALSE`. +To obtain a long form table with the scores for variants expressed relative to each individual, set both `return.sums` and `return.means` to `FALSE`. This output can aid in identifying which individuals are carrying the most weight in a family's score. ```{r} ex_score_table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) @@ -89,21 +91,21 @@ ex_score_table ## How scoring works ### A Minimal example, scoring a variant from perspective of a single individual. -This section is meant to demonstrate how the variant scoring is accomplished on a finer scale. A user does not need to interact with the package on this level of granularity. This section is for explanatory purposes only. +This section is meant to demonstrate how the variant scoring is accomplished on a finer scale. A user does not need to interact with the package on this level of granularity. This section is for explanatory purposes only, demonstrating how the `score.fam` function operated "under the hood". -Under the hood, the `score.fam` function is running the scoring method once for each affected individual in the status dataframe (or for each individual regardless of status if `affected.only = FALSE`). To do this, for each individual, the program takes corresponding row of the relationship matrix to determine the relations to all other individuals in the pedigree. +The `score.fam` function runs the scoring method once for each affected individual in the status dataframe (or for each individual regardless of status if `affected.only = FALSE`). To do this, for each individual, the program takes corresponding row of the relationship matrix to determine the relations to all other individuals in the pedigree. -For example, the degrees of relationships of all other members of the example family relative to the proband are show in the following subset of the matrix: +For example, the degrees of relationships of all other members of the example family relative to the reference individual `"MS-1234-1001"` are show in the following subset of the matrix: ```{r} rel.mat.proband <- rel.mat["MS-1234-1001",] rel.mat.proband ``` -This is taken along with a list of encoded statuses of all individuals for the given variant. Obove `score.variant.status` used the disease status and a genetic variant are used to determine which category the combo falls in. Within `score.fam`, a labelled list of encodings for all individuals is generated. Where: - - A_c = Affected individual with ALT variant - - A_i = Affected individual without ALT variant - - U_c = Unaffected individual without ALT variant - - U_i = Unaffected individual with ALT variant +This is taken along with a list of encoded statuses of all individuals for the given variant. Above `score.variant.status` used the disease status and a genetic variant are used to determine which category the combo falls in. Within `score.fam`, a labelled list of encodings for all individuals is generated. Where: + - A_c = Affected individual with variant + - A_i = Affected individual without variant + - U_c = Unaffected individual without variant + - U_i = Unaffected individual with variant ```{r} name.stat.dict <- full.df.status$statvar.cat names(name.stat.dict) <- full.df.status$name @@ -115,7 +117,7 @@ name.stat.dict rel.dict<-build.relation.dict(rel.mat.proband, name.stat.dict) rel.dict ``` -In this example, the proband, two first degree relations, and a third degree relations are all affected and share the candadte variant. For the affected correct (`A_c`) category we therefore see the following encoded: +In this example, the proband, two first degree relations, and a third degree relations are all affected and share the candidate variant. For the affected correct (`A_c`) category we therefore see the following encoded: ```{r} rel.dict$A_c @@ -126,22 +128,27 @@ Since one first degree unaffected relative has the variant, they are categorized rel.dict$U_i ``` -Scoring a relatedness-weighted score for the variant from the perspective of the given individual is then performed by `calc.rv.score` +Deriving a relatedness-weighted score for the variant from the perspective of the given individual is then performed by `calc.rv.score` -For each encoded relationship, a relationship-informed weight is applied to their sharing or not sharing of a variant. +For each degree-encoded relationship, the coefficient of relatedness is used to weight the evidence for or against a variant. The coefficients for different degress of relationship are: +```{r} +for(i in 0:7){ + print(paste0("Degree of relatedness: ", i, " coefficient of relatedness: ", 1 / (2 ** (i)))) +} +``` The score for affected status increase as individuals become more distantly related. The formula is: ``` (1 / coefficient_of_relatedness) * affected.weight ``` -For example, an affected cousin (encoded as a 3) would get a score of: +For example, an affected cousin (encoded as a 3 and having a coefficient of relatedness of 0.125) would get a score of: ``` (1/0.125) * affected.weight 8 * 1 = 8 points in favour of the variant. ``` -for unaffected individuals, scores decay the further a person is in relation to the query individual based on the formula: +For unaffected individuals, scores decay the further a person is in relation to the query individual based on the formula: ``` ((unaffected.max*2) * coefficient_of_relatedness ) * unaffected.weight ``` @@ -178,7 +185,7 @@ This is all accomplished by the function `calc.rv.score`. calc.rv.score(rel.dict) ``` -The weights of the scoring can be adjusted, for example if we wanted to consider only `affected`-based evidence, we could turn off the unaffected part of the calculation by setting the unaffected weighting to 0. This can be useful for incompletely penetrant variants, where disease status and genotype of unaffected indiviudals are more likely to have imperfect concordance. +The weights of the scoring can be adjusted, for example if we wanted to consider only `affected`-based evidence, we could turn off the unaffected part of the calculation by setting the unaffected weighting to 0. This can be useful for incompletely penetrant variants, where disease status and genotype of unaffected individuals are more likely to have imperfect concordance. ```{r} calc.rv.score(rel.dict, unaffected.weight=0) @@ -187,13 +194,3 @@ calc.rv.score(rel.dict, unaffected.weight=0) The `score.fam` function automatically walks through this process from all specified perspectives in the pedigree and by default returns the average score. The use of the averages and different perspectives is meant to eliminate pedigree-associated bias, such as for instances when a proband is distantly related to all other members in a family (considering the relationships from only the perspective of the proband in this case would give an inflated score for the variant's value). -```{r} - - -``` - - -```{r} - - -``` From 486e4e9ccb9dd4e2b4eafb0af23cbc18f676854b Mon Sep 17 00:00:00 2001 From: CNuge Date: Fri, 2 May 2025 14:54:52 -0300 Subject: [PATCH 10/43] working on draft of second vignette --- ...qbio.variant.scoring-penetrance_and_ibd.Rmd | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd new file mode 100644 index 0000000..b418eac --- /dev/null +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -0,0 +1,18 @@ +--- +title: "seqbio.variant.scoring - penetrance and idb informed scoring of families" +author: "Cameron M. Nugent" +data: "`r Sys.Date()`" +output: pdf_document #rmarkdown::html_vignette # +vignette: > + %\VignetteIndexEntry{seqbio.variant.scoring-vignette} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +#TODO - pull origin dev once both the penetrance and vignette PRs are approved and merged in. + +## Introduction + +The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate comparison of the relative "power" of families ina study. This can be accomplished upstream of sequencing (i.e. in the project planning stage) and is therefore only dependent on relationship structure and reported affected status of individuals in a given family or set of families + + From f871511917bbdcb50928843d5b7f134f024af275 Mon Sep 17 00:00:00 2001 From: CNuge Date: Fri, 2 May 2025 14:57:16 -0300 Subject: [PATCH 11/43] simple outline of proposed sections: --- ...eqbio.variant.scoring-penetrance_and_ibd.Rmd | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd index b418eac..db2c634 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -10,9 +10,26 @@ vignette: > --- #TODO - pull origin dev once both the penetrance and vignette PRs are approved and merged in. +#TODO - round out the unit tests as you develop and work through the minimalist examples. + + ## Introduction The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate comparison of the relative "power" of families ina study. This can be accomplished upstream of sequencing (i.e. in the project planning stage) and is therefore only dependent on relationship structure and reported affected status of individuals in a given family or set of families +## The input data + + +## Calculating IBD + + +## Calculating penetrance + + +## Combining IBD and penetrance into a pedigree score + + +## Comparison of current and maximum pedigree score + From bf241e1775d73ecc8a2da0e1dd388c61ba19520a Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 07:53:31 -0300 Subject: [PATCH 12/43] adding description to the vignette --- ...bio.variant.scoring-penetrance_and_ibd.Rmd | 80 ++++++++++++++++++- 1 file changed, 79 insertions(+), 1 deletion(-) diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd index db2c634..744835d 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -16,11 +16,89 @@ vignette: > ## Introduction -The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate comparison of the relative "power" of families ina study. This can be accomplished upstream of sequencing (i.e. in the project planning stage) and is therefore only dependent on relationship structure and reported affected status of individuals in a given family or set of families +The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate comparison of the relative "power" of families ina study. This can be accomplished upstream of sequencing (i.e. in the project planning stage) and is therefore only dependent on relationship structure and reported affected status of individuals in a given family or set of families. + +The `cal.penetrance` function generates both a theoretical ranking of the power of a family assuming you were able to collect everyone on the simplified pedigree, as well as a current ranking, examining only those for whom you currently have DNA. This allows evaluation of the impact on the ranking if certain other family members are enrolled in the study. + ## The input data +# Estimating power of families + +This tool facilitates comparison of the relative "power" of families as they come into our studies, e.g. to prioritize followup. This is upstream of sequencing, so is entirely dependent on relationship structure and reported affected status. + +Essentially, more distant relationships between affecteds (e.g. affected cousins) are more valuable than close relationships (e.g. affected siblings), as there is less IBD and therefore a smaller search space. On the other hand, if the line of relatives between those affected individuals contains any unaffected subjects, then we have detected the presence of obligate carriers; we can use this information to penalize the ranking as it indicates that there is a chance any unaffected actually carries the pertinent variant. + + +## Simplifying pedigrees + +First, you'll need to prune down pedigrees to informative allele transfers. For +the purposes of this tool, we are excluding young generations (under 20-ish) and +large (more than two sequential generations) trees of exclusively unaffected family +members. There are some judgment calls happening here. See [this](https://sequencebio.atlassian.net/wiki/spaces/SEQUENCEPE/pages/1443364866/Quantifying+power+of+a+family+for+discovery) page for an example. + +Simplifying assumptions: +- Condition is autosomal dominant +- There are no ambiguous statuses +- No more than two sequential generations of unknown carrier status (non-obligate carrier vs. non-carrier). Generalized support of arbitrary tree structures gets a lot more complicated, especially for the likelihood function. +- Exclude big giant trees of unaffecteds - related to above. Will slightly bias the result toward higher penetrance. +- Exclude subjects younger than age of onset + + +## Categories of relationships + +|Category|Description| +|---|---| +|A|Affected individuals| +|B|Obligate carriers| +|C|Children of either affecteds or carriers, with no children of their own| +|D|Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring)| + +We have made the editorial decision to exclude trees of unaffecteds that are larger than those described in D. + +## Encoding into categories + +Encoding of families is input via a tab-separated manifest with a header, e.g.: + +``` +Family max_a a max_b b max_c c max_d d max_n n +1111 2 1 0 0 0 0 0 0 0 0 +``` + +All columns with the prefix `max_` are meant to count the total number of each category in the pedigree, while +the columns without this prefix are the number of each category for whom we have samples. + +The categories correspond to A, B, and C as defined above. + +Category D is represented by two numbers, d and n. n is the number of offspring in a tree +of unaffecteds; d is the number of those types of trees across the pedigree. Multiple types of trees are encoded with +commas separating the values. For example, the following represents a family with three total trees of unaffecteds. +One tree (d=1) has three offspring (n=3); two trees (d=2) each have one offspring (n=1). + +``` +d n +1,2 3,1 +``` + +## Theoretical vs. current rankings + +This function generates both a theoretical ranking of the power of a family assuming +you were able to collect everyone on the simplified pedigree, as well as a current +ranking, examining only those for whom you currently have DNA. This allows evaluation +of the impact on the ranking if certain other family members are enrolled in the study. + +### Special cases + +In trees of unaffecteds, there are two special cases for current ranking: +1. You have collected the parent (regardless of collection status of the child). In +this case, the child cannot provide any additional information beyond the parent, so +we only count the parent. (d=1, n=0; equivalently, c=1) +2. You have collected one or more children, but not the parent. In this case, +each of the children contribute a portion of what the parent would have contributed +to our understanding. (d=1, n>0) + + ## Calculating IBD From 957557d91318aa08a3342270955d229e622d1c65 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 08:22:13 -0300 Subject: [PATCH 13/43] incorporating megs suggestions from review --- README.md | 17 +++++++++++++++++ vignettes/seqbio.variant.scoring-vignette.Rmd | 9 ++++++--- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a956dfe..dae0c80 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,23 @@ Since variants can be incompletely penetrant, the scoring can be based solely on of unaffected evidence can be customized. +## How it works + +For a walk through of the seqbio.variant.scoring functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette: + +For a walk through of the seqbio.variant.scoring functions for scoring the value of *variants* within families, see the corresponging vignette: + + + +### The relationship matrix + +This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. + +### The status file + +This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. + + ## related info - slides on concept: https://docs.google.com/presentation/d/1yWlj400fbsrS1CCd4wpy7ve9Sov56H82lZh7hoW8vyg/edit#slide=id.g34c18bf0cf2_0_112 diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index 2edd606..e2f986c 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -11,7 +11,7 @@ vignette: > ## Introduction -When looking at shared rare variants across families, not all affected and unaffected individuals are of equal importance. `seqbio.variant.scoring` is an R package meant to aid in comparative evaluation of information in family-based rare-variant association studies; the package considers evidence of rare variant's association with disease status in a family and scores the variant based on relationships of individuals in a pedigree. Comparing scores across candidates can thereby help in assessing their relative merit. +When looking at rare variants within a family, not all affected and unaffected individuals are of equal importance. `seqbio.variant.scoring` is an R package meant to aid in comparative evaluation of information in family-based rare-variant association studies; the package considers evidence of rare variant's association with disease status in a family and scores the variant based on relationships of individuals in a pedigree. Comparing scores across candidates can thereby help in assessing their relative merit. The program leverages Wright’s coefficient of relatedness to score families based on the relationship of individuals as well as their disease status and genotypes for a given variant. A candidate variant is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not due to association with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large IBD portion of the genome shared by the two individuals and thereby gives evidence in favour of the candidate. @@ -24,7 +24,7 @@ library(seqbio.variant.scoring) ### The relation matrix -This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). +This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. The function `read.relation.mat` ```{r} @@ -101,11 +101,12 @@ For example, the degrees of relationships of all other members of the example fa rel.mat.proband <- rel.mat["MS-1234-1001",] rel.mat.proband ``` -This is taken along with a list of encoded statuses of all individuals for the given variant. Above `score.variant.status` used the disease status and a genetic variant are used to determine which category the combo falls in. Within `score.fam`, a labelled list of encodings for all individuals is generated. Where: +This is taken along with a list of encoded statuses of all individuals for the given variant. Above `score.variant.status` used the disease status and a genetic variant of each individual to determine which of the following categories they fall in to: - A_c = Affected individual with variant - A_i = Affected individual without variant - U_c = Unaffected individual without variant - U_i = Unaffected individual with variant +Within `score.fam`, a labelled list of encodings for all individuals is generated. ```{r} name.stat.dict <- full.df.status$statvar.cat names(name.stat.dict) <- full.df.status$name @@ -187,6 +188,8 @@ calc.rv.score(rel.dict) The weights of the scoring can be adjusted, for example if we wanted to consider only `affected`-based evidence, we could turn off the unaffected part of the calculation by setting the unaffected weighting to 0. This can be useful for incompletely penetrant variants, where disease status and genotype of unaffected individuals are more likely to have imperfect concordance. +Additionally, families with low numbers of affected individuals sequenced and high number of unaffected individuals may haved inflated variant scores and potentially be misleading, focusing the scoring algorithm on the affected individuals only can overcome this bias. + ```{r} calc.rv.score(rel.dict, unaffected.weight=0) ``` From 58b0fe741677c8da83fc9b3a4cb48705d1a359e9 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 08:57:15 -0300 Subject: [PATCH 14/43] addressing more commentS --- vignettes/seqbio.variant.scoring-vignette.Rmd | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index e2f986c..f1c32be 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -38,6 +38,7 @@ rel.mat This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. +Note that the order of individuals can be different between the two files. All individual in the status file must be present within the relationship matrix, but you can have individuals in the realtionship matrix that are not present in the status file. The program assumes there is no genotypes for individuals without a status indicated and they are ignored in the variant scoring. ```{r} @@ -59,7 +60,7 @@ full.df.status ## Scoring a family -For most real-world applications you will likely want to score an entire family in conjunction with one another and take the mean score. This can be accomplished with `score.fam`, which takes in the matrix of relationships and the table with encoded `statvar.cat` of all individuals. +For most real-world applications, you will likely want to score family members in conjunction with one another, and take the mean score for an entire family. This can be accomplished with `score.fam`, which takes in the matrix of relationships and the table with encoded `statvar.cat` of all individuals. ```{r} From 1903a4fe5703525f53d56fa3d0869a7a7f8a7889 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 09:00:32 -0300 Subject: [PATCH 15/43] all comments addressed --- vignettes/seqbio.variant.scoring-vignette.Rmd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index f1c32be..525855f 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -73,7 +73,7 @@ By default `score.fam` returns: - The scores considering only the Affected individuals as the reference individual (skipping the rows for the U individuals: MS-1234-1002, MS-1234-1006, and MS-1234-6001, in the previous example). - The mean of the calculated score from each reference individuals. -Note that if an individual is present in the relationship matrix and not in the status file, it is assumed there is no genetic information for this individual and they are ignored when calculating the variant score. +As previously noted, if an individual is present in the relationship matrix and not in the status file, it is assumed there is no genetic information for this individual and they are ignored when calculating the variant score. The scoring can be changed to summing across all combinations as opposed to the mean by passing the following options. Note using the program in this way will return higher scores for more dense pedigrees. ```{r} From 4ec2c6078f0c0507f8e21b4f6627b84d34e31cdb Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 09:07:08 -0300 Subject: [PATCH 16/43] note to readme on the vignette in R --- README.md | 19 ++++++++++++++----- ...eqbio.variant.scoring-variant_scoring.Rmd} | 0 2 files changed, 14 insertions(+), 5 deletions(-) rename vignettes/{seqbio.variant.scoring-vignette.Rmd => seqbio.variant.scoring-variant_scoring.Rmd} (100%) diff --git a/README.md b/README.md index dae0c80..27e3a2d 100644 --- a/README.md +++ b/README.md @@ -17,11 +17,20 @@ of unaffected evidence can be customized. ## How it works -For a walk through of the seqbio.variant.scoring functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette: - -For a walk through of the seqbio.variant.scoring functions for scoring the value of *variants* within families, see the corresponging vignette: - - +For a walk through of the seqbio.variant.scoring functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette file: +`vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd` +or within R, run: +``` +vignette('seqbio.variant.scoring-penetrance_and_ibd') +``` + +For a walk through of the seqbio.variant.scoring functions for scoring the value of *variants* within families, see the corresponging vignette file: +`vignettes/seqbio.variant.scoring-variant_scoring.Rmd` + +or within R, run: +``` +vignette('seqbio.variant.scoring-variant_scoring') +``` ### The relationship matrix diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-variant_scoring.Rmd similarity index 100% rename from vignettes/seqbio.variant.scoring-vignette.Rmd rename to vignettes/seqbio.variant.scoring-variant_scoring.Rmd From 3c4529c33971a9f8f7012001ed3962646b28001b Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 09:07:48 -0300 Subject: [PATCH 17/43] removing double license --- LICENSE | 2 -- LICENSE.md | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) delete mode 100644 LICENSE diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 63e3ef0..0000000 --- a/LICENSE +++ /dev/null @@ -1,2 +0,0 @@ -YEAR: 2025 -COPYRIGHT HOLDER: seqbio.variant.scoring authors diff --git a/LICENSE.md b/LICENSE.md index f491bd0..9cd26d1 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ # MIT License -Copyright (c) 2025 seqbio.variant.scoring authors +Copyright (c) 2025 seqbio.variant.scoring Sequence Bioinformatics Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal From c88b146f056d2e1cb6eb8bb7614f72b03894cb83 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 10:03:27 -0300 Subject: [PATCH 18/43] improving descriptions of inputs and outputs in the vignette --- R/pedigree.r | 38 ++++----- ...bio.variant.scoring-penetrance_and_ibd.Rmd | 83 ++++++++++--------- 2 files changed, 65 insertions(+), 56 deletions(-) diff --git a/R/pedigree.r b/R/pedigree.r index 2cece69..fe8819c 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -6,7 +6,7 @@ #' @param a Count of affected individuals #' @param b Count of obligate carriers #' @param c Count of children of either affecteds or carriers, with no children of their own -#' @param d Count of Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring) +#' @param d Count of trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring) #' @param n Count of the number of second generation progeny in a given tree. #' #' @return K Pedigree-based estimation of autosomal dominant penetrance rate. @@ -59,18 +59,18 @@ ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { -#' Rank the pedigrees using the pihat values. +#' Score the pedigrees using the pihat values. #' #' @param pihat Estimated proportion of genome shared between individuals, from function: ibd. #' @param K Estimated penetrance value, from function: penetrance. -#' @param rank.by.K Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE +#' @param score.by.K Should the scoreing of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE #' #' @return #' @export #' #' @examples -rank <- function(pihat, K=-1,rank.by.K=FALSE) { - if(rank.by.K==TRUE){ +score <- function(pihat, K=-1,score.by.K=FALSE) { + if(score.by.K==TRUE){ log(2^pihat*K) } log(2^pihat) @@ -97,7 +97,7 @@ read.pedigree <- function(filename){ #' Take the encoded information about the pedigrees and calculate penetrance. #' -#' Determine a value rank of families by comparing their relationship structure. +#' Determine a value score of families by comparing their relationship structure. #' More distant relationships between affecteds (e.g. affected cousins) #' is more valuable that close relationships (e.g. affected siblings) #' as there is less IBD and therefore a smaller search space. @@ -113,24 +113,24 @@ read.pedigree <- function(filename){ #' - Exclude subjects younger than age of onset #' #' @param h A data frame containing the encoded pedigree information -#' @param rank.by.K Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE -#' @return A data frame containing the theoretical ranking of the power of a +#' @param score.by.K Should the scoreing of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE +#' @return A data frame containing the theoretical scoreing of the power of a #' family assuming you were able to collect everyone on the simplified pedigree, -#' as well as a current ranking, examining only those for whom you currently have DNA. +#' as well as a current scoreing, examining only those for whom you currently have DNA. #' @export #' #' @examples #' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') #' example.pedigree.df <- read.pedigree(example.pedigree.file) #' penetrance.df <- cal.penetrance(example.pedigree.df) -cal.penetrance <- function(h, rank.by.K=FALSE){ +cal.penetrance <- function(h, score.by.K=FALSE){ family_vec <- c() penetrance_vec <- c() max_pihat_vec <- c() - max_rank_vec <- c() + max_score_vec <- c() current_pihat_vec <- c() - current_rank_vec <- c() + current_score_vec <- c() for (i in seq_len(nrow(h))) { family <- h[i,"Family"] max_a <- h[i, "max_a"] @@ -149,21 +149,21 @@ cal.penetrance <- function(h, rank.by.K=FALSE){ K <- optimize(penetrance, c(0,1), max_a, max_b, max_c, max_d, max_n, maximum=TRUE)$max max_pihat <- ibd(max_a, max_b, max_c, max_d, max_n, K) - max_rank <- rank(max_pihat, K, rank.by.K = rank.by.K ) + max_score <- score(max_pihat, K, score.by.K = score.by.K ) current_pihat <- ibd(a_actual, b_actual, c_actual, d_actual, n_actual, K, FALSE) - current_rank <- rank(current_pihat, K) + current_score <- score(current_pihat, K) family_vec <- c(family_vec, family) penetrance_vec <- c(penetrance_vec, K) max_pihat_vec <- c(max_pihat_vec, max_pihat) - max_rank_vec <- c(max_rank_vec, max_rank) + max_score_vec <- c(max_score_vec, max_score) current_pihat_vec <- c(current_pihat_vec, current_pihat) - current_rank_vec <- c(current_rank_vec,current_rank) + current_score_vec <- c(current_score_vec,current_score) } - df <- data.frame(family_vec, penetrance_vec, max_pihat_vec, max_rank_vec, current_pihat_vec, current_rank_vec) - colnames(df) <- c("family", "penetrance", "max_pi-hat", "max_rank", "current_pi-hat", "current_rank") - df$pct_of_max <- df$current_rank / df$max_rank * 100 + df <- data.frame(family_vec, penetrance_vec, max_pihat_vec, max_score_vec, current_pihat_vec, current_score_vec) + colnames(df) <- c("family", "penetrance", "max_pi-hat", "max_score", "current_pi-hat", "current_score") + df$pct_of_max <- df$current_score / df$max_score * 100 df[, -1] <- round(df[, -1], 2) return(df) } diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd index 744835d..3179eeb 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -9,57 +9,56 @@ vignette: > %\VignetteEncoding{UTF-8} --- -#TODO - pull origin dev once both the penetrance and vignette PRs are approved and merged in. -#TODO - round out the unit tests as you develop and work through the minimalist examples. - +```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} +library(seqbio.variant.scoring) +``` ## Introduction The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate comparison of the relative "power" of families ina study. This can be accomplished upstream of sequencing (i.e. in the project planning stage) and is therefore only dependent on relationship structure and reported affected status of individuals in a given family or set of families. +## Estimating power of families + The `cal.penetrance` function generates both a theoretical ranking of the power of a family assuming you were able to collect everyone on the simplified pedigree, as well as a current ranking, examining only those for whom you currently have DNA. This allows evaluation of the impact on the ranking if certain other family members are enrolled in the study. ## The input data -# Estimating power of families +The family power calculations depend on a single tab-delimited input file, where each row represents a family. The input file is read inusing the `read.pedigree` function. -This tool facilitates comparison of the relative "power" of families as they come into our studies, e.g. to prioritize followup. This is upstream of sequencing, so is entirely dependent on relationship structure and reported affected status. +```{r} +example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') -Essentially, more distant relationships between affecteds (e.g. affected cousins) are more valuable than close relationships (e.g. affected siblings), as there is less IBD and therefore a smaller search space. On the other hand, if the line of relatives between those affected individuals contains any unaffected subjects, then we have detected the presence of obligate carriers; we can use this information to penalize the ranking as it indicates that there is a chance any unaffected actually carries the pertinent variant. +example.pedigree.df <- read.pedigree(example.pedigree.file) +head(example.pedigree.df) +``` +The input file is expected to have the following 11 columns (with a header). +```{r} +colnames(example.pedigree.df) +``` -## Simplifying pedigrees +### Simplified summary of pedigrees -First, you'll need to prune down pedigrees to informative allele transfers. For -the purposes of this tool, we are excluding young generations (under 20-ish) and -large (more than two sequential generations) trees of exclusively unaffected family -members. There are some judgment calls happening here. See [this](https://sequencebio.atlassian.net/wiki/spaces/SEQUENCEPE/pages/1443364866/Quantifying+power+of+a+family+for+discovery) page for an example. +For now this file should be be constructed through careful manual inspection of the predigrees. To encode the rows for each family, you should first prune down pedigrees to informative allele transfers. For +the purposes of this tool, we exclude young generations (non-adults, younger than age of onset) and large (more than two sequential generations) trees of exclusively unaffected family members. Additionally all individuals require a binary A/U status, there should be no ambigious individuals. There will be some judgment calls required here. -Simplifying assumptions: -- Condition is autosomal dominant -- There are no ambiguous statuses -- No more than two sequential generations of unknown carrier status (non-obligate carrier vs. non-carrier). Generalized support of arbitrary tree structures gets a lot more complicated, especially for the likelihood function. -- Exclude big giant trees of unaffecteds - related to above. Will slightly bias the result toward higher penetrance. -- Exclude subjects younger than age of onset +### Encoding categories of relationships -## Categories of relationships +From the simplified pedigrees, the individuals are assigned to the following categories. |Category|Description| |---|---| -|A|Affected individuals| -|B|Obligate carriers| -|C|Children of either affecteds or carriers, with no children of their own| -|D|Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring)| - -We have made the editorial decision to exclude trees of unaffecteds that are larger than those described in D. +|a|Affected individuals| +|b|Obligate carriers| +|c|Children of either affecteds or carriers, with no children of their own| +|d|Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring; trees of unaffecteds that are larger than this are omitted.)| -## Encoding into categories -Encoding of families is input via a tab-separated manifest with a header, e.g.: +The counts of individuals assgined to these categories are then added to the tab-delimited input file: ``` Family max_a a max_b b max_c c max_d d max_n n @@ -67,14 +66,11 @@ Family max_a a max_b b max_c c max_d d max_n n ``` All columns with the prefix `max_` are meant to count the total number of each category in the pedigree, while -the columns without this prefix are the number of each category for whom we have samples. +the columns without this prefix are the number of each category for whom samples have been collected. The categories correspond to A, B, and C as defined above. -Category D is represented by two numbers, d and n. n is the number of offspring in a tree -of unaffecteds; d is the number of those types of trees across the pedigree. Multiple types of trees are encoded with -commas separating the values. For example, the following represents a family with three total trees of unaffecteds. -One tree (d=1) has three offspring (n=3); two trees (d=2) each have one offspring (n=1). +Category D is represented by two numbers, d and n. n is the number of offspring in a tree of unaffecteds; d is the number of those types of trees across the pedigree. Multiple types of trees are encoded with commas separating the values. For example, the following represents a family with three total trees of unaffecteds. One tree (d=1) has three offspring (n=3); two trees (d=2) each have one offspring (n=1). ``` d n @@ -83,12 +79,26 @@ d n ## Theoretical vs. current rankings -This function generates both a theoretical ranking of the power of a family assuming -you were able to collect everyone on the simplified pedigree, as well as a current -ranking, examining only those for whom you currently have DNA. This allows evaluation -of the impact on the ranking if certain other family members are enrolled in the study. +With the encoded data loaded, the function `cal.penetrance` will generate both a theoretical ranking of the power of a family assuming you were able to collect everyone on the simplified pedigree, as well as a current ranking, examining only those for whom you currently have DNA. This allows evaluation of the impact on the ranking if certain other family members are enrolled in the study. -### Special cases +```{r} +penetrance.df <- cal.penetrance(example.pedigree.df) +penetrance.df +``` + +The output includes seven columns that with the following information: +|Output Column|Description| +|---|---| +|family|The family id.| +|penetrance| Estimated penetrance rate (K) for the family.| +|max_pi-hat| The estimated proportion of the genome that is shared between all individuals *that could be sampled* in the family (IBD).| +|max_score| The theoretical maximum score for the given family *that could* be achieved if all individuals were sampled.| +|current_pi-hat|The estimated proportion of the genome that is shared between all individuals *that have been sampled* in the family (IBD).| +|current_score| The score for the given family *that has* been achieved through the sampled individuals| +|pct_of_max| The percentage of the theoretical maximum score that has been realized in the family sampling.| + + +### Note on a few special cases In trees of unaffecteds, there are two special cases for current ranking: 1. You have collected the parent (regardless of collection status of the child). In @@ -99,7 +109,6 @@ each of the children contribute a portion of what the parent would have contribu to our understanding. (d=1, n>0) - ## Calculating IBD From 18b55d85aa0437ef144a587c07e504e1cb666881 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 5 May 2025 10:37:38 -0300 Subject: [PATCH 19/43] notes on ranking --- README.md | 21 ++++-------- ...bio.variant.scoring-penetrance_and_ibd.Rmd | 34 +++++++++++-------- 2 files changed, 27 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 27e3a2d..721ec02 100644 --- a/README.md +++ b/README.md @@ -32,27 +32,20 @@ or within R, run: vignette('seqbio.variant.scoring-variant_scoring') ``` -### The relationship matrix +## Scoring families -This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. +### Then encoding file -### The status file - -This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. +## Scoring Variants -## related info +### The relationship matrix -- slides on concept: https://docs.google.com/presentation/d/1yWlj400fbsrS1CCd4wpy7ve9Sov56H82lZh7hoW8vyg/edit#slide=id.g34c18bf0cf2_0_112 -- penetrance related code: https://github.com/SequenceBio/seqbio-pedigree-ranking -- penetrance related notes: https://sequencebio.atlassian.net/wiki/spaces/SEQUENCEPE/pages/1443364866/Quantifying+power+of+a+family+for+discovery +This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. +### The status file +This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. -## TODO -- add a description of the formulas and scoring system here -- outline how to deploy the code -- a means of building the relationship matricies easily? -- generate scores for a variety of situations. Do they make sense? Can they be summed across families? diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd index 3179eeb..7312e73 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -58,11 +58,10 @@ From the simplified pedigrees, the individuals are assigned to the following cat |d|Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring; trees of unaffecteds that are larger than this are omitted.)| -The counts of individuals assgined to these categories are then added to the tab-delimited input file: +The counts of individuals assigned to these categories are then added to the tab-delimited input file: -``` -Family max_a a max_b b max_c c max_d d max_n n -1111 2 1 0 0 0 0 0 0 0 0 +```{r} +head(example.pedigree.df) ``` All columns with the prefix `max_` are meant to count the total number of each category in the pedigree, while @@ -98,6 +97,23 @@ The output includes seven columns that with the following information: |pct_of_max| The percentage of the theoretical maximum score that has been realized in the family sampling.| +With the scoring completed, the values can be queried to learn more about the realized and potential value of the families relative to one another. + +Sorting on `current_score` shows which family has the most detection power based on collected samples. + +```{r} +penetrance.df[order(penetrance.df$current_score, decreasing = TRUE),] +``` +If we had to work with only what we have, then family 5031 gives the most detection power. + +Sorting on `max_score` shows which family could have the most detection power, if all individuals were sampled. This can be useful in targeting future sampling efforts as it shows where more samples would give the most value. + +```{r} +penetrance.df[order(penetrance.df$max_score, decreasing = TRUE),] +``` +Here we can see that family `0347` has a maximum score of 30.84, but a realized score of only 4.17. Given the high potential detection power for this family, it is an ideal target for future sampling efforts as current samples reveal on 13.5% of the family's potential value. + + ### Note on a few special cases In trees of unaffecteds, there are two special cases for current ranking: @@ -109,14 +125,4 @@ each of the children contribute a portion of what the parent would have contribu to our understanding. (d=1, n>0) -## Calculating IBD - - -## Calculating penetrance - - -## Combining IBD and penetrance into a pedigree score - - -## Comparison of current and maximum pedigree score From 6ff3fb0337a9aae8129d5db1f0155700dd20285a Mon Sep 17 00:00:00 2001 From: CNuge Date: Wed, 7 May 2025 11:33:04 -0300 Subject: [PATCH 20/43] updates to notes --- R/pedigree.r | 2 +- vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/R/pedigree.r b/R/pedigree.r index fe8819c..3a2721a 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -69,7 +69,7 @@ ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { #' @export #' #' @examples -score <- function(pihat, K=-1,score.by.K=FALSE) { +score <- function(pihat, K=-1, score.by.K=FALSE) { if(score.by.K==TRUE){ log(2^pihat*K) } diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd index 7312e73..0912531 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -16,14 +16,13 @@ library(seqbio.variant.scoring) ## Introduction -The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate comparison of the relative "power" of families ina study. This can be accomplished upstream of sequencing (i.e. in the project planning stage) and is therefore only dependent on relationship structure and reported affected status of individuals in a given family or set of families. +The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate comparison of the relative "power" of families in a study. This can be accomplished upstream of sequencing (i.e. in the project planning stage) and is therefore only dependent on relationship structure and reported affected status of individuals in a given family or set of families. ## Estimating power of families The `cal.penetrance` function generates both a theoretical ranking of the power of a family assuming you were able to collect everyone on the simplified pedigree, as well as a current ranking, examining only those for whom you currently have DNA. This allows evaluation of the impact on the ranking if certain other family members are enrolled in the study. - ## The input data The family power calculations depend on a single tab-delimited input file, where each row represents a family. The input file is read inusing the `read.pedigree` function. From f5fcca2787768b15a2713f37d4ba0807a5cb0a3a Mon Sep 17 00:00:00 2001 From: CNuge Date: Wed, 7 May 2025 11:40:40 -0300 Subject: [PATCH 21/43] changes to docs based on feedback --- NAMESPACE | 2 +- R/pedigree.r | 34 +++++++++++++++-------------- man/cal.penetrance.Rd | 8 +++---- tests/testthat/test_pedigree_eval.R | 2 +- 4 files changed, 24 insertions(+), 22 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 9ffcbfe..3caed85 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,7 +3,7 @@ S3method(sum,fam.scores) export(assign.status) export(build.relation.dict) -export(cal.penetrance) +export(score.pedigree) export(calc.rv.score) export(encode.rows) export(ibd) diff --git a/R/pedigree.r b/R/pedigree.r index 2cece69..f5f7ea6 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -18,19 +18,26 @@ penetrance <- function(K, a, b, c, d, n) { } -#TODO - CHECK with BARI -# for IBD, am I wrong and is this the correct param description? -#' @param n The number of non-overlapping paths between all pedigree members. - - #' Calculation of Identity by descent (IBD). #' -#' Use the relationships of information from the pedigree to calculate and +#' Use the relationship informationfrom the pedigree to #' estimate of the amount of the genome they have inherited it from a #' common ancestor without recombination. #' #' Can do this for the total potential relatedness in a pedigree (theoretical=TRUE), -#' or for the actual relatedness across collected samples (theoretical=FALSE) +#' or for the actual relatedness across collected samples (theoretical=FALSE). +#' For the theoretical=TRUE case, in the unaffected trees, if we have a sample from the parent, +#' then the offspring do not provide any additional information for a max IBD calculation. +#' This means that K does not scale with n. +#' +#' For theoretical=FALSE, sometimes we don’t have the healthy parent in an unaffected tree, +#' and only have a child. In this case, the IBD contribution from the child is 1/4, +#' and since they’re unaffected and therefore are a counter-filter, +#' they would contribute 1-1/4 = 3/4 to the total relatedness. +#' Either the parent is a non-obligate carrier, or is a non-carrier. +#' The probability of the children depends on which of those is true, +#' so we have the additional set of terms in the theoretical=FALSE logic. +#' #' #' @param a Count of affected individuals #' @param b Count of obligate carriers @@ -63,16 +70,12 @@ ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { #' #' @param pihat Estimated proportion of genome shared between individuals, from function: ibd. #' @param K Estimated penetrance value, from function: penetrance. -#' @param rank.by.K Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE #' #' @return #' @export #' #' @examples -rank <- function(pihat, K=-1,rank.by.K=FALSE) { - if(rank.by.K==TRUE){ - log(2^pihat*K) - } +rank <- function(pihat, K=-1) { log(2^pihat) } @@ -113,7 +116,6 @@ read.pedigree <- function(filename){ #' - Exclude subjects younger than age of onset #' #' @param h A data frame containing the encoded pedigree information -#' @param rank.by.K Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE #' @return A data frame containing the theoretical ranking of the power of a #' family assuming you were able to collect everyone on the simplified pedigree, #' as well as a current ranking, examining only those for whom you currently have DNA. @@ -122,8 +124,8 @@ read.pedigree <- function(filename){ #' @examples #' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') #' example.pedigree.df <- read.pedigree(example.pedigree.file) -#' penetrance.df <- cal.penetrance(example.pedigree.df) -cal.penetrance <- function(h, rank.by.K=FALSE){ +#' penetrance.df <- score.pedigree(example.pedigree.df) +score.pedigree <- function(h){ family_vec <- c() penetrance_vec <- c() @@ -149,7 +151,7 @@ cal.penetrance <- function(h, rank.by.K=FALSE){ K <- optimize(penetrance, c(0,1), max_a, max_b, max_c, max_d, max_n, maximum=TRUE)$max max_pihat <- ibd(max_a, max_b, max_c, max_d, max_n, K) - max_rank <- rank(max_pihat, K, rank.by.K = rank.by.K ) + max_rank <- rank(max_pihat, K) current_pihat <- ibd(a_actual, b_actual, c_actual, d_actual, n_actual, K, FALSE) current_rank <- rank(current_pihat, K) diff --git a/man/cal.penetrance.Rd b/man/cal.penetrance.Rd index 4e01532..23a4905 100644 --- a/man/cal.penetrance.Rd +++ b/man/cal.penetrance.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/pedigree.r -\name{cal.penetrance} -\alias{cal.penetrance} +\name{score.pedigree} +\alias{score.pedigree} \title{Take the encoded information about the pedigrees and calculate penetrance.} \usage{ -cal.penetrance(h, rank.by.K = FALSE) +score.pedigree(h, rank.by.K = FALSE) } \arguments{ \item{h}{A data frame containing the encoded pedigree information} @@ -38,5 +38,5 @@ Will slightly bias the result toward higher penetrance. \examples{ example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') example.pedigree.df <- read.pedigree(example.pedigree.file) -penetrance.df <- cal.penetrance(example.pedigree.df) +penetrance.df <- score.pedigree(example.pedigree.df) } diff --git a/tests/testthat/test_pedigree_eval.R b/tests/testthat/test_pedigree_eval.R index b3bffb7..4c8b857 100644 --- a/tests/testthat/test_pedigree_eval.R +++ b/tests/testthat/test_pedigree_eval.R @@ -8,5 +8,5 @@ test_that("Data are read from files correctly", { example.pedigree.df <- read.pedigree(example.pedigree.file) - penetrance.df <- cal.penetrance(example.pedigree.df) + penetrance.df <- score.pedigree(example.pedigree.df) }) From de3fda5a74e7418c9c14dd40f1d71f987eda9808 Mon Sep 17 00:00:00 2001 From: CNuge Date: Wed, 7 May 2025 14:33:58 -0300 Subject: [PATCH 22/43] standardizing onto periods not underscores --- R/encoding.R | 40 +++++------ R/pedigree.r | 70 +++++++++---------- R/relatedness.r | 64 ++++++++--------- man/assign.status.Rd | 28 ++++---- man/calc.rv.score.Rd | 20 +++--- man/score.fam.Rd | 6 +- man/score.variant.status.Rd | 2 +- tests/testthat/test_relatedness.R | 32 ++++----- vignettes/seqbio.variant.scoring-vignette.Rmd | 36 +++++----- 9 files changed, 149 insertions(+), 149 deletions(-) diff --git a/R/encoding.R b/R/encoding.R index 3d80fb9..a618634 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -1,12 +1,12 @@ #' Take a disease status and a genetic variant and determine which category the combo falls in. -#' A_c = Affected individual with ALT variant -#' A_i = Affected individual without ALT variant -#' U_c = Unaffected individual without ALT variant -#' U_i = Unaffected individual with ALT variant +#' A.c = Affected individual with ALT variant +#' A.i = Affected individual without ALT variant +#' U.c = Unaffected individual without ALT variant +#' U.i = Unaffected individual with ALT variant #' If theoretical.max = TRUE the true variant statuses are ignored and all -#' affected/unaffected are assigned A_c and U_c respectively. +#' affected/unaffected are assigned A.c and U.c respectively. #' These encoding can then be used show what a family's max score would be. #' #' @param status Disease status of an individual. A = affected, U = unaffected. @@ -16,30 +16,30 @@ #' Default is FALSE. #' @return a string #' @examples -#' assign.status("A", "0/1") == "A_c" -#' assign.status("A", "0|0") == "A_i" -#' assign.status("U", 1) == "U_i" -#' assign.status("U", "0|0") =="U_c" +#' assign.status("A", "0/1") == "A.c" +#' assign.status("A", "0|0") == "A.i" +#' assign.status("U", 1) == "U.i" +#' assign.status("U", "0|0") =="U.c" #' @export assign.status <- function(status, variant, theoretical.max=FALSE){ if(status == "A"){ if(theoretical.max){ - return("A_c") + return("A.c") } else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0" || variant == "1|1" ){ - return("A_c") + return("A.c") }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ - return("A_i") + return("A.i") }else{ return("unk") } }else if (status == "U"){ if(theoretical.max){ - return("U_c") + return("U.c") } else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0" || variant == "1|1" ){ - return("U_i") + return("U.i") }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ - return("U_c") + return("U.c") }else{ return("unk") } @@ -50,7 +50,7 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ #' Take the dataframe with variants and status and determine which indivudals #' are scored correctly and which are scored incorrectly. -#' Assign an A_c, A_i, U_c, U_i, unk +#' Assign an A.c, A.i, U.c, U.i, unk #' #' Variants can be encoded as binary (0 or 1, genotypes 0/0 or 0/1, or phased genotypes 0|0 0|1). #' Note the program assumes alt is the disease allele. homozygous alts are allowed. @@ -100,10 +100,10 @@ return(indiv.df) #' @export build.relation.dict <- function( mat.row, name.stat.dict, drop.unrelated=TRUE){ indiv.rels = list( - "A_c" = c(), - "A_i" = c(), - "U_c" = c(), - "U_i" = c() + "A.c" = c(), + "A.i" = c(), + "U.c" = c(), + "U.i" = c() ) for(i in seq_along(mat.row)){ diff --git a/R/pedigree.r b/R/pedigree.r index f5f7ea6..e4c5622 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -127,45 +127,45 @@ read.pedigree <- function(filename){ #' penetrance.df <- score.pedigree(example.pedigree.df) score.pedigree <- function(h){ - family_vec <- c() - penetrance_vec <- c() - max_pihat_vec <- c() - max_rank_vec <- c() - current_pihat_vec <- c() - current_rank_vec <- c() + family.vec <- c() + penetrance.vec <- c() + max.pihat.vec <- c() + max.rank.vec <- c() + current.pihat.vec <- c() + current.rank.vec <- c() for (i in seq_len(nrow(h))) { family <- h[i,"Family"] - max_a <- h[i, "max_a"] - max_b <- h[i, "max_b"] - max_c <- h[i, "max_c"] - max_d <- h[i, "max_d"] - max_n <- h[i, "max_n"] - a_actual <- h[i, "a"] - b_actual <- h[i, "b"] - c_actual <- h[i, "c"] - d_actual <- h[i, "d"] - n_actual <- h[i, "n"] - - max_d <- as.numeric(strsplit(as.character(max_d), ",")[[1]]) - max_n <- as.numeric(strsplit(as.character(max_n), ",")[[1]]) - - K <- optimize(penetrance, c(0,1), max_a, max_b, max_c, max_d, max_n, maximum=TRUE)$max - max_pihat <- ibd(max_a, max_b, max_c, max_d, max_n, K) - max_rank <- rank(max_pihat, K) - current_pihat <- ibd(a_actual, b_actual, c_actual, d_actual, n_actual, K, FALSE) - current_rank <- rank(current_pihat, K) - - family_vec <- c(family_vec, family) - penetrance_vec <- c(penetrance_vec, K) - max_pihat_vec <- c(max_pihat_vec, max_pihat) - max_rank_vec <- c(max_rank_vec, max_rank) - current_pihat_vec <- c(current_pihat_vec, current_pihat) - current_rank_vec <- c(current_rank_vec,current_rank) + max.a <- h[i, "max_a"] + max.b <- h[i, "max_b"] + max.c <- h[i, "max_c"] + max.d <- h[i, "max_d"] + max.n <- h[i, "max_n"] + a.actual <- h[i, "a"] + b.actual <- h[i, "b"] + c.actual <- h[i, "c"] + d.actual <- h[i, "d"] + n.actual <- h[i, "n"] + + max.d <- as.numeric(strsplit(as.character(max.d), ",")[[1]]) + max.n <- as.numeric(strsplit(as.character(max.n), ",")[[1]]) + + K <- optimize(penetrance, c(0,1), max.a, max.b, max.c, max.d, max.n, maximum=TRUE)$max + max.pihat <- ibd(max.a, max.b, max.c, max.d, max.n, K) + max.rank <- rank(max.pihat, K) + current.pihat <- ibd(a.actual, b.actual, c.actual, d.actual, n.actual, K, FALSE) + current.rank <- rank(current.pihat, K) + + family.vec <- c(family.vec, family) + penetrance.vec <- c(penetrance.vec, K) + max.pihat.vec <- c(max.pihat.vec, max.pihat) + max.rank.vec <- c(max.rank.vec, max.rank) + current.pihat.vec <- c(current.pihat.vec, current.pihat) + current.rank.vec <- c(current.rank.vec,current.rank) } - df <- data.frame(family_vec, penetrance_vec, max_pihat_vec, max_rank_vec, current_pihat_vec, current_rank_vec) - colnames(df) <- c("family", "penetrance", "max_pi-hat", "max_rank", "current_pi-hat", "current_rank") - df$pct_of_max <- df$current_rank / df$max_rank * 100 + df <- data.frame(family.vec, penetrance.vec, max.pihat.vec, max.rank.vec, current.pihat.vec, current.rank.vec) + colnames(df) <- c("family", "penetrance", "max.pi-hat", "max.rank", "current.pi-hat", "current.rank") + df$pct.of.max <- df$current.rank / df$max.rank * 100 df[, -1] <- round(df[, -1], 2) return(df) } diff --git a/R/relatedness.r b/R/relatedness.r index 7337ae4..b8f6dfe 100644 --- a/R/relatedness.r +++ b/R/relatedness.r @@ -7,23 +7,23 @@ #' For each encoded relationship, a relationship-informed weight is applied to their sharing #' or not sharing of a variant. #' The score for affected status is: -#' (1 / coefficient_of_relatedness) * status_weight +#' (1 / coefficient.of.relatedness) * status.weight #' For example, an affected cousin (encoded as a 3) would get a score of: #' (1/0.125) * affected weight #' 8 * 1 #' = 8 points in favour of the variant. #' Whereas for unaffected individuals, scores decay the further a person is in #' relation to the proband based on the formula: -#' ((unaffected.max*2) * coefficient_of_relatedness ) * unaffected_weight +#' ((unaffected.max*2) * coefficient.of.relatedness ) * unaffected.weight #' For example, with the default unaffected.max of 8. The sister that does not have a variant would get a score of -#' ((8*2) 0.5) * unaffected_weight +#' ((8*2) 0.5) * unaffected.weight #' (16 * 0.5) * 0.5 #' = 4 points for the variant. #' If these were the only two relatives considered we could sum the points #' and get a score in favour of the variant of #' 8 + 4 = 12 #' If there is evidence against a variant, this is factored into the score as: -#' total_score = evidence_for - evidence_against +#' total.score = evidence.for - evidence.against #' For example, if there were also an affected sibling without the variant we would have the score against of: #' (1/0.5) * 1 = 2 #' The final score for the variant would then be @@ -40,8 +40,8 @@ #' #' Input: #' -#' @param fam_list -#' - A list with the names: ['A_c', 'A_i', 'U_c', 'U_i'] +#' @param fam.list +#' - A list with the names: ['A.c', 'A.i', 'U.c', 'U.i'] #' respectively containing the affected correct, affected incorrect, #' unaffected correct and unaffected incorrect. #' - This can be generated with the function: score.variant.status @@ -49,8 +49,8 @@ #' reference's relatives as encoded based on their degree of relatedness to the #' reference (reference = 0, sibling/parent/offspring = 1, uncle/grandparent = 2, #' cousin = 3, etc.) -#' @param affected.weight A coefficient to multiply the calculated A_c and A_i relatedness values by. -#' @param unaffected.weight A coefficient to multiply the U_c and U_i relatedness values by. +#' @param affected.weight A coefficient to multiply the calculated A.c and A.i relatedness values by. +#' @param unaffected.weight A coefficient to multiply the U.c and U.i relatedness values by. #' @param unaffected.max This param controls the score given to a first degree unaffected relatives #' scores decay from this specified maximum by half for each subsequent relationship degree. #' @param max.err A heuristic cap of the number of incorrect assignments allowed when scoring. When the total number @@ -59,10 +59,10 @@ #' @return A list with three components: score, score.for, score.against. #' #' @examples -#' relations<-list("A_c" = c(0, 1, 3, 1),"A_i" = c(3),"U_c" = c(1, 2),"U_i" = c(1)) +#' relations<-list("A.c" = c(0, 1, 3, 1),"A.i" = c(3),"U.c" = c(1, 2),"U.i" = c(1)) #' rv.scores <- calc.rv.score(relations) #' @export -calc.rv.score <- function(fam_list, affected.weight=1, unaffected.weight=0.5, unaffected.max=8, max.err=4){ +calc.rv.score <- function(fam.list, affected.weight=1, unaffected.weight=0.5, unaffected.max=8, max.err=4){ relatedness = list() @@ -70,43 +70,43 @@ calc.rv.score <- function(fam_list, affected.weight=1, unaffected.weight=0.5, un relatedness[i+1] = 1 / (2 ** (i)) } - score_dict = list() + score.dict = list() - score_dict[["A_c"]] - score_dict[["A_i"]] - score_dict[["U_c"]] - score_dict[["U_i"]] + score.dict[["A.c"]] + score.dict[["A.i"]] + score.dict[["U.c"]] + score.dict[["U.i"]] #TODO - change this to apply the different formula for the unaffecteds - for (n in names(fam_list)){ + for (n in names(fam.list)){ scores = c() - if(n == "A_c" || n == "A_i"){ - for (x in fam_list[[n]]){ + if(n == "A.c" || n == "A.i"){ + for (x in fam.list[[n]]){ #for affected, importance score gets higher the further from reference individual you get. scores = c(scores, 1/relatedness[[x+1]]) } - }else if ( n=="U_c" || n == "U_i"){ - for (x in fam_list[[n]]){ + }else if ( n=="U.c" || n == "U.i"){ + for (x in fam.list[[n]]){ #for unaffected, importance score gets lower the further from reference individual you get. scores = c(scores, (unaffected.max*2) * relatedness[[x+1]]) } } - score_dict[[n]] = scores + score.dict[[n]] = scores } - weighted_for = sum(score_dict[["A_c"]]*affected.weight ) + - sum(score_dict[["U_c"]]*unaffected.weight) + weighted.for = sum(score.dict[["A.c"]]*affected.weight ) + + sum(score.dict[["U.c"]]*unaffected.weight) - weighted_against = sum(score_dict[["A_i"]]*affected.weight ) + - sum(score_dict[["U_i"]]*unaffected.weight) + weighted.against = sum(score.dict[["A.i"]]*affected.weight ) + + sum(score.dict[["U.i"]]*unaffected.weight) - out.list <- list("score" = weighted_for - weighted_against, - "score.for" = weighted_for, - "score.against" = weighted_against ) + out.list <- list("score" = weighted.for - weighted.against, + "score.for" = weighted.for, + "score.against" = weighted.against ) - n.incor <- length(score_dict[["A_i"]]) + length(score_dict[["U_i"]]) + n.incor <- length(score.dict[["A.i"]]) + length(score.dict[["U.i"]]) if(n.incor > max.err){ out.list[["score"]] <- 0 } @@ -146,8 +146,8 @@ subset.mat <- function(mat.df, status.df){ #' #' @param relation.mat A relationship matrix for the family. #' @param status.df A dataframe with the encoded variant/disease status of each individual -#' @param affected.weight A coefficient to multiply the calculated A_c and A_i relatedness values by. -#' @param unaffected.weight A coefficient to multiply the U_c and U_i relatedness values by. +#' @param affected.weight A coefficient to multiply the calculated A.c and A.i relatedness values by. +#' @param unaffected.weight A coefficient to multiply the U.c and U.i relatedness values by. #' @param return.sums Boolean indicating if sum of family variant scores should be returned (default = FALSE). #' @param return.means Boolean indicating if mean of all family variant scores should be returned (default = TRUE). #' @param affected.only Boolean indicating if the family score should be calculated using only the affected individuals? (default = TRUE). @@ -161,7 +161,7 @@ subset.mat <- function(mat.df, status.df){ #' mat.df <- read.relation.mat(mat.name1) #' ind.df <- read.indiv(tsv.name1) #' ind.df.status <- score.variant.status(ind.df) -#' score_default <- score.fam(mat.df, ind.df.status) +#' score.default <- score.fam(mat.df, ind.df.status) #' @export score.fam <- function(relation.mat, status.df, affected.weight=1, unaffected.weight=0.5, return.sums = FALSE, return.means = TRUE, diff --git a/man/assign.status.Rd b/man/assign.status.Rd index 6c722b9..b53a95a 100644 --- a/man/assign.status.Rd +++ b/man/assign.status.Rd @@ -3,12 +3,12 @@ \name{assign.status} \alias{assign.status} \title{Take a disease status and a genetic variant and determine which category the combo falls in. -A_c = Affected individual with ALT variant -A_i = Affected individual without ALT variant -U_c = Unaffected individual without ALT variant -U_i = Unaffected individual with ALT variant +A.c = Affected individual with ALT variant +A.i = Affected individual without ALT variant +U.c = Unaffected individual without ALT variant +U.i = Unaffected individual with ALT variant If theoretical.max = TRUE the true variant statuses are ignored and all -affected/unaffected are assigned A_c and U_c respectively. +affected/unaffected are assigned A.c and U.c respectively. These encoding can then be used show what a family's max score would be.} \usage{ assign.status(status, variant, theoretical.max = FALSE) @@ -27,17 +27,17 @@ a string } \description{ Take a disease status and a genetic variant and determine which category the combo falls in. -A_c = Affected individual with ALT variant -A_i = Affected individual without ALT variant -U_c = Unaffected individual without ALT variant -U_i = Unaffected individual with ALT variant +A.c = Affected individual with ALT variant +A.i = Affected individual without ALT variant +U.c = Unaffected individual without ALT variant +U.i = Unaffected individual with ALT variant If theoretical.max = TRUE the true variant statuses are ignored and all -affected/unaffected are assigned A_c and U_c respectively. +affected/unaffected are assigned A.c and U.c respectively. These encoding can then be used show what a family's max score would be. } \examples{ -assign.status("A", "0/1") == "A_c" -assign.status("A", "0|0") == "A_i" -assign.status("U", 1) == "U_i" -assign.status("U", "0|0") =="U_c" +assign.status("A", "0/1") == "A.c" +assign.status("A", "0|0") == "A.i" +assign.status("U", 1) == "U.i" +assign.status("U", "0|0") =="U.c" } diff --git a/man/calc.rv.score.Rd b/man/calc.rv.score.Rd index 065a096..00d5abd 100644 --- a/man/calc.rv.score.Rd +++ b/man/calc.rv.score.Rd @@ -5,7 +5,7 @@ \title{Calculate a relatedness-weighted score for a given rare variant.} \usage{ calc.rv.score( - fam_list, + fam.list, affected.weight = 1, unaffected.weight = 0.5, unaffected.max = 8, @@ -13,8 +13,8 @@ calc.rv.score( ) } \arguments{ -\item{fam_list}{\itemize{ -\item A list with the names: \link{'A_c', 'A_i', 'U_c', 'U_i'} +\item{fam.list}{\itemize{ +\item A list with the names: \link{'A.c', 'A.i', 'U.c', 'U.i'} respectively containing the affected correct, affected incorrect, unaffected correct and unaffected incorrect. - This can be generated with the function: score.variant.status @@ -24,9 +24,9 @@ reference (reference = 0, sibling/parent/offspring = 1, uncle/grandparent = 2, cousin = 3, etc.) }} -\item{affected.weight}{A coefficient to multiply the calculated A_c and A_i relatedness values by.} +\item{affected.weight}{A coefficient to multiply the calculated A.c and A.i relatedness values by.} -\item{unaffected.weight}{A coefficient to multiply the U_c and U_i relatedness values by.} +\item{unaffected.weight}{A coefficient to multiply the U.c and U.i relatedness values by.} \item{unaffected.max}{This param controls the score given to a first degree unaffected relatives scores decay from this specified maximum by half for each subsequent relationship degree.} @@ -45,23 +45,23 @@ These scores can be used to compare variants of interest within a family. For each encoded relationship, a relationship-informed weight is applied to their sharing or not sharing of a variant. The score for affected status is: -(1 / coefficient_of_relatedness) * status_weight +(1 / coefficient.of.relatedness) * status.weight For example, an affected cousin (encoded as a 3) would get a score of: (1/0.125) * affected weight 8 * 1 = 8 points in favour of the variant. Whereas for unaffected individuals, scores decay the further a person is in relation to the proband based on the formula: -((unaffected.max\emph{2) * coefficient_of_relatedness ) * unaffected_weight +((unaffected.max\emph{2) * coefficient.of.relatedness ) * unaffected.weight For example, with the default unaffected.max of 8. The sister that does not have a variant would get a score of -((8}2) 0.5) * unaffected_weight +((8}2) 0.5) * unaffected.weight (16 * 0.5) * 0.5 = 4 points for the variant. If these were the only two relatives considered we could sum the points and get a score in favour of the variant of 8 + 4 = 12 If there is evidence against a variant, this is factored into the score as: -total_score = evidence_for - evidence_against +total.score = evidence.for - evidence.against For example, if there were also an affected sibling without the variant we would have the score against of: (1/0.5) * 1 = 2 The final score for the variant would then be @@ -79,6 +79,6 @@ of unaffected individuals that have a variant rather than affected individuals t Input: } \examples{ -relations<-list("A_c" = c(0, 1, 3, 1),"A_i" = c(3),"U_c" = c(1, 2),"U_i" = c(1)) +relations<-list("A.c" = c(0, 1, 3, 1),"A.i" = c(3),"U.c" = c(1, 2),"U.i" = c(1)) rv.scores <- calc.rv.score(relations) } diff --git a/man/score.fam.Rd b/man/score.fam.Rd index 1f32802..16c01c0 100644 --- a/man/score.fam.Rd +++ b/man/score.fam.Rd @@ -22,9 +22,9 @@ score.fam( \item{status.df}{A dataframe with the encoded variant/disease status of each individual} -\item{affected.weight}{A coefficient to multiply the calculated A_c and A_i relatedness values by.} +\item{affected.weight}{A coefficient to multiply the calculated A_c and A.i relatedness values by.} -\item{unaffected.weight}{A coefficient to multiply the U_c and U_i relatedness values by.} +\item{unaffected.weight}{A coefficient to multiply the U.c and U.i relatedness values by.} \item{return.sums}{Boolean indicating if sum of family variant scores should be returned (default = FALSE).} @@ -63,5 +63,5 @@ tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring mat.df <- read.relation.mat(mat.name1) ind.df <- read.indiv(tsv.name1) ind.df.status <- score.variant.status(ind.df) -score_default <- score.fam(mat.df, ind.df.status) +score.default <- score.fam(mat.df, ind.df.status) } diff --git a/man/score.variant.status.Rd b/man/score.variant.status.Rd index e41317d..6d2bf1a 100644 --- a/man/score.variant.status.Rd +++ b/man/score.variant.status.Rd @@ -4,7 +4,7 @@ \alias{score.variant.status} \title{Take the dataframe with variants and status and determine which indivudals are scored correctly and which are scored incorrectly. -Assign an A_c, A_i, U_c, U_i, unk} +Assign an A.c, A.i, U.c, U.i, unk} \usage{ score.variant.status(indiv.df, theoretical.max = FALSE) } diff --git a/tests/testthat/test_relatedness.R b/tests/testthat/test_relatedness.R index a2a1dd7..67c59b6 100644 --- a/tests/testthat/test_relatedness.R +++ b/tests/testthat/test_relatedness.R @@ -3,10 +3,10 @@ test_that("Scoring proceeds as expected in all cases", { print("test per encoding score calculation for several combos") print("1. simple proband-sib") - encoded.dat1 <- list("A_c" = c(0, 1), - "A_i" = c(), - "U_c" = c(), - "U_i" = c()) + encoded.dat1 <- list("A.c" = c(0, 1), + "A.i" = c(), + "U.c" = c(), + "U.i" = c()) expected.score1<- list("score" = 3, "score.for" = 3, @@ -15,10 +15,10 @@ test_that("Scoring proceeds as expected in all cases", { expect_equal(expected.score1, score1) print("2. correct A proband and cousin, incorrect unaffected sib") - encoded.dat2 <- list("A_c" = c(0, 3), - "A_i" = c(), - "U_c" = c(), - "U_i" = c(1)) + encoded.dat2 <- list("A.c" = c(0, 3), + "A.i" = c(), + "U.c" = c(), + "U.i" = c(1)) expected.score2<- list("score" = 5, "score.for" = 9, @@ -35,10 +35,10 @@ test_that("Scoring proceeds as expected in all cases", { print("3. bad score proband and incorrect unaffected sib") - encoded.dat3 <- list("A_c" = c(0), - "A_i" = c(), - "U_c" = c(), - "U_i" = c(1)) + encoded.dat3 <- list("A.c" = c(0), + "A.i" = c(), + "U.c" = c(), + "U.i" = c(1)) expected.score3<- list("score" = -3, "score.for" = 1, @@ -48,10 +48,10 @@ test_that("Scoring proceeds as expected in all cases", { print("3. complex score, all situations covered in one.") - encoded.dat4 <- list("A_c" = c(0, 1, 3, 1), - "A_i" = c(3), - "U_c" = c(1, 2), - "U_i" = c(1)) + encoded.dat4 <- list("A.c" = c(0, 1, 3, 1), + "A.i" = c(3), + "U.c" = c(1, 2), + "U.i" = c(1)) expected.score4<- list("score" = 7, "score.for" = 19, diff --git a/vignettes/seqbio.variant.scoring-vignette.Rmd b/vignettes/seqbio.variant.scoring-vignette.Rmd index 525855f..0cf9a5f 100644 --- a/vignettes/seqbio.variant.scoring-vignette.Rmd +++ b/vignettes/seqbio.variant.scoring-vignette.Rmd @@ -64,8 +64,8 @@ For most real-world applications, you will likely want to score family members i ```{r} -ex_score_default <- score.fam(rel.mat, full.df.status) -ex_score_default +ex.score.default <- score.fam(rel.mat, full.df.status) +ex.score.default ``` @@ -78,16 +78,16 @@ As previously noted, if an individual is present in the relationship matrix and The scoring can be changed to summing across all combinations as opposed to the mean by passing the following options. Note using the program in this way will return higher scores for more dense pedigrees. ```{r} -ex_score_sum <- score.fam(rel.mat, full.df.status, return.sums = TRUE, return.means = FALSE) -ex_score_sum +ex.score.sum <- score.fam(rel.mat, full.df.status, return.sums = TRUE, return.means = FALSE) +ex.score.sum ``` To obtain a long form table with the scores for variants expressed relative to each individual, set both `return.sums` and `return.means` to `FALSE`. This output can aid in identifying which individuals are carrying the most weight in a family's score. ```{r} -ex_score_table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) -ex_score_table +ex.score.table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) +ex.score.table ``` ## How scoring works ### A Minimal example, scoring a variant from perspective of a single individual. @@ -103,10 +103,10 @@ rel.mat.proband <- rel.mat["MS-1234-1001",] rel.mat.proband ``` This is taken along with a list of encoded statuses of all individuals for the given variant. Above `score.variant.status` used the disease status and a genetic variant of each individual to determine which of the following categories they fall in to: - - A_c = Affected individual with variant - - A_i = Affected individual without variant - - U_c = Unaffected individual without variant - - U_i = Unaffected individual with variant + - A.c = Affected individual with variant + - A.i = Affected individual without variant + - U.c = Unaffected individual without variant + - U.i = Unaffected individual with variant Within `score.fam`, a labelled list of encodings for all individuals is generated. ```{r} name.stat.dict <- full.df.status$statvar.cat @@ -119,15 +119,15 @@ name.stat.dict rel.dict<-build.relation.dict(rel.mat.proband, name.stat.dict) rel.dict ``` -In this example, the proband, two first degree relations, and a third degree relations are all affected and share the candidate variant. For the affected correct (`A_c`) category we therefore see the following encoded: +In this example, the proband, two first degree relations, and a third degree relations are all affected and share the candidate variant. For the affected correct (`A.c`) category we therefore see the following encoded: ```{r} -rel.dict$A_c +rel.dict$A.c ``` -Since one first degree unaffected relative has the variant, they are categorized as "unaffected incorrect"(`U_i`) and we see: +Since one first degree unaffected relative has the variant, they are categorized as "unaffected incorrect"(`U.i`) and we see: ```{r} -rel.dict$U_i +rel.dict$U.i ``` Deriving a relatedness-weighted score for the variant from the perspective of the given individual is then performed by `calc.rv.score` @@ -141,7 +141,7 @@ for(i in 0:7){ The score for affected status increase as individuals become more distantly related. The formula is: ``` - (1 / coefficient_of_relatedness) * affected.weight + (1 / coefficient.of.relatedness) * affected.weight ``` For example, an affected cousin (encoded as a 3 and having a coefficient of relatedness of 0.125) would get a score of: ``` @@ -152,11 +152,11 @@ For example, an affected cousin (encoded as a 3 and having a coefficient of rela For unaffected individuals, scores decay the further a person is in relation to the query individual based on the formula: ``` -((unaffected.max*2) * coefficient_of_relatedness ) * unaffected.weight +((unaffected.max*2) * coefficient.of.relatedness ) * unaffected.weight ``` For example, with the default `unaffected.weight = 0.5` and unaffected sister that does not have a variant would get a score of ``` - ((8*2) 0.5) * unaffected_weight + ((8*2) 0.5) * unaffected.weight (16 * 0.5) * 0.5 = 4 points for the variant. @@ -168,7 +168,7 @@ If these were the only two relatives considered we could sum the points and get ``` If there is evidence against a variant, this is factored into the score as: ``` - total_score = evidence_for - evidence_against + total.score = evidence.for - evidence.against ``` For example, if there were also an affected sibling without the variant we would have the score against of: ``` From 3e57d7640987c27deb397b62d5345652607fb388 Mon Sep 17 00:00:00 2001 From: CNuge Date: Wed, 7 May 2025 14:45:55 -0300 Subject: [PATCH 23/43] changes to docs based on feedback --- R/encoding.R | 1 + R/pedigree.r | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/R/encoding.R b/R/encoding.R index a618634..6f91176 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -26,6 +26,7 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ if(theoretical.max){ return("A.c") } + #NOTE - Once in a while 1/0 genotypes crop up; also 0/2 etc. if derived from multi-allelics. This edge case not covered at present. else if(variant == "0/1" || variant == "1/1" || variant == "1" || variant == "0|1" || variant == "1|0" || variant == "1|1" ){ return("A.c") }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ diff --git a/R/pedigree.r b/R/pedigree.r index e4c5622..c9d6834 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -2,7 +2,7 @@ #' Likelihood function for calculation of Pedigree-based autosomal dominant penetrance value. #' Formula deployed via optimize so as to determine the optimal value. #' -#' @param K Seed value for the estimate of penetrance rate. +#' @param K The range of penetrance values to be explored by the optimization function. #' @param a Count of affected individuals #' @param b Count of obligate carriers #' @param c Count of children of either affecteds or carriers, with no children of their own From 0fbdf14b00eedef1923537aefe95da883c7d5e9b Mon Sep 17 00:00:00 2001 From: CNuge Date: Wed, 7 May 2025 14:53:02 -0300 Subject: [PATCH 24/43] bringing in new dev changed --- R/pedigree.r | 52 ++-------------------------------------------------- 1 file changed, 2 insertions(+), 50 deletions(-) diff --git a/R/pedigree.r b/R/pedigree.r index 8f66294..4c554a9 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -70,23 +70,12 @@ ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { #' #' @param pihat Estimated proportion of genome shared between individuals, from function: ibd. #' @param K Estimated penetrance value, from function: penetrance. -<<<<<<< HEAD -#' @param score.by.K Should the scoreing of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE -======= ->>>>>>> 3c7b0707afec488ed6daceabeb17626cd7b33712 #' #' @return #' @export #' #' @examples -<<<<<<< HEAD -score <- function(pihat, K=-1, score.by.K=FALSE) { - if(score.by.K==TRUE){ - log(2^pihat*K) - } -======= -rank <- function(pihat, K=-1) { ->>>>>>> 3c7b0707afec488ed6daceabeb17626cd7b33712 +score <- function(pihat, K=-1) { log(2^pihat) } @@ -127,12 +116,7 @@ read.pedigree <- function(filename){ #' - Exclude subjects younger than age of onset #' #' @param h A data frame containing the encoded pedigree information -<<<<<<< HEAD -#' @param score.by.K Should the scoreing of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE -#' @return A data frame containing the theoretical scoreing of the power of a -======= -#' @return A data frame containing the theoretical ranking of the power of a ->>>>>>> 3c7b0707afec488ed6daceabeb17626cd7b33712 +#' @return A data frame containing the theoretical scoring of the power of a #' family assuming you were able to collect everyone on the simplified pedigree, #' as well as a current scoreing, examining only those for whom you currently have DNA. #' @export @@ -140,17 +124,6 @@ read.pedigree <- function(filename){ #' @examples #' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') #' example.pedigree.df <- read.pedigree(example.pedigree.file) -<<<<<<< HEAD -#' penetrance.df <- cal.penetrance(example.pedigree.df) -cal.penetrance <- function(h, score.by.K=FALSE){ - - family_vec <- c() - penetrance_vec <- c() - max_pihat_vec <- c() - max_score_vec <- c() - current_pihat_vec <- c() - current_score_vec <- c() -======= #' penetrance.df <- score.pedigree(example.pedigree.df) score.pedigree <- function(h){ @@ -160,7 +133,6 @@ score.pedigree <- function(h){ max.rank.vec <- c() current.pihat.vec <- c() current.rank.vec <- c() ->>>>>>> 3c7b0707afec488ed6daceabeb17626cd7b33712 for (i in seq_len(nrow(h))) { family <- h[i,"Family"] max.a <- h[i, "max_a"] @@ -177,25 +149,6 @@ score.pedigree <- function(h){ max.d <- as.numeric(strsplit(as.character(max.d), ",")[[1]]) max.n <- as.numeric(strsplit(as.character(max.n), ",")[[1]]) -<<<<<<< HEAD - K <- optimize(penetrance, c(0,1), max_a, max_b, max_c, max_d, max_n, maximum=TRUE)$max - max_pihat <- ibd(max_a, max_b, max_c, max_d, max_n, K) - max_score <- score(max_pihat, K, score.by.K = score.by.K ) - current_pihat <- ibd(a_actual, b_actual, c_actual, d_actual, n_actual, K, FALSE) - current_score <- score(current_pihat, K) - - family_vec <- c(family_vec, family) - penetrance_vec <- c(penetrance_vec, K) - max_pihat_vec <- c(max_pihat_vec, max_pihat) - max_score_vec <- c(max_score_vec, max_score) - current_pihat_vec <- c(current_pihat_vec, current_pihat) - current_score_vec <- c(current_score_vec,current_score) - } - - df <- data.frame(family_vec, penetrance_vec, max_pihat_vec, max_score_vec, current_pihat_vec, current_score_vec) - colnames(df) <- c("family", "penetrance", "max_pi-hat", "max_score", "current_pi-hat", "current_score") - df$pct_of_max <- df$current_score / df$max_score * 100 -======= K <- optimize(penetrance, c(0,1), max.a, max.b, max.c, max.d, max.n, maximum=TRUE)$max max.pihat <- ibd(max.a, max.b, max.c, max.d, max.n, K) max.rank <- rank(max.pihat, K) @@ -213,7 +166,6 @@ score.pedigree <- function(h){ df <- data.frame(family.vec, penetrance.vec, max.pihat.vec, max.rank.vec, current.pihat.vec, current.rank.vec) colnames(df) <- c("family", "penetrance", "max.pi-hat", "max.rank", "current.pi-hat", "current.rank") df$pct.of.max <- df$current.rank / df$max.rank * 100 ->>>>>>> 3c7b0707afec488ed6daceabeb17626cd7b33712 df[, -1] <- round(df[, -1], 2) return(df) } From 6093888625cef38ae845f7cbcdef389cf6872c8c Mon Sep 17 00:00:00 2001 From: CNuge Date: Thu, 22 May 2025 13:10:10 -0300 Subject: [PATCH 25/43] bugfix for the processing of the actual d,n --- NAMESPACE | 4 +- R/pedigree.r | 4 ++ inst/extdata/example_pedigree_encoding2.tsv | 5 ++ man/ibd.Rd | 48 ++++++++++++++++++++ man/penetrance.Rd | 4 +- man/{rank.Rd => score.Rd} | 12 ++--- man/score.fam.Rd | 2 +- man/{cal.penetrance.Rd => score.pedigree.Rd} | 10 ++-- tests/testthat/test_pedigree_eval.R | 10 +++- 9 files changed, 79 insertions(+), 20 deletions(-) create mode 100644 inst/extdata/example_pedigree_encoding2.tsv create mode 100644 man/ibd.Rd rename man/{rank.Rd => score.Rd} (51%) rename man/{cal.penetrance.Rd => score.pedigree.Rd} (78%) diff --git a/NAMESPACE b/NAMESPACE index 3caed85..0fe8b7e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,15 +3,15 @@ S3method(sum,fam.scores) export(assign.status) export(build.relation.dict) -export(score.pedigree) export(calc.rv.score) export(encode.rows) export(ibd) export(penetrance) -export(rank) export(read.indiv) export(read.pedigree) export(read.relation.mat) export(read.var.table) +export(score) export(score.fam) +export(score.pedigree) export(score.variant.status) diff --git a/R/pedigree.r b/R/pedigree.r index 4c554a9..2b43d20 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -136,6 +136,7 @@ score.pedigree <- function(h){ for (i in seq_len(nrow(h))) { family <- h[i,"Family"] max.a <- h[i, "max_a"] + #Yeezy yeezy whats good its ya boy max.b <- h[i, "max_b"] max.c <- h[i, "max_c"] max.d <- h[i, "max_d"] @@ -149,6 +150,9 @@ score.pedigree <- function(h){ max.d <- as.numeric(strsplit(as.character(max.d), ",")[[1]]) max.n <- as.numeric(strsplit(as.character(max.n), ",")[[1]]) + d.actual <- as.numeric(strsplit(as.character(d.actual), ",")[[1]]) + n.actual <- as.numeric(strsplit(as.character(n.actual), ",")[[1]]) + K <- optimize(penetrance, c(0,1), max.a, max.b, max.c, max.d, max.n, maximum=TRUE)$max max.pihat <- ibd(max.a, max.b, max.c, max.d, max.n, K) max.rank <- rank(max.pihat, K) diff --git a/inst/extdata/example_pedigree_encoding2.tsv b/inst/extdata/example_pedigree_encoding2.tsv new file mode 100644 index 0000000..7272d15 --- /dev/null +++ b/inst/extdata/example_pedigree_encoding2.tsv @@ -0,0 +1,5 @@ +Family max_a a max_b b max_c c max_d d max_n n +Fam1 2 2 1 1 5 0 1 1 1 1 +Fam2 3 2 1 0 4 1 0 0 0 0 +Fam3 3 3 3 0 2 1 0 0 0 0 +Fam4 2 2 0 0 3 0 1,1 1,1 2,4 2,4 diff --git a/man/ibd.Rd b/man/ibd.Rd new file mode 100644 index 0000000..0e4f4b4 --- /dev/null +++ b/man/ibd.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/pedigree.r +\name{ibd} +\alias{ibd} +\title{Calculation of Identity by descent (IBD).} +\usage{ +ibd(a, b, c, d, n, K, theoretical = TRUE) +} +\arguments{ +\item{a}{Count of affected individuals} + +\item{b}{Count of obligate carriers} + +\item{c}{Count of children of either affecteds or carriers, with no children of their own} + +\item{d}{Count of Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring)} + +\item{n}{Count of the number of second generation progeny in a given tree.} + +\item{K}{The estimate of penetrance rate.} + +\item{theoretical}{Boolean indicating if the calculation should be +theoretical IBD calculation (using only d and k), or if the calculation +should use the provided n value.} +} +\value{ +pi-hat value. The proportion of genome shared between individuals. +} +\description{ +Use the relationship informationfrom the pedigree to +estimate of the amount of the genome they have inherited it from a +common ancestor without recombination. +} +\details{ +Can do this for the total potential relatedness in a pedigree (theoretical=TRUE), +or for the actual relatedness across collected samples (theoretical=FALSE). +For the theoretical=TRUE case, in the unaffected trees, if we have a sample from the parent, +then the offspring do not provide any additional information for a max IBD calculation. +This means that K does not scale with n. + +For theoretical=FALSE, sometimes we don’t have the healthy parent in an unaffected tree, +and only have a child. In this case, the IBD contribution from the child is 1/4, +and since they’re unaffected and therefore are a counter-filter, +they would contribute 1-1/4 = 3/4 to the total relatedness. +Either the parent is a non-obligate carrier, or is a non-carrier. +The probability of the children depends on which of those is true, +so we have the additional set of terms in the theoretical=FALSE logic. +} diff --git a/man/penetrance.Rd b/man/penetrance.Rd index 12e3045..50a2dc5 100644 --- a/man/penetrance.Rd +++ b/man/penetrance.Rd @@ -8,7 +8,7 @@ Formula deployed via optimize so as to determine the optimal value.} penetrance(K, a, b, c, d, n) } \arguments{ -\item{K}{Seed value for the estimate of penetrance rate.} +\item{K}{The range of penetrance values to be explored by the optimization function.} \item{a}{Count of affected individuals} @@ -16,7 +16,7 @@ penetrance(K, a, b, c, d, n) \item{c}{Count of children of either affecteds or carriers, with no children of their own} -\item{d}{Count of Trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring)} +\item{d}{Count of trees of unaffected individuals - specifically, two sequential generations (i.e. a parent and their offspring)} \item{n}{Count of the number of second generation progeny in a given tree.} } diff --git a/man/rank.Rd b/man/score.Rd similarity index 51% rename from man/rank.Rd rename to man/score.Rd index fae0da3..a37d271 100644 --- a/man/rank.Rd +++ b/man/score.Rd @@ -1,18 +1,16 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/pedigree.r -\name{rank} -\alias{rank} -\title{Rank the pedigrees using the pihat values.} +\name{score} +\alias{score} +\title{Score the pedigrees using the pihat values.} \usage{ -rank(pihat, K = -1, rank.by.K = FALSE) +score(pihat, K = -1) } \arguments{ \item{pihat}{Estimated proportion of genome shared between individuals, from function: ibd.} \item{K}{Estimated penetrance value, from function: penetrance.} - -\item{rank.by.K}{Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE} } \description{ -Rank the pedigrees using the pihat values. +Score the pedigrees using the pihat values. } diff --git a/man/score.fam.Rd b/man/score.fam.Rd index 16c01c0..a8860a1 100644 --- a/man/score.fam.Rd +++ b/man/score.fam.Rd @@ -22,7 +22,7 @@ score.fam( \item{status.df}{A dataframe with the encoded variant/disease status of each individual} -\item{affected.weight}{A coefficient to multiply the calculated A_c and A.i relatedness values by.} +\item{affected.weight}{A coefficient to multiply the calculated A.c and A.i relatedness values by.} \item{unaffected.weight}{A coefficient to multiply the U.c and U.i relatedness values by.} diff --git a/man/cal.penetrance.Rd b/man/score.pedigree.Rd similarity index 78% rename from man/cal.penetrance.Rd rename to man/score.pedigree.Rd index 23a4905..f6b0ab7 100644 --- a/man/cal.penetrance.Rd +++ b/man/score.pedigree.Rd @@ -4,20 +4,18 @@ \alias{score.pedigree} \title{Take the encoded information about the pedigrees and calculate penetrance.} \usage{ -score.pedigree(h, rank.by.K = FALSE) +score.pedigree(h) } \arguments{ \item{h}{A data frame containing the encoded pedigree information} - -\item{rank.by.K}{Should the ranking of families be based on ibd (FALSE), or by IBD and K (TRUE). Default is FALSE} } \value{ -A data frame containing the theoretical ranking of the power of a +A data frame containing the theoretical scoring of the power of a family assuming you were able to collect everyone on the simplified pedigree, -as well as a current ranking, examining only those for whom you currently have DNA. +as well as a current scoreing, examining only those for whom you currently have DNA. } \description{ -Determine a value rank of families by comparing their relationship structure. +Determine a value score of families by comparing their relationship structure. More distant relationships between affecteds (e.g. affected cousins) is more valuable that close relationships (e.g. affected siblings) as there is less IBD and therefore a smaller search space. diff --git a/tests/testthat/test_pedigree_eval.R b/tests/testthat/test_pedigree_eval.R index 4c8b857..a877adf 100644 --- a/tests/testthat/test_pedigree_eval.R +++ b/tests/testthat/test_pedigree_eval.R @@ -3,10 +3,16 @@ test_that("Data are read from files correctly", { ################################## # test 1 example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') + example.pedigree.df <- read.pedigree(example.pedigree.file) + penetrance.df <- score.pedigree(example.pedigree.df) + ################################## + # test 2 + example.pedigree.file2 <-system.file('extdata/example_pedigree_encoding2.tsv', package = 'seqbio.variant.scoring') + example.pedigree.file2 <- "inst/extdata/example_pedigree_encoding2.tsv" + example.pedigree.df2 <- read.pedigree(example.pedigree.file2) + penetrance.df2 <- score.pedigree(example.pedigree.df2) - example.pedigree.df <- read.pedigree(example.pedigree.file) - penetrance.df <- score.pedigree(example.pedigree.df) }) From b60a7316e5204bc86e6d4e1dfe4faa8f5297d068 Mon Sep 17 00:00:00 2001 From: CNuge Date: Thu, 22 May 2025 14:26:30 -0300 Subject: [PATCH 26/43] docs and testing --- inst/extdata/example_pedigree_encoding3.tsv | 5 +++++ tests/testthat/test_pedigree_eval.R | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) create mode 100644 inst/extdata/example_pedigree_encoding3.tsv diff --git a/inst/extdata/example_pedigree_encoding3.tsv b/inst/extdata/example_pedigree_encoding3.tsv new file mode 100644 index 0000000..cee5499 --- /dev/null +++ b/inst/extdata/example_pedigree_encoding3.tsv @@ -0,0 +1,5 @@ +Family max_a a max_b b max_c c max_d d max_n n +Fam1 2 2 1 1 5 0 1 1 1 1 +Fam2 3 2 1 0 4 1 0 0 0 0 +Fam3 3 3 3 0 2 1 0 0 0 0 +Fam4 2 2 0 0 3 0 2 2 0 0 diff --git a/tests/testthat/test_pedigree_eval.R b/tests/testthat/test_pedigree_eval.R index a877adf..b9fe8d9 100644 --- a/tests/testthat/test_pedigree_eval.R +++ b/tests/testthat/test_pedigree_eval.R @@ -9,10 +9,16 @@ test_that("Data are read from files correctly", { ################################## # test 2 example.pedigree.file2 <-system.file('extdata/example_pedigree_encoding2.tsv', package = 'seqbio.variant.scoring') - example.pedigree.file2 <- "inst/extdata/example_pedigree_encoding2.tsv" + #example.pedigree.file2 <- "inst/extdata/example_pedigree_encoding2.tsv" example.pedigree.df2 <- read.pedigree(example.pedigree.file2) penetrance.df2 <- score.pedigree(example.pedigree.df2) + ################################## + # test 3 + example.pedigree.file3 <-system.file('extdata/example_pedigree_encoding3.tsv', package = 'seqbio.variant.scoring') + #example.pedigree.file3 <- "inst/extdata/example_pedigree_encoding3.tsv" + example.pedigree.df3 <- read.pedigree(example.pedigree.file3) + penetrance.df3 <- score.pedigree(example.pedigree.df3) }) From 6fd78a6c4335de78fdce2721fefa657b87538190 Mon Sep 17 00:00:00 2001 From: CNuge Date: Thu, 22 May 2025 15:01:53 -0300 Subject: [PATCH 27/43] adding in unit test for the scoring --- R/pedigree.r | 18 +++++++++--------- tests/testthat/test_pedigree_eval.R | 12 ++++++++++++ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/R/pedigree.r b/R/pedigree.r index 2b43d20..f45494f 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -130,9 +130,9 @@ score.pedigree <- function(h){ family.vec <- c() penetrance.vec <- c() max.pihat.vec <- c() - max.rank.vec <- c() + max.score.vec <- c() current.pihat.vec <- c() - current.rank.vec <- c() + current.score.vec <- c() for (i in seq_len(nrow(h))) { family <- h[i,"Family"] max.a <- h[i, "max_a"] @@ -155,21 +155,21 @@ score.pedigree <- function(h){ K <- optimize(penetrance, c(0,1), max.a, max.b, max.c, max.d, max.n, maximum=TRUE)$max max.pihat <- ibd(max.a, max.b, max.c, max.d, max.n, K) - max.rank <- rank(max.pihat, K) + max.score <- score(max.pihat, K) current.pihat <- ibd(a.actual, b.actual, c.actual, d.actual, n.actual, K, FALSE) - current.rank <- rank(current.pihat, K) + current.score <- score(current.pihat, K) family.vec <- c(family.vec, family) penetrance.vec <- c(penetrance.vec, K) max.pihat.vec <- c(max.pihat.vec, max.pihat) - max.rank.vec <- c(max.rank.vec, max.rank) + max.score.vec <- c(max.score.vec, max.score) current.pihat.vec <- c(current.pihat.vec, current.pihat) - current.rank.vec <- c(current.rank.vec,current.rank) + current.score.vec <- c(current.score.vec,current.score) } - df <- data.frame(family.vec, penetrance.vec, max.pihat.vec, max.rank.vec, current.pihat.vec, current.rank.vec) - colnames(df) <- c("family", "penetrance", "max.pi-hat", "max.rank", "current.pi-hat", "current.rank") - df$pct.of.max <- df$current.rank / df$max.rank * 100 + df <- data.frame(family.vec, penetrance.vec, max.pihat.vec, max.score.vec, current.pihat.vec, current.score.vec) + colnames(df) <- c("family", "penetrance", "max.pi-hat", "max.score", "current.pi-hat", "current.score") + df$pct.of.max <- df$current.score / df$max.score * 100 df[, -1] <- round(df[, -1], 2) return(df) } diff --git a/tests/testthat/test_pedigree_eval.R b/tests/testthat/test_pedigree_eval.R index b9fe8d9..3dddf04 100644 --- a/tests/testthat/test_pedigree_eval.R +++ b/tests/testthat/test_pedigree_eval.R @@ -20,5 +20,17 @@ test_that("Data are read from files correctly", { example.pedigree.df3 <- read.pedigree(example.pedigree.file3) penetrance.df3 <- score.pedigree(example.pedigree.df3) + expected.penetrance.df3 <- data.frame( + "family" = c("Fam1", "Fam2", "Fam3", "Fam4"), + "penetrance" = c(0.37,0.58,0.45,0.57), + "max.pi-hat" = c(9.76,8.97,7.73,8.43), + "max.score" = c(6.76,6.22,5.36,5.84), + "current.pi-hat" = c(2.97,2.49,3.36,3.97), + "current.score" = c(2.06,1.73,2.33,2.75), + "pct.of.max" = c(30.44,27.79,43.53,47.12) + ) + colnames(expected.penetrance.df3) <- c("family", "penetrance", "max.pi-hat", "max.score", "current.pi-hat", "current.score", "pct.of.max") + + expect_equal(all.equal(penetrance.df3, expected.penetrance.df3), TRUE) }) From 294a5e05414e82840480904a394c7bea7cb1c9d9 Mon Sep 17 00:00:00 2001 From: CNuge Date: Fri, 23 May 2025 09:10:11 -0300 Subject: [PATCH 28/43] encoding error graceful handling --- R/encoding.R | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/encoding.R b/R/encoding.R index 6f91176..a5208f3 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -22,6 +22,8 @@ #' assign.status("U", "0|0") =="U.c" #' @export assign.status <- function(status, variant, theoretical.max=FALSE){ + + var.err<-"Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'" if(status == "A"){ if(theoretical.max){ return("A.c") @@ -32,7 +34,7 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ return("A.i") }else{ - return("unk") + stop(var.err) } }else if (status == "U"){ if(theoretical.max){ @@ -42,10 +44,10 @@ assign.status <- function(status, variant, theoretical.max=FALSE){ }else if (variant == "0/0" || variant == "0" || variant == "0|0" ){ return("U.c") }else{ - return("unk") + stop(var.err) } }else{ - return("unk") + stop("Status must be one of: U or A") } } From 72cdc3c4206c96eefe5de2e716efd53768f4528e Mon Sep 17 00:00:00 2001 From: CNuge Date: Fri, 23 May 2025 15:40:57 -0300 Subject: [PATCH 29/43] draft of readme --- README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 721ec02..bf8d8c8 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,29 @@ # seqbio.variant.scoring ## An R package method for scoring variants based on relationships of individuals. -*WORK IN PROGRESS - currently a minimal viable product.* ## Introduction -When looking at shared rare variants across families, not all sets of affected and unaffected individuals are equal. -This R package is designed to score rare variants, assigning values based on the disease status of individuals, -the presence or absence of a rare variant in those individuals, and their pairwise coefficients of relatedness. -The package uses a custom formula to assign value to a variant that gives more weight to shared variants common -to distantly related affected individuals. The variant status for unaffected individuals can optionally be considered -as well, with the highest scoring values being given to closely related individuals that *do not* share a variant of interst. -Since variants can be incompletely penetrant, the scoring can be based solely on the affected individuals, or the weight -of unaffected evidence can be customized. + +## Installation + + +The development version of `debar` can be installed directly from GitHub. You'll need to have the R package `devtools` installed and loaded. Also note if the build_vignettes option is set to true, you will need to have the R package `knitr` installed. + +``` +#install.packages("devtools") +#install.packages("knitr") #required if build_vignettes = TRUE +#library(devtools) +devtools::install_github("SequenceBio/seqbio.variant.scoring", build_vignettes = TRUE) +library(seqbio.variant.scoring) +``` + ## How it works +The package's vignette contains detailed explanations of the functions and parameters. + For a walk through of the seqbio.variant.scoring functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette file: `vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd` or within R, run: @@ -34,18 +41,51 @@ vignette('seqbio.variant.scoring-variant_scoring') ## Scoring families -### Then encoding file +### The encoding file +See the included example data, which encodes 14 families. See the accompanying vignette for more information on encoding pedigrees: +``` + example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +``` +The data can be loaded with the following function: +``` + example.pedigree.df <- read.pedigree(example.pedigree.file) +``` +and scoring then performed: +``` + scoring.df <- score.pedigree(example.pedigree.df) +``` ## Scoring Variants +When looking at shared rare variants across families, not all sets of affected and unaffected individuals are equal. This R package is designed to score rare variants, assigning values based on the disease status of individuals, the presence or absence of a rare variant in those individuals, and their pairwise coefficients of relatedness. The package uses a custom formula to assign value to a variant that gives more weight to shared variants common to distantly related affected individuals. The variant status for unaffected individuals can optionally be considered as well, with the highest scoring values being given to closely related individuals that *do not* share a variant of interst. Since variants can be incompletely penetrant, the scoring can be based solely on the affected individuals, or the weight of unaffected evidence can be customized. + + ### The relationship matrix This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. +``` +mat.name<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') + +rel.mat <- read.relation.mat(mat.name) +``` + ### The status file This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. +``` +tsv.name<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') +ind.df <- read.indiv(tsv.name) + +ind.df.status <- score.variant.status(notch1234.df) +``` + +### Scoring variants +The two streams of information can then be combined to score a variant based off the relationships of individuals. +``` +score.example <- score.fam(rel.mat, ind.df.status) +``` \ No newline at end of file From c76892c063ba8bb322140202cf241c2f9849293e Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 26 May 2025 13:49:17 -0300 Subject: [PATCH 30/43] adding preamble --- README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index bf8d8c8..d3e5591 100644 --- a/README.md +++ b/README.md @@ -1,16 +1,17 @@ # seqbio.variant.scoring -## An R package method for scoring variants based on relationships of individuals. +## An R package for relationship-informed pedigree and variant scoring ## Introduction +Family-based genetic studies are effective for identifying rare variants underlying heritable diseases, yet are often challenged by issues such as incomplete penetrance and the difficulty of prioritizing numerous candidate variants. The proportion of the genome with identity-by-descent (IBD) and estimates of penetrance are metrics that can show the value of different pedigrees in family-based studies. Additionally, IBD and the genotypes of individuals are combined by seqbio.variant.scoringto to score the value of candidate variants. ## Installation -The development version of `debar` can be installed directly from GitHub. You'll need to have the R package `devtools` installed and loaded. Also note if the build_vignettes option is set to true, you will need to have the R package `knitr` installed. +The development version of `seqbio.variant.scoring` can be installed directly from GitHub. You'll need to have the R package `devtools` installed and loaded. Also note if the build_vignettes option is set to true, you will need to have the R package `knitr` installed. ``` #install.packages("devtools") @@ -24,14 +25,14 @@ library(seqbio.variant.scoring) The package's vignette contains detailed explanations of the functions and parameters. -For a walk through of the seqbio.variant.scoring functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette file: +For a walk through of the `seqbio.variant.scoring` functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette file: `vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd` or within R, run: ``` vignette('seqbio.variant.scoring-penetrance_and_ibd') ``` -For a walk through of the seqbio.variant.scoring functions for scoring the value of *variants* within families, see the corresponging vignette file: +For a walk through of the `seqbio.variant.scoring` functions for scoring the value of *variants* within families, see the corresponging vignette file: `vignettes/seqbio.variant.scoring-variant_scoring.Rmd` or within R, run: From b0fd92ec358a5c5a9d64ab3efd33ee46c1290ffa Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 26 May 2025 13:50:18 -0300 Subject: [PATCH 31/43] expanding preamble --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d3e5591..d6f6409 100644 --- a/README.md +++ b/README.md @@ -6,11 +6,11 @@ Family-based genetic studies are effective for identifying rare variants underlying heritable diseases, yet are often challenged by issues such as incomplete penetrance and the difficulty of prioritizing numerous candidate variants. The proportion of the genome with identity-by-descent (IBD) and estimates of penetrance are metrics that can show the value of different pedigrees in family-based studies. Additionally, IBD and the genotypes of individuals are combined by seqbio.variant.scoringto to score the value of candidate variants. +The `seqbio.variant.scoring` R package is meant to aid in comparative evaluation of families and candidate variants in rare-variant association studies. The package can be used for two methodologically overlapping but distinct purposes: 1) prior to any genetic/genomic evaluation, evaluation of relative detection power of pedigrees, can direct recruitment efforts by showing which unsampled individuals would be the most meaningful additions to a study, and 2) after sequencing and analysis, variants based on association with disease status and familial relationships of individuals, aids in variant prioritization ## Installation - The development version of `seqbio.variant.scoring` can be installed directly from GitHub. You'll need to have the R package `devtools` installed and loaded. Also note if the build_vignettes option is set to true, you will need to have the R package `knitr` installed. ``` From 9b8759946856aeec8305bdf700c5d78b01618db3 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 26 May 2025 13:55:54 -0300 Subject: [PATCH 32/43] generic variable names --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d6f6409..40544e9 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ This file includes the same individual IDs used in the relationship matrix as we tsv.name<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') ind.df <- read.indiv(tsv.name) -ind.df.status <- score.variant.status(notch1234.df) +ind.df.status <- score.variant.status(ex1234.df) ``` From 6aefe617a8164000a6d096ca8f53d2f5d69bab33 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 26 May 2025 14:14:03 -0300 Subject: [PATCH 33/43] adding dependencies --- .github/ISSUE_TEMPLATE/bug_report.md | 34 +++++++++++++ .github/ISSUE_TEMPLATE/feature-addition.md | 26 ++++++++++ .github/pull_request_template.md | 25 ++++++++++ .github/workflows/main.yaml | 58 ++++++++++++++++++++++ DESCRIPTION | 3 ++ 5 files changed, 146 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature-addition.md create mode 100644 .github/pull_request_template.md create mode 100644 .github/workflows/main.yaml diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..605bb8e --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,34 @@ +--- +name: Bug Report +about: Create a report about something not working as expected. +title: '' +labels: '' +assignees: '' + +--- + +## **Describe the bug** +A clear and concise description of what the bug is. + +## **To reproduce the bug** +Steps to reproduce the observed issue: +1. Go to '...' +2. Click on '....' +3. Scroll down to '....' +4. See error + +## **Expected behaviour** +A clear and concise description of what you expected to happen. + +## **Screenshots and data** +If applicable, add input/output data (files or cloud location) or screenshots to help explain your problem. + +## **Additional context** +Add any other context about the problem here. + +## **Proposed troubleshooting steps** +Describe how you're going to try to fix it! +- [ ] This can be a task list + +## **Help requests** +Describe any specific aspects of the problem/solution where help from a teammate is required. Tag them so they're aware. diff --git a/.github/ISSUE_TEMPLATE/feature-addition.md b/.github/ISSUE_TEMPLATE/feature-addition.md new file mode 100644 index 0000000..b824490 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature-addition.md @@ -0,0 +1,26 @@ +--- +name: Feature Addition +about: Suggest an idea for this project. +title: '' +labels: '' +assignees: '' + +--- + +## **Is your feature request related to a problem? Please describe.** +A clear and concise description of what the problem is. Ex. Currently we cannot test ___, the way ___ is done is disorganized [...] + +## **Describe the solution you'd like** +A clear and concise description of what you want to happen, in plain english. + +## **Describe the plan** +A clear and concise description of the path forward as you see it. Describe any uncertainties or alternative solutions that need to be compared. + +### **TODO** +- [ ] Break the plan down into specifics + +## **Additional context** +Add any other context or screenshots about the feature request here. + +## **Help requests** +Describe any specific aspects of the problem/solution where help from a teammate is required. Tag them so they're aware. diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..6b4941a --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,25 @@ +# Overview of Pull Request +A brief description of _why_ this change is necessary (couple of sentences/short paragraph). + +## Release notes +A brief description of what this change is in plain english. What does this change add, fix, or address? (couple of sentences/short paragraph) + +## Closes the following issues + - _[Links to any relevant GitHub Issues]_ + +## Pre-review checklist +Make sure all these boxes are checked before tagging a reviewer! + +- [ ] The PR title is a concise, present-tense summary of the change +- [ ] The PR is linked to any relevant GitHub Issues, if they exist +- [ ] The PR is against the intended branch, and is mergeable +- [ ] The code is sufficiently tested (unit tests and real world experiments, where applicable) +- [ ] The code is linted and meets style guidelines +- [ ] The changelog has been updated +- [ ] I reviewed the PR myself before requesting a review from others + +## Pre-release notes +* External dependencies affected + - [ ] _List any dependencies that may be affected by this change or need to be updated to align with this change_ +* Post-release tasks + - [ ] _List any rake tasks or other TODOs that need to be sorted after this change is released_ diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml new file mode 100644 index 0000000..3d2295e --- /dev/null +++ b/.github/workflows/main.yaml @@ -0,0 +1,58 @@ +name: ContiniousIntegrationChecks +run-name: ${{ github.actor }} triggered CI run with event ${{ github.event_name }} for branch ${{ github.ref }} +on: + push: + branches: [ main, ci_add] + pull_request: + branches_ignore: [] +permissions: + id-token: write + contents: read # This is required for actions/checkout +jobs: + fullCI: + defaults: + run: + shell: bash -l {0} + name: CI test + runs-on: ubuntu-latest + steps: + - run: echo "The job was automatically triggered by a ${{ github.event_name }} event." + - run: echo "The name of your branch is ${{ github.ref }} and your repository is ${{ github.repository }}." + - run: echo "The job's status is ${{ job.status }}." + - name: Git clone the repository + uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + - name: Configure AWS Credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + audience: sts.amazonaws.com + aws-region: ca-central-1 + role-to-assume: arn:aws:iam::188162314097:role/github-action-s3-package-access + - name: Set up conda + uses: conda-incubator/setup-miniconda@v3 + with: + auto-activate-base: false + python-version: 3.12 + channels: bioconda, conda-forge, defaults + - name: Conda create env and install dependencies + shell: bash -l {0} + run: | + conda deactivate + conda install boto3 + conda config --add channels s3://package-mirror-sbi-shared/seqbio-pkgs/ + conda create --name=sfvf_test python numpy pytest pytest-cov pandas pyyaml seqbiopy + - name: Conda activate env and run unit tests + shell: bash -l {0} + run: | + conda activate /usr/share/miniconda/envs/sfvf_test + conda install bcftools>=1.21 + python -m pytest --cov workflow/scripts +#commands above ensure that the channel is available, that the CLI tool is available, and +#that unit tests using the package pass +# - name: Run PreCommit +# uses: pre-commit/action@v3.0.1 +#test syntax with: action-validator .github/workflows/main.yaml +#from: https://github.com/mpalmer/action-validator diff --git a/DESCRIPTION b/DESCRIPTION index ac26179..4ccb218 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,3 +20,6 @@ License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 +Suggests: + devtools, + testthat From 306549fc30d238dedf2cdaac21ccee647a340980 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 26 May 2025 14:24:05 -0300 Subject: [PATCH 34/43] test version --- .github/workflows/main.yaml | 16 ++++------------ DESCRIPTION | 18 +++++++----------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index 3d2295e..beffa22 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -21,16 +21,8 @@ jobs: - run: echo "The job's status is ${{ job.status }}." - name: Git clone the repository uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v4 - with: - python-version: '3.x' - - name: Configure AWS Credentials - uses: aws-actions/configure-aws-credentials@v4 - with: - audience: sts.amazonaws.com - aws-region: ca-central-1 - role-to-assume: arn:aws:iam::188162314097:role/github-action-s3-package-access + - name: Set up R env + uses: r-lib/actions/setup-r@v2 - name: Set up conda uses: conda-incubator/setup-miniconda@v3 with: @@ -40,8 +32,8 @@ jobs: - name: Conda create env and install dependencies shell: bash -l {0} run: | - conda deactivate - conda install boto3 + conda install r-devtools + conda install r-testthat conda config --add channels s3://package-mirror-sbi-shared/seqbio-pkgs/ conda create --name=sfvf_test python numpy pytest pytest-cov pandas pyyaml seqbiopy - name: Conda activate env and run unit tests diff --git a/DESCRIPTION b/DESCRIPTION index 4ccb218..4303fb1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -5,17 +5,13 @@ Authors@R: person("Cameron", "Nugent", , "cam.nugent@sequencebio.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-1135-2605")) Description: - This R package is designed to score rare variants, assigning values based on - the disease status of individuals, the presence or absence of a rare variant - in those individuals, and their pairwise coefficients of relatedness. - The package uses a custom formula to assign value to a variant that gives - more weight to shared variants common to distantly related affected - individuals. The variant status for unaffected individuals can optionally - be considered as well, with the highest scoring values being given to - closely related individuals that do not share a variant of interst. - Since variants can be incompletely penetrant, the scoring can be based - solely on the affected individuals, or the weight of unaffected evidence - can be customized. + The seqbio.variant.scoring R package is meant to aid in comparative evaluation of families + and candidate variants in rare-variant association studies. The package can be used for + two methodologically overlapping but distinct purposes: First, the prior to any genetic/genomic + evaluation, evaluation of relative detection power of pedigrees, can direct recruitment + efforts by showing which unsampled individuals would be the most meaningful additions to a study. + Second, after sequencing and analysis, variants based on association with disease status + and familial relationships of individuals, aids in variant prioritization License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) From 6b822d6fe94eba05951a7a7f0b06e966f03f4e9d Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 26 May 2025 14:31:15 -0300 Subject: [PATCH 35/43] CI checks --- .github/workflows/main.yaml | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml index beffa22..6648884 100644 --- a/.github/workflows/main.yaml +++ b/.github/workflows/main.yaml @@ -32,19 +32,6 @@ jobs: - name: Conda create env and install dependencies shell: bash -l {0} run: | - conda install r-devtools - conda install r-testthat - conda config --add channels s3://package-mirror-sbi-shared/seqbio-pkgs/ - conda create --name=sfvf_test python numpy pytest pytest-cov pandas pyyaml seqbiopy - - name: Conda activate env and run unit tests - shell: bash -l {0} - run: | - conda activate /usr/share/miniconda/envs/sfvf_test - conda install bcftools>=1.21 - python -m pytest --cov workflow/scripts -#commands above ensure that the channel is available, that the CLI tool is available, and -#that unit tests using the package pass -# - name: Run PreCommit -# uses: pre-commit/action@v3.0.1 -#test syntax with: action-validator .github/workflows/main.yaml -#from: https://github.com/mpalmer/action-validator + conda create -n test_r r-base r-devtools r-testthat + conda activate test_r + Rscript -e "testthat::test_local()" From 71970110b8ff17b9050a84b5b3654c1fbc3ee1f4 Mon Sep 17 00:00:00 2001 From: CNuge Date: Tue, 27 May 2025 15:48:15 -0300 Subject: [PATCH 36/43] all CRAN errors and warnings addressed --- DESCRIPTION | 12 +++-- NAMESPACE | 2 +- R/encoding.R | 5 +- R/pedigree.r | 54 ++++++++++++------- R/relatedness.r | 12 +++-- man/{sum.fam.scores.Rd => add.fam.scores.Rd} | 8 +-- man/build.relation.dict.Rd | 3 -- man/calc.rv.score.Rd | 2 +- man/encode.rows.Rd | 5 +- man/ibd.Rd | 12 +++-- man/penetrance.Rd | 4 ++ man/read.pedigree.Rd | 7 ++- man/score.Rd | 13 +++-- man/score.fam.Rd | 3 +- man/score.pedigree.Rd | 9 ++-- man/subset.mat.Rd | 8 +++ tests/testthat/test_relatedness.R | 2 +- ...bio.variant.scoring-penetrance_and_ibd.Rmd | 6 +-- 18 files changed, 101 insertions(+), 66 deletions(-) rename man/{sum.fam.scores.Rd => add.fam.scores.Rd} (86%) diff --git a/DESCRIPTION b/DESCRIPTION index 4303fb1..806177a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,5 +1,5 @@ Package: seqbio.variant.scoring -Title: Score evidence of rare variants' association with disease status in a family, based on relationships of individuals +Title: Relationship-informed pedigree and variant scoring Version: 0.1.0 Authors@R: person("Cameron", "Nugent", , "cam.nugent@sequencebio.com", role = c("aut", "cre"), @@ -7,15 +7,17 @@ Authors@R: Description: The seqbio.variant.scoring R package is meant to aid in comparative evaluation of families and candidate variants in rare-variant association studies. The package can be used for - two methodologically overlapping but distinct purposes: First, the prior to any genetic/genomic + two methodologically overlapping but distinct purposes. First, the prior to any genetic or genomic evaluation, evaluation of relative detection power of pedigrees, can direct recruitment efforts by showing which unsampled individuals would be the most meaningful additions to a study. Second, after sequencing and analysis, variants based on association with disease status - and familial relationships of individuals, aids in variant prioritization -License: MIT + file LICENSE + and familial relationships of individuals, aids in variant prioritization. +License: MIT Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 +VignetteBuilder: knitr Suggests: devtools, - testthat + testthat, + knitr diff --git a/NAMESPACE b/NAMESPACE index 0fe8b7e..c1ff121 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,6 @@ # Generated by roxygen2: do not edit by hand -S3method(sum,fam.scores) +export(add.fam.scores) export(assign.status) export(build.relation.dict) export(calc.rv.score) diff --git a/R/encoding.R b/R/encoding.R index a5208f3..cf3f6d2 100644 --- a/R/encoding.R +++ b/R/encoding.R @@ -98,8 +98,6 @@ return(indiv.df) #' @param drop.unrelated Should unrelated (-1) relationships be dropped? Default = TRUE. #' #' @return A list with the categorized relationship/variant information. -#' @examples -#' #TODO - add #' @export build.relation.dict <- function( mat.row, name.stat.dict, drop.unrelated=TRUE){ indiv.rels = list( @@ -127,10 +125,9 @@ build.relation.dict <- function( mat.row, name.stat.dict, drop.unrelated=TRUE){ #' For each row, generate the encoded data for scoring. #' @param relation.mat The relationship matrix for all pairwise combinations of individuals. #' @param status.df The ID, status, and genotypes for each individual. +#' @param ... Additional arguments to be passed between methods. #' @return A dictionary with the per-individual relationship lists. #' One value for each row of the matrix. -#' @examples -#' #TODO - add #' @export encode.rows <- function(relation.mat, status.df, ...){ diff --git a/R/pedigree.r b/R/pedigree.r index f45494f..c81fc41 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -13,6 +13,8 @@ #' @export #' #' @examples +#' # the penetrance function is a function where values is found through optimization +#' K <- optimize(penetrance, c(0,1), 3, 1, 5, 2, 1, maximum=TRUE)$max penetrance <- function(K, a, b, c, d, n) { a*log(K) + b*log(1-K) + c*log(2-K) + sum(d*log(2^n + (1-K)*(2-K)^n) - d*(n+1)*log(2)) } @@ -24,10 +26,13 @@ penetrance <- function(K, a, b, c, d, n) { #' estimate of the amount of the genome they have inherited it from a #' common ancestor without recombination. #' -#' Can do this for the total potential relatedness in a pedigree (theoretical=TRUE), +#' Can do this for the total potential relatedness +#' in a pedigree (theoretical=TRUE), #' or for the actual relatedness across collected samples (theoretical=FALSE). -#' For the theoretical=TRUE case, in the unaffected trees, if we have a sample from the parent, -#' then the offspring do not provide any additional information for a max IBD calculation. +#' For the theoretical=TRUE case, in the unaffected trees, +#' if we have a sample from the parent, +#' then the offspring do not provide any additional information +#' for a max IBD calculation. #' This means that K does not scale with n. #' #' For theoretical=FALSE, sometimes we don’t have the healthy parent in an unaffected tree, @@ -54,6 +59,7 @@ penetrance <- function(K, a, b, c, d, n) { #' @export #' #' @examples +#' ibd <- ibd(3, 1, 5, 2, 1, 0.4576484) ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { if (theoretical) { x <- sum(d) * 2^K @@ -68,20 +74,19 @@ ibd <- function(a, b, c, d, n, K, theoretical=TRUE) { #' Score the pedigrees using the pihat values. #' -#' @param pihat Estimated proportion of genome shared between individuals, from function: ibd. -#' @param K Estimated penetrance value, from function: penetrance. +#' @param pihat Estimated proportion of genome shared between individuals, +#' from function: ibd. #' -#' @return +#' @return The score value. #' @export -#' #' @examples -score <- function(pihat, K=-1) { +#' s.val <- score(12.61) +score <- function(pihat) { log(2^pihat) } #' Read in the encoded pedigree data file. -#' TODO - can we build this encoded matrix from a more standard file format? i.e. ped-like? #' #' @param filename name of the file with the data. #' @@ -89,10 +94,12 @@ score <- function(pihat, K=-1) { #' @export #' #' @examples -#' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +#' example.pedigree.file <- system.file('extdata/example_pedigree_encoding.tsv', +#' package = 'seqbio.variant.scoring') #' example.pedigree.df <- read.pedigree(example.pedigree.file) read.pedigree <- function(filename){ - h <- read.table(filename, header=TRUE, sep="\t", check.names=FALSE, colClasses=c("Family"="character")) + h <- read.table(filename, header=TRUE, sep="\t", + check.names=FALSE, colClasses=c("Family"="character")) return(h) } @@ -110,7 +117,8 @@ read.pedigree <- function(filename){ #' - No ambiguous statuses #' - No more than two sequential generations of unknown carrier status #' (non-obligate carrier vs. non-carrier). -#' Generalized support of arbitrary tree structures gets a lot more complicated, especially for the likelihood function. +#' Generalized support of arbitrary tree structures gets a lot more +#' complicated, especially for the likelihood function. #' - Exclude big giant trees of unaffecteds - related to above. #' Will slightly bias the result toward higher penetrance. #' - Exclude subjects younger than age of onset @@ -118,11 +126,13 @@ read.pedigree <- function(filename){ #' @param h A data frame containing the encoded pedigree information #' @return A data frame containing the theoretical scoring of the power of a #' family assuming you were able to collect everyone on the simplified pedigree, -#' as well as a current scoreing, examining only those for whom you currently have DNA. +#' as well as a current scoreing, examining only those for whom you currently +#' have DNA. #' @export #' #' @examples -#' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +#' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', +#' package = 'seqbio.variant.scoring') #' example.pedigree.df <- read.pedigree(example.pedigree.file) #' penetrance.df <- score.pedigree(example.pedigree.df) score.pedigree <- function(h){ @@ -153,11 +163,13 @@ score.pedigree <- function(h){ d.actual <- as.numeric(strsplit(as.character(d.actual), ",")[[1]]) n.actual <- as.numeric(strsplit(as.character(n.actual), ",")[[1]]) - K <- optimize(penetrance, c(0,1), max.a, max.b, max.c, max.d, max.n, maximum=TRUE)$max + K <- optimize(penetrance, c(0,1), max.a, max.b, max.c, + max.d, max.n, maximum=TRUE)$max max.pihat <- ibd(max.a, max.b, max.c, max.d, max.n, K) - max.score <- score(max.pihat, K) - current.pihat <- ibd(a.actual, b.actual, c.actual, d.actual, n.actual, K, FALSE) - current.score <- score(current.pihat, K) + max.score <- score(max.pihat) + current.pihat <- ibd(a.actual, b.actual, c.actual, + d.actual, n.actual, K, FALSE) + current.score <- score(current.pihat) family.vec <- c(family.vec, family) penetrance.vec <- c(penetrance.vec, K) @@ -167,8 +179,10 @@ score.pedigree <- function(h){ current.score.vec <- c(current.score.vec,current.score) } - df <- data.frame(family.vec, penetrance.vec, max.pihat.vec, max.score.vec, current.pihat.vec, current.score.vec) - colnames(df) <- c("family", "penetrance", "max.pi-hat", "max.score", "current.pi-hat", "current.score") + df <- data.frame(family.vec, penetrance.vec, max.pihat.vec, max.score.vec, + current.pihat.vec, current.score.vec) + colnames(df) <- c("family", "penetrance", "max.pi-hat", "max.score", + "current.pi-hat", "current.score") df$pct.of.max <- df$current.score / df$max.score * 100 df[, -1] <- round(df[, -1], 2) return(df) diff --git a/R/relatedness.r b/R/relatedness.r index b8f6dfe..edc065b 100644 --- a/R/relatedness.r +++ b/R/relatedness.r @@ -41,7 +41,7 @@ #' Input: #' #' @param fam.list -#' - A list with the names: ['A.c', 'A.i', 'U.c', 'U.i'] +#' - A list with the names: 'A.c', 'A.i', 'U.c', 'U.i' #' respectively containing the affected correct, affected incorrect, #' unaffected correct and unaffected incorrect. #' - This can be generated with the function: score.variant.status @@ -118,6 +118,9 @@ calc.rv.score <- function(fam.list, affected.weight=1, unaffected.weight=0.5, un #' Take the matrix and subset out only the encoded individuals that are present in the status dataframe. +#' @param mat.df The full matrix file to subset +#' @param status.df The list of sampled individuals, matrix is subset to only these individuals. +#' @return A subset of the input matrix. subset.mat <- function(mat.df, status.df){ sub.ids <- rownames(mat.df)[rownames(mat.df) %in% status.df$name] sub.mat <- mat.df[sub.ids, sub.ids] @@ -165,8 +168,7 @@ subset.mat <- function(mat.df, status.df){ #' @export score.fam <- function(relation.mat, status.df, affected.weight=1, unaffected.weight=0.5, return.sums = FALSE, return.means = TRUE, - affected.only = TRUE, max.err=4, - subset.fam = TRUE){ + affected.only = TRUE, max.err=4){ #The family encoding matrix needs to be subset to include only the individuals in the status dataframe sub.relation.mat <- subset.mat(relation.mat, status.df) @@ -200,10 +202,10 @@ score.fam <- function(relation.mat, status.df, affected.weight=1, unaffected.wei #' @examples #' score.fam1 <- c("score" = 1.0, "score.for" = 2.0, "score.against" = 1.0) #' score.fam2 <- c("score" = 1.0, "score.for" = 3.0, "score.against" = 2.0) -#' # out <- sum.fam.scores(c(score.fam1, score.fam2)) +#' # out <- add.fam.scores(c(score.fam1, score.fam2)) #' #returns: c("score" = 2.0,"score.for" = 5.0, "score.against" = 3.0) #' @export -sum.fam.scores <- function(score.vec){ +add.fam.scores <- function(score.vec){ outvec<-tapply(score.vec, names(score.vec), sum) sorted.out<-c("score" = outvec[["score"]], diff --git a/man/sum.fam.scores.Rd b/man/add.fam.scores.Rd similarity index 86% rename from man/sum.fam.scores.Rd rename to man/add.fam.scores.Rd index c50f0df..847583e 100644 --- a/man/sum.fam.scores.Rd +++ b/man/add.fam.scores.Rd @@ -1,11 +1,11 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/relatedness.r -\name{sum.fam.scores} -\alias{sum.fam.scores} +\name{add.fam.scores} +\alias{add.fam.scores} \title{Sum all the given scores and return a single vector with cumulative "score", "for" and "against" vals. For use in instances where one wishes to combine scores from multiple families.} \usage{ -\method{sum}{fam.scores}(score.vec) +add.fam.scores(score.vec) } \arguments{ \item{score.vec}{A vector will all of the per family score outputs.} @@ -20,6 +20,6 @@ For use in instances where one wishes to combine scores from multiple families. \examples{ score.fam1 <- c("score" = 1.0, "score.for" = 2.0, "score.against" = 1.0) score.fam2 <- c("score" = 1.0, "score.for" = 3.0, "score.against" = 2.0) -# out <- sum.fam.scores(c(score.fam1, score.fam2)) +# out <- add.fam.scores(c(score.fam1, score.fam2)) #returns: c("score" = 2.0,"score.for" = 5.0, "score.against" = 3.0) } diff --git a/man/build.relation.dict.Rd b/man/build.relation.dict.Rd index 7aaa3d1..a964e8e 100644 --- a/man/build.relation.dict.Rd +++ b/man/build.relation.dict.Rd @@ -19,6 +19,3 @@ A list with the categorized relationship/variant information. \description{ Build dictionary with the relationships falling in the different categories for the query row. } -\examples{ -#TODO - add -} diff --git a/man/calc.rv.score.Rd b/man/calc.rv.score.Rd index 00d5abd..5352054 100644 --- a/man/calc.rv.score.Rd +++ b/man/calc.rv.score.Rd @@ -14,7 +14,7 @@ calc.rv.score( } \arguments{ \item{fam.list}{\itemize{ -\item A list with the names: \link{'A.c', 'A.i', 'U.c', 'U.i'} +\item A list with the names: 'A.c', 'A.i', 'U.c', 'U.i' respectively containing the affected correct, affected incorrect, unaffected correct and unaffected incorrect. - This can be generated with the function: score.variant.status diff --git a/man/encode.rows.Rd b/man/encode.rows.Rd index 43d78c4..44eac3a 100644 --- a/man/encode.rows.Rd +++ b/man/encode.rows.Rd @@ -11,6 +11,8 @@ encode.rows(relation.mat, status.df, ...) \item{relation.mat}{The relationship matrix for all pairwise combinations of individuals.} \item{status.df}{The ID, status, and genotypes for each individual.} + +\item{...}{Additional arguments to be passed between methods.} } \value{ A dictionary with the per-individual relationship lists. @@ -20,6 +22,3 @@ One value for each row of the matrix. Take the relationship matrix and the encoded statuses of info. For each row, generate the encoded data for scoring. } -\examples{ -#TODO - add -} diff --git a/man/ibd.Rd b/man/ibd.Rd index 0e4f4b4..16173ed 100644 --- a/man/ibd.Rd +++ b/man/ibd.Rd @@ -32,10 +32,13 @@ estimate of the amount of the genome they have inherited it from a common ancestor without recombination. } \details{ -Can do this for the total potential relatedness in a pedigree (theoretical=TRUE), +Can do this for the total potential relatedness +in a pedigree (theoretical=TRUE), or for the actual relatedness across collected samples (theoretical=FALSE). -For the theoretical=TRUE case, in the unaffected trees, if we have a sample from the parent, -then the offspring do not provide any additional information for a max IBD calculation. +For the theoretical=TRUE case, in the unaffected trees, +if we have a sample from the parent, +then the offspring do not provide any additional information +for a max IBD calculation. This means that K does not scale with n. For theoretical=FALSE, sometimes we don’t have the healthy parent in an unaffected tree, @@ -46,3 +49,6 @@ Either the parent is a non-obligate carrier, or is a non-carrier. The probability of the children depends on which of those is true, so we have the additional set of terms in the theoretical=FALSE logic. } +\examples{ +ibd <- ibd(3, 1, 5, 2, 1, 0.4576484) +} diff --git a/man/penetrance.Rd b/man/penetrance.Rd index 50a2dc5..7a0f98e 100644 --- a/man/penetrance.Rd +++ b/man/penetrance.Rd @@ -27,3 +27,7 @@ K Pedigree-based estimation of autosomal dominant penetrance rate. Likelihood function for calculation of Pedigree-based autosomal dominant penetrance value. Formula deployed via optimize so as to determine the optimal value. } +\examples{ +# the penetrance function is a function where values is found through optimization +K <- optimize(penetrance, c(0,1), 3, 1, 5, 2, 1, maximum=TRUE)$max +} diff --git a/man/read.pedigree.Rd b/man/read.pedigree.Rd index c8a5ff3..79d4210 100644 --- a/man/read.pedigree.Rd +++ b/man/read.pedigree.Rd @@ -2,8 +2,7 @@ % Please edit documentation in R/pedigree.r \name{read.pedigree} \alias{read.pedigree} -\title{Read in the encoded pedigree data file. -TODO - can we build this encoded matrix from a more standard file format? i.e. ped-like?} +\title{Read in the encoded pedigree data file.} \usage{ read.pedigree(filename) } @@ -15,9 +14,9 @@ A data frame containing the encoded pedigree information. } \description{ Read in the encoded pedigree data file. -TODO - can we build this encoded matrix from a more standard file format? i.e. ped-like? } \examples{ -example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +example.pedigree.file <- system.file('extdata/example_pedigree_encoding.tsv', +package = 'seqbio.variant.scoring') example.pedigree.df <- read.pedigree(example.pedigree.file) } diff --git a/man/score.Rd b/man/score.Rd index a37d271..d5e9ee5 100644 --- a/man/score.Rd +++ b/man/score.Rd @@ -4,13 +4,18 @@ \alias{score} \title{Score the pedigrees using the pihat values.} \usage{ -score(pihat, K = -1) +score(pihat) } \arguments{ -\item{pihat}{Estimated proportion of genome shared between individuals, from function: ibd.} - -\item{K}{Estimated penetrance value, from function: penetrance.} +\item{pihat}{Estimated proportion of genome shared between individuals, +from function: ibd.} +} +\value{ +The score value. } \description{ Score the pedigrees using the pihat values. } +\examples{ +s.val <- score(12.61) +} diff --git a/man/score.fam.Rd b/man/score.fam.Rd index a8860a1..50dd576 100644 --- a/man/score.fam.Rd +++ b/man/score.fam.Rd @@ -13,8 +13,7 @@ score.fam( return.sums = FALSE, return.means = TRUE, affected.only = TRUE, - max.err = 4, - subset.fam = TRUE + max.err = 4 ) } \arguments{ diff --git a/man/score.pedigree.Rd b/man/score.pedigree.Rd index f6b0ab7..0a381ee 100644 --- a/man/score.pedigree.Rd +++ b/man/score.pedigree.Rd @@ -12,7 +12,8 @@ score.pedigree(h) \value{ A data frame containing the theoretical scoring of the power of a family assuming you were able to collect everyone on the simplified pedigree, -as well as a current scoreing, examining only those for whom you currently have DNA. +as well as a current scoreing, examining only those for whom you currently +have DNA. } \description{ Determine a value score of families by comparing their relationship structure. @@ -27,14 +28,16 @@ Simplifying assumptions: \item No ambiguous statuses \item No more than two sequential generations of unknown carrier status (non-obligate carrier vs. non-carrier). -Generalized support of arbitrary tree structures gets a lot more complicated, especially for the likelihood function. +Generalized support of arbitrary tree structures gets a lot more +complicated, especially for the likelihood function. \item Exclude big giant trees of unaffecteds - related to above. Will slightly bias the result toward higher penetrance. \item Exclude subjects younger than age of onset } } \examples{ -example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', +package = 'seqbio.variant.scoring') example.pedigree.df <- read.pedigree(example.pedigree.file) penetrance.df <- score.pedigree(example.pedigree.df) } diff --git a/man/subset.mat.Rd b/man/subset.mat.Rd index 8131b2c..e9ba75c 100644 --- a/man/subset.mat.Rd +++ b/man/subset.mat.Rd @@ -6,6 +6,14 @@ \usage{ \method{subset}{mat}(mat.df, status.df) } +\arguments{ +\item{mat.df}{The full matrix file to subset} + +\item{status.df}{The list of sampled individuals, matrix is subset to only these individuals.} +} +\value{ +A subset of the input matrix. +} \description{ Take the matrix and subset out only the encoded individuals that are present in the status dataframe. } diff --git a/tests/testthat/test_relatedness.R b/tests/testthat/test_relatedness.R index 67c59b6..03a4f5d 100644 --- a/tests/testthat/test_relatedness.R +++ b/tests/testthat/test_relatedness.R @@ -73,7 +73,7 @@ test_that("Scoring proceeds as expected in all cases", { b <- c("score"=20, "score.for"=20, "score.against"=0) c <- c("score"=30, "score.for"=40, "score.against"=10) - abc<- sum.fam.scores(c(a,b,c)) + abc<- add.fam.scores(c(a,b,c)) expected.abc <- c("score"=60, "score.for"=70, "score.against"=10) expect_equal(all.equal(names(abc), names(expected.abc)), TRUE) for(i in length(abc)){ diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd index 0912531..5e79aad 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -80,7 +80,7 @@ d n With the encoded data loaded, the function `cal.penetrance` will generate both a theoretical ranking of the power of a family assuming you were able to collect everyone on the simplified pedigree, as well as a current ranking, examining only those for whom you currently have DNA. This allows evaluation of the impact on the ranking if certain other family members are enrolled in the study. ```{r} -penetrance.df <- cal.penetrance(example.pedigree.df) +penetrance.df <- score.pedigree(example.pedigree.df) penetrance.df ``` @@ -101,14 +101,14 @@ With the scoring completed, the values can be queried to learn more about the re Sorting on `current_score` shows which family has the most detection power based on collected samples. ```{r} -penetrance.df[order(penetrance.df$current_score, decreasing = TRUE),] +penetrance.df[order(penetrance.df$current.score, decreasing = TRUE),] ``` If we had to work with only what we have, then family 5031 gives the most detection power. Sorting on `max_score` shows which family could have the most detection power, if all individuals were sampled. This can be useful in targeting future sampling efforts as it shows where more samples would give the most value. ```{r} -penetrance.df[order(penetrance.df$max_score, decreasing = TRUE),] +penetrance.df[order(penetrance.df$max.score, decreasing = TRUE),] ``` Here we can see that family `0347` has a maximum score of 30.84, but a realized score of only 4.17. Given the high potential detection power for this family, it is an ideal target for future sampling efforts as current samples reveal on 13.5% of the family's potential value. From 23e03e5d18c0cbec8a95693e5bb9d913a6195864 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 2 Jun 2025 08:42:41 -0300 Subject: [PATCH 37/43] re-add the unit tests --- tests/testthat/test_encoding.R | 46 +++++++++++++ tests/testthat/test_io.r | 50 ++++++++++++++ tests/testthat/test_multi_fam_full_workflow.r | 33 +++++++++ .../testthat/test_single_fam_full_workflow.r | 68 +++++++++++++++++++ 4 files changed, 197 insertions(+) create mode 100644 tests/testthat/test_encoding.R create mode 100644 tests/testthat/test_io.r create mode 100644 tests/testthat/test_multi_fam_full_workflow.r create mode 100644 tests/testthat/test_single_fam_full_workflow.r diff --git a/tests/testthat/test_encoding.R b/tests/testthat/test_encoding.R new file mode 100644 index 0000000..c38fa5d --- /dev/null +++ b/tests/testthat/test_encoding.R @@ -0,0 +1,46 @@ +test_that("Families are correctly encoded.", { + + # Test status assignments + print("assign.status checks") + expect_equal(assign.status("A", "0/1"), "A.c") + expect_equal(assign.status("A", "0|1"), "A.c") + expect_equal(assign.status("A", "1|0"), "A.c") + expect_equal(assign.status("A", 1), "A.c") + + expect_equal(assign.status("A", 0), "A.i") + expect_equal(assign.status("A", "0/0"), "A.i") + expect_equal(assign.status("A", "0|0"), "A.i") + + expect_equal(assign.status("U", 1), "U.i") + expect_equal(assign.status("U", "0/1"), "U.i") + expect_equal(assign.status("U", "0|1"), "U.i") + expect_equal(assign.status("U", "1|0"), "U.i") + + expect_equal(assign.status("U", 0), "U.c") + expect_equal(assign.status("U", "0/0"), "U.c") + expect_equal(assign.status("U", "0|0"), "U.c") + + # Test score.variant.status assignments + #TODO - add the ability to encode a theoretical max + expect_error(assign.status("BAD", 0), "Status must be one of: U or A") + + expect_error(assign.status("A", "./."), "Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'") + expect_error(assign.status("U", "0|."), "Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'") + + #tsv.name1<-"Data/1234_ex2.tsv" + tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') + indiv.df <- read.indiv(tsv.name1) + + print("score values for a real family") + scores<-score.variant.status(indiv.df) + expected.scores<-c("A.c", "U.i", "A.c", "A.c", "A.i", "U.c", "A.c", "U.c") + expect_equal(scores$statvar.cat, expected.scores) + + print("theoretical.max high score values for a family") + ther.scores <- score.variant.status(indiv.df, theoretical.max=TRUE) + + expected.thermax.scores <- c("A.c","U.c","A.c","A.c","A.c" ,"U.c", "A.c", "U.c") + expect_equal(ther.scores$statvar.cat, expected.thermax.scores) + + +}) diff --git a/tests/testthat/test_io.r b/tests/testthat/test_io.r new file mode 100644 index 0000000..f1b1711 --- /dev/null +++ b/tests/testthat/test_io.r @@ -0,0 +1,50 @@ +test_that("Data are read from files correctly", { + + ################################## + # test 1 + + #tsv.name1<-"extdata/1234_ex2.tsv" + # TODO - switch all to this style + tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') + ex1234.df <- read.indiv(tsv.name1) + #mat.name1<-"extdata/1234_ex2.mat" + mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') + ex1234.mat <- read.relation.mat(mat.name1) + # TODO - could make an s3 class with the two data structures, have single func to + # wrap this and read in both files. + + #TODO - hard code expected values, run matrix comparisons. + + + ################################## + # test 2 - horizontal tabular vcf-like output + + expected.test2 = data.frame("name" = c("MS-5678-1001", "MS-5678-1002", "MS-5678-1004", + "MS-5678-6001", "MS-5678-1003"), + "status" = c("A", "U", "U", "U", "A"), + "variant" = c("0/1", "0/0", "0/0", "0/0", "0/1")) + infile.test2 <-system.file('extdata/example_vcf_extract_5678.tsv', + package = 'seqbio.variant.scoring') + + test2.df <- read.var.table(infile.test2) + expect_equal(test2.df, expected.test2) + + + ################################## + # test 3 - horizontal tabular vcf-like output with phased + expected.test3 = data.frame("name" = c("MS-9876-1002", "MS-9876-1009", "MS-9876-1006", "MS-9876-1004", + "MS-9876-1007", "MS-9876-1003", "MS-9876-1001", "MS-9876-1008", "MS-9876-1005"), + "status" = c("U","U","A","A","U", "U","A","U","U"), + "variant" = c("0|0","0|0","0|0","0|1","0|0","0|0","0|1","0|0","0|1")) + infile.test3 <-system.file('extdata/example_vcf_extract_9876.tsv', + package = 'seqbio.variant.scoring') + + test3.df <- read.var.table(infile.test3) + expect_equal(test3.df, expected.test3) + + ################################## + # test 4 - TODO run a test where there are more people in the + # matrix than there are in the status df, make sure that subsetting is performed + # correctly. + +}) diff --git a/tests/testthat/test_multi_fam_full_workflow.r b/tests/testthat/test_multi_fam_full_workflow.r new file mode 100644 index 0000000..612a416 --- /dev/null +++ b/tests/testthat/test_multi_fam_full_workflow.r @@ -0,0 +1,33 @@ +test_that("Scoring of variant in family multiple families proceeds as expected", { + + ################################## + # test 1 + + mat.name1<-system.file('extdata/5678_ex1.mat', package = 'seqbio.variant.scoring') + tsv.name1<-system.file('extdata/5678_ex1.tsv', package = 'seqbio.variant.scoring') + + mat.name2<-system.file('extdata/9876_ex1.mat', package = 'seqbio.variant.scoring') + tsv.name2<-system.file('extdata/9876_ex1.tsv', package = 'seqbio.variant.scoring') + + mat.name3<-system.file('extdata/5432_ex1.mat', package = 'seqbio.variant.scoring') + tsv.name3<-system.file('extdata/5432_ex1.tsv', package = 'seqbio.variant.scoring') + + mat.fnames <- c(mat.name1, mat.name2, mat.name3) + tsv.names <- c(tsv.name1, tsv.name2, tsv.name3) + + mat.df <- lapply(mat.fnames, read.relation.mat) + indv.df <- lapply(tsv.names, read.indiv) + + status.df <- lapply(indv.df, score.variant.status) + + bulk.scores <- mapply(score.fam, mat.df, status.df, affected.weight=1, unaffected.weight=0.5, return.sums = TRUE) + + #TODO - formalize this for better interface with bulk processing + summed.scores<-add.fam.scores(c(bulk.scores[,1], bulk.scores[,2], bulk.scores[,3 ])) + + expected.summed.scores <- c("score" = 102.75,"score.for" = 212.75, "score.against" = 110.0) + expect_equal(all.equal(expected.summed.scores, summed.scores), TRUE) + + #TODO -possibly refactor above into a "score families" bulk function + +}) diff --git a/tests/testthat/test_single_fam_full_workflow.r b/tests/testthat/test_single_fam_full_workflow.r new file mode 100644 index 0000000..478edf1 --- /dev/null +++ b/tests/testthat/test_single_fam_full_workflow.r @@ -0,0 +1,68 @@ +#library(seqbio.variant.scoring) +test_that("Scoring of variant in family single family proceeds as expected", { + + ################################## + # test 1 + + mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') + ex1234.mat <- read.relation.mat(mat.name1) + tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') + ex1234.df <- read.indiv(tsv.name1) + + ex1234.df.status <- score.variant.status(ex1234.df) + + ex2.1234.score.default <- score.fam(ex1234.mat, ex1234.df.status) + expected.ex2.score.custom <- c("score"=21.3, "score.for"=27.6, "score.against"=6.3) + expect_equal(ex2.1234.score.default, expected.ex2.score.custom) + + #should give a single list with the family totals + ex2.1234.score.custom <- score.fam(ex1234.mat, ex1234.df.status, + affected.weight=1, unaffected.weight=0.5, + return.sums = TRUE, affected.only = FALSE) + + expected.ex2.score.custom <- c("score"=178.5, "score.for"=244, "score.against"=65.5) + expect_equal(ex2.1234.score.custom,expected.ex2.score.custom) + + #should give the row-by-row + ex2.1234.score.indiv.a.only <- score.fam(ex1234.mat, ex1234.df.status, affected.weight=1, unaffected.weight=0.5, + return.means = FALSE) + + expected.ex2.1234.score.indiv.a.only <- data.frame("score" = c(7., 9., 33.5, 31.5, 25.5), + "score.for" = c(19., 21., 34.5, 33.5, 30.), + "score.against" = c(12., 12., 1., 2., 4.5)) + rownames(expected.ex2.1234.score.indiv.a.only) <- c("MS-1234-1001", "MS-1234-1003", "MS-1234-1004", + "MS-1234-1005", "MS-1234-5001") + + expect_equal(all.equal(expected.ex2.1234.score.indiv.a.only, ex2.1234.score.indiv.a.only), TRUE) + + #affected only calculation + ex2.1234.score.affected.only <- score.fam(ex1234.mat, ex1234.df.status, affected.weight=1, unaffected.weight=0.0) + expected.ex2.score.affected.only <- c("score"=19.4, "score.for"=23.6, "score.against"=4.2) + expect_equal(ex2.1234.score.affected.only,expected.ex2.score.affected.only) + + + #affected only -per indiv + ex2.1234.score.indiv.no.u <- score.fam(ex1234.mat, ex1234.df.status, + affected.weight=1, unaffected.weight=0.0, + return.means = FALSE, + affected.only = TRUE) + + expected.ex2.1234.score.indiv.no.u <- data.frame("score" = c(5, 7, 33, 31, 21), + "score.for" = c(13, 15, 33, 32, 25), + "score.against" = c(8, 8, 0, 1, 4)) + rownames(expected.ex2.1234.score.indiv.no.u) <- c("MS-1234-1001", "MS-1234-1003", "MS-1234-1004", + "MS-1234-1005", "MS-1234-5001") + + expect_equal(all.equal(ex2.1234.score.indiv.no.u, expected.ex2.1234.score.indiv.no.u), TRUE) + + + + + # possibly - mean of scores? Account for not inflating pedigrees with lots of people, or is that a feature not a bug. + + # TODO - try dropping individuals from the matrix, see if it still works. (should be OK) + + # TODO - option to wrap the read/status/score and do it all in one step. basically make it a one liner. + + +}) From 3126af62404bfad83252bd21f362ea5ca00914b2 Mon Sep 17 00:00:00 2001 From: CNuge Date: Mon, 2 Jun 2025 08:46:56 -0300 Subject: [PATCH 38/43] renames to address notes --- DESCRIPTION | 2 +- LICENSE.md => LICENSE | 0 vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd | 2 +- vignettes/seqbio.variant.scoring-variant_scoring.Rmd | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename LICENSE.md => LICENSE (100%) diff --git a/DESCRIPTION b/DESCRIPTION index 806177a..64fcd65 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -12,7 +12,7 @@ Description: efforts by showing which unsampled individuals would be the most meaningful additions to a study. Second, after sequencing and analysis, variants based on association with disease status and familial relationships of individuals, aids in variant prioritization. -License: MIT +License: MIT + file LICENSE Encoding: UTF-8 Roxygen: list(markdown = TRUE) RoxygenNote: 7.3.2 diff --git a/LICENSE.md b/LICENSE similarity index 100% rename from LICENSE.md rename to LICENSE diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd index 5e79aad..dcca8f7 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd @@ -4,7 +4,7 @@ author: "Cameron M. Nugent" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # vignette: > - %\VignetteIndexEntry{seqbio.variant.scoring-vignette} + %\VignetteIndexEntry{seqbio.variant.scoring-penetrance_and_ibd} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- diff --git a/vignettes/seqbio.variant.scoring-variant_scoring.Rmd b/vignettes/seqbio.variant.scoring-variant_scoring.Rmd index 0cf9a5f..d6978ff 100644 --- a/vignettes/seqbio.variant.scoring-variant_scoring.Rmd +++ b/vignettes/seqbio.variant.scoring-variant_scoring.Rmd @@ -4,7 +4,7 @@ author: "Cameron M. Nugent" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # vignette: > - %\VignetteIndexEntry{seqbio.variant.scoring-vignette} + %\VignetteIndexEntry{seqbio.variant.scoring-variant_scoring} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- From 6128c0a6410c7767737810b0b69aa856a21a624a Mon Sep 17 00:00:00 2001 From: CNuge Date: Fri, 6 Jun 2025 13:42:19 -0300 Subject: [PATCH 39/43] rename throughout --- DESCRIPTION | 4 +-- ...io.variant.scoring.Rproj => KinformR.Rproj | 0 LICENSE | 2 +- R/io.R | 6 ++-- R/pedigree.r | 4 +-- R/relatedness.r | 4 +-- README.md | 32 +++++++++---------- man/read.indiv.Rd | 2 +- man/read.pedigree.Rd | 2 +- man/read.relation.mat.Rd | 2 +- man/read.var.table.Rd | 2 +- man/score.fam.Rd | 4 +-- man/score.pedigree.Rd | 2 +- tests/testthat.R | 4 +-- tests/testthat/test_encoding.R | 2 +- tests/testthat/test_io.r | 8 ++--- tests/testthat/test_multi_fam_full_workflow.r | 12 +++---- tests/testthat/test_pedigree_eval.R | 6 ++-- .../testthat/test_single_fam_full_workflow.r | 6 ++-- ...bd.Rmd => KinformR-penetrance_and_ibd.Rmd} | 8 ++--- ...oring.Rmd => KinformR-variant_scoring.Rmd} | 12 +++---- 21 files changed, 62 insertions(+), 62 deletions(-) rename seqbio.variant.scoring.Rproj => KinformR.Rproj (100%) rename vignettes/{seqbio.variant.scoring-penetrance_and_ibd.Rmd => KinformR-penetrance_and_ibd.Rmd} (96%) rename vignettes/{seqbio.variant.scoring-variant_scoring.Rmd => KinformR-variant_scoring.Rmd} (92%) diff --git a/DESCRIPTION b/DESCRIPTION index 64fcd65..9e59962 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ -Package: seqbio.variant.scoring +Package: KinformR Title: Relationship-informed pedigree and variant scoring Version: 0.1.0 Authors@R: person("Cameron", "Nugent", , "cam.nugent@sequencebio.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-1135-2605")) Description: - The seqbio.variant.scoring R package is meant to aid in comparative evaluation of families + The KinformR R package is meant to aid in comparative evaluation of families and candidate variants in rare-variant association studies. The package can be used for two methodologically overlapping but distinct purposes. First, the prior to any genetic or genomic evaluation, evaluation of relative detection power of pedigrees, can direct recruitment diff --git a/seqbio.variant.scoring.Rproj b/KinformR.Rproj similarity index 100% rename from seqbio.variant.scoring.Rproj rename to KinformR.Rproj diff --git a/LICENSE b/LICENSE index 9cd26d1..d30f53e 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ # MIT License -Copyright (c) 2025 seqbio.variant.scoring Sequence Bioinformatics Inc. +Copyright (c) 2025 KinformR Sequence Bioinformatics Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/R/io.R b/R/io.R index 6b64245..cc7da22 100644 --- a/R/io.R +++ b/R/io.R @@ -7,7 +7,7 @@ #' MS-5678-1001 A 0/1 #' @return A data frame. #' @examples -#' tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') +#' tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'KinformR') #' id.df1 <- read.indiv(tsv.name1) #' @export read.indiv <- function(fname){ @@ -24,7 +24,7 @@ read.indiv <- function(fname){ #' @param fname The file with the relationship matrix information. #' @return A matrix with the relationships and individual ids as rownames and colnames. #' @examples -#' mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') +#' mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'KinformR') #' mat1 <- read.relation.mat(mat.name1) #' @export read.relation.mat <- function(fname){ @@ -57,7 +57,7 @@ read.relation.mat <- function(fname){ #' MS-5678-1001 A 0/1 #' @examples #' ex.infile <-system.file('extdata/example_vcf_extract_5678.tsv', -#' package = 'seqbio.variant.scoring') +#' package = 'KinformR') #' read.var.table(ex.infile) #' @export read.var.table <- function(fname){ diff --git a/R/pedigree.r b/R/pedigree.r index c81fc41..cf80331 100644 --- a/R/pedigree.r +++ b/R/pedigree.r @@ -95,7 +95,7 @@ score <- function(pihat) { #' #' @examples #' example.pedigree.file <- system.file('extdata/example_pedigree_encoding.tsv', -#' package = 'seqbio.variant.scoring') +#' package = 'KinformR') #' example.pedigree.df <- read.pedigree(example.pedigree.file) read.pedigree <- function(filename){ h <- read.table(filename, header=TRUE, sep="\t", @@ -132,7 +132,7 @@ read.pedigree <- function(filename){ #' #' @examples #' example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', -#' package = 'seqbio.variant.scoring') +#' package = 'KinformR') #' example.pedigree.df <- read.pedigree(example.pedigree.file) #' penetrance.df <- score.pedigree(example.pedigree.df) score.pedigree <- function(h){ diff --git a/R/relatedness.r b/R/relatedness.r index edc065b..b12974e 100644 --- a/R/relatedness.r +++ b/R/relatedness.r @@ -159,8 +159,8 @@ subset.mat <- function(mat.df, status.df){ #' of points for or against. This simplifies scoring and allows for fast filtering of poor quality variants. Default is 4. #' @return A labelled vector with names: score, score.for, score.against #' @examples -#' mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') -#' tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') +#' mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') +#' tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') #' mat.df <- read.relation.mat(mat.name1) #' ind.df <- read.indiv(tsv.name1) #' ind.df.status <- score.variant.status(ind.df) diff --git a/README.md b/README.md index 40544e9..c2310e1 100644 --- a/README.md +++ b/README.md @@ -1,43 +1,43 @@ -# seqbio.variant.scoring -## An R package for relationship-informed pedigree and variant scoring +# KinformR +## The Kinship Informer: An R package for relationship-informed pedigree and variant scoring ## Introduction -Family-based genetic studies are effective for identifying rare variants underlying heritable diseases, yet are often challenged by issues such as incomplete penetrance and the difficulty of prioritizing numerous candidate variants. The proportion of the genome with identity-by-descent (IBD) and estimates of penetrance are metrics that can show the value of different pedigrees in family-based studies. Additionally, IBD and the genotypes of individuals are combined by seqbio.variant.scoringto to score the value of candidate variants. +Family-based genetic studies are effective for identifying rare variants underlying heritable diseases, yet are often challenged by issues such as incomplete penetrance and the difficulty of prioritizing numerous candidate variants. The proportion of the genome with identity-by-descent (IBD) and estimates of penetrance are metrics that can show the value of different pedigrees in family-based studies. Additionally, IBD and the genotypes of individuals are combined by KinformRto to score the value of candidate variants. -The `seqbio.variant.scoring` R package is meant to aid in comparative evaluation of families and candidate variants in rare-variant association studies. The package can be used for two methodologically overlapping but distinct purposes: 1) prior to any genetic/genomic evaluation, evaluation of relative detection power of pedigrees, can direct recruitment efforts by showing which unsampled individuals would be the most meaningful additions to a study, and 2) after sequencing and analysis, variants based on association with disease status and familial relationships of individuals, aids in variant prioritization +The `KinformR` R package is meant to aid in comparative evaluation of families and candidate variants in rare-variant association studies. The package can be used for two methodologically overlapping but distinct purposes: 1) prior to any genetic/genomic evaluation, evaluation of relative detection power of pedigrees, can direct recruitment efforts by showing which unsampled individuals would be the most meaningful additions to a study, and 2) after sequencing and analysis, variants based on association with disease status and familial relationships of individuals, aids in variant prioritization ## Installation -The development version of `seqbio.variant.scoring` can be installed directly from GitHub. You'll need to have the R package `devtools` installed and loaded. Also note if the build_vignettes option is set to true, you will need to have the R package `knitr` installed. +The development version of `KinformR` can be installed directly from GitHub. You'll need to have the R package `devtools` installed and loaded. Also note if the build_vignettes option is set to true, you will need to have the R package `knitr` installed. ``` #install.packages("devtools") #install.packages("knitr") #required if build_vignettes = TRUE #library(devtools) -devtools::install_github("SequenceBio/seqbio.variant.scoring", build_vignettes = TRUE) -library(seqbio.variant.scoring) +devtools::install_github("SequenceBio/KinformR", build_vignettes = TRUE) +library(KinformR) ``` ## How it works The package's vignette contains detailed explanations of the functions and parameters. -For a walk through of the `seqbio.variant.scoring` functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette file: -`vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd` +For a walk through of the `KinformR` functions for scoring the value of *families* based on penetrance and IBD, see the corresponging vignette file: +`vignettes/KinformR-penetrance_and_ibd.Rmd` or within R, run: ``` -vignette('seqbio.variant.scoring-penetrance_and_ibd') +vignette('KinformR-penetrance_and_ibd') ``` -For a walk through of the `seqbio.variant.scoring` functions for scoring the value of *variants* within families, see the corresponging vignette file: -`vignettes/seqbio.variant.scoring-variant_scoring.Rmd` +For a walk through of the `KinformR` functions for scoring the value of *variants* within families, see the corresponging vignette file: +`vignettes/KinformR-variant_scoring.Rmd` or within R, run: ``` -vignette('seqbio.variant.scoring-variant_scoring') +vignette('KinformR-variant_scoring') ``` ## Scoring families @@ -46,7 +46,7 @@ vignette('seqbio.variant.scoring-variant_scoring') See the included example data, which encodes 14 families. See the accompanying vignette for more information on encoding pedigrees: ``` - example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') + example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'KinformR') ``` The data can be loaded with the following function: ``` @@ -67,7 +67,7 @@ When looking at shared rare variants across families, not all sets of affected a This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. ``` -mat.name<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') +mat.name<-system.file('extdata/1234_ex2.mat', package = 'KinformR') rel.mat <- read.relation.mat(mat.name) ``` @@ -77,7 +77,7 @@ rel.mat <- read.relation.mat(mat.name) This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. ``` -tsv.name<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') +tsv.name<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') ind.df <- read.indiv(tsv.name) ind.df.status <- score.variant.status(ex1234.df) diff --git a/man/read.indiv.Rd b/man/read.indiv.Rd index ec59145..d51ac5f 100644 --- a/man/read.indiv.Rd +++ b/man/read.indiv.Rd @@ -18,6 +18,6 @@ A data frame. Read in variant and status info for individuals. } \examples{ -tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') +tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'KinformR') id.df1 <- read.indiv(tsv.name1) } diff --git a/man/read.pedigree.Rd b/man/read.pedigree.Rd index 79d4210..b9e4ecf 100644 --- a/man/read.pedigree.Rd +++ b/man/read.pedigree.Rd @@ -17,6 +17,6 @@ Read in the encoded pedigree data file. } \examples{ example.pedigree.file <- system.file('extdata/example_pedigree_encoding.tsv', -package = 'seqbio.variant.scoring') +package = 'KinformR') example.pedigree.df <- read.pedigree(example.pedigree.file) } diff --git a/man/read.relation.mat.Rd b/man/read.relation.mat.Rd index cdf64e7..65592bb 100644 --- a/man/read.relation.mat.Rd +++ b/man/read.relation.mat.Rd @@ -18,6 +18,6 @@ Row/column intersections give the degree of relationship for the two individuals. 0 = self, -1 = unrelated. } \examples{ -mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') +mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'KinformR') mat1 <- read.relation.mat(mat.name1) } diff --git a/man/read.var.table.Rd b/man/read.var.table.Rd index ee7f129..2ad5848 100644 --- a/man/read.var.table.Rd +++ b/man/read.var.table.Rd @@ -26,6 +26,6 @@ be encoded in a specific fashion for you current purposes. } \examples{ ex.infile <-system.file('extdata/example_vcf_extract_5678.tsv', - package = 'seqbio.variant.scoring') + package = 'KinformR') read.var.table(ex.infile) } diff --git a/man/score.fam.Rd b/man/score.fam.Rd index 50dd576..3e7d09a 100644 --- a/man/score.fam.Rd +++ b/man/score.fam.Rd @@ -57,8 +57,8 @@ NOTE: if affected.only = True, the averages and sums are calculated using only t } } \examples{ -mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') -tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') +mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') +tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') mat.df <- read.relation.mat(mat.name1) ind.df <- read.indiv(tsv.name1) ind.df.status <- score.variant.status(ind.df) diff --git a/man/score.pedigree.Rd b/man/score.pedigree.Rd index 0a381ee..ff03942 100644 --- a/man/score.pedigree.Rd +++ b/man/score.pedigree.Rd @@ -37,7 +37,7 @@ Will slightly bias the result toward higher penetrance. } \examples{ example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', -package = 'seqbio.variant.scoring') +package = 'KinformR') example.pedigree.df <- read.pedigree(example.pedigree.file) penetrance.df <- score.pedigree(example.pedigree.df) } diff --git a/tests/testthat.R b/tests/testthat.R index e2341c4..f824997 100644 --- a/tests/testthat.R +++ b/tests/testthat.R @@ -1,5 +1,5 @@ library(testthat) -library(seqbio.variant.scoring) +library(KinformR) -test_check("seqbio.variant.scoring") +test_check("KinformR") diff --git a/tests/testthat/test_encoding.R b/tests/testthat/test_encoding.R index c38fa5d..3513a9d 100644 --- a/tests/testthat/test_encoding.R +++ b/tests/testthat/test_encoding.R @@ -28,7 +28,7 @@ test_that("Families are correctly encoded.", { expect_error(assign.status("U", "0|."), "Incompatible variant value! Supported encodings are: '0' '1' '0/0' '0/1' '0|0' '0|1'") #tsv.name1<-"Data/1234_ex2.tsv" - tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') + tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') indiv.df <- read.indiv(tsv.name1) print("score values for a real family") diff --git a/tests/testthat/test_io.r b/tests/testthat/test_io.r index f1b1711..561dc8c 100644 --- a/tests/testthat/test_io.r +++ b/tests/testthat/test_io.r @@ -5,10 +5,10 @@ test_that("Data are read from files correctly", { #tsv.name1<-"extdata/1234_ex2.tsv" # TODO - switch all to this style - tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') + tsv.name1 <-system.file('extdata/1234_ex2.tsv', package = 'KinformR') ex1234.df <- read.indiv(tsv.name1) #mat.name1<-"extdata/1234_ex2.mat" - mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') + mat.name1 <-system.file('extdata/1234_ex2.mat', package = 'KinformR') ex1234.mat <- read.relation.mat(mat.name1) # TODO - could make an s3 class with the two data structures, have single func to # wrap this and read in both files. @@ -24,7 +24,7 @@ test_that("Data are read from files correctly", { "status" = c("A", "U", "U", "U", "A"), "variant" = c("0/1", "0/0", "0/0", "0/0", "0/1")) infile.test2 <-system.file('extdata/example_vcf_extract_5678.tsv', - package = 'seqbio.variant.scoring') + package = 'KinformR') test2.df <- read.var.table(infile.test2) expect_equal(test2.df, expected.test2) @@ -37,7 +37,7 @@ test_that("Data are read from files correctly", { "status" = c("U","U","A","A","U", "U","A","U","U"), "variant" = c("0|0","0|0","0|0","0|1","0|0","0|0","0|1","0|0","0|1")) infile.test3 <-system.file('extdata/example_vcf_extract_9876.tsv', - package = 'seqbio.variant.scoring') + package = 'KinformR') test3.df <- read.var.table(infile.test3) expect_equal(test3.df, expected.test3) diff --git a/tests/testthat/test_multi_fam_full_workflow.r b/tests/testthat/test_multi_fam_full_workflow.r index 612a416..b65a81b 100644 --- a/tests/testthat/test_multi_fam_full_workflow.r +++ b/tests/testthat/test_multi_fam_full_workflow.r @@ -3,14 +3,14 @@ test_that("Scoring of variant in family multiple families proceeds as expected", ################################## # test 1 - mat.name1<-system.file('extdata/5678_ex1.mat', package = 'seqbio.variant.scoring') - tsv.name1<-system.file('extdata/5678_ex1.tsv', package = 'seqbio.variant.scoring') + mat.name1<-system.file('extdata/5678_ex1.mat', package = 'KinformR') + tsv.name1<-system.file('extdata/5678_ex1.tsv', package = 'KinformR') - mat.name2<-system.file('extdata/9876_ex1.mat', package = 'seqbio.variant.scoring') - tsv.name2<-system.file('extdata/9876_ex1.tsv', package = 'seqbio.variant.scoring') + mat.name2<-system.file('extdata/9876_ex1.mat', package = 'KinformR') + tsv.name2<-system.file('extdata/9876_ex1.tsv', package = 'KinformR') - mat.name3<-system.file('extdata/5432_ex1.mat', package = 'seqbio.variant.scoring') - tsv.name3<-system.file('extdata/5432_ex1.tsv', package = 'seqbio.variant.scoring') + mat.name3<-system.file('extdata/5432_ex1.mat', package = 'KinformR') + tsv.name3<-system.file('extdata/5432_ex1.tsv', package = 'KinformR') mat.fnames <- c(mat.name1, mat.name2, mat.name3) tsv.names <- c(tsv.name1, tsv.name2, tsv.name3) diff --git a/tests/testthat/test_pedigree_eval.R b/tests/testthat/test_pedigree_eval.R index 3dddf04..d3f010b 100644 --- a/tests/testthat/test_pedigree_eval.R +++ b/tests/testthat/test_pedigree_eval.R @@ -2,20 +2,20 @@ test_that("Data are read from files correctly", { ################################## # test 1 - example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') + example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'KinformR') example.pedigree.df <- read.pedigree(example.pedigree.file) penetrance.df <- score.pedigree(example.pedigree.df) ################################## # test 2 - example.pedigree.file2 <-system.file('extdata/example_pedigree_encoding2.tsv', package = 'seqbio.variant.scoring') + example.pedigree.file2 <-system.file('extdata/example_pedigree_encoding2.tsv', package = 'KinformR') #example.pedigree.file2 <- "inst/extdata/example_pedigree_encoding2.tsv" example.pedigree.df2 <- read.pedigree(example.pedigree.file2) penetrance.df2 <- score.pedigree(example.pedigree.df2) ################################## # test 3 - example.pedigree.file3 <-system.file('extdata/example_pedigree_encoding3.tsv', package = 'seqbio.variant.scoring') + example.pedigree.file3 <-system.file('extdata/example_pedigree_encoding3.tsv', package = 'KinformR') #example.pedigree.file3 <- "inst/extdata/example_pedigree_encoding3.tsv" example.pedigree.df3 <- read.pedigree(example.pedigree.file3) penetrance.df3 <- score.pedigree(example.pedigree.df3) diff --git a/tests/testthat/test_single_fam_full_workflow.r b/tests/testthat/test_single_fam_full_workflow.r index 478edf1..b9627bd 100644 --- a/tests/testthat/test_single_fam_full_workflow.r +++ b/tests/testthat/test_single_fam_full_workflow.r @@ -1,12 +1,12 @@ -#library(seqbio.variant.scoring) +#library(KinformR) test_that("Scoring of variant in family single family proceeds as expected", { ################################## # test 1 - mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') + mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') ex1234.mat <- read.relation.mat(mat.name1) - tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') + tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') ex1234.df <- read.indiv(tsv.name1) ex1234.df.status <- score.variant.status(ex1234.df) diff --git a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd b/vignettes/KinformR-penetrance_and_ibd.Rmd similarity index 96% rename from vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd rename to vignettes/KinformR-penetrance_and_ibd.Rmd index dcca8f7..6c950dc 100644 --- a/vignettes/seqbio.variant.scoring-penetrance_and_ibd.Rmd +++ b/vignettes/KinformR-penetrance_and_ibd.Rmd @@ -1,16 +1,16 @@ --- -title: "seqbio.variant.scoring - penetrance and idb informed scoring of families" +title: "KinformR - penetrance and idb informed scoring of families" author: "Cameron M. Nugent" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # vignette: > - %\VignetteIndexEntry{seqbio.variant.scoring-penetrance_and_ibd} + %\VignetteIndexEntry{KinformR-penetrance_and_ibd} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} -library(seqbio.variant.scoring) +library(KinformR) ``` @@ -28,7 +28,7 @@ The `cal.penetrance` function generates both a theoretical ranking of the power The family power calculations depend on a single tab-delimited input file, where each row represents a family. The input file is read inusing the `read.pedigree` function. ```{r} -example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'seqbio.variant.scoring') +example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'KinformR') example.pedigree.df <- read.pedigree(example.pedigree.file) diff --git a/vignettes/seqbio.variant.scoring-variant_scoring.Rmd b/vignettes/KinformR-variant_scoring.Rmd similarity index 92% rename from vignettes/seqbio.variant.scoring-variant_scoring.Rmd rename to vignettes/KinformR-variant_scoring.Rmd index d6978ff..92d4d38 100644 --- a/vignettes/seqbio.variant.scoring-variant_scoring.Rmd +++ b/vignettes/KinformR-variant_scoring.Rmd @@ -1,22 +1,22 @@ --- -title: "seqbio.variant.scoring - pedigree-informed rare variant association and penetrance scoring" +title: "KinformR - pedigree-informed rare variant association and penetrance scoring" author: "Cameron M. Nugent" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # vignette: > - %\VignetteIndexEntry{seqbio.variant.scoring-variant_scoring} + %\VignetteIndexEntry{KinformR-variant_scoring} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ## Introduction -When looking at rare variants within a family, not all affected and unaffected individuals are of equal importance. `seqbio.variant.scoring` is an R package meant to aid in comparative evaluation of information in family-based rare-variant association studies; the package considers evidence of rare variant's association with disease status in a family and scores the variant based on relationships of individuals in a pedigree. Comparing scores across candidates can thereby help in assessing their relative merit. +When looking at rare variants within a family, not all affected and unaffected individuals are of equal importance. `KinformR` is an R package meant to aid in comparative evaluation of information in family-based rare-variant association studies; the package considers evidence of rare variant's association with disease status in a family and scores the variant based on relationships of individuals in a pedigree. Comparing scores across candidates can thereby help in assessing their relative merit. The program leverages Wright’s coefficient of relatedness to score families based on the relationship of individuals as well as their disease status and genotypes for a given variant. A candidate variant is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not due to association with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large IBD portion of the genome shared by the two individuals and thereby gives evidence in favour of the candidate. ```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} -library(seqbio.variant.scoring) +library(KinformR) ``` @@ -28,7 +28,7 @@ This input is a matrix containing all the pairwise relationships of individuals The function `read.relation.mat` ```{r} -mat.name1<-system.file('extdata/1234_ex2.mat', package = 'seqbio.variant.scoring') +mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') rel.mat <- read.relation.mat(mat.name1) rel.mat ``` @@ -42,7 +42,7 @@ Note that the order of individuals can be different between the two files. All i ```{r} -tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'seqbio.variant.scoring') +tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') status.df <- read.indiv(tsv.name1) status.df From f66eaa5ae197df9eb5bcf6e140cc03a6b8ca0823 Mon Sep 17 00:00:00 2001 From: CNuge Date: Thu, 12 Jun 2025 09:51:41 -0300 Subject: [PATCH 40/43] improvements to vignettes --- vignettes/KinformR-penetrance_and_ibd.Rmd | 14 +++++++---- vignettes/KinformR-variant_scoring.Rmd | 29 ++++++++++++++++------- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/vignettes/KinformR-penetrance_and_ibd.Rmd b/vignettes/KinformR-penetrance_and_ibd.Rmd index 6c950dc..058340c 100644 --- a/vignettes/KinformR-penetrance_and_ibd.Rmd +++ b/vignettes/KinformR-penetrance_and_ibd.Rmd @@ -1,6 +1,7 @@ --- title: "KinformR - penetrance and idb informed scoring of families" author: "Cameron M. Nugent" +date: "`r format(Sys.time(), '%d %B, %Y')`" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # vignette: > @@ -9,9 +10,6 @@ vignette: > %\VignetteEncoding{UTF-8} --- -```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} -library(KinformR) -``` ## Introduction @@ -22,13 +20,19 @@ The `cal.penetrance` and associated `penetrance` and `ibd` functions facilitate The `cal.penetrance` function generates both a theoretical ranking of the power of a family assuming you were able to collect everyone on the simplified pedigree, as well as a current ranking, examining only those for whom you currently have DNA. This allows evaluation of the impact on the ranking if certain other family members are enrolled in the study. +### Load the library + +```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} +library(KinformR) +``` ## The input data The family power calculations depend on a single tab-delimited input file, where each row represents a family. The input file is read inusing the `read.pedigree` function. ```{r} -example.pedigree.file <-system.file('extdata/example_pedigree_encoding.tsv', package = 'KinformR') +example.pedigree.file <- system.file('extdata/example_pedigree_encoding.tsv', + package = 'KinformR') example.pedigree.df <- read.pedigree(example.pedigree.file) @@ -50,7 +54,7 @@ the purposes of this tool, we exclude young generations (non-adults, younger tha From the simplified pedigrees, the individuals are assigned to the following categories. |Category|Description| -|---|---| +|---:|---| |a|Affected individuals| |b|Obligate carriers| |c|Children of either affecteds or carriers, with no children of their own| diff --git a/vignettes/KinformR-variant_scoring.Rmd b/vignettes/KinformR-variant_scoring.Rmd index 92d4d38..6b9d261 100644 --- a/vignettes/KinformR-variant_scoring.Rmd +++ b/vignettes/KinformR-variant_scoring.Rmd @@ -1,6 +1,7 @@ --- -title: "KinformR - pedigree-informed rare variant association and penetrance scoring" +title: "KinformR - pedigree-informed rare variant association scoring" author: "Cameron M. Nugent" +date: "`r format(Sys.time(), '%d %B, %Y')`" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # vignette: > @@ -13,7 +14,9 @@ vignette: > When looking at rare variants within a family, not all affected and unaffected individuals are of equal importance. `KinformR` is an R package meant to aid in comparative evaluation of information in family-based rare-variant association studies; the package considers evidence of rare variant's association with disease status in a family and scores the variant based on relationships of individuals in a pedigree. Comparing scores across candidates can thereby help in assessing their relative merit. -The program leverages Wright’s coefficient of relatedness to score families based on the relationship of individuals as well as their disease status and genotypes for a given variant. A candidate variant is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not due to association with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is not associated with the large IBD portion of the genome shared by the two individuals and thereby gives evidence in favour of the candidate. +The program leverages Wright’s coefficient of relatedness to score families based on the relationship of individuals as well as their disease status and genotypes for a given variant. A candidate variant is considered strongest when it is shared by an affected individual and other distantly-related affected family members and not shared by closely related unaffected family members. The theory behind this is that more distantly related individuals have a smaller proportion of their genome that is IBD, so the chance of a false positive (shared variant through chance, not due to association with disease) is lower. Conversely, when a closely related relative is unaffected, and the affected and unaffected individuals have different genotypes for the candidate variant, this provides evidence that the disease is likely not associated with the large IBD portion of the genome shared by the two individuals and thereby gives evidence in favour of the candidate. + +### Load the library ```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} library(KinformR) @@ -22,11 +25,13 @@ library(KinformR) ## Input data +Two input are required to score a candidate variant: the family relationship information and the variant status for the individuals. + ### The relation matrix -This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations. = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. +This input is a matrix containing all the pairwise relationships of individuals in a family. The row and column names are the individual IDs, and the intersecting value denotes the degree of relationship between the individuals (self = 0, 1st degree relations = 1, etc. Unrelated individuals are given a value of -1). As of version `0.1.0` the relation matrix is a manually created file, where relationship values are assigned via manual inspection of the family pedigree. -The function `read.relation.mat` +To read in the data, one uses the function `read.relation.mat`. ```{r} mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') rel.mat <- read.relation.mat(mat.name1) @@ -38,7 +43,9 @@ rel.mat This file includes the same individual IDs used in the relationship matrix as well as the disease and variant status for all individuals. -Note that the order of individuals can be different between the two files. All individual in the status file must be present within the relationship matrix, but you can have individuals in the realtionship matrix that are not present in the status file. The program assumes there is no genotypes for individuals without a status indicated and they are ignored in the variant scoring. +Note that the order of individuals can be different between the two files. All individual in the status file must be present within the relationship matrix, but you can have individuals in the relationship matrix that are not present in the status file. The program assumes there is no genotype information for individuals not in the status file and they are ignored in the variant scoring. + +To read in the data, one uses the function `read.indiv`. ```{r} @@ -48,7 +55,7 @@ status.df <- read.indiv(tsv.name1) status.df ``` -The disease-genotype scoring can then be encoded using the `score.variant.status` function to produce the status-variant category for all individuals (new output column named: `statvar.cat`). +The disease-genotype scoring can then be encoded using the `score.variant.status` function to produce the status-variant category for all individuals. This creates a df with the new column: `statvar.cat`. ```{r} @@ -78,7 +85,8 @@ As previously noted, if an individual is present in the relationship matrix and The scoring can be changed to summing across all combinations as opposed to the mean by passing the following options. Note using the program in this way will return higher scores for more dense pedigrees. ```{r} -ex.score.sum <- score.fam(rel.mat, full.df.status, return.sums = TRUE, return.means = FALSE) +ex.score.sum <- score.fam(rel.mat, full.df.status, + return.sums = TRUE, return.means = FALSE) ex.score.sum ``` @@ -86,9 +94,11 @@ ex.score.sum To obtain a long form table with the scores for variants expressed relative to each individual, set both `return.sums` and `return.means` to `FALSE`. This output can aid in identifying which individuals are carrying the most weight in a family's score. ```{r} -ex.score.table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) +ex.score.table <- score.fam(rel.mat, full.df.status, + return.sums = FALSE, return.means = FALSE) ex.score.table ``` + ## How scoring works ### A Minimal example, scoring a variant from perspective of a single individual. @@ -135,7 +145,8 @@ Deriving a relatedness-weighted score for the variant from the perspective of th For each degree-encoded relationship, the coefficient of relatedness is used to weight the evidence for or against a variant. The coefficients for different degress of relationship are: ```{r} for(i in 0:7){ - print(paste0("Degree of relatedness: ", i, " coefficient of relatedness: ", 1 / (2 ** (i)))) + print(paste0("Degree of relatedness: ", i, + " coefficient of relatedness: ", 1 / (2 ** (i)))) } ``` From bb747825680c36206410a8c134155d5ae3116172 Mon Sep 17 00:00:00 2001 From: CNuge Date: Thu, 12 Jun 2025 10:09:16 -0300 Subject: [PATCH 41/43] improving vignette tables --- vignettes/KinformR-penetrance_and_ibd.Rmd | 22 +++++++++++++++------- vignettes/KinformR-variant_scoring.Rmd | 2 ++ 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/vignettes/KinformR-penetrance_and_ibd.Rmd b/vignettes/KinformR-penetrance_and_ibd.Rmd index 058340c..d68901f 100644 --- a/vignettes/KinformR-penetrance_and_ibd.Rmd +++ b/vignettes/KinformR-penetrance_and_ibd.Rmd @@ -4,6 +4,8 @@ author: "Cameron M. Nugent" date: "`r format(Sys.time(), '%d %B, %Y')`" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # +pdf_document: + df_print: kable vignette: > %\VignetteIndexEntry{KinformR-penetrance_and_ibd} %\VignetteEngine{knitr::rmarkdown} @@ -28,19 +30,18 @@ library(KinformR) ## The input data -The family power calculations depend on a single tab-delimited input file, where each row represents a family. The input file is read inusing the `read.pedigree` function. +The family power calculations depend on a single tab-delimited input file, where each row represents a family. The input file is read in using the `read.pedigree` function. ```{r} example.pedigree.file <- system.file('extdata/example_pedigree_encoding.tsv', package = 'KinformR') example.pedigree.df <- read.pedigree(example.pedigree.file) - -head(example.pedigree.df) ``` The input file is expected to have the following 11 columns (with a header). ```{r} colnames(example.pedigree.df) + ``` ### Simplified summary of pedigrees @@ -64,7 +65,9 @@ From the simplified pedigrees, the individuals are assigned to the following cat The counts of individuals assigned to these categories are then added to the tab-delimited input file: ```{r} -head(example.pedigree.df) +#example.pedigree.df +knitr::kable(example.pedigree.df, format = "markdown", digits = 2) + ``` All columns with the prefix `max_` are meant to count the total number of each category in the pedigree, while @@ -85,7 +88,9 @@ With the encoded data loaded, the function `cal.penetrance` will generate both a ```{r} penetrance.df <- score.pedigree(example.pedigree.df) -penetrance.df +#penetrance.df +knitr::kable(penetrance.df, format = "markdown", digits = 2) + ``` The output includes seven columns that with the following information: @@ -105,14 +110,17 @@ With the scoring completed, the values can be queried to learn more about the re Sorting on `current_score` shows which family has the most detection power based on collected samples. ```{r} -penetrance.df[order(penetrance.df$current.score, decreasing = TRUE),] +ord.df.current <- penetrance.df[order(penetrance.df$current.score, decreasing = TRUE),] +knitr::kable(ord.df.current, format = "markdown", digits = 2) + ``` If we had to work with only what we have, then family 5031 gives the most detection power. Sorting on `max_score` shows which family could have the most detection power, if all individuals were sampled. This can be useful in targeting future sampling efforts as it shows where more samples would give the most value. ```{r} -penetrance.df[order(penetrance.df$max.score, decreasing = TRUE),] +ord.df.max <- penetrance.df[order(penetrance.df$max.score, decreasing = TRUE),] +knitr::kable(ord.df.max, format = "markdown", digits = 2) ``` Here we can see that family `0347` has a maximum score of 30.84, but a realized score of only 4.17. Given the high potential detection power for this family, it is an ideal target for future sampling efforts as current samples reveal on 13.5% of the family's potential value. diff --git a/vignettes/KinformR-variant_scoring.Rmd b/vignettes/KinformR-variant_scoring.Rmd index 6b9d261..72a01b3 100644 --- a/vignettes/KinformR-variant_scoring.Rmd +++ b/vignettes/KinformR-variant_scoring.Rmd @@ -4,6 +4,8 @@ author: "Cameron M. Nugent" date: "`r format(Sys.time(), '%d %B, %Y')`" data: "`r Sys.Date()`" output: pdf_document #rmarkdown::html_vignette # +pdf_document: + df_print: kable vignette: > %\VignetteIndexEntry{KinformR-variant_scoring} %\VignetteEngine{knitr::rmarkdown} From ca1ec249930bc7242d58e62543b947149f27cc78 Mon Sep 17 00:00:00 2001 From: CNuge Date: Thu, 12 Jun 2025 10:26:53 -0300 Subject: [PATCH 42/43] abstract away the knitr code --- vignettes/KinformR-penetrance_and_ibd.Rmd | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/vignettes/KinformR-penetrance_and_ibd.Rmd b/vignettes/KinformR-penetrance_and_ibd.Rmd index d68901f..87ae310 100644 --- a/vignettes/KinformR-penetrance_and_ibd.Rmd +++ b/vignettes/KinformR-penetrance_and_ibd.Rmd @@ -26,6 +26,10 @@ The `cal.penetrance` function generates both a theoretical ranking of the power ```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} library(KinformR) + +show <- function(df){ + knitr::kable(df, format = "markdown", digits = 2) +} ``` ## The input data @@ -65,9 +69,7 @@ From the simplified pedigrees, the individuals are assigned to the following cat The counts of individuals assigned to these categories are then added to the tab-delimited input file: ```{r} -#example.pedigree.df -knitr::kable(example.pedigree.df, format = "markdown", digits = 2) - +show(example.pedigree.df) ``` All columns with the prefix `max_` are meant to count the total number of each category in the pedigree, while @@ -88,8 +90,8 @@ With the encoded data loaded, the function `cal.penetrance` will generate both a ```{r} penetrance.df <- score.pedigree(example.pedigree.df) -#penetrance.df -knitr::kable(penetrance.df, format = "markdown", digits = 2) + +show(penetrance.df) ``` @@ -111,7 +113,7 @@ Sorting on `current_score` shows which family has the most detection power based ```{r} ord.df.current <- penetrance.df[order(penetrance.df$current.score, decreasing = TRUE),] -knitr::kable(ord.df.current, format = "markdown", digits = 2) +show(ord.df.current) ``` If we had to work with only what we have, then family 5031 gives the most detection power. @@ -120,7 +122,8 @@ Sorting on `max_score` shows which family could have the most detection power, i ```{r} ord.df.max <- penetrance.df[order(penetrance.df$max.score, decreasing = TRUE),] -knitr::kable(ord.df.max, format = "markdown", digits = 2) +show(ord.df.max) + ``` Here we can see that family `0347` has a maximum score of 30.84, but a realized score of only 4.17. Given the high potential detection power for this family, it is an ideal target for future sampling efforts as current samples reveal on 13.5% of the family's potential value. From 9f435df343c3740d9c5ca02fc3b43641fe603d7a Mon Sep 17 00:00:00 2001 From: CNuge Date: Thu, 12 Jun 2025 11:51:47 -0300 Subject: [PATCH 43/43] improving vignette formatting --- vignettes/KinformR-variant_scoring.Rmd | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/vignettes/KinformR-variant_scoring.Rmd b/vignettes/KinformR-variant_scoring.Rmd index 72a01b3..1042fee 100644 --- a/vignettes/KinformR-variant_scoring.Rmd +++ b/vignettes/KinformR-variant_scoring.Rmd @@ -22,6 +22,11 @@ The program leverages Wright’s coefficient of relatedness to score families ba ```{r loadlib, echo = TRUE, results = 'hide', message=FALSE, warning=FALSE} library(KinformR) + +show <- function(df){ + knitr::kable(df, format = "markdown", digits = 2) +} + ``` @@ -37,7 +42,7 @@ To read in the data, one uses the function `read.relation.mat`. ```{r} mat.name1<-system.file('extdata/1234_ex2.mat', package = 'KinformR') rel.mat <- read.relation.mat(mat.name1) -rel.mat +show(rel.mat) ``` @@ -54,7 +59,7 @@ To read in the data, one uses the function `read.indiv`. tsv.name1<-system.file('extdata/1234_ex2.tsv', package = 'KinformR') status.df <- read.indiv(tsv.name1) -status.df +show(status.df) ``` The disease-genotype scoring can then be encoded using the `score.variant.status` function to produce the status-variant category for all individuals. This creates a df with the new column: `statvar.cat`. @@ -62,7 +67,7 @@ The disease-genotype scoring can then be encoded using the `score.variant.status ```{r} full.df.status <- score.variant.status(status.df) -full.df.status +show(full.df.status) ``` @@ -74,7 +79,7 @@ For most real-world applications, you will likely want to score family members i ```{r} ex.score.default <- score.fam(rel.mat, full.df.status) -ex.score.default +show(ex.score.default) ``` @@ -89,7 +94,7 @@ The scoring can be changed to summing across all combinations as opposed to the ex.score.sum <- score.fam(rel.mat, full.df.status, return.sums = TRUE, return.means = FALSE) -ex.score.sum +show(ex.score.sum) ``` @@ -98,7 +103,7 @@ To obtain a long form table with the scores for variants expressed relative to e ex.score.table <- score.fam(rel.mat, full.df.status, return.sums = FALSE, return.means = FALSE) -ex.score.table +show(ex.score.table) ``` ## How scoring works @@ -112,7 +117,7 @@ For example, the degrees of relationships of all other members of the example fa ```{r} rel.mat.proband <- rel.mat["MS-1234-1001",] -rel.mat.proband +show(rel.mat.proband) ``` This is taken along with a list of encoded statuses of all individuals for the given variant. Above `score.variant.status` used the disease status and a genetic variant of each individual to determine which of the following categories they fall in to: - A.c = Affected individual with variant