GitHub - elsayed-lab/gene-structure-analysis: Visualization and descriptive statistics of Trypanosome gene structure using RNA-Seq data.

Branches Tags

Name		Name	Last commit message	Last commit date
Latest commit History 60 Commits
child		child
input		input
scripts		scripts
settings		settings
static		static
.gitignore		.gitignore
README.Rmd		README.Rmd

Repository files navigation

---
title: Trypanosome Gene Structure Analysis (T. cruzi 3.1)
author: Keith Hughitt
output:
  html_document:
    theme: cosmo
    toc: true
    toc_float: true
    number_sections: true
  pdf_document:
    header-includes:
    - \usepackage{texshade}
---

Introduction
============

In previous work, *L. major* Friedlin and *T. cruzi* Y strain RNA-Seq data was
used to determine UTR boundaries and splice leader and polyadenylation acceptor
sites ([1](https://github.com/elsayed-lab/utr_analysis)), and to look for
evidence of alternative trans-splicing and poly-adenylation, and compute basic
statistics relating to SL and Poly(A) site usage
([2](http://www.umiacs.umd.edu/~keith/research/2015/110-utr-lengths/)) across
on a genome-wide scale.

Here, we expand on these analyses and attempt to map out the gene structure
for each of the parasites, and explore some of the basic properties and features 
of the trypanosome gene structure.

In order to ensure that the results are as easily reproduced as possible, all
relevant figures and tables are shown with the code used to generate them.

In a few instances where it is easier to extract the required information
outside of the R environment (for example, read mapping statistics), external
scripts are provided in the `scripts/` folder.

```{r include=FALSE}
library(knitr)
knit('settings/tcruzi_ystrain.Rmd', quiet=TRUE, output=tempfile())
```

```{r knitr_settings, include=FALSE}
opts_knit$set(progress=TRUE, verbose=TRUE)
opts_chunk$set(fig.width=1920/192,
               fig.height=1920/192,
               fig.path=file.path(CONFIG$output_dir, 'figure/'),
               dpi=192,
               cache.path=CONFIG$cache_dir,
			   echo=CONFIG$verbose)
options(digits=3)
options(stringsAsFactors=FALSE)
options(knitr.duplicate.label='allow')
options(java.parameters="-Xmx8g" )

# If rmarkdown.pandoc.to not specified, have it default to 'latex' output
if (is.null(opts_knit$get("rmarkdown.pandoc.to"))) {
    opts_knit$set(rmarkdown.pandoc.to='latex')
}

# Format-specific options
if (opts_knit$get("rmarkdown.pandoc.to") == 'latex') {
    # PDF output
    opts_chunk$set(dev=c('cairo_pdf', 'tiff'), 
  				   dev.args=list(cairo_pdf=list(family="DejaVu Sans"),
                                 tiff=list(compression="lzw")))
} else {
    # HTML output
    #opts_chunk$set(dev=c('png', 'tiff'), fig.retina=1)
    opts_chunk$set(dev=c('png'), fig.retina=1)
}
```

```{r testing, include=FALSE, eval=FALSE}
opts_knit$get("rmarkdown.pandoc.to")
```

```{r load_libraries, message=FALSE, warning=FALSE}
library('GenomicRanges')
library('Biostrings')
library('dynamicTreeCut')
library('DT')
library('rtracklayer')
library('edgeR')
library('ggplot2')
library('GO.db')
library('goseq')
library('gplots')
library('gridExtra')
library('Hmisc')
library('reshape2')
library('MASS')
library('dplyr')
library('readr')
library('viridis')
library('msa')
library('printr')
library('seqLogo')
library('stringdist')
library('tibble')
library('tidyr')
library('stringr')
library('svglite')
library('xlsx')
source('../../../2015/00-shared/R/annotations.R')
source('../../../2015/00-shared/R/enrichment_analysis.R')
source('../../../2015/00-shared/R/util.R')
source('child/helper.R')

# fix namespace for dplyr functions
assign('select', dplyr::select, envir=.GlobalEnv)

# random seed
set.seed(1)
```

```{r load_data, include=FALSE}
knit('child/load_data.Rmd', quiet=TRUE, output=tempfile())
```

Coverage Statistics
===================

```{r child='child/coverage_summary_statistics.Rmd'}
```

```{r child='child/coverage_utrs.Rmd'}
```

UTRs
====

```{r child='child/length_summary.Rmd'}
```

```{r child='child/stage_specific_utr_lengths.Rmd'}
```

```{r child='child/gene_structure_overview.Rmd'}
```

```{r child='child/functional_enrichment.Rmd'}
```

```{r child='child/length_correlations.Rmd'}
```

Intergenic Regions
==================

```{r child='child/intergenic_regions.Rmd'}
```

Alternative trans-splicing and polyadenylation
==============================================

```{r child='child/alternative_site_usage.Rmd'}
```

```{r child='child/stage_specific_site_usage.Rmd'}
```

Trans-spicing acceptor sites
============================

```{r child='child/acceptor_site.Rmd'}
```

Polyadenylation sites
=====================

```{r child='child/polyadenylation_sites.Rmd'}
```

Polypyrimidine tracts
=====================

```{r child='child/polypyrimidine_tract.Rmd'}
```

Gene expression across chromosome, PTU, and developmental stage
===============================================================

```{r child='child/expression_summary.Rmd'}
```

Comparative analysis
====================

```{r child='child/comparative_analysis.Rmd'}
```

```{r child='child/tcruzi_sylvio_comparison.Rmd'}
```

System Info
===========

```{r}
sessionInfo()
```

```{r include=FALSE, cache=TRUE, autodep=TRUE}
# write tables to file
table_dir <- file.path(CONFIG$output_dir, 'table')
dir.create(table_dir, recursive=TRUE)

for (table_name in ls()[grepl('^table_S?[0-9]', ls())]) {
    if (table_name %in% c('table_dir', 'table_name')) {
        next
    }
    message(sprintf("Saving %s", table_name))

    # Retrieve variable associated with table
    tbl_ <- get(table_name)

    # For single tables, save as CSV
    if (is.matrix(tbl_) || is.data.frame(tbl_)) {
        filename <- sprintf("%s.csv", table_name)

        # write table to file
        write.csv(format(tbl_, digits=3), 
                file=file.path(table_dir, filename))
    } else {
        # Otherwise, if multiple tables stored in a list, save as XLS with
        # separate sheets for each table
        filename <- sprintf("%s.xlsx", table_name)
        xls_filepath <- file.path(table_dir, filename)

        sheet_names <- names(tbl_)

        write.xlsx(as.data.frame(tbl_[[sheet_names[1]]]), file=xls_filepath,
                   sheetName=sheet_names[1], row.names=FALSE)

        for (sheet in sheet_names[2:length(tbl_)]) {
            # free up Java memory on each loop iteration
            # http://stackoverflow.com/questions/21937640/handling-java-lang-outofmemoryerror-when-writing-to-excel-from-r
            .jcall("java/lang/System", method = "gc")

            write.xlsx(as.data.frame(tbl_[[sheet]]), file=xls_filepath,
                       sheetName=sheet, row.names=FALSE)
        }
    }
}

# drop site identifiers for all tables, except for the 'combined' one
for (stage in CONFIG$stages) {
    sl_sites[[stage]] <- sl_sites[[stage]] %>% select(-site)
    polya_sites[[stage]] <- polya_sites[[stage]] %>% select(-site)
}
```

```{r include=FALSE, cache=TRUE, autodep=TRUE}
# copy SL and Poly(A) gff files
data_dir <- file.path(CONFIG$output_dir, 'data')
dir.create(data_dir, recursive=TRUE)

for (gff in CONFIG$site_gffs) {
    file.copy(gff, file.path(data_dir, basename(gff)))
}

# copy static alternative sl site figures (4 & S6)
file.copy('static/Figure_S7.png', file.path(CONFIG$output_dir, 'figure'))
file.copy('static/Figure_4.png', file.path(CONFIG$output_dir, 'figure'))

# copy intra-cds sl sites file to output dir
file.copy('input/tcruzi_ystrain/XX_intracds_orfs.csv.gz', file.path(CONFIG$output_dir, 'table'))

# save rendered output
message(opts_knit$get("rmarkdown.pandoc.to"))

if (opts_knit$get("rmarkdown.pandoc.to") == 'latex') {
    system(sprintf('(sleep 30 && cp -r README.pdf %s) &', CONFIG$output_dir))
} else {
    system(sprintf('(sleep 30 && cp -r README.html %s) &', CONFIG$output_dir))
}
```