introduce changes from Emma #20

lpantano · 2024-05-08T19:28:20Z

source(params$params_file)
source(params$project_file)

library(tidyverse)
library(knitr)
library(DESeq2)
library(DEGreport)
library(ggrepel)
library(pheatmap)
# library(RColorBrewer)
library(DT)
library(pheatmap)
library(bcbioR)
library(janitor)
ggplot2::theme_set(theme_light(base_size = 14))
opts_chunk[["set"]](
    cache = FALSE,
    cache.lazy = FALSE,
    dev = c("png", "pdf"),
    error = TRUE,
    highlight = TRUE,
    message = FALSE,
    prompt = FALSE,
    tidy = FALSE,
    warning = FALSE,
    fig.height = 4)

#' Create sub-chunks for plots
#'
#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots
#'
#' @param pl a plot object
#' @param fig.height figure height
#' @param fig.width figure width
#' @param chunk_name name of the chunk
#'
#' @author Andreas Scharmueller \email{andschar@@protonmail.com}
#'
subchunkify = function(pl,
                       fig.height = 7,
                       fig.width = 5,
                       chunk_name = 'plot') {
  pl_deparsed = paste0(deparse(function() {
    pl
  }), collapse = '')
  
  sub_chunk = paste0(
    "```{r ",
    chunk_name,
    ", fig.height=",
    fig.height,
    ", fig.width=",
    fig.width,
    ", dpi=72",
    ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}",
    "\n(",
    pl_deparsed,
    ")()",
    "\n```"
  )
  
  cat(knitr::knit(
    text = knitr::knit_expand(text = sub_chunk),
    quiet = TRUE
  ))
}

sanitize_datatable = function(df, ...) {
 # remove dashes which cause wrapping
 DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
                   colnames=gsub("-", "_", colnames(df)))
}

Overview

Project: r project
PI: r PI
Analyst: r analyst
Experiment: r experiment
Aim: r aim

Samples and metadata

###Main factor of interest###
# This will be used to order the samples and annotate the QC figures
FOI <- "tissue_type"


## May 8th, I edited the metadata read in to take the input csv from nf-core which MAY have duplicated lines in the case of dup lanes##
meta_df=read_csv(metadata_fn)  %>% arrange(.data[[FOI]]) %>% distinct(sample, .keep_all = T)

order <- meta_df$sample


ggplot(meta_df, aes(.data[[FOI]], fill = .data[[FOI]])) +
  geom_bar() + ylab("") + xlab("") +
  scale_fill_cb_friendly()


metrics <- read_tsv(paste0(multiqc_data_dir, 'multiqc_general_stats.txt'))
metrics_qualimap <- read_tsv(paste0(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt'))

metrics <- metrics %>% full_join(metrics_qualimap)
metrics <- metrics %>%
  clean_names() %>% 
  dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .))

total_reads <- metrics %>% 
  dplyr::filter(grepl('_', sample)) %>% 
  remove_empty(which = 'cols') %>%
  dplyr::rename(single_sample = sample) %>%
  mutate(sample = gsub('_[12]+$', '', single_sample)) %>%
  group_by(sample) %>%
  summarize(total_reads = sum(fastqc_raw_total_sequences))

metrics <- metrics %>% 
  dplyr::filter(!grepl('_', sample)) %>% 
  remove_empty(which = 'cols') %>%
  full_join(total_reads) %>%
  mutate(mapped_reads = samtools_reads_mapped) %>%
  mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>%
  mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>%
  mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>%
 # mutate(r_rna_rate = custom_content_biotype_counts_percent_r_rna) %>%
  mutate(x5_3_bias = qualimap_5_3_bias)

metrics <- metrics %>%
    full_join(meta_df , by = c("sample" = "sample"))


### Just for this analysis, not to be permanently added#####
metrics$sample <- gsub("X4","4", metrics$sample)

meta_sm <- meta_df %>%
  as.data.frame() %>%
  column_to_rownames("sample")

meta_sm %>% sanitize_datatable()

Read metrics {.tabset}

Total reads

Here, we want to see consistency and a minimum of 20 million reads.

metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = total_reads, 
               fill = .data[[FOI]])) +
    geom_bar(stat = "identity") +
    coord_flip() +
    scale_y_continuous(name = "million reads") +
    scale_fill_cb_friendly() + xlab("") + 
    ggtitle("Total reads")

#get min percent mapped reads for reference
min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1)
max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1)

Mapping rate

The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (r min_pct_mapped - r max_pct_mapped%).

metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1)
metrics %>%
    ggplot(aes(x = factor(sample, level = order), 
               y = mapped_reads_pct, 
               color = .data[[FOI]])) +
        geom_point() +
    coord_flip() +
    scale_color_cb_friendly() +
    ylim(0, 100) +
    ggtitle("Mapping rate") +
  geom_hline(yintercept=70, color = cb_friendly_cols('blue'))

Number of genes detected

The number of genes represented in every sample is expected to be consistent and over 20K (blue line).

se <- readRDS(se_object)
genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe()
sample_names <- metrics[,c("sample"), drop=F]
genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample"))
genes_detected <- genes_detected %>% group_by(name)
genes_detected <- summarise(genes_detected, 
                             n_genes = max(value))

#### Code to fix stupid names, only for this analysis. Remove for git###
genes_detected$name <- gsub("X4","4", genes_detected$name)

                      
metrics <- metrics %>%
  left_join(genes_detected, by = c("sample" = "name"))

ggplot(metrics,aes(x = factor(sample, level = order),
           y = n_genes, fill = .data[[FOI]])) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_cb_friendly() +
  ggtitle("Number of genes") +
  ylab("Number of genes") +
  xlab("") +
  geom_hline(yintercept=20000, color = cb_friendly_cols('blue'))

Gene detection saturation

This plot shows how complex the samples are. We expect samples with more reads to detect more genes.

metrics %>% 
  ggplot(aes(x = total_reads, 
             y = n_genes,
             color = .data[[FOI]])) +
  geom_point()+
  scale_x_log10() +
  scale_color_cb_friendly() +
  ggtitle("Gene saturation") +
  ylab("Number of genes")

Exonic mapping rate

Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively).

metrics %>%
  ggplot(aes(x = factor(sample, level = order),
             y = exonic_rate * 100, 
             color = .data[[FOI]])) +
  geom_point() +
  ylab("Exonic rate %") + 
  ggtitle("Exonic mapping rate") + 
  scale_color_cb_friendly() +
  coord_flip()  +
  xlab("") +
  ylim(c(0,100)) +
  geom_hline(yintercept=70, color = cb_friendly_cols('blue')) +
  geom_hline(yintercept=75, color = cb_friendly_cols('brown'))

Intronic mapping rate

Here, we expect a low intronic mapping rate (≤ 15% - 20%)

metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = intronic_rate * 100, 
               color = .data[[FOI]])) +
  geom_point() +
  ylab("Intronic rate %") +
  ggtitle("Intronic mapping rate") + 
  scale_color_cb_friendly() +
  coord_flip()  +
  xlab("") +
  ylim(c(0,100)) +
  geom_hline(yintercept=20, color = cb_friendly_cols('blue')) +
  geom_hline(yintercept=15, color = cb_friendly_cols('brown'))

Intergenic mapping rate

Here, we expect a low intergenic mapping rate, which is true for all samples.

metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = intergenic_rate * 100, 
               color = .data[[FOI]])) +
        geom_point() +
    ylab("Intergenic rate %") +
    ggtitle("Intergenic mapping rate") + 
    coord_flip()  + xlab("") +
    scale_color_cb_friendly() +
    ylim(c(0, 100))

rRNA mapping rate

Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10%

# for some bad samples it could be > 50%
rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10
metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = r_rna_rate * 100, 
               color = .data[[FOI]])) +
        geom_point() +
    ylab("rRNA rate, %")+
    ylim(0, rrna_ylim) + 
    ggtitle("rRNA mapping rate") +
    coord_flip() +
    scale_color_cb_friendly()

5'->3' bias

There should be little bias, i.e. the values should be close to 1, or at least consistent among samples

metrics %>%
  ggplot(aes(x = factor(sample, level = order),
             y = x5_3_bias, 
             color = .data[[FOI]])) +
  geom_point() +
  ggtitle("5'-3' bias") + 
  coord_flip() +
  ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") +
  scale_color_cb_friendly()+
  geom_hline(yintercept=1, color = cb_friendly_cols('blue'))

Counts per gene - all genes

We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar

metrics_small <- metrics %>% dplyr::select(sample, .data[[FOI]])
metrics_small <- left_join(sample_names, metrics_small)

counts <- 
  assays(se)[["counts"]] %>% 
  as_tibble() %>%
  filter(rowSums(.)!=0) %>% 
  gather(name, counts) 

### Just for this analysis, not to be permanently added#####
counts$name <- gsub("X4","4", counts$name)


counts <- left_join(counts, metrics_small, by = c("name" = "sample"))

ggplot(counts, aes(factor(name, level = order),
                   log2(counts+1),
                   fill = .data[[FOI]])) +
  geom_boxplot() + 
  scale_fill_cb_friendly() +
  coord_flip() + xlab("") +
  ggtitle("Counts per gene, all non-zero genes") +
  scale_color_cb_friendly()

Sample similarity analysis

In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA).

Principal component analysis (PCA) {.tabset}

Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).

raw_counts <- assays(se)[["counts"]] %>% round() %>%
    as_tibble() %>%
    filter(rowSums(.)!=0) %>%
    as.matrix()

### Just for this analysis, not to be permanently added#####
colnames(raw_counts) <- gsub("X4","4", colnames(raw_counts))



vst <- vst(raw_counts)

#fix samples names
coldat_for_pca <- as.data.frame(metrics)
rownames(coldat_for_pca) <- coldat_for_pca$sample
coldat_for_pca <- coldat_for_pca[colnames(raw_counts),]
pca1 <- degPCA(vst, coldat_for_pca,
              condition = FOI, data = T)[["plot"]]
pca2 <- degPCA(vst, coldat_for_pca,
              condition = FOI, data = T, pc1="PC3", pc2="PC4")[["plot"]]

pca1 + scale_color_cb_friendly()
pca2 + scale_color_cb_friendly()

## Remove non-useful columns output by nf-core
coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness"))])

# Remove missing data
coldat_2 <- na.omit(coldat_2)
degCovariates(vst, metadata = coldat_2)

## Hierarchical clustering

vst_cor <- cor(vst)

annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca[[FOI]])))
names(annotation_cols) <- unique(coldat_for_pca[[FOI]])

## Note am still unable to get the annotation colors with the CB palette to work##
pheatmap(vst_cor, annotation_col = coldat_for_pca[,FOI, drop=F], show_rownames = T, show_colnames = T, color = cb_friendly_pal('heatmap')(15))

R session

List and version of tools used for the QC report generation.

sessionInfo()

The text was updated successfully, but these errors were encountered:

lpantano added bug Something isn't working RNAseq labels May 8, 2024

lpantano assigned abartlett004 May 8, 2024

lpantano mentioned this issue May 10, 2024

Fix code with real nfcore/rnaseq projects #22

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

introduce changes from Emma #20

introduce changes from Emma #20

lpantano commented May 8, 2024

introduce changes from Emma #20

introduce changes from Emma #20

Comments

lpantano commented May 8, 2024

Overview

Samples and metadata

Read metrics {.tabset}

Total reads

Mapping rate

Number of genes detected

Gene detection saturation

Exonic mapping rate

Intronic mapping rate

Intergenic mapping rate

rRNA mapping rate

5'->3' bias

Counts per gene - all genes

Sample similarity analysis

Principal component analysis (PCA) {.tabset}

R session