Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

introduce changes from Emma #20

Open
lpantano opened this issue May 8, 2024 · 0 comments
Open

introduce changes from Emma #20

lpantano opened this issue May 8, 2024 · 0 comments
Assignees
Labels
bug Something isn't working RNAseq

Comments

@lpantano
Copy link
Contributor

lpantano commented May 8, 2024

source(params$params_file)
source(params$project_file)
library(tidyverse)
library(knitr)
library(DESeq2)
library(DEGreport)
library(ggrepel)
library(pheatmap)
# library(RColorBrewer)
library(DT)
library(pheatmap)
library(bcbioR)
library(janitor)
ggplot2::theme_set(theme_light(base_size = 14))
opts_chunk[["set"]](
    cache = FALSE,
    cache.lazy = FALSE,
    dev = c("png", "pdf"),
    error = TRUE,
    highlight = TRUE,
    message = FALSE,
    prompt = FALSE,
    tidy = FALSE,
    warning = FALSE,
    fig.height = 4)
#' Create sub-chunks for plots
#'
#' taken from: https://stackoverflow.com/questions/15365829/dynamic-height-and-width-for-knitr-plots
#'
#' @param pl a plot object
#' @param fig.height figure height
#' @param fig.width figure width
#' @param chunk_name name of the chunk
#'
#' @author Andreas Scharmueller \email{andschar@@protonmail.com}
#'
subchunkify = function(pl,
                       fig.height = 7,
                       fig.width = 5,
                       chunk_name = 'plot') {
  pl_deparsed = paste0(deparse(function() {
    pl
  }), collapse = '')
  
  sub_chunk = paste0(
    "```{r ",
    chunk_name,
    ", fig.height=",
    fig.height,
    ", fig.width=",
    fig.width,
    ", dpi=72",
    ", echo=FALSE, message=FALSE, warning=FALSE, fig.align='center'}",
    "\n(",
    pl_deparsed,
    ")()",
    "\n```"
  )
  
  cat(knitr::knit(
    text = knitr::knit_expand(text = sub_chunk),
    quiet = TRUE
  ))
}

sanitize_datatable = function(df, ...) {
 # remove dashes which cause wrapping
 DT::datatable(df, ..., rownames=gsub("-", "_", rownames(df)),
                   colnames=gsub("-", "_", colnames(df)))
}

Overview

  • Project: r project
  • PI: r PI
  • Analyst: r analyst
  • Experiment: r experiment
  • Aim: r aim

Samples and metadata

###Main factor of interest###
# This will be used to order the samples and annotate the QC figures
FOI <- "tissue_type"

## May 8th, I edited the metadata read in to take the input csv from nf-core which MAY have duplicated lines in the case of dup lanes##
meta_df=read_csv(metadata_fn)  %>% arrange(.data[[FOI]]) %>% distinct(sample, .keep_all = T)

order <- meta_df$sample


ggplot(meta_df, aes(.data[[FOI]], fill = .data[[FOI]])) +
  geom_bar() + ylab("") + xlab("") +
  scale_fill_cb_friendly()

metrics <- read_tsv(paste0(multiqc_data_dir, 'multiqc_general_stats.txt'))
metrics_qualimap <- read_tsv(paste0(multiqc_data_dir, 'mqc_qualimap_genomic_origin_1.txt'))

metrics <- metrics %>% full_join(metrics_qualimap)
metrics <- metrics %>%
  clean_names() %>% 
  dplyr::rename_with(~gsub('.*mqc_generalstats_', '', .))

total_reads <- metrics %>% 
  dplyr::filter(grepl('_', sample)) %>% 
  remove_empty(which = 'cols') %>%
  dplyr::rename(single_sample = sample) %>%
  mutate(sample = gsub('_[12]+$', '', single_sample)) %>%
  group_by(sample) %>%
  summarize(total_reads = sum(fastqc_raw_total_sequences))

metrics <- metrics %>% 
  dplyr::filter(!grepl('_', sample)) %>% 
  remove_empty(which = 'cols') %>%
  full_join(total_reads) %>%
  mutate(mapped_reads = samtools_reads_mapped) %>%
  mutate(exonic_rate = exonic/(star_uniquely_mapped * 2)) %>%
  mutate(intronic_rate = intronic/(star_uniquely_mapped * 2)) %>%
  mutate(intergenic_rate = intergenic/(star_uniquely_mapped * 2)) %>%
 # mutate(r_rna_rate = custom_content_biotype_counts_percent_r_rna) %>%
  mutate(x5_3_bias = qualimap_5_3_bias)

metrics <- metrics %>%
    full_join(meta_df , by = c("sample" = "sample"))


### Just for this analysis, not to be permanently added#####
metrics$sample <- gsub("X4","4", metrics$sample)
meta_sm <- meta_df %>%
  as.data.frame() %>%
  column_to_rownames("sample")

meta_sm %>% sanitize_datatable()

Read metrics {.tabset}

Total reads

Here, we want to see consistency and a minimum of 20 million reads.

metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = total_reads, 
               fill = .data[[FOI]])) +
    geom_bar(stat = "identity") +
    coord_flip() +
    scale_y_continuous(name = "million reads") +
    scale_fill_cb_friendly() + xlab("") + 
    ggtitle("Total reads") 

#get min percent mapped reads for reference
min_pct_mapped <- round(min(metrics$mapped_reads/metrics$total_reads)*100,1)
max_pct_mapped <- round(max(metrics$mapped_reads/metrics$total_reads)*100,1)

Mapping rate

The genomic mapping rate represents the percentage of reads mapping to the reference genome. We want to see consistent mapping rates between samples and over 70% mapping. These samples have mapping rates (r min_pct_mapped - r max_pct_mapped%).

metrics$mapped_reads_pct <- round(metrics$mapped_reads/metrics$total_reads*100,1)
metrics %>%
    ggplot(aes(x = factor(sample, level = order), 
               y = mapped_reads_pct, 
               color = .data[[FOI]])) +
        geom_point() +
    coord_flip() +
    scale_color_cb_friendly() +
    ylim(0, 100) +
    ggtitle("Mapping rate") +
  geom_hline(yintercept=70, color = cb_friendly_cols('blue'))

Number of genes detected

The number of genes represented in every sample is expected to be consistent and over 20K (blue line).

se <- readRDS(se_object)
genes_detected <- colSums(assays(se)[["counts"]] > 0) %>% enframe()
sample_names <- metrics[,c("sample"), drop=F]
genes_detected <- left_join(genes_detected, sample_names, by = c("name" = "sample"))
genes_detected <- genes_detected %>% group_by(name)
genes_detected <- summarise(genes_detected, 
                             n_genes = max(value))

#### Code to fix stupid names, only for this analysis. Remove for git###
genes_detected$name <- gsub("X4","4", genes_detected$name)

                      
metrics <- metrics %>%
  left_join(genes_detected, by = c("sample" = "name"))

ggplot(metrics,aes(x = factor(sample, level = order),
           y = n_genes, fill = .data[[FOI]])) +
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_cb_friendly() +
  ggtitle("Number of genes") +
  ylab("Number of genes") +
  xlab("") +
  geom_hline(yintercept=20000, color = cb_friendly_cols('blue'))

Gene detection saturation

This plot shows how complex the samples are. We expect samples with more reads to detect more genes.

metrics %>% 
  ggplot(aes(x = total_reads, 
             y = n_genes,
             color = .data[[FOI]])) +
  geom_point()+
  scale_x_log10() +
  scale_color_cb_friendly() +
  ggtitle("Gene saturation") +
  ylab("Number of genes")

Exonic mapping rate

Here we are looking for consistency, and exonic mapping rates around 70% or 75% (blue and red lines, respectively).

metrics %>%
  ggplot(aes(x = factor(sample, level = order),
             y = exonic_rate * 100, 
             color = .data[[FOI]])) +
  geom_point() +
  ylab("Exonic rate %") + 
  ggtitle("Exonic mapping rate") + 
  scale_color_cb_friendly() +
  coord_flip()  +
  xlab("") +
  ylim(c(0,100)) +
  geom_hline(yintercept=70, color = cb_friendly_cols('blue')) +
  geom_hline(yintercept=75, color = cb_friendly_cols('brown'))

Intronic mapping rate

Here, we expect a low intronic mapping rate (≤ 15% - 20%)

metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = intronic_rate * 100, 
               color = .data[[FOI]])) +
  geom_point() +
  ylab("Intronic rate %") +
  ggtitle("Intronic mapping rate") + 
  scale_color_cb_friendly() +
  coord_flip()  +
  xlab("") +
  ylim(c(0,100)) +
  geom_hline(yintercept=20, color = cb_friendly_cols('blue')) +
  geom_hline(yintercept=15, color = cb_friendly_cols('brown'))

Intergenic mapping rate

Here, we expect a low intergenic mapping rate, which is true for all samples.

metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = intergenic_rate * 100, 
               color = .data[[FOI]])) +
        geom_point() +
    ylab("Intergenic rate %") +
    ggtitle("Intergenic mapping rate") + 
    coord_flip()  + xlab("") +
    scale_color_cb_friendly() +
    ylim(c(0, 100))

rRNA mapping rate

Samples should have a ribosomal RNA (rRNA) "contamination" rate below 10%

# for some bad samples it could be > 50%
rrna_ylim <- max(round(metrics$r_rna_rate*100, 2)) + 10
metrics %>%
    ggplot(aes(x = factor(sample, level = order),
               y = r_rna_rate * 100, 
               color = .data[[FOI]])) +
        geom_point() +
    ylab("rRNA rate, %")+
    ylim(0, rrna_ylim) + 
    ggtitle("rRNA mapping rate") +
    coord_flip() +
    scale_color_cb_friendly()

5'->3' bias

There should be little bias, i.e. the values should be close to 1, or at least consistent among samples

metrics %>%
  ggplot(aes(x = factor(sample, level = order),
             y = x5_3_bias, 
             color = .data[[FOI]])) +
  geom_point() +
  ggtitle("5'-3' bias") + 
  coord_flip() +
  ylim(c(0.5,1.5)) + xlab("") + ylab("5'-3' bias") +
  scale_color_cb_friendly()+
  geom_hline(yintercept=1, color = cb_friendly_cols('blue'))

Counts per gene - all genes

We expect consistency in the box plots here between the samples, i.e. the distribution of counts across the genes is similar

metrics_small <- metrics %>% dplyr::select(sample, .data[[FOI]])
metrics_small <- left_join(sample_names, metrics_small)

counts <- 
  assays(se)[["counts"]] %>% 
  as_tibble() %>%
  filter(rowSums(.)!=0) %>% 
  gather(name, counts) 

### Just for this analysis, not to be permanently added#####
counts$name <- gsub("X4","4", counts$name)


counts <- left_join(counts, metrics_small, by = c("name" = "sample"))

ggplot(counts, aes(factor(name, level = order),
                   log2(counts+1),
                   fill = .data[[FOI]])) +
  geom_boxplot() + 
  scale_fill_cb_friendly() +
  coord_flip() + xlab("") +
  ggtitle("Counts per gene, all non-zero genes") +
  scale_color_cb_friendly()

Sample similarity analysis

In this section, we look at how well the different groups in the dataset cluster with each other. Samples from the same group should ideally be clustering together. We use Principal Component Analysis (PCA).

Principal component analysis (PCA) {.tabset}

Principal Component Analysis (PCA) is a statistical technique used to simplify high-dimensional data by identifying patterns and reducing the number of variables. In the context of gene expression, PCA helps analyze large datasets containing information about the expression levels of thousands of genes across different samples (e.g., tissues, cells).

raw_counts <- assays(se)[["counts"]] %>% round() %>%
    as_tibble() %>%
    filter(rowSums(.)!=0) %>%
    as.matrix()

### Just for this analysis, not to be permanently added#####
colnames(raw_counts) <- gsub("X4","4", colnames(raw_counts))



vst <- vst(raw_counts)

#fix samples names
coldat_for_pca <- as.data.frame(metrics)
rownames(coldat_for_pca) <- coldat_for_pca$sample
coldat_for_pca <- coldat_for_pca[colnames(raw_counts),]
pca1 <- degPCA(vst, coldat_for_pca,
              condition = FOI, data = T)[["plot"]]
pca2 <- degPCA(vst, coldat_for_pca,
              condition = FOI, data = T, pc1="PC3", pc2="PC4")[["plot"]]

pca1 + scale_color_cb_friendly()
pca2 + scale_color_cb_friendly()
## Remove non-useful columns output by nf-core
coldat_2 <- data.frame(coldat_for_pca[,!(colnames(coldat_for_pca) %in% c("fastq_1", "fastq_2", "salmon_library_types", "salmon_compatible_fragment_ratio", "samtools_reads_mapped_percent", "samtools_reads_properly_paired_percent", "samtools_mapped_passed_pct", "strandedness"))])

# Remove missing data
coldat_2 <- na.omit(coldat_2)
degCovariates(vst, metadata = coldat_2)
## Hierarchical clustering

vst_cor <- cor(vst)

annotation_cols <- cb_friendly_pal('grey')(length(unique(coldat_for_pca[[FOI]])))
names(annotation_cols) <- unique(coldat_for_pca[[FOI]])

## Note am still unable to get the annotation colors with the CB palette to work##
pheatmap(vst_cor, annotation_col = coldat_for_pca[,FOI, drop=F], show_rownames = T, show_colnames = T, color = cb_friendly_pal('heatmap')(15))

R session

List and version of tools used for the QC report generation.

sessionInfo()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working RNAseq
Projects
None yet
Development

No branches or pull requests

2 participants