From 4683251b5f3b5774c8b688e6403fdb6cb99163f5 Mon Sep 17 00:00:00 2001
From: HDash <16350928+HDash@users.noreply.github.com>
Date: Wed, 17 Apr 2024 11:42:27 +0100
Subject: [PATCH 1/4] Populate main package vignette

---
 vignettes/MotifStats.Rmd | 172 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 170 insertions(+), 2 deletions(-)
diff --git a/vignettes/MotifStats.Rmd b/vignettes/MotifStats.Rmd
index f0e87af..967e380 100644
--- a/vignettes/MotifStats.Rmd
+++ b/vignettes/MotifStats.Rmd
@@ -7,13 +7,181 @@ vignette: >
   %\VignetteEncoding{UTF-8}
 ---
 
-```{r, include = FALSE}
+```{r include = FALSE}
 knitr::opts_chunk$set(
   collapse = TRUE,
   comment = "#>"
 )
 ```
 
-```{r setup}
+
+## Introduction
+
+`MotifStats` is a simple R package to calculate the the metrics to quantify the
+relationship between peaks and motifs. It is based on [Analysis of Motif
+Enrichment (AME)](https://meme-suite.org/meme/doc/ame.html) and [Find Individual
+Motif Occurrences (FIMO)](https://meme-suite.org/meme/doc/fimo.html) from the
+[MEME suite](https://meme-suite.org/meme/index.html).  
+
+<br>
+It has two distinct functions:
+
+1. Calculate motif enrichment motif enrichment relative to a set of background
+sequences using AME.
+2. Calculate the distance between each motif and its nearest peak summit, where
+each motif is identified using FIMO.
+
+## Data
+The `MotifStats` package comes with motif and peak data for transcription
+factors CTCF and CREB1. Details of the files are as follows:
+
+- **CREB1 TIP-seq peaks (narrowPeaks)**[^f1]
+-- *Data not publicly available*  
+
+- **CTCF TIP-seq peaks (narrowPeaks)**[^f1]
+-- Retrieved from NCBI GEO under
+accession
+[GSE188512](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE188512)  
+
+- **CREB motif**
+-- Retrieved from JASPAR2024 under matrix ID
+[MA0018.5](http://jaspar.genereg.net/matrix/MA0018.5/)  
+
+- **CTCF motif**
+-- Retrieved from JASPAR2024 under matrix ID
+[MA1930.2](http://jaspar.genereg.net/matrix/MA1930.2/)  
+
+
+## Installation
+`MotifMarker` relies on [MEME suite](https://meme-suite.org/meme/index.html) as
+a system dependency. Directions for installation can be found [here](https://www.bioconductor.org/packages/release/bioc/vignettes/memes/inst/doc/install_guide.html).  
+<br>
+To install the package, use the following command:  
+```{r eval = FALSE}
+if(!require("remotes")) install.packages("remotes")
+remotes::install_github("neurogenomics/MotifStats")
+```
+
+
+## Usage
+In this example analysis, we will compare the enrichment of the CREB motif in
+CTCF TIP-seq peaks relative to the background. We will also calculate the
+distance between the centre of each CTCF motif and its nearest peak summit.
+
+### Load packages
+Load the installed package.
+```{r include  = FALSE}
 library(MotifStats)
 ```
+
+For this example, we will also load the `BSgenome.Hsapiens.UCSC.hg38` package to
+provide the genome build our peaks have been derived from.
+```{r include  = FALSE}
+library(BSgenome.Hsapiens.UCSC.hg38)
+```
+
+### Prepare input data
+First, we need to load the motif and peak data. The motif data obtained from
+JASPAR can be loaded as follows:
+```{r eval  = FALSE}
+name_motif <- read_motif_file(
+  "/path/to/motfif/file.jaspar",
+  motif_id = "id_of_motif",
+  file_format = "jaspar"
+  )
+```
+
+Next, we load the narrowPeak files from MACS2/3.
+```{r eval  = FALSE}
+name_peaks <- read_peak_file("/path/to/peak/file.narrowPeak")
+```
+
+For this example, we will load the built-in CREB1 TIP-seq peaks (as `GRanges`
+object) and CREB motif (as `PWMatrix` object) data.
+```{r include  = TRUE}
+data("ctcf_motif")
+data("ctcf_peaks")
+```
+
+### Calculate motif enrichment
+To calculate the motif relative to a set of background sequences, we use
+`peak_proportion()`.  
+
+- An additional `out_dir` argument can be used to specify the
+directory to save the AME output files[^f2].  
+
+```{r include  = TRUE}
+ctcf_read_prop <- peak_proportion(
+  peak_input = ctcf_peaks,
+  motif = ctcf_motif,
+  genome_build = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
+)
+```
+
+`ctcf_read_prob` is a list with two components:  
+1. `$tp` (True positives) with the number of true positive motif occurrences in
+the given sequence followed by its relative percentage.  
+1. `$fp` (False positives) with the number of false positive motif occurrences
+in the given sequence followed by its relative percentage.  
+
+
+### Find motif-summit distances
+To calculate the distance between each motif and its nearest peak summit, we use
+`summit_to_motif()`.  
+
+- `fp_rate` argument specifies the desired false-positive rate for FIMO. A
+p-value is calculated using the formula:  
+$$p = \frac{\text{fp_rate}}{2 * \text{promoter_length}}$$  
+- An additional `out_dir` argument can be used to specify the
+directory to save the 0-order background file.  
+
+```{r include  = TRUE}
+ctcf_read_sum_motif <- summit_to_motif(
+  peak_input = ctcf_peaks,
+  motif = ctcf_motif,
+  fp_rate = 0.05,
+  genome_build = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38
+)
+```
+
+`ctcf_read_sum_motif` is a list with two objects:  
+1. `peak_set` with peak information.  
+2. `distance_to_summit` with distances between the centre of each motif and its
+nearest peak summit.  
+
+
+#### Visualize results
+We can optionally visualize the distribution of distances by using `density_plot()`.
+```{r include  = TRUE, fig.width = 7, fig.height = 4}
+density_plot(
+  ctcf_read_sum_motif$distance_to_summit,
+  plot_title = "CTCF motif distance to summit",
+  x_label = "Distance to summit (bp)",
+  y_label = "Density"
+)
+```
+
+
+> **NOTE:** Since AME and FIMO accept different parameters and are calculated
+independently, it is not possible to obtain directly comparable results.
+
+
+## Future Enchancements
+- Calculate metrics for more than one motif at a time.  
+
+
+## Session Info
+
+<details>
+
+```{r echo = FALSE}
+sessionInfo()
+```
+
+</details>
+
+<!-- Footnotes -->
+[^f1]: The peak file is a subset of chromosome 19 to reduce the size of the
+built-in data.
+[^f2]: For more information on the output files, refer to the
+[AME documentation](https://meme-suite.org/meme/doc/ame.html).

From 910db57a0cb4624eda8739ebbc4a769d6d507bc7 Mon Sep 17 00:00:00 2001
From: HDash <16350928+HDash@users.noreply.github.com>
Date: Thu, 18 Apr 2024 15:37:45 +0100
Subject: [PATCH 2/4] Correct typos in main vignette

---
 vignettes/MotifStats.Rmd | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vignettes/MotifStats.Rmd b/vignettes/MotifStats.Rmd
index 967e380..324cf03 100644
--- a/vignettes/MotifStats.Rmd
+++ b/vignettes/MotifStats.Rmd
@@ -53,7 +53,7 @@ accession
 
 
 ## Installation
-`MotifMarker` relies on [MEME suite](https://meme-suite.org/meme/index.html) as
+`MotifStats` relies on [MEME suite](https://meme-suite.org/meme/index.html) as
 a system dependency. Directions for installation can be found [here](https://www.bioconductor.org/packages/release/bioc/vignettes/memes/inst/doc/install_guide.html).  
 <br>
 To install the package, use the following command:  
@@ -64,9 +64,10 @@ remotes::install_github("neurogenomics/MotifStats")
 
 
 ## Usage
-In this example analysis, we will compare the enrichment of the CREB motif in
+In this example analysis, we will compare the enrichment of the CTCF motif in
 CTCF TIP-seq peaks relative to the background. We will also calculate the
-distance between the centre of each CTCF motif and its nearest peak summit.
+distance between the centre of each motif occurrence and its nearest peak
+summit.
 
 ### Load packages
 Load the installed package.
@@ -96,7 +97,7 @@ Next, we load the narrowPeak files from MACS2/3.
 name_peaks <- read_peak_file("/path/to/peak/file.narrowPeak")
 ```
 
-For this example, we will load the built-in CREB1 TIP-seq peaks (as `GRanges`
+For this example, we will load the built-in CTCF TIP-seq peaks (as `GRanges`
 object) and CREB motif (as `PWMatrix` object) data.
 ```{r include  = TRUE}
 data("ctcf_motif")
@@ -118,7 +119,7 @@ ctcf_read_prop <- peak_proportion(
 )
 ```
 
-`ctcf_read_prob` is a list with two components:  
+`ctcf_read_prop` is a list with two components:  
 1. `$tp` (True positives) with the number of true positive motif occurrences in
 the given sequence followed by its relative percentage.  
 1. `$fp` (False positives) with the number of false positive motif occurrences

From c1dad8eaa317f26b5faf3dc9088c5930582a2099 Mon Sep 17 00:00:00 2001
From: HDash <16350928+HDash@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:59:06 +0100
Subject: [PATCH 3/4] Add further explanations to the main vignette

---
 vignettes/MotifStats.Rmd | 63 +++++++++++++++++++++++++++++++++-------
 1 file changed, 52 insertions(+), 11 deletions(-)

diff --git a/vignettes/MotifStats.Rmd b/vignettes/MotifStats.Rmd
index 324cf03..f784c47 100644
--- a/vignettes/MotifStats.Rmd
+++ b/vignettes/MotifStats.Rmd
@@ -31,7 +31,9 @@ sequences using AME.
 2. Calculate the distance between each motif and its nearest peak summit, where
 each motif is identified using FIMO.
 
+
 ## Data
+
 The `MotifStats` package comes with motif and peak data for transcription
 factors CTCF and CREB1. Details of the files are as follows:
 
@@ -53,6 +55,7 @@ accession
 
 
 ## Installation
+
 `MotifStats` relies on [MEME suite](https://meme-suite.org/meme/index.html) as
 a system dependency. Directions for installation can be found [here](https://www.bioconductor.org/packages/release/bioc/vignettes/memes/inst/doc/install_guide.html).  
 <br>
@@ -64,27 +67,32 @@ remotes::install_github("neurogenomics/MotifStats")
 
 
 ## Usage
+
 In this example analysis, we will compare the enrichment of the CTCF motif in
 CTCF TIP-seq peaks relative to the background. We will also calculate the
 distance between the centre of each motif occurrence and its nearest peak
 summit.
 
+
 ### Load packages
+
 Load the installed package.
-```{r include  = FALSE}
+```{r include = TRUE, message = FALSE, warning = FALSE}
 library(MotifStats)
 ```
 
 For this example, we will also load the `BSgenome.Hsapiens.UCSC.hg38` package to
 provide the genome build our peaks have been derived from.
-```{r include  = FALSE}
+```{r include = TRUE, message = FALSE, warning = FALSE}
 library(BSgenome.Hsapiens.UCSC.hg38)
 ```
 
+
 ### Prepare input data
+
 First, we need to load the motif and peak data. The motif data obtained from
 JASPAR can be loaded as follows:
-```{r eval  = FALSE}
+```{r eval = FALSE}
 name_motif <- read_motif_file(
   "/path/to/motfif/file.jaspar",
   motif_id = "id_of_motif",
@@ -92,8 +100,8 @@ name_motif <- read_motif_file(
   )
 ```
 
-Next, we load the narrowPeak files from MACS2/3.
-```{r eval  = FALSE}
+Next, we load the narrowPeak files from MACS2/3[^f2].
+```{r eval = FALSE}
 name_peaks <- read_peak_file("/path/to/peak/file.narrowPeak")
 ```
 
@@ -104,12 +112,20 @@ data("ctcf_motif")
 data("ctcf_peaks")
 ```
 
+
 ### Calculate motif enrichment
-To calculate the motif relative to a set of background sequences, we use
-`peak_proportion()`.  
 
+To calculate the motif relative to a set of background sequences, we use
+`peak_proportion()`.
+
+- Under the hood, it calls `meme::runAme` for motif
+enrichment scoring. In context of this call, it identifies the occurrences of
+input motif in the input sequences compared with background sequences and
+outputs relevant statistics.
+- A 0-order background model with the same nucleotide composition as the input
+sequences is generated for comparison.
 - An additional `out_dir` argument can be used to specify the
-directory to save the AME output files[^f2].  
+directory to save the AME output files[^f3] and the background model.  
 
 ```{r include  = TRUE}
 ctcf_read_prop <- peak_proportion(
@@ -125,14 +141,23 @@ the given sequence followed by its relative percentage.
 1. `$fp` (False positives) with the number of false positive motif occurrences
 in the given sequence followed by its relative percentage.  
 
+In context of this function, the true positives represent to the
+number/percentage of peaks with an associated motif occurrence, while the false
+positives represent the number/percentage of peaks without an associated motif
+occurrence.  
+
+
 
 ### Find motif-summit distances
+
 To calculate the distance between each motif and its nearest peak summit, we use
 `summit_to_motif()`.  
 
 - `fp_rate` argument specifies the desired false-positive rate for FIMO. A
 p-value is calculated using the formula:  
 $$p = \frac{\text{fp_rate}}{2 * \text{promoter_length}}$$  
+- A 0-order background model with the same nucleotide composition as the input
+sequences is generated for comparison.  
 - An additional `out_dir` argument can be used to specify the
 directory to save the 0-order background file.  
 
@@ -146,13 +171,19 @@ ctcf_read_sum_motif <- summit_to_motif(
 ```
 
 `ctcf_read_sum_motif` is a list with two objects:  
-1. `peak_set` with peak information.  
+1. `peak_set` with peak information, as a `GRanges` object.  
 2. `distance_to_summit` with distances between the centre of each motif and its
 nearest peak summit.  
 
+**NOTE**: When a motif is found multiple times within a single peak, the
+`peak_set` and `distance_to_summit` objects will contain multiple entries (rows)
+corresponding to the same peak. Each of these entries represents a distinct
+occurrence of the motif within that peak.
 
 #### Visualize results
-We can optionally visualize the distribution of distances by using `density_plot()`.
+
+We can optionally visualize the distribution of distances by using
+`density_plot()`.
 ```{r include  = TRUE, fig.width = 7, fig.height = 4}
 density_plot(
   ctcf_read_sum_motif$distance_to_summit,
@@ -162,12 +193,19 @@ density_plot(
 )
 ```
 
+For this given example, we can observe the distribution of distances between the
+centre of each motif, and its nearest peak summit follows a normal distribution.
+With a mean of the distribution around 0 base pairs (bp), we can infer that the
+motif is likely to be located at the peak summit, suggesting that the identified
+peak summits are associated with binding of regulatory proteins.
+
 
 > **NOTE:** Since AME and FIMO accept different parameters and are calculated
 independently, it is not possible to obtain directly comparable results.
 
 
 ## Future Enchancements
+
 - Calculate metrics for more than one motif at a time.  
 
 
@@ -181,8 +219,11 @@ sessionInfo()
 
 </details>
 
+
 <!-- Footnotes -->
 [^f1]: The peak file is a subset of chromosome 19 to reduce the size of the
 built-in data.
-[^f2]: For more information on the output files, refer to the
+[^f2]: narrowPeak files from both version 2 and 3 of [MACS: Model-based Analysis
+for ChIP-Seq](https://github.com/macs3-project/MACS) is supported.
+[^f3]: For more information on the output files, refer to the
 [AME documentation](https://meme-suite.org/meme/doc/ame.html).

From e22b77d17620faccdf12b9c17e7f84cdb3bcb93d Mon Sep 17 00:00:00 2001
From: HDash <16350928+HDash@users.noreply.github.com>
Date: Thu, 18 Apr 2024 17:17:54 +0100
Subject: [PATCH 4/4] Update `out_dir` behavior for `runAme` per roxygen docs
 for `peak_proportion`

---
 R/peak_proportion.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/R/peak_proportion.R b/R/peak_proportion.R
index b250644..8505400 100644
--- a/R/peak_proportion.R
+++ b/R/peak_proportion.R
@@ -52,7 +52,7 @@ peak_proportion <- function(peak_input,
                                    out_dir = out_dir)
   ame_out <- memes::runAme(peak_sequences,
                            database = list(motif),
-                           outdir = "./")
+                           outdir = out_dir)
 
   return(
     list(