From 512e3ee1d594696a4659377f2431b9e1ac84849a Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 11:22:54 -0700 Subject: [PATCH 01/33] Switched to the dev docker build --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 117d4a56..49a27db3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,8 @@ before_install: # Pull the docker image first so the test doesn't wait for this - docker pull ignaciot/nascent # Fake the tag locally so that the pipeline runs properly - - docker tag ignaciot/nascent ignaciot/nascent:latest + #- docker tag ignaciot/nascent ignaciot/nascent:latest + - docker tag ignaciot/nascent ignaciot/nascent:dev install: # Install Nextflow From f329d5760c3a09fc5831731508048d916cdb1e27 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 11:23:58 -0700 Subject: [PATCH 02/33] Removed unnecessary param --- main.nf | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 9e38f5fe..7762f4ab 100644 --- a/main.nf +++ b/main.nf @@ -197,7 +197,7 @@ if (params.fastqs) { } else { Channel .fromFilePairs( params.fastqs, size: params.singleEnd ? 1 : 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } + .ifEmpty { exit 1, "Cannot find any reads matching: ${params.fastqs}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } } } @@ -218,7 +218,7 @@ if (params.sras) { } else { Channel .fromFilePairs( params.sras, size: params.singleEnd ? 1 : 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } + .ifEmpty { exit 1, "Cannot find any reads matching: ${params.fastqs}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } } } @@ -243,7 +243,6 @@ summary['Pipeline Name'] = 'nf-core/nascent' summary['Pipeline Version'] = workflow.manifest.version summary['Run Name'] = custom_runName ?: workflow.runName summary['Save Reference'] = params.saveReference ? 'Yes' : 'No' -if(params.reads) summary['Reads'] = params.reads if(params.fastqs) summary['Fastqs'] = params.fastqs if(params.sras) summary['SRAs'] = params.sras summary['Genome Ref'] = params.fasta From 79b141e6003dde71df48c929da2d76313edbd8d5 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 12:35:08 -0700 Subject: [PATCH 03/33] Comment out post-processing of fastqc data to see if that is what's stalling the Travis build --- main.nf | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 7762f4ab..32cbb050 100644 --- a/main.nf +++ b/main.nf @@ -413,7 +413,6 @@ if(!params.hisat2_indices && params.fasta){ */ process fastqc { - validExitStatus 0,1 tag "$prefix" publishDir "${params.outdir}/qc/fastqc/", mode: 'copy', saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} @@ -427,10 +426,21 @@ process fastqc { script: prefix = reads.baseName """ - echo ${prefix} +# echo `which gunzip` fastqc $reads - extract_fastqc_stats.sh --srr=${prefix} > ${prefix}_stats_fastqc.txt + #extract_fastqc_stats.sh --srr=${prefix} > ${prefix}_stats_fastqc.txt +# GC=\$(gunzip -c "\$(find . -name *_fastqc.zip)" "${prefix}"_fastqc/fastqc_data.txt \ +# | grep "%GC" | grep -o "[0-9]*") +# SEQ=\$(gunzip -c "\$(find . -name *_fastqc.zip)" "${prefix}"_fastqc/fastqc_data.txt | \ +# grep "Total Sequences" | \ +# grep -o "[0-9]*") +# DEDUP=\$(gunzip -c "\$(find . -name *_fastqc.zip)" "${prefix}"_fastqc/fastqc_data.txt | \ +# grep "#Total Deduplicated Percentage" | \ +# grep -o "[0-9,.]*") +# +# echo -e "SRR\t%GC\tTotal_Sequences\t%Total_Deduplicated" > ${prefix}_stats_fastqc.txt +# echo -e "${prefix}""\$(printf "\\t")""\$GC""\$(printf "\\t")""\$SEQ""\$(printf "\\t")""\$DEDUP" >> ${prefix}_stats_fastqc.txt """ } From 741b011e7abf2b3aa61c97ee55c7f4a71541701a Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 12:37:57 -0700 Subject: [PATCH 04/33] Switched to the dev Docker container --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 48ec7a2c..5f4ab9f1 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :latest - container = 'ignaciot/nascent:latest' + container = 'ignaciot/nascent:dev' // Workflow flags //reads = "data/*{R1,R2}*.fastq" From 9d2df3abbf6f77659cb635596d8a50d64a70714e Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 13:35:53 -0700 Subject: [PATCH 05/33] Updated Docker container URL --- nextflow.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nextflow.config b/nextflow.config index 5f4ab9f1..cb0eac90 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :latest - container = 'ignaciot/nascent:dev' + container = 'ignaciot/nascent' // Workflow flags //reads = "data/*{R1,R2}*.fastq" From 64bbd5193a4a1be2e23e40d329db4fd5ea65a767 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 14:05:08 -0700 Subject: [PATCH 06/33] Fixes from lint --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 49a27db3..fbbb5508 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: - docker pull ignaciot/nascent # Fake the tag locally so that the pipeline runs properly #- docker tag ignaciot/nascent ignaciot/nascent:latest - - docker tag ignaciot/nascent ignaciot/nascent:dev + - docker tag ignaciot/nascent ignaciot/nascent install: # Install Nextflow From 40edbf32cf72d2c7c50843d3acc6679645025a96 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 14:55:17 -0700 Subject: [PATCH 07/33] Debugging docker issues --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index fbbb5508..9d1eb637 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,10 +11,10 @@ before_install: # PRs to master are only ok if coming from dev branch - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' # Pull the docker image first so the test doesn't wait for this - - docker pull ignaciot/nascent + - docker pull ignaciot/nascent:dev # Fake the tag locally so that the pipeline runs properly #- docker tag ignaciot/nascent ignaciot/nascent:latest - - docker tag ignaciot/nascent ignaciot/nascent + - docker tag ignaciot/nascent:dev ignaciot/nascent:latest install: # Install Nextflow From ba79edac6415dc81fb8e466b415d1b017f070365 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Wed, 6 Feb 2019 15:02:05 -0700 Subject: [PATCH 08/33] Switch to the latest FastQC version --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index c6cb7276..ca0fab18 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: - bioconda - defaults dependencies: - - fastqc=0.11.5 + - fastqc=0.11.8 - multiqc=1.6 - hisat2=2.1.0 - samtools=1.8 From 6c678898c3669d5896b9e23c4e52ec78eb00e89a Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Thu, 21 Mar 2019 14:14:14 -0600 Subject: [PATCH 09/33] Lint-related cleanup --- conf/test.config | 2 +- main.nf | 24 ++++++++++++------------ nextflow.config | 5 +++-- 3 files changed, 16 insertions(+), 15 deletions(-) diff --git a/conf/test.config b/conf/test.config index 88510c5e..cbf4521a 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,6 +16,6 @@ params { // Input data singleEnd = true threadfqdump = false - fastqs = "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/testdata/SRR4012402.chr21.fastq" + reads = "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/testdata/SRR4012402.chr21.fastq" fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/chr21.fa" } diff --git a/main.nf b/main.nf index 32cbb050..4f910f9b 100644 --- a/main.nf +++ b/main.nf @@ -64,13 +64,13 @@ def helpMessage() { The typical command for running the pipeline is as follows: - nextflow run nf-core/nascent -profile slurm --fastqs '/project/*_{R1,R2}*.fastq' --outdir '/project/' + nextflow run nf-core/nascent -profile slurm --reads '/project/*_{R1,R2}*.fastq' --outdir '/project/' nextflow run nf-core/nascent --reads '*_R{1,2}.fastq.gz' -profile standard,docker Required arguments: -profile Configuration profile to use. - --fastqs Directory pattern for fastq files: /project/*{R1,R2}*.fastq (Required if --sras not specified) - --sras Directory pattern for SRA files: /project/*.sras (Required if --fastqs not specified) + --reads Directory pattern for fastq files: /project/*{R1,R2}*.fastq (Required if --sras not specified) + --sras Directory pattern for SRA files: /project/*.sras (Required if --reads not specified) --workdir Nextflow working directory where all intermediate files are saved. --email Where to send workflow report email. @@ -183,21 +183,21 @@ if( workflow.profile == 'awsbatch') { /* * Create a channel for input read files */ -if (params.fastqs) { +if (params.reads) { if (params.singleEnd) { fastq_reads_qc = Channel - .fromPath(params.fastqs) + .fromPath(params.reads) .map { file -> tuple(file.baseName, file) } fastq_reads_trim = Channel - .fromPath(params.fastqs) + .fromPath(params.reads) .map { file -> tuple(file.baseName, file) } fastq_reads_gzip = Channel - .fromPath(params.fastqs) + .fromPath(params.reads) .map { file -> tuple(file.baseName, file) } } else { Channel - .fromFilePairs( params.fastqs, size: params.singleEnd ? 1 : 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.fastqs}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } + .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) + .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } } } @@ -206,7 +206,7 @@ else { Channel .empty() .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } - params.fastqs = null + params.reads = null } if (params.sras) { @@ -218,7 +218,7 @@ if (params.sras) { } else { Channel .fromFilePairs( params.sras, size: params.singleEnd ? 1 : 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.fastqs}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } + .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } } } @@ -243,7 +243,7 @@ summary['Pipeline Name'] = 'nf-core/nascent' summary['Pipeline Version'] = workflow.manifest.version summary['Run Name'] = custom_runName ?: workflow.runName summary['Save Reference'] = params.saveReference ? 'Yes' : 'No' -if(params.fastqs) summary['Fastqs'] = params.fastqs +if(params.reads) summary['Fastqs'] = params.reads if(params.sras) summary['SRAs'] = params.sras summary['Genome Ref'] = params.fasta summary['Thread fqdump'] = params.threadfqdump ? 'YES' : 'NO' diff --git a/nextflow.config b/nextflow.config index cb0eac90..50a07ae0 100644 --- a/nextflow.config +++ b/nextflow.config @@ -17,11 +17,12 @@ params { // Workflow flags //reads = "data/*{R1,R2}*.fastq" - //fastqs= "data/*{R1,R2}*.fastq" + reads = "data/*{R1,R2}*.fastq" //sras= "data/*{R1,R2}*.sra" outdir = './results' // Run arguments + singleEnd = true workdir = false clusterOptions = false flip = false @@ -31,7 +32,7 @@ params { tracedir = "${params.outdir}/pipeline_info" // nf_required_version = nf_required_version outdir = './results' - pairedEnd = false +// pairedEnd = false saveAllfq = false savefq = false saveTrim = false From 7c9b343f49d4ad0cf7e29f99bd7b8be188ff6de7 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Fri, 22 Mar 2019 08:55:18 -0600 Subject: [PATCH 10/33] Yet more lint cleanup --- bin/scrape_software_versions.py | 1 - conf/base.config | 3 +-- conf/igenomes.config | 1 - nextflow.config | 5 +++-- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index b46a5558..b65bd25a 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -3,7 +3,6 @@ from collections import OrderedDict import re -# TODO nf-core: Add additional regexes for new tools in process get_software_versions regexes = { 'nf-core/nascent': ['v_pipeline.txt', r"(\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], diff --git a/conf/base.config b/conf/base.config index ee4dfbe8..73532f1e 100644 --- a/conf/base.config +++ b/conf/base.config @@ -11,9 +11,8 @@ process { - container = params.container + //container = params.container - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } time = { check_max( 2.h * task.attempt, 'time' ) } diff --git a/conf/igenomes.config b/conf/igenomes.config index d19e61f4..08154994 100644 --- a/conf/igenomes.config +++ b/conf/igenomes.config @@ -9,7 +9,6 @@ params { // illumina iGenomes reference file paths - // TODO nf-core: Add new reference types and strip out those that are not needed genomes { 'GRCh37' { bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" diff --git a/nextflow.config b/nextflow.config index 50a07ae0..96f0a683 100644 --- a/nextflow.config +++ b/nextflow.config @@ -13,7 +13,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :latest - container = 'ignaciot/nascent' + //container = 'ignaciot/nascent' // Workflow flags //reads = "data/*{R1,R2}*.fastq" @@ -59,7 +59,8 @@ profiles { debug { process.beforeScript = 'echo $HOSTNAME' } docker { docker.enabled = true - process.container = params.container + //process.container = params.container + process.container = 'ignaciot/nascent' } none { // Don't load any config (for use with custom home configs) From b57c19c96971723252a57df0fbd5acf61823b64d Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Fri, 22 Mar 2019 12:40:56 -0600 Subject: [PATCH 11/33] Updated the docs --- README.md | 58 ++++++++++++++++++++- docs/output.md | 134 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 189 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5ae239a1..bc41e05d 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,10 @@ https://img.shields.io/badge/singularity-available-7E4C74.svg) The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker / singularity containers making installation trivial and results highly reproducible. +#### Reference + +If you've used this pipeline in your research, you can cite this pipeline using DOI xxxxxxxxxxxxxxxxxxx ([OSF project](https://osf.io/xxxxxxxxxxx/)). + ### Documentation The nf-core/nascent pipeline comes with documentation about the pipeline, found in the `docs/` directory: @@ -24,7 +28,57 @@ The nf-core/nascent pipeline comes with documentation about the pipeline, found 4. [Output and how to interpret the results](docs/output.md) 5. [Troubleshooting](docs/troubleshooting.md) - +This pipeline is designed to process the sequencing output of nascent transcription assays, like GRO-seq or PRO-seq. It produces bedGraph- and bigWig-fomatted outputs after mapping strand-specific reads, as well as other useful outputs like quality control reports or IGV-ready (Integrative Genomics Viewer) tdf files. + +### Quick start + +Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the proper paths are set for genome reference files and other executables (look for all mentions of `COMPLETE_*`). Variable names should hopefully be self-explanatory. You can specify the Nextflow working directory and output directory with flags. Note you must also now specify the email to which the report will be sent for the run. + + nextflow run nf-core/nascent --reads '*_R{1,2}.fastq.gz' -profile standard,docker + +## Arguments + +**Required Arguments** + +| Arugment | Usage | Description | +|-----------|----------------------------------|----------------------------------------------------------------------| +| -profile | \ | Configuration profile to use. | +| --fastqs | \ | Directory pattern for fastq files. | +| --sras | \ | Directory pattern for sra files. | +| --genome_id | \<'hg38'> | Genome ID to which the samples will be mapped (e.g. hg38, mm10, rn6).| +| --workdir | \ | Nextflow working directory where all intermediate files are saved. | +| --email | \ | Where to send workflow report email. | + +**Save Options** + +| Arguments | Usage | Description | +|------------|---------------|-----------------------------------------------------------| +| --outdir | \ | Specifies where to save the output from the nextflow run. | +| --savefq | | Compresses and saves raw fastq reads. | +| --saveTrim | | Compresses and saves trimmed fastq reads. | +| --saveAll | | Compresses and saves all fastq reads. | +| --skipBAM | | Skips saving BAM files (only save CRAM). Default=False | + +**Input File Options** + +| Arguments | Usage | Description | +|--------------|-------------|------------------------------------------------------------------------------| +| --singleEnd | | Specifies that the input files are not paired reads (default is paired-end). | +| --flip | | Reverse complements each strand. Necessary for some library preps. | + +**Performance Options** + +| Arguments | Usage | Description | +|-----------------|-------------|---------------------------------------------------------| +| --threadfqdump | | Runs multi-threading for fastq-dump for sra processing. | + +**QC Options** + +| Arguments | Usage | Description | +|-----------------|-------------|---------------------------------------------------------| +| --skipMultiQC | | Skip running MultiQC. | +| --skipRSeQC | | Skip running RSeQC. | + ### Credits -nf-core/nascent was originally written by Ignacio Tripodi, Margaret Gruca. +nf-core/nascent was originally written by Ignacio Tripodi ([@ignaciot](https://github.com/ignaciot)) and Margaret Gruca ([@magruca](https://github.com/magruca)). diff --git a/docs/output.md b/docs/output.md index 2f3961a0..728e8662 100644 --- a/docs/output.md +++ b/docs/output.md @@ -8,8 +8,36 @@ This document describes the output produced by the pipeline. Most of the plots a The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +* [fastq-dump](#fastqdump) - if needed, extract the fastq file[s] from a sample +* [SeqKit/bbduk](#seqkitbbduk) - trim reads and remove adapters * [FastQC](#fastqc) - read quality control * [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline +* [HISAT2](#hisat2) - map reads to the reference genome +* [Samtools](#samtools) - convert the mapped reads as SAM files to BAM format +* [Preseq](#preseq) - estimate complexity of the sample +* [RSeQC](#rseqc) - analyze read distributions +* [Pileup](#pileup) - analyze coverage +* [bedtools](#bedtools) - create both normalized and non-normalized coverage files in bedGraph format +* [igvtools](#igvtools) - create compressed files to visualize the sample in the Integrative Genomics Viewer ([IGV](http://software.broadinstitute.org/software/igv/home)) + + +## fastqdump +[fastq-dump](https://trace.ncbi.nlm.nih.gov/Traces/sra/sra.cgi?view=toolkit_doc&f=fastq-dump) decompresses an SRR file obtained from the Gene Expression Omnibus ([GEO](https://www.ncbi.nlm.nih.gov/geo/)) database. This will produce one or two fastq files (in the case of paired-end reads). + +**Output directory: `results/fastq-dump`** + +* `sample.fastq` + * FastQ file to process, from the corresponding sample. + + +## seqkitbbduk +[SeqKit](https://bioinf.shenwei.me/seqkit/) is a toolkit for fasta and fastq file manipulation, used in the pipeline for xxxxxxxxxxxxxxxxxxxxxxxxx. [BBDuk](https://www.geneious.com/plugins/bbduk/) is an adapter trimming tool used to leave only the useful part of a sequenced read. + +**Output directory: `results/bbduk`** + +* `sample.trim.fastq` + * Trimmed FastQ file for each sample. + ## FastQC [FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences. @@ -18,7 +46,7 @@ For further reading and documentation see the [FastQC help](http://www.bioinform > **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. -**Output directory: `results/fastqc`** +**Output directory: `results/qc`** * `sample_fastqc.html` * FastQC report, containing quality metrics for your untrimmed raw fastq files @@ -39,3 +67,107 @@ The pipeline has special steps which allow the software versions used to be repo * Directory containing parsed statistics from the different tools used in the pipeline For more information about how to use MultiQC reports, see http://multiqc.info + + +## hisat2 +[HISAT2](https://ccb.jhu.edu/software/hisat2/index.shtml) is a sequence alignment tool to map the trimmed sequenced reads to the corresponding reference genome. Due to their size, the resulting sam files are not conserved after the pipeline has completed execution. + +If the necessary indices for mapping are not provided/present, a separate process will build them first. This step can take a few minutes, however it should only be executed once. + +**Output directory: none** + + + +## samtools +[Samtools](http://www.htslib.org/) is a suite of tools to handle format conversions, among other things, for high-throughput sequencing data. We also use Samtools to generate the list of chromosome sizes, if not provided for the desired reference genome. + +**Output directory: `results/mapped/bams`** + +* `sample.trim.sorted.bam` + * Mapped sample in BAM format +* `sample.trim.sorted.bam.bai` + * Index for the `sample.trim.sorted.bam` mapped sample in BAM format + +**Output directory: `results/qc/mapstats`** + +* `sample.trim.sorted.bam.flagstat` + * xxxxxxxxxxxxxxxxxxx +* `sample.trim.sorted.bam.millionsmapped` + * xxxxxxxxxxxxxxxxxxx + + +## preseq +[Preseq](http://smithlabresearch.org/software/preseq/) plots the estimated complexity of a sample, and estimates future yields for complexity if the sample is sequenced at higher read depths. + +**Output directory: `results/qc/preseq`** + +* `sample.trim.c_curve.txt` + * xxxxxxxxxxxxxxxxx +* `sample.trim.lc_extrap.txt` + * xxxxxxxxxxxxxxxxx + + +## rseqc +[RSeQC](http://dldcc-web.brc.bcm.edu/lilab/liguow/CGI/rseqc/_build/html/) provides a number of useful modules that can comprehensively evaluate high throughput sequence data. We use it on this pipeline to analyze read distributions. + +**Output directory: `results/qc/rseqc`** + +* `sample.trim.read_dist.txt` + * xxxxxxxxxxxxxxxxx + + +## pileup +[Pileup](xxxxxxxxxxxxxxxx) analyzes the sequencing coverage for each sample. + +**Output directory: `results/qc/pileup`** + +* `sample.trim.coverage.hist.txt` + * xxxxxxxxxxxxxxxxx +* `sample.trim.coverage.stats.txt` + * xxxxxxxxxxxxxxxxx + + +## bedtools +[bedtools](https://bedtools.readthedocs.io/en/latest/) is an extensive toolkit for BED and bedGraph format manipulation, like sorting, intersecting and joining these files. The files produced here are useful to be processed later using [Tfit](https://github.com/Dowell-Lab/Tfit) or [dReg](https://github.com/Danko-Lab/dREG) to find regions of active transcription, and transcription regulatory elements. + +**Output directory: `results/mapped/bedgraphs`** + +* `sample.trim.bedGraph` + * Sample coverage file in bedGraph format +* `sample.trim.pos.bedGraph` + * Sample coverage file (positive strand only) in bedGraph format +* `sample.trim.neg.bedGraph` + * Sample coverage file (negative strand only) in bedGraph format + +**Output directory: `results/mapped/rcc_bedgraphs`** + +* `sample.trim.rcc.bedGraph` + * Normalized sample coverage file in bedGraph format +* `sample.pos.trim.rcc.bedGraph` + * Normalized sample coverage file (positive strand only) in bedGraph format +* `sample.neg.trim.rcc.bedGraph` + * Normalized sample coverage file (negative strand only) in bedGraph format + +**Output directory: `results/mapped/dreg_input`** + +* `sample.trim.pos.rcc.bw` + * Sample coverage file (positive strand only) in BigWig format +* `sample.trim.neg.rcc.bw` + * Sample coverage file (negative strand only) in BigWig format + +**Output directory: `results/mapped/rcc_bigwig`** + +* `sample.trim.pos.rcc.bw` + * Normalized sample coverage file (positive strand only) in BigWig format +* `sample.trim.neg.rcc.bw` + * Normalized sample coverage file (negative strand only) in BigWig format + + +## igvtools +[igvtools](https://software.broadinstitute.org/software/igv/igvtools) is a commandline tool we use to produce a compressed version of the sample coverage file in order to visualize it on IGV more efficiently (with a significantly smaller memory footprint). + +**Output directory: `results/mapped/tdfs`** + +* `sample.trim.rpkm.tdf` + * Sample coverage file in TDF format + From 190da7e5db6e0a28c45adfe3f934a33357222ca6 Mon Sep 17 00:00:00 2001 From: Margaret Gruca Date: Fri, 22 Mar 2019 13:25:46 -0600 Subject: [PATCH 12/33] Update output.md updated trimming/qc details --- docs/output.md | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/docs/output.md b/docs/output.md index 728e8662..7f69733c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -9,13 +9,13 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: * [fastq-dump](#fastqdump) - if needed, extract the fastq file[s] from a sample -* [SeqKit/bbduk](#seqkitbbduk) - trim reads and remove adapters +* [SeqKit/bbduk](#seqkitbbduk) - flip reads (experiment specific) & trim reads for adapters/quality/length * [FastQC](#fastqc) - read quality control * [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline * [HISAT2](#hisat2) - map reads to the reference genome * [Samtools](#samtools) - convert the mapped reads as SAM files to BAM format * [Preseq](#preseq) - estimate complexity of the sample -* [RSeQC](#rseqc) - analyze read distributions +* [RSeQC](#rseqc) - analyze read distributions, infer experiment (SE/PE, whether reads need to be flipped), & read duplication * [Pileup](#pileup) - analyze coverage * [bedtools](#bedtools) - create both normalized and non-normalized coverage files in bedGraph format * [igvtools](#igvtools) - create compressed files to visualize the sample in the Integrative Genomics Viewer ([IGV](http://software.broadinstitute.org/software/igv/home)) @@ -30,13 +30,15 @@ and processes data using the following steps: * FastQ file to process, from the corresponding sample. -## seqkitbbduk -[SeqKit](https://bioinf.shenwei.me/seqkit/) is a toolkit for fasta and fastq file manipulation, used in the pipeline for xxxxxxxxxxxxxxxxxxxxxxxxx. [BBDuk](https://www.geneious.com/plugins/bbduk/) is an adapter trimming tool used to leave only the useful part of a sequenced read. +## seqkit & bbduk +[SeqKit](https://bioinf.shenwei.me/seqkit/) is a toolkit for fasta and fastq file manipulation, used in the pipeline if the positive/negative strands need to be flipped (dependent on library prep protocol). [BBDuk](https://www.geneious.com/plugins/bbduk/) is trimming tool used to filter reads for adapters, read quality, and overall length after adapter removal. -**Output directory: `results/bbduk`** +**Output directory: `results/bbduk, qc/trimstats`** * `sample.trim.fastq` * Trimmed FastQ file for each sample. +* `{refstats,trimstats,ehist}.txt` + * Trimming details including adapters removed, percentages of reads removed that did not meet minimum quality/length ## FastQC @@ -91,9 +93,9 @@ If the necessary indices for mapping are not provided/present, a separate proces **Output directory: `results/qc/mapstats`** * `sample.trim.sorted.bam.flagstat` - * xxxxxxxxxxxxxxxxxxx + * Overall mapping statistics * `sample.trim.sorted.bam.millionsmapped` - * xxxxxxxxxxxxxxxxxxx + * File that contains number of uniquely mapped reads (not total multi-mapped). Used in normalization ## preseq @@ -102,9 +104,9 @@ If the necessary indices for mapping are not provided/present, a separate proces **Output directory: `results/qc/preseq`** * `sample.trim.c_curve.txt` - * xxxxxxxxxxxxxxxxx + * Curve generated based on number of unique reads vs. total reads sequenced * `sample.trim.lc_extrap.txt` - * xxxxxxxxxxxxxxxxx + * Extrapolation of the c_curve that attempts to model the predicted number of unique reads if the sample was seqeunced to a greater depth ## rseqc @@ -113,7 +115,7 @@ If the necessary indices for mapping are not provided/present, a separate proces **Output directory: `results/qc/rseqc`** * `sample.trim.read_dist.txt` - * xxxxxxxxxxxxxxxxx + * Relative distribution of reads relative to a gene reference file ## pileup @@ -122,9 +124,9 @@ If the necessary indices for mapping are not provided/present, a separate proces **Output directory: `results/qc/pileup`** * `sample.trim.coverage.hist.txt` - * xxxxxxxxxxxxxxxxx + * Histogram of read coverage over each chromosome * `sample.trim.coverage.stats.txt` - * xxxxxxxxxxxxxxxxx + * Coverage stats broken down by chromosome including %GC, pos/neg read coverage, total coverage, etc. ## bedtools From 05e10d22202e7698acfedb37fb08ccc7a4ff6b71 Mon Sep 17 00:00:00 2001 From: Margaret Gruca Date: Fri, 22 Mar 2019 13:28:02 -0600 Subject: [PATCH 13/33] minor change to fastqc description --- docs/output.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index 7f69733c..fb196610 100644 --- a/docs/output.md +++ b/docs/output.md @@ -46,12 +46,12 @@ and processes data using the following steps: For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). -> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. +> **NB:** The FastQC plots displayed in the MultiQC report shows both untrimmed and trimmed reads. **Output directory: `results/qc`** * `sample_fastqc.html` - * FastQC report, containing quality metrics for your untrimmed raw fastq files + * FastQC report, containing quality metrics for your untrimmed raw fastq files & trimmed fastq files * `zips/sample_fastqc.zip` * zip file containing the FastQC report, tab-delimited data file and plot images From 356ca90a0775bfe54352d19d70af312540b1353d Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Fri, 22 Mar 2019 13:34:20 -0600 Subject: [PATCH 14/33] Proper URL for the pileup script --- docs/output.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index fb196610..1944db04 100644 --- a/docs/output.md +++ b/docs/output.md @@ -16,7 +16,7 @@ and processes data using the following steps: * [Samtools](#samtools) - convert the mapped reads as SAM files to BAM format * [Preseq](#preseq) - estimate complexity of the sample * [RSeQC](#rseqc) - analyze read distributions, infer experiment (SE/PE, whether reads need to be flipped), & read duplication -* [Pileup](#pileup) - analyze coverage +* [BBMap](#pileup) - analyze coverage * [bedtools](#bedtools) - create both normalized and non-normalized coverage files in bedGraph format * [igvtools](#igvtools) - create compressed files to visualize the sample in the Integrative Genomics Viewer ([IGV](http://software.broadinstitute.org/software/igv/home)) @@ -119,7 +119,7 @@ If the necessary indices for mapping are not provided/present, a separate proces ## pileup -[Pileup](xxxxxxxxxxxxxxxx) analyzes the sequencing coverage for each sample. +[BBMap](https://github.com/BioInfoTools/BBMap/blob/master/sh/pileup.sh) includes a tool called `pileup`, which analyzes the sequencing coverage for each sample. **Output directory: `results/qc/pileup`** From e1b28ba5af6e01f6088c05b781a6750b23757122 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 10:14:29 +0200 Subject: [PATCH 15/33] Fixing linting errors and stuff like that --- .travis.yml | 4 ++-- Dockerfile | 2 +- Singularity | 18 ------------------ conf/base.config | 2 -- environment.yml | 2 +- nextflow.config | 12 +++++------- 6 files changed, 9 insertions(+), 31 deletions(-) delete mode 100644 Singularity diff --git a/.travis.yml b/.travis.yml index 117d4a56..d8d2aabb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -11,9 +11,9 @@ before_install: # PRs to master are only ok if coming from dev branch - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' # Pull the docker image first so the test doesn't wait for this - - docker pull ignaciot/nascent + - docker pull nfcore/nascent:dev # Fake the tag locally so that the pipeline runs properly - - docker tag ignaciot/nascent ignaciot/nascent:latest + - docker tag nfcore/nascent:dev nfcore/nascent:dev install: # Install Nextflow diff --git a/Dockerfile b/Dockerfile index 0553b09a..bba58f69 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,4 +4,4 @@ LABEL authors="Ignacio Tripodi (ignacio.tripodi@colorado.edu), Margaret Gruca (m COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-nascent-1.0/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-nascent-1.0dev/bin:$PATH diff --git a/Singularity b/Singularity deleted file mode 100644 index 086e81e1..00000000 --- a/Singularity +++ /dev/null @@ -1,18 +0,0 @@ -From:nfcore/base -Bootstrap:docker - -%labels - MAINTAINER Ignacio Tripodi, Margaret Gruca - DESCRIPTION Singularity image containing all requirements for the nf-core/nascent pipeline - VERSION 1.0 - -%environment - PATH=/opt/conda/envs/nf-core-nascent-1.0/bin:$PATH - export PATH - -%files - environment.yml / - -%post - /opt/conda/bin/conda env create -f /environment.yml - /opt/conda/bin/conda clean -a diff --git a/conf/base.config b/conf/base.config index ee4dfbe8..5f3d1199 100644 --- a/conf/base.config +++ b/conf/base.config @@ -11,8 +11,6 @@ process { - container = params.container - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 2 * task.attempt, 'cpus' ) } memory = { check_max( 8.GB * task.attempt, 'memory' ) } diff --git a/environment.yml b/environment.yml index c6cb7276..86f898c3 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: nf-core-nascent-1.0 +name: nf-core-nascent-1.0dev channels: - conda-forge - bioconda diff --git a/nextflow.config b/nextflow.config index 48ec7a2c..49227366 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,10 +11,6 @@ // Global default params, used in configs params { - // Container slug. Stable releases should specify release tag! - // Developmental code should specify :latest - container = 'ignaciot/nascent:latest' - // Workflow flags //reads = "data/*{R1,R2}*.fastq" //fastqs= "data/*{R1,R2}*.fastq" @@ -29,7 +25,6 @@ params { help = false igenomes_base = "./iGenomes" tracedir = "${params.outdir}/pipeline_info" -// nf_required_version = nf_required_version outdir = './results' pairedEnd = false saveAllfq = false @@ -37,9 +32,12 @@ params { saveTrim = false skipMultiQC = false threadfqdump = false -// version = version } +// Container slug. Stable releases should specify release tag! +// Developmental code should specify :dev +process.container = 'nfcore/nascent:dev' + profiles { awsbatch { includeConfig 'conf/base.config' @@ -114,7 +112,7 @@ manifest { description = 'Nascent Transcription Processing Pipeline' mainScript = 'main.nf' nextflowVersion = '>=0.32.0' - version = '1.0' + version = '1.0dev' } // Function to ensure that resource requirements don't go beyond From e20a5d94f69e46712e8751524ed505958e31f34d Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 10:54:37 +0200 Subject: [PATCH 16/33] Initial template commit --- .gitattributes | 1 + .github/CONTRIBUTING.md | 47 +++ .github/ISSUE_TEMPLATE/bug_report.md | 31 ++ .github/ISSUE_TEMPLATE/feature_request.md | 16 + .github/PULL_REQUEST_TEMPLATE.md | 15 + .github/markdownlint.yml | 9 + .gitignore | 7 + .travis.yml | 42 +++ CHANGELOG.md | 4 + CODE_OF_CONDUCT.md | 46 +++ Dockerfile | 7 + LICENSE | 21 ++ README.md | 30 ++ assets/email_template.html | 52 +++ assets/email_template.txt | 34 ++ assets/multiqc_config.yaml | 9 + assets/sendmail_template.txt | 36 ++ bin/markdown_to_html.r | 51 +++ bin/scrape_software_versions.py | 49 +++ conf/awsbatch.config | 18 + conf/base.config | 34 ++ conf/igenomes.config | 147 ++++++++ conf/test.config | 25 ++ docs/README.md | 12 + docs/output.md | 41 +++ docs/usage.md | 282 ++++++++++++++ environment.yml | 11 + main.nf | 428 ++++++++++++++++++++++ nextflow.config | 127 +++++++ 29 files changed, 1632 insertions(+) create mode 100644 .gitattributes create mode 100644 .github/CONTRIBUTING.md create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md create mode 100644 .github/PULL_REQUEST_TEMPLATE.md create mode 100644 .github/markdownlint.yml create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 CHANGELOG.md create mode 100644 CODE_OF_CONDUCT.md create mode 100644 Dockerfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 assets/email_template.html create mode 100644 assets/email_template.txt create mode 100644 assets/multiqc_config.yaml create mode 100644 assets/sendmail_template.txt create mode 100755 bin/markdown_to_html.r create mode 100755 bin/scrape_software_versions.py create mode 100644 conf/awsbatch.config create mode 100644 conf/base.config create mode 100644 conf/igenomes.config create mode 100644 conf/test.config create mode 100644 docs/README.md create mode 100644 docs/output.md create mode 100644 docs/usage.md create mode 100644 environment.yml create mode 100644 main.nf create mode 100644 nextflow.config diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..7fe55006 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.config linguist-language=nextflow diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md new file mode 100644 index 00000000..4496ce23 --- /dev/null +++ b/.github/CONTRIBUTING.md @@ -0,0 +1,47 @@ +# nf-core/nascent: Contributing Guidelines + +Hi there! Many thanks for taking an interest in improving nf-core/nascent. + +We try to manage the required tasks for nf-core/nascent using GitHub issues, you probably came to this page when creating one. Please use the pre-filled template to save time. + +However, don't be put off by this template - other more general issues and suggestions are welcome! Contributions to the code are even more welcome ;) + +> If you need help using or modifying nf-core/nascent then the best place to ask is on the pipeline channel on [Slack](https://nf-core-invite.herokuapp.com/). + + + +## Contribution workflow +If you'd like to write some code for nf-core/nascent, the standard workflow +is as follows: + +1. Check that there isn't already an issue about your idea in the + [nf-core/nascent issues](https://github.com/nf-core/nascent/issues) to avoid + duplicating work. + * If there isn't one already, please create one so that others know you're working on this +2. Fork the [nf-core/nascent repository](https://github.com/nf-core/nascent) to your GitHub account +3. Make the necessary changes / additions within your forked repository +4. Submit a Pull Request against the `dev` branch and wait for the code to be reviewed and merged. + +If you're not used to this workflow with git, you can start with some [basic docs from GitHub](https://help.github.com/articles/fork-a-repo/) or even their [excellent interactive tutorial](https://try.github.io/). + + +## Tests +When you create a pull request with changes, [Travis CI](https://travis-ci.org/) will run automatic tests. +Typically, pull-requests are only fully reviewed when these tests are passing, though of course we can help out before then. + +There are typically two types of tests that run: + +### Lint Tests +The nf-core has a [set of guidelines](http://nf-co.re/guidelines) which all pipelines must adhere to. +To enforce these and ensure that all pipelines stay in sync, we have developed a helper tool which runs checks on the pipeline code. This is in the [nf-core/tools repository](https://github.com/nf-core/tools) and once installed can be run locally with the `nf-core lint ` command. + +If any failures or warnings are encountered, please follow the listed URL for more documentation. + +### Pipeline Tests +Each nf-core pipeline should be set up with a minimal set of test-data. +Travis CI then runs the pipeline on this data to ensure that it exists successfully. +If there are any failures then the automated tests fail. +These tests are run both with the latest available version of Nextflow and also the minimum required version that is stated in the pipeline code. + +## Getting help +For further information/help, please consult the [nf-core/nascent documentation](https://github.com/nf-core/nascent#documentation) and don't hesitate to get in touch on the pipeline channel on [Slack](https://nf-core-invite.herokuapp.com/). diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 00000000..0bdb2be9 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,31 @@ +Hi there! + +Thanks for telling us about a problem with the pipeline. Please delete this text and anything that's not relevant from the template below: + +#### Describe the bug +A clear and concise description of what the bug is. + +#### Steps to reproduce +Steps to reproduce the behaviour: +1. Command line: `nextflow run ...` +2. See error: _Please provide your error message_ + +#### Expected behaviour +A clear and concise description of what you expected to happen. + +#### System: + - Hardware: [e.g. HPC, Desktop, Cloud...] + - Executor: [e.g. slurm, local, awsbatch...] + - OS: [e.g. CentOS Linux, macOS, Linux Mint...] + - Version [e.g. 7, 10.13.6, 18.3...] + +#### Nextflow Installation: + - Version: [e.g. 0.31.0] + +#### Container engine: + - Engine: [e.g. Conda, Docker or Singularity] + - version: [e.g. 1.0.0] + - Image tag: [e.g. nfcore/nascent:1.0.0] + +#### Additional context +Add any other context about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 00000000..1f025b77 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,16 @@ +Hi there! + +Thanks for suggesting a new feature for the pipeline! Please delete this text and anything that's not relevant from the template below: + +#### Is your feature request related to a problem? Please describe. +A clear and concise description of what the problem is. +Ex. I'm always frustrated when [...] + +#### Describe the solution you'd like +A clear and concise description of what you want to happen. + +#### Describe alternatives you've considered +A clear and concise description of any alternative solutions or features you've considered. + +#### Additional context +Add any other context about the feature request here. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 00000000..621f47ba --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,15 @@ +Many thanks to contributing to nf-core/nascent! + +Please fill in the appropriate checklist below (delete whatever is not relevant). These are the most common things requested on pull requests (PRs). + +## PR checklist + - [ ] This comment contains a description of changes (with reason) + - [ ] If you've fixed a bug or added code that should be tested, add tests! + - [ ] If necessary, also make a PR on the [nf-core/nascent branch on the nf-core/test-datasets repo]( https://github.com/nf-core/test-datasets/pull/new/nf-core/nascent) + - [ ] Ensure the test suite passes (`nextflow run . -profile test,docker`). + - [ ] Make sure your code lints (`nf-core lint .`). + - [ ] Documentation in `docs` is updated + - [ ] `CHANGELOG.md` is updated + - [ ] `README.md` is updated + +**Learn more about contributing:** https://github.com/nf-core/nascent/tree/master/.github/CONTRIBUTING.md diff --git a/.github/markdownlint.yml b/.github/markdownlint.yml new file mode 100644 index 00000000..e052a635 --- /dev/null +++ b/.github/markdownlint.yml @@ -0,0 +1,9 @@ +# Markdownlint configuration file +default: true, +line-length: false +no-multiple-blanks: 0 +blanks-around-headers: false +blanks-around-lists: false +header-increment: false +no-duplicate-header: + siblings_only: true diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..5b54e3e6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.nextflow* +work/ +data/ +results/ +.DS_Store +tests/test_data +*.pyc diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..3a57c2df --- /dev/null +++ b/.travis.yml @@ -0,0 +1,42 @@ +sudo: required +language: python +jdk: openjdk8 +services: docker +python: '3.6' +cache: pip +matrix: + fast_finish: true + +before_install: + # PRs to master are only ok if coming from dev branch + - '[ $TRAVIS_PULL_REQUEST = "false" ] || [ $TRAVIS_BRANCH != "master" ] || ([ $TRAVIS_PULL_REQUEST_SLUG = $TRAVIS_REPO_SLUG ] && [ $TRAVIS_PULL_REQUEST_BRANCH = "dev" ])' + # Pull the docker image first so the test doesn't wait for this + - docker pull nfcore/nascent:dev + # Fake the tag locally so that the pipeline runs properly + # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) + - docker tag nfcore/nascent:dev nfcore/nascent:dev + +install: + # Install Nextflow + - mkdir /tmp/nextflow && cd /tmp/nextflow + - wget -qO- get.nextflow.io | bash + - sudo ln -s /tmp/nextflow/nextflow /usr/local/bin/nextflow + # Install nf-core/tools + - pip install --upgrade pip + - pip install nf-core + # Reset + - mkdir ${TRAVIS_BUILD_DIR}/tests && cd ${TRAVIS_BUILD_DIR}/tests + # Install markdownlint-cli + - sudo apt-get install npm && npm install -g markdownlint-cli + +env: + - NXF_VER='0.32.0' # Specify a minimum NF version that should be tested and work + - NXF_VER='' # Plus: get the latest NF version and check that it works + +script: + # Lint the pipeline code + - nf-core lint ${TRAVIS_BUILD_DIR} + # Lint the documentation + - markdownlint ${TRAVIS_BUILD_DIR} -c ${TRAVIS_BUILD_DIR}/.github/markdownlint.yml + # Run the pipeline with the test profile + - nextflow run ${TRAVIS_BUILD_DIR} -profile test,docker diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 00000000..a8aacf83 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,4 @@ +# nf-core/nascent: Changelog + +## v1.0dev - [date] +Initial release of nf-core/nascent, created with the [nf-core](http://nf-co.re/) template. diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..09226d0d --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team on [Slack](https://nf-core-invite.herokuapp.com/). The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..072e5375 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,7 @@ +FROM nfcore/base +LABEL authors="Ignacio Tripodi" \ + description="Docker image containing all requirements for nf-core/nascent pipeline" + +COPY environment.yml / +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH /opt/conda/envs/nf-core-nascent-1.0dev/bin:$PATH diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..e2b324dd --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) Ignacio Tripodi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 00000000..a01b9c80 --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# nf-core/nascent + +**Nascent Transcription Processing Pipeline**. + +[![Build Status](https://travis-ci.com/nf-core/nascent.svg?branch=master)](https://travis-ci.com/nf-core/nascent) +[![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.32.0-brightgreen.svg)](https://www.nextflow.io/) + +[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/) +[![Docker](https://img.shields.io/docker/automated/nfcore/nascent.svg)](https://hub.docker.com/r/nfcore/nascent) + +## Introduction +The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It comes with docker containers making installation trivial and results highly reproducible. + + +## Documentation +The nf-core/nascent pipeline comes with documentation about the pipeline, found in the `docs/` directory: + +1. [Installation](https://nf-co.re/usage/installation) +2. Pipeline configuration + * [Local installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) +3. [Running the pipeline](docs/usage.md) +4. [Output and how to interpret the results](docs/output.md) +5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) + + + +## Credits +nf-core/nascent was originally written by Ignacio Tripodi. diff --git a/assets/email_template.html b/assets/email_template.html new file mode 100644 index 00000000..f8fe7553 --- /dev/null +++ b/assets/email_template.html @@ -0,0 +1,52 @@ + + + + + + + + + nf-core/nascent Pipeline Report + + +
+ +

nf-core/nascent v${version}

+

Run Name: $runName

+ +<% if (!success){ + out << """ +
+

nf-core/nascent execution completed unsuccessfully!

+

The exit status of the task that caused the workflow execution to fail was: $exitStatus.

+

The full error message was:

+
${errorReport}
+
+ """ +} else { + out << """ +
+ nf-core/nascent execution completed successfully! +
+ """ +} +%> + +

The workflow was completed at $dateComplete (duration: $duration)

+

The command used to launch the workflow was as follows:

+
$commandLine
+ +

Pipeline Configuration:

+ + + <% out << summary.collect{ k,v -> "" }.join("\n") %> + +
$k
$v
+ +

nf-core/nascent

+

https://github.com/nf-core/nascent

+ +
+ + + diff --git a/assets/email_template.txt b/assets/email_template.txt new file mode 100644 index 00000000..de3d6153 --- /dev/null +++ b/assets/email_template.txt @@ -0,0 +1,34 @@ +======================================== + nf-core/nascent v${version} +======================================== +Run Name: $runName + +<% if (success){ + out << "## nf-core/nascent execution completed successfully! ##" +} else { + out << """#################################################### +## nf-core/nascent execution completed unsuccessfully! ## +#################################################### +The exit status of the task that caused the workflow execution to fail was: $exitStatus. +The full error message was: + +${errorReport} +""" +} %> + + +The workflow was completed at $dateComplete (duration: $duration) + +The command used to launch the workflow was as follows: + + $commandLine + + + +Pipeline Configuration: +----------------------- +<% out << summary.collect{ k,v -> " - $k: $v" }.join("\n") %> + +-- +nf-core/nascent +https://github.com/nf-core/nascent diff --git a/assets/multiqc_config.yaml b/assets/multiqc_config.yaml new file mode 100644 index 00000000..5ec55b92 --- /dev/null +++ b/assets/multiqc_config.yaml @@ -0,0 +1,9 @@ +report_comment: > + This report has been generated by the nf-core/nascent + analysis pipeline. For information about how to interpret these results, please see the + documentation. +report_section_order: + nf-core/nascent-software-versions: + order: -1000 + +export_plots: true diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt new file mode 100644 index 00000000..2d671220 --- /dev/null +++ b/assets/sendmail_template.txt @@ -0,0 +1,36 @@ +To: $email +Subject: $subject +Mime-Version: 1.0 +Content-Type: multipart/related;boundary="nfcoremimeboundary" + +--nfcoremimeboundary +Content-Type: text/html; charset=utf-8 + +$email_html + +<% +if (mqcFile){ +def mqcFileObj = new File("$mqcFile") +if (mqcFileObj.length() < mqcMaxSize){ +out << """ +--nfcoremimeboundary +Content-Type: text/html; name=\"multiqc_report\" +Content-Transfer-Encoding: base64 +Content-ID: +Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\" + +${mqcFileObj. + bytes. + encodeBase64(). + toString(). + tokenize( '\n' )*. + toList()*. + collate( 76 )*. + collect { it.join() }. + flatten(). + join( '\n' )} +""" +}} +%> + +--nfcoremimeboundary-- diff --git a/bin/markdown_to_html.r b/bin/markdown_to_html.r new file mode 100755 index 00000000..abe13350 --- /dev/null +++ b/bin/markdown_to_html.r @@ -0,0 +1,51 @@ +#!/usr/bin/env Rscript + +# Command line argument processing +args = commandArgs(trailingOnly=TRUE) +if (length(args) < 2) { + stop("Usage: markdown_to_html.r ", call.=FALSE) +} +markdown_fn <- args[1] +output_fn <- args[2] + +# Load / install packages +if (!require("markdown")) { + install.packages("markdown", dependencies=TRUE, repos='http://cloud.r-project.org/') + library("markdown") +} + +base_css_fn <- getOption("markdown.HTML.stylesheet") +base_css <- readChar(base_css_fn, file.info(base_css_fn)$size) +custom_css <- paste(base_css, " +body { + padding: 3em; + margin-right: 350px; + max-width: 100%; +} +#toc { + position: fixed; + right: 20px; + width: 300px; + padding-top: 20px; + overflow: scroll; + height: calc(100% - 3em - 20px); +} +#toc_header { + font-size: 1.8em; + font-weight: bold; +} +#toc > ul { + padding-left: 0; + list-style-type: none; +} +#toc > ul ul { padding-left: 20px; } +#toc > ul > li > a { display: none; } +img { max-width: 800px; } +") + +markdownToHTML( + file = markdown_fn, + output = output_fn, + stylesheet = custom_css, + options = c('toc', 'base64_images', 'highlight_code') +) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py new file mode 100755 index 00000000..692176f4 --- /dev/null +++ b/bin/scrape_software_versions.py @@ -0,0 +1,49 @@ +#!/usr/bin/env python +from __future__ import print_function +from collections import OrderedDict +import re + +# TODO nf-core: Add additional regexes for new tools in process get_software_versions +regexes = { + 'nf-core/nascent': ['v_pipeline.txt', r"(\S+)"], + 'Nextflow': ['v_nextflow.txt', r"(\S+)"], + 'FastQC': ['v_fastqc.txt', r"FastQC v(\S+)"], + 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], +} +results = OrderedDict() +results['nf-core/nascent'] = 'N/A' +results['Nextflow'] = 'N/A' +results['FastQC'] = 'N/A' +results['MultiQC'] = 'N/A' + +# Search each file using its regex +for k, v in regexes.items(): + with open(v[0]) as x: + versions = x.read() + match = re.search(v[1], versions) + if match: + results[k] = "v{}".format(match.group(1)) + +# Remove software set to false in results +for k in results: + if not results[k]: + del(results[k]) + +# Dump to YAML +print (''' +id: 'software_versions' +section_name: 'nf-core/nascent Software Versions' +section_href: 'https://github.com/nf-core/nascent' +plot_type: 'html' +description: 'are collected at run time from the software output.' +data: | +
+''') +for k,v in results.items(): + print("
{}
{}
".format(k,v)) +print ("
") + +# Write out regexes as csv file: +with open('software_versions.csv', 'w') as f: + for k,v in results.items(): + f.write("{}\t{}\n".format(k,v)) diff --git a/conf/awsbatch.config b/conf/awsbatch.config new file mode 100644 index 00000000..14af5866 --- /dev/null +++ b/conf/awsbatch.config @@ -0,0 +1,18 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running on AWS batch + * ------------------------------------------------- + * Base config needed for running with -profile awsbatch + */ +params { + config_profile_name = 'AWSBATCH' + config_profile_description = 'AWSBATCH Cloud Profile' + config_profile_contact = 'Alexander Peltzer (@apeltzer)' + config_profile_url = 'https://aws.amazon.com/de/batch/' +} + +aws.region = params.awsregion +process.executor = 'awsbatch' +process.queue = params.awsqueue +executor.awscli = '/home/ec2-user/miniconda/bin/aws' +params.tracedir = './' diff --git a/conf/base.config b/conf/base.config new file mode 100644 index 00000000..d495f403 --- /dev/null +++ b/conf/base.config @@ -0,0 +1,34 @@ +/* + * ------------------------------------------------- + * nf-core/nascent Nextflow base config file + * ------------------------------------------------- + * A 'blank slate' config file, appropriate for general + * use on most high performace compute environments. + * Assumes that all software is installed and available + * on the PATH. Runs in `local` mode - all jobs will be + * run on the logged in environment. + */ + +process { + + // TODO nf-core: Check the defaults for all processes + cpus = { check_max( 1 * task.attempt, 'cpus' ) } + memory = { check_max( 8.GB * task.attempt, 'memory' ) } + time = { check_max( 2.h * task.attempt, 'time' ) } + + errorStrategy = { task.exitStatus in [143,137,104,134,139] ? 'retry' : 'finish' } + maxRetries = 1 + maxErrors = '-1' + + // Process-specific resource requirements + // TODO nf-core: Customise requirements for specific processes. + // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors +} + +params { + // Defaults only, expecting to be overwritten + max_memory = 128.GB + max_cpus = 16 + max_time = 240.h + igenomes_base = 's3://ngi-igenomes/igenomes/' +} diff --git a/conf/igenomes.config b/conf/igenomes.config new file mode 100644 index 00000000..d19e61f4 --- /dev/null +++ b/conf/igenomes.config @@ -0,0 +1,147 @@ +/* + * ------------------------------------------------- + * Nextflow config file for iGenomes paths + * ------------------------------------------------- + * Defines reference genomes, using iGenome paths + * Can be used by any config that customises the base + * path using $params.igenomes_base / --igenomes_base + */ + +params { + // illumina iGenomes reference file paths + // TODO nf-core: Add new reference types and strip out those that are not needed + genomes { + 'GRCh37' { + bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" + } + 'GRCm38' { + bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" + } + 'TAIR10' { + bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" + } + 'EB2' { + bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" + } + 'UMD3.1' { + bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" + } + 'WBcel235' { + bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" + } + 'CanFam3.1' { + bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" + } + 'GRCz10' { + bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" + } + 'BDGP6' { + bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" + } + 'EquCab2' { + bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" + } + 'EB1' { + bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" + } + 'Galgal4' { + bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" + } + 'Gm01' { + bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" + } + 'Mmul_1' { + bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" + } + 'IRGSP-1.0' { + bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" + } + 'CHIMP2.1.4' { + bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" + } + 'Rnor_6.0' { + bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" + } + 'R64-1-1' { + bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" + } + 'EF2' { + bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" + } + 'Sbi1' { + bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" + } + 'Sscrofa10.2' { + bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" + } + 'AGPv3' { + bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" + fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" + gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" + star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" + } + } +} diff --git a/conf/test.config b/conf/test.config new file mode 100644 index 00000000..285d64a4 --- /dev/null +++ b/conf/test.config @@ -0,0 +1,25 @@ +/* + * ------------------------------------------------- + * Nextflow config file for running tests + * ------------------------------------------------- + * Defines bundled input files and everything required + * to run a fast and simple test. Use as follows: + * nextflow run nf-core/nascent -profile test + */ + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + // Limit resources so that this can run on Travis + max_cpus = 2 + max_memory = 6.GB + max_time = 48.h + // Input data + // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets + // TODO nf-core: Give any required params for the test so that command line flags are not needed + singleEnd = false + readPaths = [ + ['Testdata', ['https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R1.tiny.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/exoseq/testdata/Testdata_R2.tiny.fastq.gz']], + ['SRR389222', ['https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub1.fastq.gz', 'https://github.com/nf-core/test-datasets/raw/methylseq/testdata/SRR389222_sub2.fastq.gz']] + ] +} diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..1bb7e42d --- /dev/null +++ b/docs/README.md @@ -0,0 +1,12 @@ +# nf-core/nascent: Documentation + +The nf-core/nascent documentation is split into the following files: + +1. [Installation](https://nf-co.re/usage/installation) +2. Pipeline configuration + * [Local installation](https://nf-co.re/usage/local_installation) + * [Adding your own system config](https://nf-co.re/usage/adding_own_config) + * [Reference genomes](https://nf-co.re/usage/reference_genomes) +3. [Running the pipeline](usage.md) +4. [Output and how to interpret the results](output.md) +5. [Troubleshooting](https://nf-co.re/usage/troubleshooting) diff --git a/docs/output.md b/docs/output.md new file mode 100644 index 00000000..df496c1f --- /dev/null +++ b/docs/output.md @@ -0,0 +1,41 @@ +# nf-core/nascent: Output + +This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. + + + +## Pipeline overview +The pipeline is built using [Nextflow](https://www.nextflow.io/) +and processes data using the following steps: + +* [FastQC](#fastqc) - read quality control +* [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline + +## FastQC +[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your reads. It provides information about the quality score distribution across your reads, the per base sequence content (%T/A/G/C). You get information about adapter contamination and other overrepresented sequences. + +For further reading and documentation see the [FastQC help](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). + +> **NB:** The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. To see how your reads look after trimming, look at the FastQC reports in the `trim_galore` directory. + +**Output directory: `results/fastqc`** + +* `sample_fastqc.html` + * FastQC report, containing quality metrics for your untrimmed raw fastq files +* `zips/sample_fastqc.zip` + * zip file containing the FastQC report, tab-delimited data file and plot images + + +## MultiQC +[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. + +The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. + +**Output directory: `results/multiqc`** + +* `Project_multiqc_report.html` + * MultiQC report - a standalone HTML file that can be viewed in your web browser +* `Project_multiqc_data/` + * Directory containing parsed statistics from the different tools used in the pipeline + +For more information about how to use MultiQC reports, see [http://multiqc.info](http://multiqc.info) diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 00000000..bbef83e2 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,282 @@ +# nf-core/nascent: Usage + +## Table of contents + + + +* [Table of contents](#table-of-contents) +* [Introduction](#introduction) +* [Running the pipeline](#running-the-pipeline) + * [Updating the pipeline](#updating-the-pipeline) + * [Reproducibility](#reproducibility) +* [Main arguments](#main-arguments) + * [`-profile`](#-profile) + * [`--reads`](#--reads) + * [`--singleEnd`](#--singleend) +* [Reference genomes](#reference-genomes) + * [`--genome` (using iGenomes)](#--genome-using-igenomes) + * [`--fasta`](#--fasta) + * [`--igenomesIgnore`](#--igenomesignore) +* [Job resources](#job-resources) + * [Automatic resubmission](#automatic-resubmission) + * [Custom resource requests](#custom-resource-requests) +* [AWS Batch specific parameters](#aws-batch-specific-parameters) + * [`--awsqueue`](#--awsqueue) + * [`--awsregion`](#--awsregion) +* [Other command line parameters](#other-command-line-parameters) + * [`--outdir`](#--outdir) + * [`--email`](#--email) + * [`-name`](#-name) + * [`-resume`](#-resume) + * [`-c`](#-c) + * [`--custom_config_version`](#--custom_config_version) + * [`--custom_config_base`](#--custom_config_base) + * [`--max_memory`](#--max_memory) + * [`--max_time`](#--max_time) + * [`--max_cpus`](#--max_cpus) + * [`--plaintext_email`](#--plaintext_email) + * [`--monochrome_logs`](#--monochrome_logs) + * [`--multiqc_config`](#--multiqc_config) + + + +## Introduction +Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. + +It is recommended to limit the Nextflow Java virtual machines memory. We recommend adding the following line to your environment (typically in `~/.bashrc` or `~./bash_profile`): + +```bash +NXF_OPTS='-Xms1g -Xmx4g' +``` + + + +## Running the pipeline +The typical command for running the pipeline is as follows: + +```bash +nextflow run nf-core/nascent --reads '*_R{1,2}.fastq.gz' -profile docker +``` + +This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. + +Note that the pipeline will create the following files in your working directory: + +```bash +work # Directory containing the nextflow working files +results # Finished results (configurable, see below) +.nextflow_log # Log file from Nextflow +# Other nextflow hidden files, eg. history of pipeline runs and old logs. +``` + +### Updating the pipeline +When you run the above command, Nextflow automatically pulls the pipeline code from GitHub and stores it as a cached version. When running the pipeline after this, it will always use the cached version if available - even if the pipeline has been updated since. To make sure that you're running the latest version of the pipeline, make sure that you regularly update the cached version of the pipeline: + +```bash +nextflow pull nf-core/nascent +``` + +### Reproducibility +It's a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. + +First, go to the [nf-core/nascent releases page](https://github.com/nf-core/nascent/releases) and find the latest version number - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. + +This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. + + +## Main arguments + +### `-profile` +Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Note that multiple profiles can be loaded, for example: `-profile docker` - the order of arguments is important! + +If `-profile` is not specified at all the pipeline will be run locally and expects all software to be installed and available on the `PATH`. + +* `awsbatch` + * A generic configuration profile to be used with AWS Batch. +* `conda` + * A generic configuration profile to be used with [conda](https://conda.io/docs/) + * Pulls most software from [Bioconda](https://bioconda.github.io/) +* `docker` + * A generic configuration profile to be used with [Docker](http://docker.com/) + * Pulls software from dockerhub: [`nfcore/nascent`](http://hub.docker.com/r/nfcore/nascent/) +* `singularity` + * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) + * Pulls software from DockerHub: [`nfcore/nascent`](http://hub.docker.com/r/nfcore/nascent/) +* `test` + * A profile with a complete configuration for automated testing + * Includes links to test data so needs no other parameters + + + +### `--reads` +Use this to specify the location of your input FastQ files. For example: + +```bash +--reads 'path/to/data/sample_*_{1,2}.fastq' +``` + +Please note the following requirements: + +1. The path must be enclosed in quotes +2. The path must have at least one `*` wildcard character +3. When using the pipeline with paired end data, the path must use `{1,2}` notation to specify read pairs. + +If left unspecified, a default pattern is used: `data/*{1,2}.fastq.gz` + +### `--singleEnd` +By default, the pipeline expects paired-end data. If you have single-end data, you need to specify `--singleEnd` on the command line when you launch the pipeline. A normal glob pattern, enclosed in quotation marks, can then be used for `--reads`. For example: + +```bash +--singleEnd --reads '*.fastq' +``` + +It is not possible to run a mixture of single-end and paired-end files in one run. + + +## Reference genomes + +The pipeline config files come bundled with paths to the illumina iGenomes reference index files. If running with docker or AWS, the configuration is set up to use the [AWS-iGenomes](https://ewels.github.io/AWS-iGenomes/) resource. + +### `--genome` (using iGenomes) +There are 31 different species supported in the iGenomes references. To run the pipeline, you must specify which to use with the `--genome` flag. + +You can find the keys to specify the genomes in the [iGenomes config file](../conf/igenomes.config). Common genomes that are supported are: + +* Human + * `--genome GRCh37` +* Mouse + * `--genome GRCm38` +* _Drosophila_ + * `--genome BDGP6` +* _S. cerevisiae_ + * `--genome 'R64-1-1'` + +> There are numerous others - check the config file for more. + +Note that you can use the same configuration setup to save sets of reference files for your own use, even if they are not part of the iGenomes resource. See the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for instructions on where to save such a file. + +The syntax for this reference configuration is as follows: + + + +```nextflow +params { + genomes { + 'GRCh37' { + fasta = '' // Used if no star index given + } + // Any number of additional genomes, key is used with --genome + } +} +``` + + +### `--fasta` +If you prefer, you can specify the full path to your reference genome when you run the pipeline: + +```bash +--fasta '[path to Fasta reference]' +``` + +### `--igenomesIgnore` +Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`. + +## Job resources +### Automatic resubmission +Each step in the pipeline has a default set of requirements for number of CPUs, memory and time. For most of the steps in the pipeline, if the job exits with an error code of `143` (exceeded requested resources) it will automatically resubmit with higher requests (2 x original, then 3 x original). If it still fails after three times then the pipeline is stopped. + +### Custom resource requests +Wherever process-specific requirements are set in the pipeline, the default value can be changed by creating a custom config file. See the files hosted at [`nf-core/configs`](https://github.com/nf-core/configs/tree/master/conf) for examples. + +If you are likely to be running `nf-core` pipelines regularly it may be a good idea to request that your custom config file is uploaded to the `nf-core/configs` git repository. Before you do this please can you test that the config file works with your pipeline of choice using the `-c` parameter (see definition below). You can then create a pull request to the `nf-core/configs` repository with the addition of your config file, associated documentation file (see examples in [`nf-core/configs/docs`](https://github.com/nf-core/configs/tree/master/docs)), and amending [`nfcore_custom.config`](https://github.com/nf-core/configs/blob/master/nfcore_custom.config) to include your custom profile. + +If you have any questions or issues please send us a message on [Slack](https://nf-core-invite.herokuapp.com/). + +## AWS Batch specific parameters +Running the pipeline on AWS Batch requires a couple of specific parameters to be set according to your AWS Batch configuration. Please use the `-awsbatch` profile and then specify all of the following parameters. +### `--awsqueue` +The JobQueue that you intend to use on AWS Batch. +### `--awsregion` +The AWS region to run your job in. Default is set to `eu-west-1` but can be adjusted to your needs. + +Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a S3 storage bucket of your choice - you'll get an error message notifying you if you didn't. + +## Other command line parameters + + + +### `--outdir` +The output directory where the results will be saved. + +### `--email` +Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run. + +### `-name` +Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + +This is used in the MultiQC report (if not default) and in the summary HTML / e-mail (always). + +**NB:** Single hyphen (core Nextflow option) + +### `-resume` +Specify this when restarting a pipeline. Nextflow will used cached results from any pipeline steps where the inputs are the same, continuing from where it got to previously. + +You can also supply a run name to resume a specific run: `-resume [run-name]`. Use the `nextflow log` command to show previous run names. + +**NB:** Single hyphen (core Nextflow option) + +### `-c` +Specify the path to a specific config file (this is a core NextFlow command). + +**NB:** Single hyphen (core Nextflow option) + +Note - you can use this to override pipeline defaults. + +### `--custom_config_version` +Provide git commit id for custom Institutional configs hosted at `nf-core/configs`. This was implemented for reproducibility purposes. Default is set to `master`. + +```bash +## Download and use config file with following git commid id +--custom_config_version d52db660777c4bf36546ddb188ec530c3ada1b96 +``` + +### `--custom_config_base` +If you're running offline, nextflow will not be able to fetch the institutional config files +from the internet. If you don't need them, then this is not a problem. If you do need them, +you should download the files from the repo and tell nextflow where to find them with the +`custom_config_base` option. For example: + +```bash +## Download and unzip the config files +cd /path/to/my/configs +wget https://github.com/nf-core/configs/archive/master.zip +unzip master.zip + +## Run the pipeline +cd /path/to/my/data +nextflow run /path/to/pipeline/ --custom_config_base /path/to/my/configs/configs-master/ +``` + +> Note that the nf-core/tools helper package has a `download` command to download all required pipeline +> files + singularity containers + institutional configs in one go for you, to make this process easier. + +### `--max_memory` +Use to set a top-limit for the default memory requirement for each process. +Should be a string in the format integer-unit. eg. `--max_memory '8.GB'` + +### `--max_time` +Use to set a top-limit for the default time requirement for each process. +Should be a string in the format integer-unit. eg. `--max_time '2.h'` + +### `--max_cpus` +Use to set a top-limit for the default CPU requirement for each process. +Should be a string in the format integer-unit. eg. `--max_cpus 1` + +### `--plaintext_email` +Set to receive plain-text e-mails instead of HTML formatted. + +### `--monochrome_logs` +Set to disable colourful command line output and live life in monochrome. + +### `--multiqc_config` +Specify a path to a custom MultiQC configuration file. diff --git a/environment.yml b/environment.yml new file mode 100644 index 00000000..4fa4152b --- /dev/null +++ b/environment.yml @@ -0,0 +1,11 @@ +# You can use this file to create a conda environment for this pipeline: +# conda env create -f environment.yml +name: nf-core-nascent-1.0dev +channels: + - conda-forge + - bioconda + - defaults +dependencies: + # TODO nf-core: Add required software dependencies here + - fastqc=0.11.8 + - multiqc=1.7 diff --git a/main.nf b/main.nf new file mode 100644 index 00000000..a1d51929 --- /dev/null +++ b/main.nf @@ -0,0 +1,428 @@ +#!/usr/bin/env nextflow +/* +======================================================================================== + nf-core/nascent +======================================================================================== + nf-core/nascent Analysis Pipeline. + #### Homepage / Documentation + https://github.com/nf-core/nascent +---------------------------------------------------------------------------------------- +*/ + + +def helpMessage() { + // TODO nf-core: Add to this help message with new command line parameters + log.info nfcoreHeader() + log.info""" + + Usage: + + The typical command for running the pipeline is as follows: + + nextflow run nf-core/nascent --reads '*_R{1,2}.fastq.gz' -profile docker + + Mandatory arguments: + --reads Path to input data (must be surrounded with quotes) + -profile Configuration profile to use. Can use multiple (comma separated) + Available: conda, docker, singularity, awsbatch, test and more. + + Options: + --genome Name of iGenomes reference + --singleEnd Specifies that the input is single end reads + + References If not specified in the configuration file or you wish to overwrite any of the references. + --fasta Path to Fasta reference + + Other options: + --outdir The output directory where the results will be saved + --email Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits + --maxMultiqcEmailFileSize Theshold size for MultiQC report to be attached in notification email. If file generated by pipeline exceeds the threshold, it will not be attached (Default: 25MB) + -name Name for the pipeline run. If not specified, Nextflow will automatically generate a random mnemonic. + + AWSBatch options: + --awsqueue The AWSBatch JobQueue that needs to be set when running on AWSBatch + --awsregion The AWS Region for your AWS Batch job to run on + """.stripIndent() +} + +/* + * SET UP CONFIGURATION VARIABLES + */ + +// Show help emssage +if (params.help){ + helpMessage() + exit 0 +} + +// Check if genome exists in the config file +if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { + exit 1, "The provided genome '${params.genome}' is not available in the iGenomes file. Currently the available genomes are ${params.genomes.keySet().join(", ")}" +} + +// TODO nf-core: Add any reference files that are needed +// Configurable reference genomes +fasta = params.genome ? params.genomes[ params.genome ].fasta ?: false : false +if ( params.fasta ){ + fasta = file(params.fasta) + if( !fasta.exists() ) exit 1, "Fasta file not found: ${params.fasta}" +} +// +// NOTE - THIS IS NOT USED IN THIS PIPELINE, EXAMPLE ONLY +// If you want to use the above in a process, define the following: +// input: +// file fasta from fasta +// + + +// Has the run name been specified by the user? +// this has the bonus effect of catching both -name and --name +custom_runName = params.name +if( !(workflow.runName ==~ /[a-z]+_[a-z]+/) ){ + custom_runName = workflow.runName +} + + +if( workflow.profile == 'awsbatch') { + // AWSBatch sanity checking + if (!params.awsqueue || !params.awsregion) exit 1, "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" + // Check outdir paths to be S3 buckets if running on AWSBatch + // related: https://github.com/nextflow-io/nextflow/issues/813 + if (!params.outdir.startsWith('s3:')) exit 1, "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" + // Prevent trace files to be stored on S3 since S3 does not support rolling files. + if (workflow.tracedir.startsWith('s3:')) exit 1, "Specify a local tracedir or run without trace! S3 cannot be used for tracefiles." +} + +// Stage config files +ch_multiqc_config = Channel.fromPath(params.multiqc_config) +ch_output_docs = Channel.fromPath("$baseDir/docs/output.md") + +/* + * Create a channel for input read files + */ +if(params.readPaths){ + if(params.singleEnd){ + Channel + .from(params.readPaths) + .map { row -> [ row[0], [file(row[1][0])]] } + .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .into { read_files_fastqc; read_files_trimming } + } else { + Channel + .from(params.readPaths) + .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] } + .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .into { read_files_fastqc; read_files_trimming } + } +} else { + Channel + .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) + .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } + .into { read_files_fastqc; read_files_trimming } +} + + +// Header log info +log.info nfcoreHeader() +def summary = [:] +if(workflow.revision) summary['Pipeline Release'] = workflow.revision +summary['Run Name'] = custom_runName ?: workflow.runName +// TODO nf-core: Report custom parameters here +summary['Reads'] = params.reads +summary['Fasta Ref'] = params.fasta +summary['Data Type'] = params.singleEnd ? 'Single-End' : 'Paired-End' +summary['Max Resources'] = "$params.max_memory memory, $params.max_cpus cpus, $params.max_time time per job" +if(workflow.containerEngine) summary['Container'] = "$workflow.containerEngine - $workflow.container" +summary['Output dir'] = params.outdir +summary['Launch dir'] = workflow.launchDir +summary['Working dir'] = workflow.workDir +summary['Script dir'] = workflow.projectDir +summary['User'] = workflow.userName +if(workflow.profile == 'awsbatch'){ + summary['AWS Region'] = params.awsregion + summary['AWS Queue'] = params.awsqueue +} +summary['Config Profile'] = workflow.profile +if(params.config_profile_description) summary['Config Description'] = params.config_profile_description +if(params.config_profile_contact) summary['Config Contact'] = params.config_profile_contact +if(params.config_profile_url) summary['Config URL'] = params.config_profile_url +if(params.email) { + summary['E-mail Address'] = params.email + summary['MultiQC maxsize'] = params.maxMultiqcEmailFileSize +} +log.info summary.collect { k,v -> "${k.padRight(18)}: $v" }.join("\n") +log.info "\033[2m----------------------------------------------------\033[0m" + +// Check the hostnames against configured profiles +checkHostname() + +def create_workflow_summary(summary) { + def yaml_file = workDir.resolve('workflow_summary_mqc.yaml') + yaml_file.text = """ + id: 'nf-core-nascent-summary' + description: " - this information is collected when the pipeline is started." + section_name: 'nf-core/nascent Workflow Summary' + section_href: 'https://github.com/nf-core/nascent' + plot_type: 'html' + data: | +
+${summary.collect { k,v -> "
$k
${v ?: 'N/A'}
" }.join("\n")} +
+ """.stripIndent() + + return yaml_file +} + + +/* + * Parse software version numbers + */ +process get_software_versions { + publishDir "${params.outdir}/pipeline_info", mode: 'copy', + saveAs: {filename -> + if (filename.indexOf(".csv") > 0) filename + else null + } + + output: + file 'software_versions_mqc.yaml' into software_versions_yaml + file "software_versions.csv" + + script: + // TODO nf-core: Get all tools to print their version number here + """ + echo $workflow.manifest.version > v_pipeline.txt + echo $workflow.nextflow.version > v_nextflow.txt + fastqc --version > v_fastqc.txt + multiqc --version > v_multiqc.txt + scrape_software_versions.py &> software_versions_mqc.yaml + """ +} + + + +/* + * STEP 1 - FastQC + */ +process fastqc { + tag "$name" + publishDir "${params.outdir}/fastqc", mode: 'copy', + saveAs: {filename -> filename.indexOf(".zip") > 0 ? "zips/$filename" : "$filename"} + + input: + set val(name), file(reads) from read_files_fastqc + + output: + file "*_fastqc.{zip,html}" into fastqc_results + + script: + """ + fastqc -q $reads + """ +} + + + +/* + * STEP 2 - MultiQC + */ +process multiqc { + publishDir "${params.outdir}/MultiQC", mode: 'copy' + + input: + file multiqc_config from ch_multiqc_config + // TODO nf-core: Add in log files from your new processes for MultiQC to find! + file ('fastqc/*') from fastqc_results.collect().ifEmpty([]) + file ('software_versions/*') from software_versions_yaml.collect() + file workflow_summary from create_workflow_summary(summary) + + output: + file "*multiqc_report.html" into multiqc_report + file "*_data" + file "multiqc_plots" + + script: + rtitle = custom_runName ? "--title \"$custom_runName\"" : '' + rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' + // TODO nf-core: Specify which MultiQC modules to use with -m for a faster run time + """ + multiqc -f $rtitle $rfilename --config $multiqc_config . + """ +} + + + +/* + * STEP 3 - Output Description HTML + */ +process output_documentation { + publishDir "${params.outdir}/pipeline_info", mode: 'copy' + + input: + file output_docs from ch_output_docs + + output: + file "results_description.html" + + script: + """ + markdown_to_html.r $output_docs results_description.html + """ +} + + + +/* + * Completion e-mail notification + */ +workflow.onComplete { + + // Set up the e-mail variables + def subject = "[nf-core/nascent] Successful: $workflow.runName" + if(!workflow.success){ + subject = "[nf-core/nascent] FAILED: $workflow.runName" + } + def email_fields = [:] + email_fields['version'] = workflow.manifest.version + email_fields['runName'] = custom_runName ?: workflow.runName + email_fields['success'] = workflow.success + email_fields['dateComplete'] = workflow.complete + email_fields['duration'] = workflow.duration + email_fields['exitStatus'] = workflow.exitStatus + email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') + email_fields['errorReport'] = (workflow.errorReport ?: 'None') + email_fields['commandLine'] = workflow.commandLine + email_fields['projectDir'] = workflow.projectDir + email_fields['summary'] = summary + email_fields['summary']['Date Started'] = workflow.start + email_fields['summary']['Date Completed'] = workflow.complete + email_fields['summary']['Pipeline script file path'] = workflow.scriptFile + email_fields['summary']['Pipeline script hash ID'] = workflow.scriptId + if(workflow.repository) email_fields['summary']['Pipeline repository Git URL'] = workflow.repository + if(workflow.commitId) email_fields['summary']['Pipeline repository Git Commit'] = workflow.commitId + if(workflow.revision) email_fields['summary']['Pipeline Git branch/tag'] = workflow.revision + if(workflow.container) email_fields['summary']['Docker image'] = workflow.container + email_fields['summary']['Nextflow Version'] = workflow.nextflow.version + email_fields['summary']['Nextflow Build'] = workflow.nextflow.build + email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp + + // TODO nf-core: If not using MultiQC, strip out this code (including params.maxMultiqcEmailFileSize) + // On success try attach the multiqc report + def mqc_report = null + try { + if (workflow.success) { + mqc_report = multiqc_report.getVal() + if (mqc_report.getClass() == ArrayList){ + log.warn "[nf-core/nascent] Found multiple reports from process 'multiqc', will use only one" + mqc_report = mqc_report[0] + } + } + } catch (all) { + log.warn "[nf-core/nascent] Could not attach MultiQC report to summary email" + } + + // Render the TXT template + def engine = new groovy.text.GStringTemplateEngine() + def tf = new File("$baseDir/assets/email_template.txt") + def txt_template = engine.createTemplate(tf).make(email_fields) + def email_txt = txt_template.toString() + + // Render the HTML template + def hf = new File("$baseDir/assets/email_template.html") + def html_template = engine.createTemplate(hf).make(email_fields) + def email_html = html_template.toString() + + // Render the sendmail template + def smail_fields = [ email: params.email, subject: subject, email_txt: email_txt, email_html: email_html, baseDir: "$baseDir", mqcFile: mqc_report, mqcMaxSize: params.maxMultiqcEmailFileSize.toBytes() ] + def sf = new File("$baseDir/assets/sendmail_template.txt") + def sendmail_template = engine.createTemplate(sf).make(smail_fields) + def sendmail_html = sendmail_template.toString() + + // Send the HTML e-mail + if (params.email) { + try { + if( params.plaintext_email ){ throw GroovyException('Send plaintext e-mail, not HTML') } + // Try to send HTML e-mail using sendmail + [ 'sendmail', '-t' ].execute() << sendmail_html + log.info "[nf-core/nascent] Sent summary e-mail to $params.email (sendmail)" + } catch (all) { + // Catch failures and try with plaintext + [ 'mail', '-s', subject, params.email ].execute() << email_txt + log.info "[nf-core/nascent] Sent summary e-mail to $params.email (mail)" + } + } + + // Write summary e-mail HTML to a file + def output_d = new File( "${params.outdir}/pipeline_info/" ) + if( !output_d.exists() ) { + output_d.mkdirs() + } + def output_hf = new File( output_d, "pipeline_report.html" ) + output_hf.withWriter { w -> w << email_html } + def output_tf = new File( output_d, "pipeline_report.txt" ) + output_tf.withWriter { w -> w << email_txt } + + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_red = params.monochrome_logs ? '' : "\033[0;31m"; + + if (workflow.stats.ignoredCountFmt > 0 && workflow.success) { + log.info "${c_purple}Warning, pipeline completed, but with errored process(es) ${c_reset}" + log.info "${c_red}Number of ignored errored process(es) : ${workflow.stats.ignoredCountFmt} ${c_reset}" + log.info "${c_green}Number of successfully ran process(es) : ${workflow.stats.succeedCountFmt} ${c_reset}" + } + + if(workflow.success){ + log.info "${c_purple}[nf-core/nascent]${c_green} Pipeline completed successfully${c_reset}" + } else { + checkHostname() + log.info "${c_purple}[nf-core/nascent]${c_red} Pipeline completed with errors${c_reset}" + } + +} + + +def nfcoreHeader(){ + // Log colors ANSI codes + c_reset = params.monochrome_logs ? '' : "\033[0m"; + c_dim = params.monochrome_logs ? '' : "\033[2m"; + c_black = params.monochrome_logs ? '' : "\033[0;30m"; + c_green = params.monochrome_logs ? '' : "\033[0;32m"; + c_yellow = params.monochrome_logs ? '' : "\033[0;33m"; + c_blue = params.monochrome_logs ? '' : "\033[0;34m"; + c_purple = params.monochrome_logs ? '' : "\033[0;35m"; + c_cyan = params.monochrome_logs ? '' : "\033[0;36m"; + c_white = params.monochrome_logs ? '' : "\033[0;37m"; + + return """ ${c_dim}----------------------------------------------------${c_reset} + ${c_green},--.${c_black}/${c_green},-.${c_reset} + ${c_blue} ___ __ __ __ ___ ${c_green}/,-._.--~\'${c_reset} + ${c_blue} |\\ | |__ __ / ` / \\ |__) |__ ${c_yellow}} {${c_reset} + ${c_blue} | \\| | \\__, \\__/ | \\ |___ ${c_green}\\`-._,-`-,${c_reset} + ${c_green}`._,._,\'${c_reset} + ${c_purple} nf-core/nascent v${workflow.manifest.version}${c_reset} + ${c_dim}----------------------------------------------------${c_reset} + """.stripIndent() +} + +def checkHostname(){ + def c_reset = params.monochrome_logs ? '' : "\033[0m" + def c_white = params.monochrome_logs ? '' : "\033[0;37m" + def c_red = params.monochrome_logs ? '' : "\033[1;91m" + def c_yellow_bold = params.monochrome_logs ? '' : "\033[1;93m" + if(params.hostnames){ + def hostname = "hostname".execute().text.trim() + params.hostnames.each { prof, hnames -> + hnames.each { hname -> + if(hostname.contains(hname) && !workflow.profile.contains(prof)){ + log.error "====================================================\n" + + " ${c_red}WARNING!${c_reset} You are running with `-profile $workflow.profile`\n" + + " but your machine hostname is ${c_white}'$hostname'${c_reset}\n" + + " ${c_yellow_bold}It's highly recommended that you use `-profile $prof${c_reset}`\n" + + "============================================================" + } + } + } + } +} diff --git a/nextflow.config b/nextflow.config new file mode 100644 index 00000000..bcc5088f --- /dev/null +++ b/nextflow.config @@ -0,0 +1,127 @@ +/* + * ------------------------------------------------- + * nf-core/nascent Nextflow config file + * ------------------------------------------------- + * Default config options for all environments. + */ + +// Global default params, used in configs +params { + + // Workflow flags + // TODO nf-core: Specify your pipeline's command line flags + reads = "data/*{1,2}.fastq.gz" + singleEnd = false + outdir = './results' + + // Boilerplate options + name = false + multiqc_config = "$baseDir/assets/multiqc_config.yaml" + email = false + maxMultiqcEmailFileSize = 25.MB + plaintext_email = false + monochrome_logs = false + help = false + igenomes_base = "./iGenomes" + tracedir = "${params.outdir}/pipeline_info" + awsqueue = false + awsregion = 'eu-west-1' + igenomesIgnore = false + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + hostnames = false + config_profile_description = false + config_profile_contact = false + config_profile_url = false +} + +// Container slug. Stable releases should specify release tag! +// Developmental code should specify :dev +process.container = 'nfcore/nascent:dev' + +// Load base.config by default for all pipelines +includeConfig 'conf/base.config' + +// Load nf-core custom profiles from different Institutions +try { + includeConfig "${params.custom_config_base}/nfcore_custom.config" +} catch (Exception e) { + System.err.println("WARNING: Could not load nf-core/config profiles: ${params.custom_config_base}/nfcore_custom.config") +} + +profiles { + awsbatch { includeConfig 'conf/awsbatch.config' } + conda { process.conda = "$baseDir/environment.yml" } + debug { process.beforeScript = 'echo $HOSTNAME' } + docker { docker.enabled = true } + singularity { singularity.enabled = true } + test { includeConfig 'conf/test.config' } +} + +// Load igenomes.config if required +if(!params.igenomesIgnore){ + includeConfig 'conf/igenomes.config' +} + +// Capture exit codes from upstream processes when piping +process.shell = ['/bin/bash', '-euo', 'pipefail'] + +timeline { + enabled = true + file = "${params.tracedir}/execution_timeline.html" +} +report { + enabled = true + file = "${params.tracedir}/execution_report.html" +} +trace { + enabled = true + file = "${params.tracedir}/execution_trace.txt" +} +dag { + enabled = true + file = "${params.tracedir}/pipeline_dag.svg" +} + +manifest { + name = 'nf-core/nascent' + author = 'Ignacio Tripodi' + homePage = 'https://github.com/nf-core/nascent' + description = 'Nascent Transcription Processing Pipeline' + mainScript = 'main.nf' + nextflowVersion = '>=0.32.0' + version = '1.0dev' +} + +// Function to ensure that resource requirements don't go beyond +// a maximum limit +def check_max(obj, type) { + if(type == 'memory'){ + try { + if(obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1) + return params.max_memory as nextflow.util.MemoryUnit + else + return obj + } catch (all) { + println " ### ERROR ### Max memory '${params.max_memory}' is not valid! Using default value: $obj" + return obj + } + } else if(type == 'time'){ + try { + if(obj.compareTo(params.max_time as nextflow.util.Duration) == 1) + return params.max_time as nextflow.util.Duration + else + return obj + } catch (all) { + println " ### ERROR ### Max time '${params.max_time}' is not valid! Using default value: $obj" + return obj + } + } else if(type == 'cpus'){ + try { + return Math.min( obj, params.max_cpus as int ) + } catch (all) { + println " ### ERROR ### Max cpus '${params.max_cpus}' is not valid! Using default value: $obj" + return obj + } + } +} From d798de3a0482b5aff7dc10d1c2019087942fffc9 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 11:27:56 +0200 Subject: [PATCH 17/33] Fixing ToDos --- docs/usage.md | 2 -- main.nf | 2 -- 2 files changed, 4 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 29fae3f0..808042c6 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -193,8 +193,6 @@ Please make sure to also set the `-w/--work-dir` and `--outdir` parameters to a ## Other command line parameters - - ### `--outdir` The output directory where the results will be saved. diff --git a/main.nf b/main.nf index 8a97873f..323871ca 100644 --- a/main.nf +++ b/main.nf @@ -250,7 +250,6 @@ log.info nfcoreHeader() def summary = [:] if(workflow.revision) summary['Pipeline Release'] = workflow.revision summary['Run Name'] = custom_runName ?: workflow.runName -// TODO nf-core: Report custom parameters here summary['Save Reference'] = params.saveReference ? 'Yes' : 'No' if(params.reads) summary['Fastqs'] = params.reads if(params.sras) summary['SRAs'] = params.sras @@ -1122,7 +1121,6 @@ workflow.onComplete { email_fields['summary']['Nextflow Build'] = workflow.nextflow.build email_fields['summary']['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - // TODO nf-core: If not using MultiQC, strip out this code (including params.maxMultiqcEmailFileSize) // On success try attach the multiqc report def mqc_report = null try { From 9b534710bd2a836f6ae000220e0d2ec5568127da Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 12:52:54 +0200 Subject: [PATCH 18/33] Delete unused configs --- conf/binac.config | 22 ---------------------- conf/cfc.config | 21 --------------------- conf/slurm.config | 21 --------------------- conf/uzh.config | 19 ------------------- 4 files changed, 83 deletions(-) delete mode 100644 conf/binac.config delete mode 100644 conf/cfc.config delete mode 100644 conf/slurm.config delete mode 100644 conf/uzh.config diff --git a/conf/binac.config b/conf/binac.config deleted file mode 100644 index be69d9e8..00000000 --- a/conf/binac.config +++ /dev/null @@ -1,22 +0,0 @@ -/* - * ---------------------------------------------------------------------------- - * Nextflow config file for use with Singularity on BINAC cluster in Tuebingen - * ---------------------------------------------------------------------------- - * Defines basic usage limits and singularity image id. - */ - -singularity { - enabled = true -} - -process { - beforeScript = 'module load devel/singularity/3.0.1' - executor = 'pbs' - queue = 'short' -} - -params { - max_memory = 128.GB - max_cpus = 28 - max_time = 48.h -} diff --git a/conf/cfc.config b/conf/cfc.config deleted file mode 100644 index 6285925b..00000000 --- a/conf/cfc.config +++ /dev/null @@ -1,21 +0,0 @@ -/* - * ------------------------------------------------------------- - * Nextflow config file for use with Singularity on CFC at QBIC - * ------------------------------------------------------------- - * Defines basic usage limits and singularity image id. - */ - -singularity { - enabled = true -} - -process { - beforeScript = 'module load qbic/singularity_slurm/3.0.1' - executor = 'slurm' -} - -params { - max_memory = 60.GB - max_cpus = 24 - max_time = 140.h -} diff --git a/conf/slurm.config b/conf/slurm.config deleted file mode 100644 index 3c0c99e2..00000000 --- a/conf/slurm.config +++ /dev/null @@ -1,21 +0,0 @@ -/* - * ------------------------------------------------- - * Nextflow config file for running tests - * ------------------------------------------------- - * Defines bundled input files and everything required - * to run a fast and simple test. Use as follows: - * nextflow run nf-core/methylseq -profile test - */ - -/* Will run data as paired-end by default. A minimum usage example is as follows: - * nextflow run main.nf -profile fiji - * Enter nextflow run main.nf -profile fiji --help for more aguments - */ - -params { - // Genome Reference File Pathds - fasta = "COMPLETE_PATH_TO_DIRECTORY_CONTAINING_GENOME_FASTA" - hisat2_indices = "COMPLETE_PATH_TO_DIRECTORY_CONTAINING_HISAT2_INDICES" - genome_refseq = "COMPLETE_PATH_TO_REFSEQ_BEDFILE_FOR_THIS_REFERENCE_GENOME" - -} diff --git a/conf/uzh.config b/conf/uzh.config deleted file mode 100644 index 68cd7dd5..00000000 --- a/conf/uzh.config +++ /dev/null @@ -1,19 +0,0 @@ -/* - * -------------------------------------------------------------------------------- - * Nextflow config file for use with Singularity on University of Zurich Cluster - * -------------------------------------------------------------------------------- - */ - -singularity { - enabled = true -} - -process { - executor = 'slurm' -} - -params { - max_memory = 1800.GB - max_cpus = 112 - max_time = 168.h -} From d4668a25865025a8d3c165af7d7b443252eda428 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 13:10:34 +0200 Subject: [PATCH 19/33] Fixing markdown linting errors --- docs/configuration/adding_your_own.md | 86 ------------------ docs/configuration/reference_genomes.md | 49 ---------- docs/installation.md | 115 ------------------------ docs/troubleshooting.md | 28 ------ docs/usage.md | 36 ++++---- 5 files changed, 17 insertions(+), 297 deletions(-) delete mode 100644 docs/configuration/adding_your_own.md delete mode 100644 docs/configuration/reference_genomes.md delete mode 100644 docs/installation.md delete mode 100644 docs/troubleshooting.md diff --git a/docs/configuration/adding_your_own.md b/docs/configuration/adding_your_own.md deleted file mode 100644 index 29a1fde3..00000000 --- a/docs/configuration/adding_your_own.md +++ /dev/null @@ -1,86 +0,0 @@ -# nf-core/nascent: Configuration for other clusters - -It is entirely possible to run this pipeline on other clusters, though you will need to set up your own config file so that the pipeline knows how to work with your cluster. - -> If you think that there are other people using the pipeline who would benefit from your configuration (eg. other common cluster setups), please let us know. We can add a new configuration and profile which can used by specifying `-profile ` when running the pipeline. - -If you are the only person to be running this pipeline, you can create your config file as `~/.nextflow/config` and it will be applied every time you run Nextflow. Alternatively, save the file anywhere and reference it when running the pipeline with `-c path/to/config` (see the [Nextflow documentation](https://www.nextflow.io/docs/latest/config.html) for more). - -A basic configuration comes with the pipeline, which runs by default (the `standard` config profile - see [`conf/base.config`](../conf/base.config)). This means that you only need to configure the specifics for your system and overwrite any defaults that you want to change. - -## Cluster Environment -By default, pipeline uses the `local` Nextflow executor - in other words, all jobs are run in the login session. If you're using a simple server, this may be fine. If you're using a compute cluster, this is bad as all jobs will run on the head node. - -To specify your cluster environment, add the following line to your config file: - -```nextflow -process.executor = 'YOUR_SYSTEM_TYPE' -``` - -Many different cluster types are supported by Nextflow. For more information, please see the [Nextflow documentation](https://www.nextflow.io/docs/latest/executor.html). - -Note that you may need to specify cluster options, such as a project or queue. To do so, use the `clusterOptions` config option: - -```nextflow -process { - executor = 'SLURM' - clusterOptions = '-A myproject' -} -``` - - -## Software Requirements -To run the pipeline, several software packages are required. How you satisfy these requirements is essentially up to you and depends on your system. If possible, we _highly_ recommend using either Docker or Singularity. - -Please see the [`installation documentation`](../installation.md) for how to run using the below as a one-off. These instructions are about configuring a config file for repeated use. - -### Docker -Docker is a great way to run nf-core/nascent, as it manages all software installations and allows the pipeline to be run in an identical software environment across a range of systems. - -Nextflow has [excellent integration](https://www.nextflow.io/docs/latest/docker.html) with Docker, and beyond installing the two tools, not much else is required - nextflow will automatically fetch the [nfcore/nascent](https://hub.docker.com/r/nfcore/nascent/) image that we have created and is hosted at dockerhub at run time. - -To add docker support to your own config file, add the following: - -```nextflow -docker.enabled = true -process.container = "nfcore/nascent" -``` - -Note that the dockerhub organisation name annoyingly can't have a hyphen, so is `nfcore` and not `nf-core`. - - -### Singularity image -Many HPC environments are not able to run Docker due to security issues. -[Singularity](http://singularity.lbl.gov/) is a tool designed to run on such HPC systems which is very similar to Docker. - -To specify singularity usage in your pipeline config file, add the following: - -```nextflow -singularity.enabled = true -process.container = "shub://nf-core/nascent" -``` - -If you intend to run the pipeline offline, nextflow will not be able to automatically download the singularity image for you. -Instead, you'll have to do this yourself manually first, transfer the image file and then point to that. - -First, pull the image file where you have an internet connection: - -```bash -singularity pull --name nf-core-nascent.simg shub://nf-core/nascent -``` - -Then transfer this file and point the config file to the image: - -```nextflow -singularity.enabled = true -process.container = "/path/to/nf-core-nascent.simg" -``` - - -### Conda -If you're not able to use Docker or Singularity, you can instead use conda to manage the software requirements. -To use conda in your own config file, add the following: - -```nextflow -process.conda = "$baseDir/environment.yml" -``` diff --git a/docs/configuration/reference_genomes.md b/docs/configuration/reference_genomes.md deleted file mode 100644 index 5af328fc..00000000 --- a/docs/configuration/reference_genomes.md +++ /dev/null @@ -1,49 +0,0 @@ -# nf-core/nascent: Reference Genomes Configuration - -The nf-core/nascent pipeline needs a reference genome for alignment and annotation. - -These paths can be supplied on the command line at run time (see the [usage docs](../usage.md)), -but for convenience it's often better to save these paths in a nextflow config file. -See below for instructions on how to do this. -Read [Adding your own system](adding_your_own.md) to find out how to set up custom config files. - -## Adding paths to a config file -Specifying long paths every time you run the pipeline is a pain. -To make this easier, the pipeline comes configured to understand reference genome keywords which correspond to preconfigured paths, meaning that you can just specify `--genome ID` when running the pipeline. - -Note that this genome key can also be specified in a config file if you always use the same genome. - -To use this system, add paths to your config file using the following template: - -```nextflow -params { - genomes { - 'YOUR-ID' { - fasta = '/genome.fa' - } - 'OTHER-GENOME' { - // [..] - } - } - // Optional - default genome. Ignored if --genome 'OTHER-GENOME' specified on command line - genome = 'YOUR-ID' -} -``` - -You can add as many genomes as you like as long as they have unique IDs. - -## illumina iGenomes -To make the use of reference genomes easier, illumina has developed a centralised resource called [iGenomes](https://support.illumina.com/sequencing/sequencing_software/igenome.html). -Multiple reference index types are held together with consistent structure for multiple genomes. - -We have put a copy of iGenomes up onto AWS S3 hosting and this pipeline is configured to use this by default. -The hosting fees for AWS iGenomes are currently kindly funded by a grant from Amazon. -The pipeline will automatically download the required reference files when you run the pipeline. -For more information about the AWS iGenomes, see https://ewels.github.io/AWS-iGenomes/ - -Downloading the files takes time and bandwidth, so we recommend making a local copy of the iGenomes resource. -Once downloaded, you can customise the variable `params.igenomes_base` in your custom configuration file to point to the reference location. -For example: -```nextflow -params.igenomes_base = '/path/to/data/igenomes/' -``` diff --git a/docs/installation.md b/docs/installation.md deleted file mode 100644 index 94bd13f0..00000000 --- a/docs/installation.md +++ /dev/null @@ -1,115 +0,0 @@ -# nf-core/nascent: Installation - -To start using the nf-core/nascent pipeline, follow the steps below: - -1. [Install Nextflow](#1-install-nextflow) -2. [Install the pipeline](#2-install-the-pipeline) - * [Automatic](#21-automatic) - * [Offline](#22-offline) - * [Development](#23-development) -3. [Pipeline configuration](#3-pipeline-configuration) - * [Software deps: Docker and Singularity](#31-software-deps-docker-and-singularity) - * [Software deps: Bioconda](#32-software-deps-bioconda) - * [Configuration profiles](#33-configuration-profiles) -4. [Reference genomes](#4-reference-genomes) -5. [Appendices](#appendices) - * [Running on UPPMAX](#running-on-uppmax) - -## 1) Install NextFlow -Nextflow runs on most POSIX systems (Linux, Mac OSX etc). It can be installed by running the following commands: - -```bash -# Make sure that Java v8+ is installed: -java -version - -# Install Nextflow -curl -fsSL get.nextflow.io | bash - -# Add Nextflow binary to your PATH: -mv nextflow ~/bin/ -# OR system-wide installation: -# sudo mv nextflow /usr/local/bin -``` - -See [nextflow.io](https://www.nextflow.io/) for further instructions on how to install and configure Nextflow. - -## 2) Install the pipeline - -#### 2.1) Automatic -This pipeline itself needs no installation - NextFlow will automatically fetch it from GitHub if `nf-core/nascent` is specified as the pipeline name. - -#### 2.2) Offline -The above method requires an internet connection so that Nextflow can download the pipeline files. If you're running on a system that has no internet connection, you'll need to download and transfer the pipeline files manually: - -```bash -wget https://github.com/nf-core/nascent/archive/master.zip -mkdir -p ~/my-pipelines/nf-core/ -unzip master.zip -d ~/my-pipelines/nf-core/ -cd ~/my_data/ -nextflow run ~/my-pipelines/nf-core/nascent-master -``` - -To stop nextflow from looking for updates online, you can tell it to run in offline mode by specifying the following environment variable in your ~/.bashrc file: - -```bash -export NXF_OFFLINE='TRUE' -``` - -#### 2.3) Development - -If you would like to make changes to the pipeline, it's best to make a fork on GitHub and then clone the files. Once cloned you can run the pipeline directly as above. - - -## 3) Pipeline configuration -By default, the pipeline runs with the `standard` configuration profile. This uses a number of sensible defaults for process requirements and is suitable for running on a simple (if powerful!) basic server. You can see this configuration in [`conf/base.config`](../conf/base.config). - -Be warned of two important points about this default configuration: - -1. The default profile uses the `local` executor - * All jobs are run in the login session. If you're using a simple server, this may be fine. If you're using a compute cluster, this is bad as all jobs will run on the head node. - * See the [nextflow docs](https://www.nextflow.io/docs/latest/executor.html) for information about running with other hardware backends. Most job scheduler systems are natively supported. -2. Nextflow will expect all software to be installed and available on the `PATH` - -#### 3.1) Software deps: Docker -First, install docker on your system: [Docker Installation Instructions](https://docs.docker.com/engine/installation/) - -Then, running the pipeline with the option `-profile standard,docker` tells Nextflow to enable Docker for this run. An image containing all of the software requirements will be automatically fetched and used from dockerhub (https://hub.docker.com/r/nfcore/nascent). - -#### 3.1) Software deps: Singularity -If you're not able to use Docker then [Singularity](http://singularity.lbl.gov/) is a great alternative. -The process is very similar: running the pipeline with the option `-profile standard,singularity` tells Nextflow to enable singularity for this run. An image containing all of the software requirements will be automatically fetched and used from singularity hub. - -If running offline with Singularity, you'll need to download and transfer the Singularity image first: - -```bash -singularity pull --name nf-core-nascent.simg shub://nf-core/nascent -``` - -Once transferred, use `-with-singularity` and specify the path to the image file: - -```bash -nextflow run /path/to/nf-core-nascent -with-singularity nf-core-nascent.simg -``` - -Remember to pull updated versions of the singularity image if you update the pipeline. - - -#### 3.2) Software deps: conda -If you're not able to use Docker _or_ Singularity, you can instead use conda to manage the software requirements. -This is slower and less reproducible than the above, but is still better than having to install all requirements yourself! -The pipeline ships with a conda environment file and nextflow has built-in support for this. -To use it first ensure that you have conda installed (we recommend [miniconda](https://conda.io/miniconda.html)), then follow the same pattern as above and use the flag `-profile standard,conda` - - -## Appendices - -#### Running on UPPMAX -To run the pipeline on the [Swedish UPPMAX](https://www.uppmax.uu.se/) clusters (`rackham`, `irma`, `bianca` etc), use the command line flag `-profile uppmax`. This tells Nextflow to submit jobs using the SLURM job executor with Singularity for software dependencies. - -Note that you will need to specify your UPPMAX project ID when running a pipeline. To do this, use the command line flag `--project `. The pipeline will exit with an error message if you try to run it pipeline with the default UPPMAX config profile without a project. - -**Optional Extra:** To avoid having to specify your project every time you run Nextflow, you can add it to your personal Nextflow config file instead. Add this line to `~/.nextflow/config`: - -```nextflow -params.project = 'project_ID' // eg. b2017123 -``` diff --git a/docs/troubleshooting.md b/docs/troubleshooting.md deleted file mode 100644 index bbf339a9..00000000 --- a/docs/troubleshooting.md +++ /dev/null @@ -1,28 +0,0 @@ -# nf-core/nascent: Troubleshooting - -## Input files not found - -If only no file, only one input file , or only read one and not read two is picked up then something is wrong with your input file declaration - -1. The path must be enclosed in quotes (`'` or `"`) -2. The path must have at least one `*` wildcard character. This is even if you are only running one paired end sample. -3. When using the pipeline with paired end data, the path must use `{1,2}` or `{R1,R2}` notation to specify read pairs. -4. If you are running Single end data make sure to specify `--singleEnd` - -If the pipeline can't find your files then you will get the following error - -``` -ERROR ~ Cannot find any reads matching: *{1,2}.fastq.gz -``` - -Note that if your sample name is "messy" then you have to be very particular with your glob specification. A file name like `L1-1-D-2h_S1_L002_R1_001.fastq.gz` can be difficult enough for a human to read. Specifying `*{1,2}*.gz` wont work give you what you want Whilst `*{R1,R2}*.gz` will. - - -## Data organization -The pipeline can't take a list of multiple input files - it takes a glob expression. If your input files are scattered in different paths then we recommend that you generate a directory with symlinked files. If running in paired end mode please make sure that your files are sensibly named so that they can be properly paired. See the previous point. - -## Extra resources and getting help -If you still have an issue with running the pipeline then feel free to contact us. -Have a look at the [pipeline website](https://github.com/nf-core/nascent) to find out how. - -If you have problems that are related to Nextflow and not our pipeline then check out the [Nextflow gitter channel](https://gitter.im/nextflow-io/nextflow) or the [google group](https://groups.google.com/forum/#!forum/nextflow). diff --git a/docs/usage.md b/docs/usage.md index 808042c6..1aca6b3d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,13 +1,12 @@ # nf-core/nascent: Usage ## Table of contents - * [Introduction](#general-nextflow-info) * [Running the pipeline](#running-the-pipeline) * [Updating the pipeline](#updating-the-pipeline) * [Reproducibility](#reproducibility) * [Main arguments](#main-arguments) - * [`-profile`](#-profile-single-dash) + * [`-profile`](#-profile-single-dash) * [`docker`](#docker) * [`awsbatch`](#awsbatch) * [`standard`](#standard) @@ -17,28 +16,27 @@ * [`--reads`](#--reads) * [`--singleEnd`](#--singleend) * [Reference Genomes](#reference-genomes) - * [`--genome`](#--genome) - * [`--fasta`](#--fasta) + * [`--genome`](#--genome) + * [`--fasta`](#--fasta) * [Job Resources](#job-resources) * [Automatic resubmission](#automatic-resubmission) * [Custom resource requests](#custom-resource-requests) * [AWS batch specific parameters](#aws-batch-specific-parameters) - * [`-awsbatch`](#-awsbatch) - * [`--awsqueue`](#--awsqueue) - * [`--awsregion`](#--awsregion) + * [`-awsbatch`](#-awsbatch) + * [`--awsqueue`](#--awsqueue) + * [`--awsregion`](#--awsregion) * [Other command line parameters](#other-command-line-parameters) - * [`--outdir`](#--outdir) - * [`--email`](#--email) - * [`-name`](#-name-single-dash) - * [`-resume`](#-resume-single-dash) - * [`-c`](#-c-single-dash) - * [`--max_memory`](#--max_memory) - * [`--max_time`](#--max_time) - * [`--max_cpus`](#--max_cpus) - * [`--plaintext_emails`](#--plaintext_emails) - * [`--sampleLevel`](#--sampleLevel) - * [`--multiqc_config`](#--multiqc_config) - + * [`--outdir`](#--outdir) + * [`--email`](#--email) + * [`-name`](#-name-single-dash) + * [`-resume`](#-resume-single-dash) + * [`-c`](#-c-single-dash) + * [`--max_memory`](#--max_memory) + * [`--max_time`](#--max_time) + * [`--max_cpus`](#--max_cpus) + * [`--plaintext_emails`](#--plaintext_emails) + * [`--sampleLevel`](#--sampleLevel) + * [`--multiqc_config`](#--multiqc_config) ## General Nextflow info Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. From d0db222940772a516c1b16b2b5373ebbebb53030 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 13:37:27 +0200 Subject: [PATCH 20/33] Update markdownlint stuff --- README.md | 9 ++------- docs/output.md | 8 ++------ 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index b69505d9..0a61e20c 100644 --- a/README.md +++ b/README.md @@ -16,10 +16,9 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool #### Reference If you've used this pipeline in your research, you can cite this pipeline using DOI xxxxxxxxxxxxxxxxxxx ([OSF project](https://osf.io/xxxxxxxxxxx/)). - + ### Documentation The nf-core/nascent pipeline comes with documentation about the pipeline, found in the `docs/` directory: - 1. [Installation](https://nf-co.re/usage/installation) 2. Pipeline configuration * [Local installation](https://nf-co.re/usage/local_installation) @@ -40,8 +39,7 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the ## Arguments **Required Arguments** - -| Arugment | Usage | Description | +| Argument | Usage | Description | |-----------|----------------------------------|----------------------------------------------------------------------| | -profile | \ | Configuration profile to use. | | --fastqs | \ | Directory pattern for fastq files. | @@ -51,7 +49,6 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the | --email | \ | Where to send workflow report email. | **Save Options** - | Arguments | Usage | Description | |------------|---------------|-----------------------------------------------------------| | --outdir | \ | Specifies where to save the output from the nextflow run. | @@ -61,7 +58,6 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the | --skipBAM | | Skips saving BAM files (only save CRAM). Default=False | **Input File Options** - | Arguments | Usage | Description | |--------------|-------------|------------------------------------------------------------------------------| | --singleEnd | | Specifies that the input files are not paired reads (default is paired-end). | @@ -80,6 +76,5 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the | --skipMultiQC | | Skip running MultiQC. | | --skipRSeQC | | Skip running RSeQC. | - ### Credits nf-core/nascent was originally written by Ignacio Tripodi ([@ignaciot](https://github.com/ignaciot)) and Margaret Gruca ([@magruca](https://github.com/magruca)). \ No newline at end of file diff --git a/docs/output.md b/docs/output.md index e188da6d..e017e7f2 100644 --- a/docs/output.md +++ b/docs/output.md @@ -55,7 +55,7 @@ For further reading and documentation see the [FastQC help](http://www.bioinform ## MultiQC -[MultiQC](http://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. +[MultiQC](https://multiqc.info) is a visualisation tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in within the report data directory. The pipeline has special steps which allow the software versions used to be reported in the MultiQC output for future traceability. @@ -66,7 +66,7 @@ The pipeline has special steps which allow the software versions used to be repo * `Project_multiqc_data/` * Directory containing parsed statistics from the different tools used in the pipeline -For more information about how to use MultiQC reports, see http://multiqc.info +For more information about how to use MultiQC reports, see [https://multiqc.info](https://multiqc.info) ## hisat2 @@ -74,10 +74,6 @@ For more information about how to use MultiQC reports, see http://multiqc.info If the necessary indices for mapping are not provided/present, a separate process will build them first. This step can take a few minutes, however it should only be executed once. -**Output directory: none** - - - ## samtools [Samtools](http://www.htslib.org/) is a suite of tools to handle format conversions, among other things, for high-throughput sequencing data. We also use Samtools to generate the list of chromosome sizes, if not provided for the desired reference genome. From 6fe091cdbefede28ee7c8b831ee2e215293f2417 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 13:38:47 +0200 Subject: [PATCH 21/33] Better readme markdown --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 0a61e20c..a3360f65 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the ## Arguments -**Required Arguments** +### Required Arguments | Argument | Usage | Description | |-----------|----------------------------------|----------------------------------------------------------------------| | -profile | \ | Configuration profile to use. | @@ -48,7 +48,7 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the | --workdir | \ | Nextflow working directory where all intermediate files are saved. | | --email | \ | Where to send workflow report email. | -**Save Options** +### Save Options | Arguments | Usage | Description | |------------|---------------|-----------------------------------------------------------| | --outdir | \ | Specifies where to save the output from the nextflow run. | @@ -57,24 +57,24 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the | --saveAll | | Compresses and saves all fastq reads. | | --skipBAM | | Skips saving BAM files (only save CRAM). Default=False | -**Input File Options** +### Input File Options | Arguments | Usage | Description | |--------------|-------------|------------------------------------------------------------------------------| | --singleEnd | | Specifies that the input files are not paired reads (default is paired-end). | | --flip | | Reverse complements each strand. Necessary for some library preps. | -**Performance Options** +### Performance Options | Arguments | Usage | Description | |-----------------|-------------|---------------------------------------------------------| | --threadfqdump | | Runs multi-threading for fastq-dump for sra processing. | -**QC Options** +### QC Options | Arguments | Usage | Description | |-----------------|-------------|---------------------------------------------------------| | --skipMultiQC | | Skip running MultiQC. | | --skipRSeQC | | Skip running RSeQC. | -### Credits +## Credits nf-core/nascent was originally written by Ignacio Tripodi ([@ignaciot](https://github.com/ignaciot)) and Margaret Gruca ([@magruca](https://github.com/magruca)). \ No newline at end of file From 0613cb117c37d7f0a3e33418cc0897d2bf3731e8 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 13:47:15 +0200 Subject: [PATCH 22/33] Fixing markdown lint for now :+1: --- docs/usage.md | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/docs/usage.md b/docs/usage.md index 1aca6b3d..8b6d805d 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -49,6 +49,7 @@ NXF_OPTS='-Xms1g -Xmx4g' ## Running the pipeline The typical command for running the pipeline is as follows: + ```bash nextflow run nf-core/nascent --reads '*_R{1,2}.fastq.gz' -profile standard,docker ``` @@ -82,22 +83,22 @@ This version number will be logged in reports when you run the pipeline, so that ## Main Arguments ### `-profile` -Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. +Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. * `docker` - * A generic configuration profile to be used with [Docker](http://docker.com/) - * Pulls software from dockerhub: [`nfcore/nascent`](http://hub.docker.com/r/nfcore/nascent/) + * A generic configuration profile to be used with [Docker](http://docker.com/) + * Pulls software from dockerhub: [`nfcore/nascent`](http://hub.docker.com/r/nfcore/nascent/) * `singularity` - * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) - * Pulls software from singularity-hub + * A generic configuration profile to be used with [Singularity](http://singularity.lbl.gov/) + * Pulls software from singularity-hub * `conda` - * A generic configuration profile to be used with [conda](https://conda.io/docs/) - * Pulls most software from [Bioconda](https://bioconda.github.io/) + * A generic configuration profile to be used with [conda](https://conda.io/docs/) + * Pulls most software from [Bioconda](https://bioconda.github.io/) * `awsbatch` - * A generic configuration profile to be used with AWS Batch. + * A generic configuration profile to be used with AWS Batch. * `test` - * A profile with a complete configuration for automated testing - * Includes links to test data so needs no other parameters + * A profile with a complete configuration for automated testing + * Includes links to test data so needs no other parameters ### `--reads` Use this to specify the location of your input FastQ files. For example: From f5596d382be8123ae378cfafe4be26de3d6012c0 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Wed, 10 Apr 2019 14:03:29 +0200 Subject: [PATCH 23/33] Getting tests to run on TravisCI now :-) --- conf/test.config | 6 ++++-- main.nf | 18 +++++++++++++++++- nextflow.config | 6 ++++++ 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/conf/test.config b/conf/test.config index cbf4521a..2fcc392f 100644 --- a/conf/test.config +++ b/conf/test.config @@ -16,6 +16,8 @@ params { // Input data singleEnd = true threadfqdump = false - reads = "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/testdata/SRR4012402.chr21.fastq" - fasta = "https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/chr21.fa" + readPaths = [ + ['SRR4012402', ['https://raw.githubusercontent.com/nf-core/test-datasets/nascent/testdata/SRR4012402.chr21.fastq']], +] + fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/chr21.fa' } diff --git a/main.nf b/main.nf index 323871ca..73b24c39 100644 --- a/main.nf +++ b/main.nf @@ -175,6 +175,22 @@ if( workflow.profile == 'awsbatch') { /* * Create a channel for input read files */ + +if(params.readPaths){ + if(params.singleEnd){ + Channel + .from(params.readPaths) + .map { row -> [ row[0], [file(row[1][0])]] } + .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } + } else { + Channel + .from(params.readPaths) + .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] } + .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } + } +} if (params.reads) { if (params.singleEnd) { fastq_reads_qc = Channel @@ -1061,7 +1077,7 @@ process multiqc { !params.skipMultiQC input: - file multiqc_config + file multiqc_config from ch_multiqc_config.collect() file (fastqc:'qc/fastqc/*') from fastqc_results.collect() file ('qc/fastqc/*') from trimmed_fastqc_results.collect() file ('qc/trimstats/*') from trim_stats.collect() diff --git a/nextflow.config b/nextflow.config index c228d5e9..01fdaee2 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,12 @@ params { skipMultiQC = false threadfqdump = false + //TODO IGNACIO CHECK + chrom_sizes = 0 + hisat2_indices = 0 + genome_refseq = 0 + sras = 0 + reads = "data/*{R1,R2}*.fastq" singleEnd = true outdir = './results' From 7f611e86e5387d01e0969a77d1df09c23ee2ba9e Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Fri, 12 Apr 2019 10:22:02 +0200 Subject: [PATCH 24/33] More cleaning up --- main.nf | 37 ++----------------------------------- 1 file changed, 2 insertions(+), 35 deletions(-) diff --git a/main.nf b/main.nf index 73b24c39..59607e33 100644 --- a/main.nf +++ b/main.nf @@ -188,10 +188,10 @@ if(params.readPaths){ .from(params.readPaths) .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] } .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } + .dump() .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } } -} -if (params.reads) { +} else { if (params.singleEnd) { fastq_reads_qc = Channel .fromPath(params.reads) @@ -210,13 +210,6 @@ if (params.reads) { } } -else { - Channel - .empty() - .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } - params.reads = null -} - if (params.sras) { if (params.singleEnd) { println("Pattern for SRAs provided") @@ -235,32 +228,6 @@ else { read_files_sra = Channel.empty() } - -/* - * Create a channel for input read files - */ -if(params.readPaths){ - if(params.singleEnd){ - Channel - .from(params.readPaths) - .map { row -> [ row[0], [file(row[1][0])]] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { read_files_fastqc; read_files_trimming } - } else { - Channel - .from(params.readPaths) - .map { row -> [ row[0], [file(row[1][0]), file(row[1][1])]] } - .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } - .into { read_files_fastqc; read_files_trimming } - } -} else { - Channel - .fromFilePairs( params.reads, size: params.singleEnd ? 1 : 2 ) - .ifEmpty { exit 1, "Cannot find any reads matching: ${params.reads}\nNB: Path needs to be enclosed in quotes!\nIf this is single-end data, please specify --singleEnd on the command line." } - .into { read_files_fastqc; read_files_trimming } -} - - // Header log info log.info nfcoreHeader() def summary = [:] From 1e68451cd8d56caf51af22bdd2e886d2ebe2a56d Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Fri, 12 Apr 2019 11:00:22 +0200 Subject: [PATCH 25/33] Update test config to work with chr21 --- conf/test.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conf/test.config b/conf/test.config index 2fcc392f..60b09634 100644 --- a/conf/test.config +++ b/conf/test.config @@ -17,7 +17,7 @@ params { singleEnd = true threadfqdump = false readPaths = [ - ['SRR4012402', ['https://raw.githubusercontent.com/nf-core/test-datasets/nascent/testdata/SRR4012402.chr21.fastq']], + ['SRR4012402', ['https://raw.githubusercontent.com/nf-core/test-datasets/nascent/testdata/SRR4012402.fastq']], ] fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/nascent/reference/chr21.fa' } From 514fd5bdc2f6b8b606c98f22b78389d085d53d06 Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Fri, 12 Apr 2019 11:00:36 +0200 Subject: [PATCH 26/33] Get the environment updated for multiqc --- environment.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/environment.yml b/environment.yml index d6ae8d12..93aada3f 100644 --- a/environment.yml +++ b/environment.yml @@ -5,9 +5,9 @@ channels: - defaults dependencies: - fastqc=0.11.8 - - multiqc=1.6 + - multiqc=1.7 - hisat2=2.1.0 - - samtools=1.8 + - samtools=1.9 - preseq=2.0.3 - seqkit=0.9.0 - bedtools=2.25.0 From fcca9e746ebe5ed507976e4ae1777148695a9cef Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Fri, 12 Apr 2019 11:02:12 +0200 Subject: [PATCH 27/33] Update environment --- environment.yml | 8 ++++---- main.nf | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/environment.yml b/environment.yml index 93aada3f..b8312069 100644 --- a/environment.yml +++ b/environment.yml @@ -9,10 +9,10 @@ dependencies: - hisat2=2.1.0 - samtools=1.9 - preseq=2.0.3 - - seqkit=0.9.0 - - bedtools=2.25.0 - - igvtools=2.3.75 - - bbmap=38.06 + - seqkit=0.10.1 + - bedtools=2.28.0 + - igvtools=2.3.93 + - bbmap=38.22 - fastx_toolkit=0.0.14 - sra-tools=2.9.1 - rseqc=3.0.0 diff --git a/main.nf b/main.nf index 59607e33..69012465 100644 --- a/main.nf +++ b/main.nf @@ -181,6 +181,7 @@ if(params.readPaths){ Channel .from(params.readPaths) .map { row -> [ row[0], [file(row[1][0])]] } + .dump() .ifEmpty { exit 1, "params.readPaths was empty - no input files supplied" } .into { fastq_reads_qc; fastq_reads_trim; fastq_reads_gzip } } else { @@ -533,8 +534,7 @@ process bbduk { refstats=${name}.refstats.txt \ ehist=${name}.ehist.txt """ - } - else if (!params.singleEnd) { + }else if (!params.singleEnd) { """ echo ${name} From 71b67f3d27bbaf2c7f2c58d233d4eb7729d3347e Mon Sep 17 00:00:00 2001 From: Alexander Peltzer Date: Mon, 15 Apr 2019 12:54:08 -0600 Subject: [PATCH 28/33] Update CHANGELOG.md Applied suggestion from PR review. Co-Authored-By: ignaciot --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a8aacf83..c025c0d4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ # nf-core/nascent: Changelog -## v1.0dev - [date] +## v1.0 - 2019-04-16 Initial release of nf-core/nascent, created with the [nf-core](http://nf-co.re/) template. From 9b7d49de45dfbb971a429e86957ad789d819c7cd Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Mon, 15 Apr 2019 13:01:42 -0600 Subject: [PATCH 29/33] Cleanup: Removed any unnecessary commented out lines. Part of PR #5 change request. --- main.nf | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/main.nf b/main.nf index 69012465..0cb4009c 100644 --- a/main.nf +++ b/main.nf @@ -374,7 +374,6 @@ process sra_dump { /* * PREPROCESSING - Build HISAT2 index (borrowed from nf-core/rnaseq) */ -// TODO: do we need --ss and --exon? probably not, need to check what was the actual hisat2-builder arguments used to generate the indices we have on fiji if(!params.hisat2_indices && params.fasta){ process make_hisat_index { tag "$fasta" @@ -420,21 +419,7 @@ process fastqc { script: prefix = reads.baseName """ -# echo `which gunzip` - fastqc $reads - #extract_fastqc_stats.sh --srr=${prefix} > ${prefix}_stats_fastqc.txt -# GC=\$(gunzip -c "\$(find . -name *_fastqc.zip)" "${prefix}"_fastqc/fastqc_data.txt \ -# | grep "%GC" | grep -o "[0-9]*") -# SEQ=\$(gunzip -c "\$(find . -name *_fastqc.zip)" "${prefix}"_fastqc/fastqc_data.txt | \ -# grep "Total Sequences" | \ -# grep -o "[0-9]*") -# DEDUP=\$(gunzip -c "\$(find . -name *_fastqc.zip)" "${prefix}"_fastqc/fastqc_data.txt | \ -# grep "#Total Deduplicated Percentage" | \ -# grep -o "[0-9,.]*") -# -# echo -e "SRR\t%GC\tTotal_Sequences\t%Total_Deduplicated" > ${prefix}_stats_fastqc.txt -# echo -e "${prefix}""\$(printf "\\t")""\$GC""\$(printf "\\t")""\$SEQ""\$(printf "\\t")""\$DEDUP" >> ${prefix}_stats_fastqc.txt """ } @@ -480,7 +465,6 @@ process bbduk { file "*.txt" into trim_stats script: -// prefix = fastq.baseName bbduk_mem = task.memory.toGiga() if (!params.singleEnd && params.flip) { """ @@ -634,7 +618,6 @@ process hisat2 { // NOTE: this tool sends output there even in successful (exit code 0) // termination, so we have to ignore errors for now, and the next // process will blow up from missing a SAM file instead. - //errorStrategy 'ignore' tag "$name" validExitStatus 0,143 @@ -700,7 +683,7 @@ process samtools { script: prefix = mapped_sam.baseName -// Note that the millionsmapped arugments below are only good for SE data. When PE is added, it will need to be changed to: + // Note that the millionsmapped arugments below are only good for SE data. When PE is added, it will need to be changed to: // -F 0x40 rootname.sorted.bam | cut -f1 | sort | uniq | wc -l > rootname.bam.millionsmapped if (!params.singleEnd) { """ @@ -1061,7 +1044,6 @@ process multiqc { rtitle = custom_runName ? "--title \"$custom_runName\"" : '' rfilename = custom_runName ? "--filename " + custom_runName.replaceAll('\\W','_').replaceAll('_+','_') + "_multiqc_report" : '' -//TO DO : Need to build a new multiqc container for the newest version """ export PATH=~/.local/bin:$PATH From 5e0aa886f93a1764c815b782febdb06de59e9daa Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Mon, 15 Apr 2019 13:11:16 -0600 Subject: [PATCH 30/33] Added DOI from OSF Project. --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a3360f65..28d9671c 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool #### Reference -If you've used this pipeline in your research, you can cite this pipeline using DOI xxxxxxxxxxxxxxxxxxx ([OSF project](https://osf.io/xxxxxxxxxxx/)). +If you've used this pipeline in your research, you can cite this pipeline using DOI 10.17605/OSF.IO/SV4UB ([OSF project](https://osf.io/sv4ub/)). ### Documentation The nf-core/nascent pipeline comes with documentation about the pipeline, found in the `docs/` directory: @@ -77,4 +77,4 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the | --skipRSeQC | | Skip running RSeQC. | ## Credits -nf-core/nascent was originally written by Ignacio Tripodi ([@ignaciot](https://github.com/ignaciot)) and Margaret Gruca ([@magruca](https://github.com/magruca)). \ No newline at end of file +nf-core/nascent was originally written by Ignacio Tripodi ([@ignaciot](https://github.com/ignaciot)) and Margaret Gruca ([@magruca](https://github.com/magruca)). From 0bae68e8edb83745ee22e78dc2d6e733e2364969 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Mon, 15 Apr 2019 20:37:52 -0600 Subject: [PATCH 31/33] Better documentation for some of the optional pipeline arguments. --- docs/usage.md | 16 ++++++++++++++++ nextflow.config | 11 ++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/docs/usage.md b/docs/usage.md index 8b6d805d..4ec32ec3 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -37,6 +37,10 @@ * [`--plaintext_emails`](#--plaintext_emails) * [`--sampleLevel`](#--sampleLevel) * [`--multiqc_config`](#--multiqc_config) + * [`--chrom_sizes`](#--chrom_sizes) + * [`--hisat_indices`](#--hisat_indices) + * [`--genome_refseq`](#--genome_refseq) + * [`--sras`](#--sras) ## General Nextflow info Nextflow handles job submissions on SLURM or other environments, and supervises running the jobs. Thus the Nextflow process must run until the pipeline is finished. We recommend that you put the process running in the background through `screen` / `tmux` or similar tool. Alternatively you can run nextflow within a cluster job submitted your job scheduler. @@ -267,3 +271,15 @@ Set to disable colourful command line output and live life in monochrome. ### `--multiqc_config` Specify a path to a custom MultiQC configuration file. + +### `--chrom_sizes` +Specify a path to a file listing the number of nucleotides on each chromosome, for the reference quenome in question. + +### `--hisat_indices` +Specify a path to the Hisat2 index directory. If not provided, hese indices will be generated the first time this pipeline is executed. + +### `--genome_refseq` +Specify a path to the RefSeq genome annotation file. Optional, but useful to collect stats via RseQC. + +### `--sras` +Specify a path to a directory (can use regular expressions) containing SRR files obtained from the Gene Expression Omnibus (GEO) platform. This is an alternative to providing fastq files if re-analizing existing public datasets. diff --git a/nextflow.config b/nextflow.config index 01fdaee2..8ee5db76 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,10 +18,19 @@ params { skipMultiQC = false threadfqdump = false - //TODO IGNACIO CHECK + // File listing the number of nucleotides per chromosome for the reference genome used. + // Will be generated the first time each genome is used to process datasets. chrom_sizes = 0 + + // Path to the Hisat2 index directory. If not provided, hese indices will be generated + // the first time this pipeline is executed. hisat2_indices = 0 + + // Path to the RefSeq genome annotation file. Optional, but useful to collect stats via RseQC. genome_refseq = 0 + + // Path to SRR files obtained from the Gene Expression Omnibus (GEO) platform. This is an + // alternative to providing fastq files if re-analizing existing public datasets. sras = 0 reads = "data/*{R1,R2}*.fastq" From cac526b11698baa5a97576aa9db25ef5d860412c Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Mon, 15 Apr 2019 21:26:00 -0600 Subject: [PATCH 32/33] Version bump to 1.0 --- .travis.yml | 2 +- Dockerfile | 2 +- environment.yml | 2 +- nextflow.config | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.travis.yml b/.travis.yml index 3a57c2df..b88805da 100644 --- a/.travis.yml +++ b/.travis.yml @@ -14,7 +14,7 @@ before_install: - docker pull nfcore/nascent:dev # Fake the tag locally so that the pipeline runs properly # Looks weird when this is :dev to :dev, but makes sense when testing code for a release (:dev to :1.0.1) - - docker tag nfcore/nascent:dev nfcore/nascent:dev + - docker tag nfcore/nascent:dev nfcore/nascent:1.0 install: # Install Nextflow diff --git a/Dockerfile b/Dockerfile index bba58f69..0553b09a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,4 +4,4 @@ LABEL authors="Ignacio Tripodi (ignacio.tripodi@colorado.edu), Margaret Gruca (m COPY environment.yml / RUN conda env create -f /environment.yml && conda clean -a -ENV PATH /opt/conda/envs/nf-core-nascent-1.0dev/bin:$PATH +ENV PATH /opt/conda/envs/nf-core-nascent-1.0/bin:$PATH diff --git a/environment.yml b/environment.yml index b8312069..3f2240bc 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: nf-core-nascent-1.0dev +name: nf-core-nascent-1.0 channels: - conda-forge - bioconda diff --git a/nextflow.config b/nextflow.config index 8ee5db76..c253a402 100644 --- a/nextflow.config +++ b/nextflow.config @@ -60,7 +60,7 @@ params { // Container slug. Stable releases should specify release tag! // Developmental code should specify :dev -process.container = 'nfcore/nascent:dev' +process.container = 'nfcore/nascent:1.0' // Load base.config by default for all pipelines includeConfig 'conf/base.config' @@ -113,7 +113,7 @@ manifest { description = 'Nascent Transcription Processing Pipeline' mainScript = 'main.nf' nextflowVersion = '>=0.32.0' - version = '1.0dev' + version = '1.0' } // Function to ensure that resource requirements don't go beyond From da15e660010e4d30857364b8468776b0f286cba9 Mon Sep 17 00:00:00 2001 From: Ignacio Tripodi Date: Mon, 15 Apr 2019 23:11:17 -0600 Subject: [PATCH 33/33] Additional well-deserved credits --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 28d9671c..580df378 100644 --- a/README.md +++ b/README.md @@ -78,3 +78,5 @@ Edit the appropriate config file, e.g. `conf/slurm_grch38.config`, to ensure the ## Credits nf-core/nascent was originally written by Ignacio Tripodi ([@ignaciot](https://github.com/ignaciot)) and Margaret Gruca ([@magruca](https://github.com/magruca)). + +Many thanks to the nf-core team and all who provided invaluable feedback and assistance along the way, particularly to [@apeltzer](https://github.com/apeltzer), [@ewels](https://github.com/ewels), [@drpatelh](https://github.com/drpatelh), and [@pditommaso](https://github.com/pditommaso).