Skip to content

Commit

Permalink
Add support for pod5 files generated via dorado basecaller (#368)
Browse files Browse the repository at this point in the history
* Add support for pod5 files generated via dorado basecaller

* remove unnecessary newline to trigger PR checks again

* Add support for full dorado basecaller based structure

* Update JD according to Code Review
  • Loading branch information
Steffengreiner authored Nov 3, 2023
1 parent bc2d0d5 commit affe710
Show file tree
Hide file tree
Showing 18 changed files with 1,480 additions and 27 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ Make sure, that you have defined the Github package Maven repository, in order f

A Nanopore NGS measurement output is delivered to us as a nested folder structure, following this model:

![Nanopore Data Structure Model](./doc/figures/Nanopore_Data_Structure_Model.svg)
![Nanopore Data Structure Model](./doc/figures/Nanopore_Data_Structure_Model.png)


#### Nanopore usage example
Expand Down
Binary file added doc/figures/Nanopore_Data_Structure_Model.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 0 additions & 4 deletions doc/figures/Nanopore_Data_Structure_Model.svg

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ final class OxfordNanoporeExperiment implements ExperimentFolder {
FAST5_FILE(FQDN_FILES + ".Fast5File"),
FASTQ_FILE(FQDN_FILES + ".FastQFile"),
FASTQ_ZIPPED_FILE(FQDN_FILES + ".FastQZippedFile"),
POD5_FILE(FQDN_FILES + ".Pod5File"),
FINAL_SUMMARY_LOG(FQDN_FILES + ".FinalSummaryLog"),
MUX_SCAN_DATA_LOG(FQDN_FILES + ".MuxScanDataLog"),
REPORT_MD_LOG(FQDN_FILES + ".ReportMdLog"),
Expand Down Expand Up @@ -299,10 +300,14 @@ final class OxfordNanoporeExperiment implements ExperimentFolder {
FASTQ_FOLDER(FQDN_FOLDERS + ".FastQFolder"),
FAST5_PASS_FOLDER(FQDN_FOLDERS + ".Fast5PassFolder"),
FAST5_FAIL_FOLDER(FQDN_FOLDERS + ".Fast5FailFolder"),
FAST5_SKIP_FOLDER(FQDN_FOLDERS + ".Fast5SkipFolder"),
FASTQ_PASS_FOLDER(FQDN_FOLDERS + ".FastQPassFolder"),
FASTQ_FAIL_FOLDER(FQDN_FOLDERS + ".FastQFailFolder"),
UNCLASSIFIED_FAST5_FOLDER(FQDN_FOLDERS + ".UnclassifiedFast5Folder"),
UNCLASSIFIED_FASTQ_FOLDER(FQDN_FOLDERS + ".UnclassifiedFastQFolder"),
POD5_PASS_FOLDER(FQDN_FOLDERS + ".Pod5PassFolder"),
POD5_FAIL_FOLDER(FQDN_FOLDERS + ".Pod5FailFolder"),
POD5_SKIP_FOLDER(FQDN_FOLDERS + ".Pod5SkipFolder"),
OTHER_REPORTS_FOLDER(FQDN_FOLDERS + ".OtherReportsFolder"),
BASECALLING_FOLDER(FQDN_FOLDERS + ".BasecallingFolder"),

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ final class OxfordNanoporeMeasurement {
this.pooledSamplesMeasurement = containsAtLeastOneBarcodedFolder(folders["fast5pass"])
// There can be still pooled samples in the failed folder, worst case is all
// samples failed, so we need to check there too
if (! pooledSamplesMeasurement) {
if (!pooledSamplesMeasurement) {
this.pooledSamplesMeasurement = containsAtLeastOneBarcodedFolder(folders["fast5fail"])
}
}

private void assessBasecallingStatus() {
this.hasBasecallingData = folders["basecalling"];
this.hasBasecallingData = folders["basecalling"]
}

private static boolean containsAtLeastOneBarcodedFolder(DataFolder folder) {
Expand All @@ -84,12 +84,24 @@ final class OxfordNanoporeMeasurement {
case Fast5FailFolder:
folders["fast5fail"] = element as Fast5FailFolder
break
case Fast5SkipFolder:
folders["fast5skip"] = element as Fast5SkipFolder
break
case FastQPassFolder:
folders["fastqpass"] = element as FastQPassFolder
break
case FastQFailFolder:
folders["fastqfail"] = element as FastQFailFolder
break
case Pod5PassFolder:
folders["pod5pass"] = element as Pod5PassFolder
break
case Pod5FailFolder:
folders["pod5fail"] = element as Pod5FailFolder
break
case Pod5SkipFolder:
folders["pod5skip"] = element as Pod5SkipFolder
break
case DataFile:
logFilesCollection.add(element as DataFile)
break
Expand All @@ -101,24 +113,38 @@ final class OxfordNanoporeMeasurement {
}

private void assessState() throws IllegalStateException {
// Condition one: Don't allow Fast5 pass and fail folder are empty
assessFast5Content()
// Condition two: Don't allow Fastq pass and fail folder are empty
assessFastQContent()
boolean isValid = false
// We need to ensure that fastq and fast5 information is provided if guppy basecaller was used
if (areFast5FoldersInMeasurement() && areFastQFoldersInMeasurement()) {
isValid = true
}
//// We need to ensure that pod5_skip and fast5_skip information is provided if dorado basecaller was used
if (arePod5FoldersInMeasurement()) {
isValid = true
}
if (isValid == false) {
throw new IllegalStateException("No valid data is contained in measurement")
}
}

private void assessFast5Content() throws IllegalStateException {
if (folders["fast5pass"].getChildren().isEmpty() && folders["fast5fail"].getChildren()
.isEmpty()) {
throw new IllegalStateException("The fast5 pass folder and fail folder are empty.")
}
// Condition one: Don't allow empty Fast5 pass and fail folder
private boolean areFast5FoldersInMeasurement() {
return isDataFolderInMeasurement("fast5pass") || isDataFolderInMeasurement("fast5fail")
}
// Condition two: Don't allow empty Fastq pass and fail folder
private boolean areFastQFoldersInMeasurement() {
return isDataFolderInMeasurement("fastqpass") || isDataFolderInMeasurement("fastqfail")
}
// Condition three: Don't allow empty Pod5 skip and fast5 skip folder
private boolean arePod5FoldersInMeasurement() {
return isDataFolderInMeasurement("fast5skip") || isDataFolderInMeasurement("pod5skip")
}

private void assessFastQContent() throws IllegalStateException {
if (folders["fastqpass"].getChildren().isEmpty() && folders["fastqfail"].getChildren()
.isEmpty()) {
throw new IllegalStateException("The fastq pass folder and fail folder are empty.")
private boolean isDataFolderInMeasurement(String string) {
if (folders[string] == null) {
return false
}
return !folders[string].getChildren().isEmpty()
}

/**
Expand Down Expand Up @@ -284,12 +310,25 @@ final class OxfordNanoporeMeasurement {
private Map<String, Map<String, DataFolder>> prepareRawData(String sampleId) {
final def result = new HashMap()
final def dataFolders = [
"fast5fail": (folders.get("fast5fail") as DataFolder),
"fast5pass": (folders.get("fast5pass") as DataFolder),
"fastqpass": (folders.get("fastqpass") as DataFolder),
"fastqfail": (folders.get("fastqfail") as DataFolder)
"fast5fail" : (folders.get("fast5fail") as DataFolder),
"fast5pass" : (folders.get("fast5pass") as DataFolder),
"fastqpass" : (folders.get("fastqpass") as DataFolder),
"fastqfail" : (folders.get("fastqfail") as DataFolder)
]
if(hasBasecallingData) dataFolders.put("basecalling", (folders.get("basecalling") as DataFolder))
if (hasBasecallingData) dataFolders.put("basecalling", (folders.get("basecalling") as DataFolder))
//Only add dorado based minimal required datafolders if present
if (folders.get("fast5skip") != null) {
dataFolders.put("fast5skip", (folders.get("fast5skip") as DataFolder))
}
if (folders.get("pod5skip") != null) {
dataFolders.put("pod5skip", (folders.get("pod5skip") as DataFolder))
}
if (folders.get("pod5fail") != null) {
dataFolders.put("pod5fail", (folders.get("pod5fail") as DataFolder))
}
if (folders.get("pod5pass") != null) {
dataFolders.put("pod5pass", (folders.get("pod5pass") as DataFolder))
}
result.put(sampleId, dataFolders)
return result
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package life.qbic.datamodel.datasets.datastructure.files.nanopore

import life.qbic.datamodel.datasets.datastructure.files.DataFile

/**
* A specialisation of a DataFile, represents an Oxford Nanopore pod5 file
*
*/
class Pod5File extends DataFile {

final private static String FILE_TYPE = "pod5"

final private static String NAME_SCHEMA = /.*\.pod5$/

protected Pod5File(String name, String relativePath) {
super(name, relativePath, FILE_TYPE)
validateName()
}

static Pod5File create(String name, String relativePath) {
return new Pod5File(name, relativePath)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore summary schema!")
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.files.nanopore.Fast5File
import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always fast5_skip.
*
* Its children field contains a list of type List<Fast5Files>
*
*/
class Fast5SkipFolder extends DataFolder {

final private static String NAME_SCHEMA = /fast5_skip/

protected Fast5SkipFolder() {}

protected Fast5SkipFolder(String name, String relativePath, List<Fast5File> children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Fast5SkipFolder object
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Fast5SkipFolder object
*/
static Fast5SkipFolder create(String name, String relativePath, List<Fast5File> children) {
return new Fast5SkipFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Fast5Skip directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always pod5_fail.
*
* Its children field contains either a list of type List<Pod5Files> or List<Pod5Folder>
*
*/
class Pod5FailFolder extends DataFolder {

final private static String NAME_SCHEMA = /pod5_fail/

protected Pod5FailFolder() {}

protected Pod5FailFolder(String name, String relativePath, List children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Pod5FailFolder object
*
* @param name The folder name
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Pod5FailFolder object
*/
static Pod5FailFolder create(String name, String relativePath, List children) {
new Pod5FailFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Pod5Fail directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always pod5_pass.
*
* Its children field contains either a list of type List<Pod5Files> or List<Pod5Folder>
*
*/
class Pod5PassFolder extends DataFolder {

final private static String NAME_SCHEMA = /pod5_pass/

protected Pod5PassFolder() {}

protected Pod5PassFolder(String name, String relativePath, List<?> children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Pod5PassFolder object
*
* @param name The folder name
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Pod5PassFolder object
*/
static Pod5PassFolder create(String name, String relativePath, List<?> children) {
new Pod5PassFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Pod5Pass directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.files.nanopore.Pod5File
import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always pod5_skip.
*
* Its children field contains a list of type List<Pod5Files>
*
*/
class Pod5SkipFolder extends DataFolder {

final private static String NAME_SCHEMA = /pod5_skip/

protected Pod5SkipFolder() {}

protected Pod5SkipFolder(String name, String relativePath, List<Pod5File> children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Pod5SkipFolder object
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Pod5SkipFolder object
*/
static Pod5SkipFolder create(String name, String relativePath, List<Pod5File> children) {
return new Pod5SkipFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Pod5Skip directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package life.qbic.datamodel.instruments


/**
* Represents the Nanopore instrument output data structure schema generated by employing the dorado basecaller with Pod5Files.
*
* The original schema is defined in as resource and is
* referenced here, wrapped in a Groovy class for reference
* in applications that want to validate the instrument
* output structure against the schema.
*/
class OxfordNanoporeInstrumentOutputDoradoMinimal {

private static final String SCHEMA_PATH = "/schemas/nanopore-instrument-output_minimal_dorado.schema.json"

static InputStream getSchemaAsStream() {
return OxfordNanoporeInstrumentOutputDoradoMinimal.getResourceAsStream(SCHEMA_PATH)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ package life.qbic.datamodel.instruments
*/
class OxfordNanoporeInstrumentOutputMinimal {

private static final String SCHEMA_PATH = "/schemas/nanopore-instrument-output_minimal_schema.json"
private static final String SCHEMA_PATH = "/schemas/nanopore-instrument-output_minimal.schema.json"

static InputStream getSchemaAsStream() {
return OxfordNanoporeInstrumentOutputMinimal.getResourceAsStream(SCHEMA_PATH)
Expand Down
Loading

0 comments on commit affe710

Please sign in to comment.