Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for pod5 files generated via dorado basecaller #368

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ Make sure, that you have defined the Github package Maven repository, in order f

A Nanopore NGS measurement output is delivered to us as a nested folder structure, following this model:

![Nanopore Data Structure Model](./doc/figures/Nanopore_Data_Structure_Model.svg)
![Nanopore Data Structure Model](./doc/figures/Nanopore_Data_Structure_Model.png)


#### Nanopore usage example
Expand Down
Binary file added doc/figures/Nanopore_Data_Structure_Model.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 0 additions & 4 deletions doc/figures/Nanopore_Data_Structure_Model.svg

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,7 @@ final class OxfordNanoporeExperiment implements ExperimentFolder {
FAST5_FILE(FQDN_FILES + ".Fast5File"),
FASTQ_FILE(FQDN_FILES + ".FastQFile"),
FASTQ_ZIPPED_FILE(FQDN_FILES + ".FastQZippedFile"),
POD5_FILE(FQDN_FILES + ".Pod5File"),
FINAL_SUMMARY_LOG(FQDN_FILES + ".FinalSummaryLog"),
MUX_SCAN_DATA_LOG(FQDN_FILES + ".MuxScanDataLog"),
REPORT_MD_LOG(FQDN_FILES + ".ReportMdLog"),
Expand Down Expand Up @@ -299,10 +300,14 @@ final class OxfordNanoporeExperiment implements ExperimentFolder {
FASTQ_FOLDER(FQDN_FOLDERS + ".FastQFolder"),
FAST5_PASS_FOLDER(FQDN_FOLDERS + ".Fast5PassFolder"),
FAST5_FAIL_FOLDER(FQDN_FOLDERS + ".Fast5FailFolder"),
FAST5_SKIP_FOLDER(FQDN_FOLDERS + ".Fast5SkipFolder"),
FASTQ_PASS_FOLDER(FQDN_FOLDERS + ".FastQPassFolder"),
FASTQ_FAIL_FOLDER(FQDN_FOLDERS + ".FastQFailFolder"),
UNCLASSIFIED_FAST5_FOLDER(FQDN_FOLDERS + ".UnclassifiedFast5Folder"),
UNCLASSIFIED_FASTQ_FOLDER(FQDN_FOLDERS + ".UnclassifiedFastQFolder"),
POD5_PASS_FOLDER(FQDN_FOLDERS + ".Pod5PassFolder"),
POD5_FAIL_FOLDER(FQDN_FOLDERS + ".Pod5FailFolder"),
POD5_SKIP_FOLDER(FQDN_FOLDERS + ".Pod5SkipFolder"),
OTHER_REPORTS_FOLDER(FQDN_FOLDERS + ".OtherReportsFolder"),
BASECALLING_FOLDER(FQDN_FOLDERS + ".BasecallingFolder"),

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,13 +57,13 @@ final class OxfordNanoporeMeasurement {
this.pooledSamplesMeasurement = containsAtLeastOneBarcodedFolder(folders["fast5pass"])
// There can be still pooled samples in the failed folder, worst case is all
// samples failed, so we need to check there too
if (! pooledSamplesMeasurement) {
if (!pooledSamplesMeasurement) {
this.pooledSamplesMeasurement = containsAtLeastOneBarcodedFolder(folders["fast5fail"])
}
}

private void assessBasecallingStatus() {
this.hasBasecallingData = folders["basecalling"];
this.hasBasecallingData = folders["basecalling"]
}

private static boolean containsAtLeastOneBarcodedFolder(DataFolder folder) {
Expand All @@ -84,12 +84,24 @@ final class OxfordNanoporeMeasurement {
case Fast5FailFolder:
folders["fast5fail"] = element as Fast5FailFolder
break
case Fast5SkipFolder:
folders["fast5skip"] = element as Fast5SkipFolder
break
case FastQPassFolder:
folders["fastqpass"] = element as FastQPassFolder
break
case FastQFailFolder:
folders["fastqfail"] = element as FastQFailFolder
break
case Pod5PassFolder:
folders["pod5pass"] = element as Pod5PassFolder
break
case Pod5FailFolder:
folders["pod5fail"] = element as Pod5FailFolder
break
case Pod5SkipFolder:
folders["pod5skip"] = element as Pod5SkipFolder
break
case DataFile:
logFilesCollection.add(element as DataFile)
break
Expand All @@ -101,24 +113,38 @@ final class OxfordNanoporeMeasurement {
}

private void assessState() throws IllegalStateException {
// Condition one: Don't allow Fast5 pass and fail folder are empty
assessFast5Content()
// Condition two: Don't allow Fastq pass and fail folder are empty
assessFastQContent()
boolean isValid = false
// We need to ensure that fastq and fast5 information is provided if guppy basecaller was used
if (areFast5FoldersInMeasurement() && areFastQFoldersInMeasurement()) {
isValid = true
}
//// We need to ensure that pod5_skip and fast5_skip information is provided if dorado basecaller was used
if (arePod5FoldersInMeasurement()) {
isValid = true
}
if (isValid == false) {
throw new IllegalStateException("No valid data is contained in measurement")
}
}

private void assessFast5Content() throws IllegalStateException {
if (folders["fast5pass"].getChildren().isEmpty() && folders["fast5fail"].getChildren()
.isEmpty()) {
throw new IllegalStateException("The fast5 pass folder and fail folder are empty.")
}
// Condition one: Don't allow empty Fast5 pass and fail folder
private boolean areFast5FoldersInMeasurement() {
return isDataFolderInMeasurement("fast5pass") || isDataFolderInMeasurement("fast5fail")
}
// Condition two: Don't allow empty Fastq pass and fail folder
private boolean areFastQFoldersInMeasurement() {
return isDataFolderInMeasurement("fastqpass") || isDataFolderInMeasurement("fastqfail")
}
// Condition three: Don't allow empty Pod5 skip and fast5 skip folder
Steffengreiner marked this conversation as resolved.
Show resolved Hide resolved
private boolean arePod5FoldersInMeasurement() {
return isDataFolderInMeasurement("fast5skip") || isDataFolderInMeasurement("pod5skip")
}

private void assessFastQContent() throws IllegalStateException {
if (folders["fastqpass"].getChildren().isEmpty() && folders["fastqfail"].getChildren()
.isEmpty()) {
throw new IllegalStateException("The fastq pass folder and fail folder are empty.")
private boolean isDataFolderInMeasurement(String string) {
if (folders[string] == null) {
return false
}
return !folders[string].getChildren().isEmpty()
}

/**
Expand Down Expand Up @@ -284,12 +310,25 @@ final class OxfordNanoporeMeasurement {
private Map<String, Map<String, DataFolder>> prepareRawData(String sampleId) {
final def result = new HashMap()
final def dataFolders = [
"fast5fail": (folders.get("fast5fail") as DataFolder),
"fast5pass": (folders.get("fast5pass") as DataFolder),
"fastqpass": (folders.get("fastqpass") as DataFolder),
"fastqfail": (folders.get("fastqfail") as DataFolder)
"fast5fail" : (folders.get("fast5fail") as DataFolder),
"fast5pass" : (folders.get("fast5pass") as DataFolder),
"fastqpass" : (folders.get("fastqpass") as DataFolder),
"fastqfail" : (folders.get("fastqfail") as DataFolder)
]
if(hasBasecallingData) dataFolders.put("basecalling", (folders.get("basecalling") as DataFolder))
if (hasBasecallingData) dataFolders.put("basecalling", (folders.get("basecalling") as DataFolder))
//Only add dorado based minimal required datafolders if present
if (folders.get("fast5skip") != null) {
dataFolders.put("fast5skip", (folders.get("fast5skip") as DataFolder))
}
if (folders.get("pod5skip") != null) {
dataFolders.put("pod5skip", (folders.get("pod5skip") as DataFolder))
}
if (folders.get("pod5fail") != null) {
dataFolders.put("pod5fail", (folders.get("pod5fail") as DataFolder))
}
if (folders.get("pod5pass") != null) {
dataFolders.put("pod5pass", (folders.get("pod5pass") as DataFolder))
}
result.put(sampleId, dataFolders)
return result
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package life.qbic.datamodel.datasets.datastructure.files.nanopore

import life.qbic.datamodel.datasets.datastructure.files.DataFile

/**
* A specialisation of a DataFile, represents an Oxford Nanopore pod5 file
*
*/
class Pod5File extends DataFile {

final private static String FILE_TYPE = "pod5"

final private static String NAME_SCHEMA = /.*\.pod5$/

protected Pod5File(String name, String relativePath) {
super(name, relativePath, FILE_TYPE)
validateName()
}

static Pod5File create(String name, String relativePath) {
return new Pod5File(name, relativePath)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore summary schema!")
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.files.nanopore.Fast5File
import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always fast5_skip.
*
* Its children field contains a list of type List<Fast5Files>
*
*/
class Fast5SkipFolder extends DataFolder {

final private static String NAME_SCHEMA = /fast5_skip/

protected Fast5SkipFolder() {}

protected Fast5SkipFolder(String name, String relativePath, List<Fast5File> children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Fast5SkipFolder object
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Fast5SkipFolder object
*/
static Fast5SkipFolder create(String name, String relativePath, List<Fast5File> children) {
return new Fast5SkipFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Fast5Skip directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always pod5_fail.
*
* Its children field contains either a list of type List<Pod5Files> or List<Pod5Folder>
*
*/
class Pod5FailFolder extends DataFolder {

final private static String NAME_SCHEMA = /pod5_fail/

protected Pod5FailFolder() {}

protected Pod5FailFolder(String name, String relativePath, List children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Pod5FailFolder object
*
* @param name The folder name
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Pod5FailFolder object
*/
static Pod5FailFolder create(String name, String relativePath, List children) {
new Pod5FailFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Pod5Fail directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always pod5_pass.
*
* Its children field contains either a list of type List<Pod5Files> or List<Pod5Folder>
*
*/
class Pod5PassFolder extends DataFolder {

final private static String NAME_SCHEMA = /pod5_pass/

protected Pod5PassFolder() {}

protected Pod5PassFolder(String name, String relativePath, List<?> children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Pod5PassFolder object
*
* @param name The folder name
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Pod5PassFolder object
*/
static Pod5PassFolder create(String name, String relativePath, List<?> children) {
new Pod5PassFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Pod5Pass directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package life.qbic.datamodel.datasets.datastructure.folders.nanopore

import life.qbic.datamodel.datasets.datastructure.files.nanopore.Pod5File
import life.qbic.datamodel.datasets.datastructure.folders.DataFolder

/**
* A special case of a DataFolder, its name is always pod5_skip.
*
* Its children field contains a list of type List<Pod5Files>
*
*/
class Pod5SkipFolder extends DataFolder {

final private static String NAME_SCHEMA = /pod5_skip/

protected Pod5SkipFolder() {}

protected Pod5SkipFolder(String name, String relativePath, List<Pod5File> children) {
super(name, relativePath, children)
validateName()
}

/**
* Creates a new instance of a Pod5SkipFolder object
* @param relativePath The relative path of the folder
* @param children A list with child elements of the folder
* @return A new instance of a Pod5SkipFolder object
*/
static Pod5SkipFolder create(String name, String relativePath, List<Pod5File> children) {
return new Pod5SkipFolder(name, relativePath, children)
}

private void validateName() {
if (!(this.name =~ NAME_SCHEMA)) {
throw new IllegalArgumentException("Name must match the Nanopore Pod5Skip directory schema!")
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package life.qbic.datamodel.instruments


/**
* Represents the Nanopore instrument output data structure schema generated by employing the dorado basecaller with Pod5Files.
*
* The original schema is defined in as resource and is
* referenced here, wrapped in a Groovy class for reference
* in applications that want to validate the instrument
* output structure against the schema.
*/
class OxfordNanoporeInstrumentOutputDoradoMinimal {

private static final String SCHEMA_PATH = "/schemas/nanopore-instrument-output_minimal_dorado.schema.json"

static InputStream getSchemaAsStream() {
return OxfordNanoporeInstrumentOutputDoradoMinimal.getResourceAsStream(SCHEMA_PATH)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ package life.qbic.datamodel.instruments
*/
class OxfordNanoporeInstrumentOutputMinimal {

private static final String SCHEMA_PATH = "/schemas/nanopore-instrument-output_minimal_schema.json"
private static final String SCHEMA_PATH = "/schemas/nanopore-instrument-output_minimal.schema.json"

static InputStream getSchemaAsStream() {
return OxfordNanoporeInstrumentOutputMinimal.getResourceAsStream(SCHEMA_PATH)
Expand Down
Loading
Loading