Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/make gff for new species #4

Open
wants to merge 22 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
e28cd05
Added bufferedWriter utility
tcasstevens Jun 7, 2022
b3b6591
Importing Brandon/Zach's code from Jupyter to adapt the writeBedFile
Gingerroot775 Jun 7, 2022
d7adca7
Initial pass of adapting it to make GFF
Gingerroot775 Jun 7, 2022
7985c56
Moved MakeGffForNewSpecies to gff package
tcasstevens Jun 8, 2022
e9bbd0b
Comitting day 2 of hackathon work
Gingerroot775 Jun 8, 2022
5c5b949
Merge branch 'feature/makeGFFForNewSpecies' of https://bitbucket.org/…
Gingerroot775 Jun 8, 2022
ee35e44
Adding initial GFF Feature data structures
tcasstevens Jun 9, 2022
c11c45f
Adding initial GFF Feature data structures
tcasstevens Jun 9, 2022
40f4c57
Adding initial GFF Feature data structures
tcasstevens Jun 9, 2022
4657027
Adding initial GFF Feature data structures
tcasstevens Jun 10, 2022
7c6d15f
First draft of parser
Gingerroot775 Jun 10, 2022
039da2d
Added convenience method for converting from string to feature type
Gingerroot775 Jun 10, 2022
0454d78
Adding initial GFF Feature data structures
tcasstevens Jun 10, 2022
4e7c148
Merge branch 'feature/makeGFFForNewSpecies' of https://bitbucket.org/…
Gingerroot775 Jun 10, 2022
ee51523
Added toString and lazyEquals methods
Gingerroot775 Jun 10, 2022
3038651
Added GFF parser, tested to see if it works with scrambled orders (it
Gingerroot775 Jun 10, 2022
748a387
Corrected bug in Feature toString
Gingerroot775 Jun 10, 2022
da41058
Made parser sort for seqid instead of just start
Gingerroot775 Jun 10, 2022
6658fe4
Merged develop into feature/makeGFFForNewSpecies
tcasstevens Jun 15, 2022
f7b95e9
Merged develop into feature/makeGFFForNewSpecies
tcasstevens Jun 16, 2022
d6b35fd
Committing prior to branching to reduce the number of clases in the
Gingerroot775 Jun 17, 2022
208c49f
Merged master into feature/makeGFFForNewSpecies
tcasstevens Sep 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/main/kotlin/biokotlin/gff/Chromosome.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package biokotlin.gff

class Chromosome (
seqid: String,
source: String,
start: Int,
end: Int,
attributes: Map<String, String> = emptyMap(),
children: List<Feature> = emptyList()
) : Feature(seqid, source, start, end, attributes = attributes, children = children) {

override fun type(): FeatureType = FeatureType.Chromosome

}
20 changes: 20 additions & 0 deletions src/main/kotlin/biokotlin/gff/Coding.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package biokotlin.gff

/**
* Also known as CDS
*/
class Coding(
seqid: String,
source: String,
start: Int,
end: Int,
score: Double = Double.NaN,
strand: String = "+",
phase: String = ".",
attributes: Map<String, String> = emptyMap(),
children: List<Feature> = emptyList()
) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) {

override fun type(): FeatureType = FeatureType.Coding

}
29 changes: 29 additions & 0 deletions src/main/kotlin/biokotlin/gff/Exon.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package biokotlin.gff

class Exon(
seqid: String,
source: String,
start: Int,
end: Int,
score: Double = Double.NaN,
strand: String = "+",
phase: String = ".",
attributes: Map<String, String> = emptyMap(),
children: List<Feature> = emptyList()
) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) {

override fun type(): FeatureType = FeatureType.Exon

fun coding(): List<Coding> {
return children.filterIsInstance<Coding>().toList()
}

fun leaders(): List<Leader> {
return children.filterIsInstance<Leader>().toList()
}

fun terminators(): List<Terminator> {
return children.filterIsInstance<Terminator>().toList()
}

}
158 changes: 158 additions & 0 deletions src/main/kotlin/biokotlin/gff/Feature.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
package biokotlin.gff

enum class FeatureType {
Gene, Exon, Leader, Terminator, Coding, mRNA, Intron, Chromosome, Scaffold;

companion object {
/**
* Converts from standard names in GFF files to a FeatureType. Case-insensitive
*/
fun fromGffString(gffString: String): FeatureType {
return when (gffString.lowercase()) {
"gene" -> Gene
"exon" -> Exon
"five_prime_utr" -> Leader
"three_prime_utr" -> Terminator
"cds" -> Coding
"mrna" -> mRNA
"intron" -> Intron
"chromosome" -> Chromosome
"scaffold" -> Scaffold
else -> throw Exception("Feature $gffString is not supported")
}
}
}
}

/**
* @param seqid The ID of the landmark used to establish the coordinate system for the current feature. IDs may contain
* any characters, but must escape any characters not in the set [a-zA-Z0-9.:^*$@!+_?-|]. In particular, IDs may not
* contain unescaped whitespace and must not begin with an unescaped ">".
* @param source The source is a free text qualifier intended to describe the algorithm or operating procedure that
* generated this feature. Typically this is the name of a piece of software, such as "Genescan" or a database name,
* such as "Genbank." In effect, the source is used to extend the feature ontology by adding a qualifier to the type
* creating a new composite type that is a subclass of the type in the type column.
* @param start The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative
* to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin
* of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to
* be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature.
* @param end The start and end coordinates of the feature are given in positive 1-based integer coordinates, relative
* to the landmark given in column one. Start is always less than or equal to end. For features that cross the origin
* of a circular feature (e.g. most bacterial genomes, plasmids, and some viral genomes), the requirement for start to
* be less than or equal to end is satisfied by making end = the position of the end + the length of the landmark feature.
* @param score The score of the feature, a floating point number. As in earlier versions of the format, the semantics
* of the score are ill-defined. It is strongly recommended that E-values be used for sequence similarity features,
* and that P-values be used for ab initio gene prediction features.
* @param strand The strand of the feature. + for positive strand (relative to the landmark), - for minus strand,
* and . for features that are not stranded. In addition, ? can be used for features whose strandedness is relevant,
* but unknown.
* @param phase For features of type "CDS", the phase indicates where the next codon begins relative to the 5' end
* (where the 5' end of the CDS is relative to the strand of the CDS feature) of the current CDS feature. For
* clarification the 5' end for CDS features on the plus strand is the feature's start and and the 5' end for CDS
* features on the minus strand is the feature's end. The phase is one of the integers 0, 1, or 2, indicating the
* number of bases forward from the start of the current CDS feature the next codon begins. A phase of "0" indicates
* that a codon begins on the first nucleotide of the CDS feature (i.e. 0 bases forward), a phase of "1" indicates
* that the codon begins at the second nucleotide of this CDS feature and a phase of "2" indicates that the codon
* begins at the third nucleotide of this region. Note that ‘Phase’ in the context of a GFF3 CDS feature should not
* be confused with the similar concept of frame that is also a common concept in bioinformatics. Frame is generally
* calculated as a value for a given base relative to the start of the complete open reading frame (ORF) or the
* codon (e.g. modulo 3) while CDS phase describes the start of the next codon relative to a given CDS feature.
* @param attributes A list of feature attributes in the format tag=value. Multiple tag=value pairs are separated
* by semicolons. URL escaping rules are used for tags or values containing the following characters: ",=;". Spaces
* are allowed in this field, but tabs must be replaced with the %09 URL escape. Attribute values do not need to be
* and should not be quoted. The quotes should be included as part of the value by parsers and not stripped.
* @param children TODO
* @see FeatureBuilder
*/
abstract class Feature(
val seqid: String,
val source: String,
val start: Int,
val end: Int,
val score: Double = Double.NaN,
val strand: String = "+",
val phase: String = ".",
var attributes: Map<String, String> = emptyMap(),
var children: List<Feature> = emptyList()
) {

init {
attributes = attributes.toMap()
children = children.sortedWith(FeatureComparator())
}

abstract fun type(): FeatureType

fun attribute(key: String) = attributes[key]

//TODO make this an actual pointer and handle multiple parents
fun parent(): Feature = TODO()

fun id() = attributes["ID"]

fun name() = attributes["Name"]

fun alias() = attributes["Alias"]

fun target() = attributes["Target"]

fun gap() = attributes["Gap"]

fun derivesFrom() = attributes["Derives_from"]

fun note() = attributes["Note"]

fun dbxref() = attributes["Dbxref"]

fun ontologyTerm() = attributes["Ontology_term"]

fun isCircular() = attributes["Is_circular"]

/**
* Compares this to [other] alphabetically by seqid, then by start, then by end position.
* Returns zero if this and [other] are equal in ordering, a negative number if this is less
* than [other], or a positive number if this is greater than [other].
*/
fun compareTo(other: Feature): Int {
return if (seqid.compareTo(other.seqid) == 0) {
if (start.compareTo(other.start) == 0) {
end.compareTo(other.end)
} else {
start.compareTo(other.start)
}
} else {
seqid.compareTo(other.seqid)
}
}
/**
* Returns the feature as a string representing row in a GFF file
*/
override fun toString(): String {
val scoreString = if (score.isNaN()) {
"."
} else {
score.toString()
}

val attributesString = StringBuilder()
for ((tag, value) in attributes) {
attributesString.append("$tag=$value;")
}

return "$seqid\t$source\t${type()}\t$start\t$end\t$scoreString\t$strand\t$phase\t${attributesString}\n"
}
}

/**
* Provides ordering for feature
*/
class FeatureComparator: Comparator<Feature> {
/**
* Returns the same result as [p0].compareTo([p1]) unless one of the arguments is null, in which case it returns 0.
*/
override fun compare(p0: Feature?, p1: Feature?): Int {
if (p0 == null || p1 == null) return 0

return p0.compareTo(p1)
}
}
50 changes: 50 additions & 0 deletions src/main/kotlin/biokotlin/gff/FeatureBuilder.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
package biokotlin.gff

//TODO add pointer to parent(s)
/**
* A mutable representation of a genetic feature that can be built into a [Feature].
* @see Feature
*/
class FeatureBuilder(
val seqid: String,
val source: String,
val type: FeatureType,
val start: Int,
val end: Int,
val score: Double = 0.0,
val strand: String = "+",
val phase: String = ".",
var attributes: Map<String, String> = emptyMap()
) {

val children = mutableListOf<FeatureBuilder>()

fun id() = attributes["ID"]

fun add(child: FeatureBuilder) {
children.add(child)
}

/**
* Builds this feature and its children, recursively.
* @return An immutable [Feature] with the properties of this [FeatureBuilder] and whose children are built
* versions of the this [FeatureBuilder]'s children, sorted by [FeatureComparator].
*/
fun build(): Feature {

val children = children.map { it.build() }
return when (type) {
FeatureType.Gene -> Gene(seqid, source, start, end, score, strand, phase, attributes, children)
FeatureType.Exon -> Exon(seqid, source, start, end, score, strand, phase, attributes, children)
FeatureType.Leader -> Leader(seqid, source, start, end, score, strand, phase, attributes, children)
FeatureType.Terminator -> Terminator(seqid, source, start, end, score, strand, phase, attributes, children)
FeatureType.Coding -> Coding(seqid, source, start, end, score, strand, phase, attributes, children)
FeatureType.mRNA -> MRNA(seqid, source, start, end, score, strand, phase, attributes, children)
FeatureType.Intron -> Intron(seqid, source, start, end)
FeatureType.Chromosome -> Chromosome(seqid, source, start, end, attributes, children)
FeatureType.Scaffold -> Scaffold(seqid, source, start, end, attributes, children)
}

}

}
22 changes: 22 additions & 0 deletions src/main/kotlin/biokotlin/gff/Gene.kt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package biokotlin.gff


class Gene(
seqid: String,
source: String,
start: Int,
end: Int,
score: Double = Double.NaN,
strand: String = "+",
phase: String = ".",
attributes: Map<String, String> = emptyMap(),
children: List<Feature> = emptyList()
) : Feature(seqid, source, start, end, score, strand, phase, attributes, children) {

override fun type(): FeatureType = FeatureType.Gene

fun mRNAs(): List<MRNA> {
return children.filterIsInstance<MRNA>().toList()
}

}
Loading