uniref (#107)

Adds uniref as a parser. Uniref uses a different format from uniprot, so it is necessary to add another file format.
Koeng101 · Dec 14, 2024 · eb32010 · eb32010
1 parent 486fdda
commit eb32010
Show file tree

Hide file tree

Showing 8 changed files with 520 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -76,6 +76,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
 ## [Unreleased]
+- Adds uniref parser [#107](https://github.com/Koeng101/dnadesign/pull/107)
+- Fixes iso-8859-1 error in reading uniref data dumps [#106](https://github.com/Koeng101/dnadesign/pull/106)
 - Updates uniprot parser to read IDs [#104](https://github.com/Koeng101/dnadesign/pull/104)
 - Fixes RecursiveFragment to not add flanks to the initial input [#102](https://github.com/Koeng101/dnadesign/pull/102)
 - Fixes add flank bug, releases new version of python lib [#101](https://github.com/Koeng101/dnadesign/pull/101)

diff --git a/lib/bio/bio.go b/lib/bio/bio.go
@@ -24,6 +24,7 @@ import (
 	"github.com/koeng101/dnadesign/lib/bio/sam"
 	"github.com/koeng101/dnadesign/lib/bio/slow5"
 	"github.com/koeng101/dnadesign/lib/bio/uniprot"
+	"github.com/koeng101/dnadesign/lib/bio/uniref"
 )
 
 // Format is a enum of different parser formats.
@@ -63,12 +64,12 @@ Lower level interfaces
 
 // DataTypes defines the possible data types returned by every parser.
 type DataTypes interface {
-	genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry
+	genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry | uniref.Entry
 }
 
 // HeaderTypes defines the possible header types returned by every parser.
 type HeaderTypes interface {
-	genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header
+	genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header | uniref.Header
 }
 
 // ParserInterface is a generic interface that all parsers must support. It is
@@ -171,6 +172,13 @@ func NewUniprotParser(r io.Reader) *Parser[uniprot.Entry, uniprot.Header] {
 	return &Parser[uniprot.Entry, uniprot.Header]{ParserInterface: uniprot.NewParser(r)}
 }
 
+// NewUnirefParser initiates a new Uniref parser from an io.Reader. No
+// maxLineLength is necessary.
+func NewUnirefParser(r io.Reader) (*Parser[uniref.Entry, uniref.Header], error) {
+	parser, err := uniref.NewParser(r)
+	return &Parser[uniref.Entry, uniref.Header]{ParserInterface: parser}, err
+}
+
 /******************************************************************************
 
 Parser higher-level functions

diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go
@@ -389,6 +389,45 @@ func ExampleNewUniprotParser() {
 	// Output: P0C9F0
 }
 
+func ExampleNewUnirefParser() {
+	// The following is the first gene of UniRef50 with the sequence truncated.
+	// We're going to gzip it and put the gzipped text as an io.Reader to mock
+	// a file. You can edit the text here to see how the parser works.
+	//
+	// Note: Unlike the uniprot parser, the uniref parser expects that the file is
+	// properly terminated with </UniRef50>.
+	uniprotEntryText := strings.NewReader(`<?xml version="1.0" encoding="ISO-8859-1" ?>
+<UniRef50 xmlns="http://uniprot.org/uniref" 
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd" 
+ releaseDate="2024-11-27" version="2024_06"> 
+<entry id="UniRef50_UPI002E2621C6" updated="2024-05-29">
+<name>Cluster: uncharacterized protein LOC134193701</name>
+<property type="member count" value="1"/>
+<property type="common taxon" value="Corticium candelabrum"/>
+<property type="common taxon ID" value="121492"/>
+<representativeMember>
+<dbReference type="UniParc ID" id="UPI002E2621C6">
+<property type="UniRef100 ID" value="UniRef100_UPI002E2621C6"/>
+<property type="UniRef90 ID" value="UniRef90_UPI002E2621C6"/>
+<property type="protein name" value="uncharacterized protein LOC134193701"/>
+<property type="source organism" value="Corticium candelabrum"/>
+<property type="NCBI taxonomy" value="121492"/>
+<property type="length" value="49499"/>
+<property type="isSeed" value="true"/>
+</dbReference>
+<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence>
+</representativeMember>
+</entry>
+</UniRef50>`)
+	// Now we load the parser, and get the first entry out.
+	parser, _ := bio.NewUnirefParser(uniprotEntryText)
+	entry, _ := parser.Next()
+
+	fmt.Println(entry.ID)
+	// Output: UniRef50_UPI002E2621C6
+}
+
 func ExampleNewSamParser() {
 	// The following can be replaced with a any io.Reader. For example,
 	// `file, err := os.Open(path)` for file would also work.

diff --git a/lib/bio/uniprot/uniprot.go b/lib/bio/uniprot/uniprot.go
@@ -28,7 +28,6 @@ import (
 	"io"
 	"net/http"
 	"net/url"
-	"strings"
 )
 
 // Decoder decodes XML elements2
@@ -68,14 +67,6 @@ type Parser struct {
 
 func NewParser(r io.Reader) *Parser {
 	decoder := xml.NewDecoder(r)
-	// Oddly enough, the uniref datasets use iso-8859-1, not UTF-8. So we need
-	// to incorporate this decoder charset reader.
-	decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
-		if strings.ToLower(charset) == "iso-8859-1" {
-			return input, nil // ISO-8859-1 bytes can be read directly as UTF-8
-		}
-		return nil, fmt.Errorf("unsupported charset: %s", charset)
-	}
 	return &Parser{decoder: decoder}
 }
 

diff --git a/lib/bio/uniref/data/uniref90_mini.xml b/lib/bio/uniref/data/uniref90_mini.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<UniRef50 xmlns="http://uniprot.org/uniref" 
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd" 
+ releaseDate="2024-11-27" version="2024_06"> 
+<entry id="UniRef50_UPI002E2621C6" updated="2024-05-29">
+<name>Cluster: uncharacterized protein LOC134193701</name>
+<property type="member count" value="1"/>
+<property type="common taxon" value="Corticium candelabrum"/>
+<property type="common taxon ID" value="121492"/>
+<representativeMember>
+<dbReference type="UniParc ID" id="UPI002E2621C6">
+<property type="UniRef100 ID" value="UniRef100_UPI002E2621C6"/>
+<property type="UniRef90 ID" value="UniRef90_UPI002E2621C6"/>
+<property type="protein name" value="uncharacterized protein LOC134193701"/>
+<property type="source organism" value="Corticium candelabrum"/>
+<property type="NCBI taxonomy" value="121492"/>
+<property type="length" value="49499"/>
+<property type="isSeed" value="true"/>
+</dbReference>
+<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence>
+</representativeMember>
+</entry>
+<entry id="UniRef50_UPI00358F51CD" updated="2024-11-27">
+<name>Cluster: LOW QUALITY PROTEIN: titin</name>
+<property type="member count" value="1"/>
+<property type="common taxon" value="Myxine glutinosa"/>
+<property type="common taxon ID" value="7769"/>
+<representativeMember>
+<dbReference type="UniParc ID" id="UPI00358F51CD">
+<property type="UniRef100 ID" value="UniRef100_UPI00358F51CD"/>
+<property type="UniRef90 ID" value="UniRef90_UPI00358F51CD"/>
+<property type="protein name" value="LOW QUALITY PROTEIN: titin"/>
+<property type="source organism" value="Myxine glutinosa"/>
+<property type="NCBI taxonomy" value="7769"/>
+<property type="length" value="47063"/>
+<property type="isSeed" value="true"/>
+</dbReference>
+<sequence length="47063" checksum="48729625616C010E">MSEQ</sequence>
+</representativeMember>
+</entry>
+</UniRef50>
diff --git a/lib/bio/uniref/example_test.go b/lib/bio/uniref/example_test.go
@@ -0,0 +1,30 @@
+package uniref_test
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+
+	"github.com/koeng101/dnadesign/lib/bio/uniref"
+)
+
+func Example() {
+	// Open the gzipped UniRef file
+	file, _ := os.Open(filepath.Join("data", "uniref90_mini.xml"))
+	defer file.Close()
+
+	// Create new parser
+	parser, _ := uniref.NewParser(file)
+
+	// Read and print the first entry
+	entry, _ := parser.Next()
+
+	fmt.Printf("Entry ID: %s\n", entry.ID)
+	fmt.Printf("Name: %s\n", entry.Name)
+	fmt.Printf("Sequence Length: %d\n", entry.RepMember.Sequence.Length)
+
+	// Output:
+	// Entry ID: UniRef50_UPI002E2621C6
+	// Name: Cluster: uncharacterized protein LOC134193701
+	// Sequence Length: 49499
+}
diff --git a/lib/bio/uniref/uniref.go b/lib/bio/uniref/uniref.go
@@ -0,0 +1,153 @@
+/*
+Package uniref provides a parser for UniRef XML files.
+
+UniRef clusters uniprot proteins by similarity. This is useful for doing
+bioinformatics on protein space, as many proteins are sequenced a ton of times
+in different organisms, and you don't want those proteins to dominate your
+training data.
+
+UniRef data dumps are available as FASTA or XML formatted data. The XML has
+more rich data, so we use that. The parser was created using AI.
+
+UniProt Reference Clusters (UniRef) provide clustered sets of sequences from
+the UniProt Knowledgebase (including isoforms) and selected UniParc records in
+order to obtain complete coverage of the sequence space at several resolutions
+while hiding redundant sequences (but not their descriptions) from view.
+(taken from uniref reference https://www.uniprot.org/help/uniref)
+
+Download uniref data dumps here: https://www.uniprot.org/downloads
+
+UniRef comes in three formats:
+- UniRef100: Clusters of sequences that have 100% sequence identity and same length
+- UniRef90: Clusters of sequences with at least 90% sequence identity and 80% overlap
+- UniRef50: Clusters of sequences with at least 50% sequence identity and 80% overlap
+*/
+package uniref
+
+import (
+	"bytes"
+	"encoding/xml"
+	"fmt"
+	"io"
+	"strings"
+)
+
+// Header is an empty struct since UniRef files don't have headers
+type Header struct{}
+
+// Entry represents a UniRef entry
+type Entry struct {
+	XMLName    xml.Name             `xml:"entry"`
+	ID         string               `xml:"id,attr"`
+	Updated    string               `xml:"updated,attr"`
+	Name       string               `xml:"name"`
+	Properties []Property           `xml:"property"`
+	RepMember  RepresentativeMember `xml:"representativeMember"`
+	Members    []Member             `xml:"member"`
+}
+
+// Property represents a property element
+type Property struct {
+	Type  string `xml:"type,attr"`
+	Value string `xml:"value,attr"`
+}
+
+// DBReference represents a database reference
+type DBReference struct {
+	Type       string     `xml:"type,attr"`
+	ID         string     `xml:"id,attr"`
+	Properties []Property `xml:"property"`
+}
+
+// Sequence represents a sequence element
+type Sequence struct {
+	Length   int    `xml:"length,attr"`
+	Checksum string `xml:"checksum,attr"`
+	Value    string `xml:",chardata"`
+}
+
+// Member represents a member element
+type Member struct {
+	DBRef    DBReference `xml:"dbReference"`
+	Sequence *Sequence   `xml:"sequence"`
+}
+
+// RepresentativeMember represents the representative member
+type RepresentativeMember Member
+
+// UniRef represents the root element which can be UniRef50, UniRef90, or UniRef100
+type UniRef struct {
+	XMLName     xml.Name // This will automatically match the root element name
+	ReleaseDate string   `xml:"releaseDate,attr"`
+	Version     string   `xml:"version,attr"`
+	Entries     []Entry  `xml:"entry"`
+}
+
+// GetUniRefVersion returns "50", "90", or "100" based on the XML root element name
+func (u *UniRef) GetUniRefVersion() string {
+	name := u.XMLName.Local
+	if strings.HasPrefix(name, "UniRef") {
+		return strings.TrimPrefix(name, "UniRef")
+	}
+	return ""
+}
+
+type Parser struct {
+	decoder *xml.Decoder
+	uniref  *UniRef
+	current int
+}
+
+func NewParser(r io.Reader) (*Parser, error) {
+	decoder := xml.NewDecoder(r)
+	decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
+		if strings.ToLower(charset) == "iso-8859-1" {
+			return input, nil
+		}
+		return nil, fmt.Errorf("unsupported charset: %s", charset)
+	}
+
+	return &Parser{
+		decoder: decoder,
+		current: -1,
+	}, nil
+}
+
+// Header returns an empty header since UniRef files don't have headers
+func (p *Parser) Header() (Header, error) {
+	return Header{}, nil
+}
+
+// Next returns the next Entry from the UniRef file
+func (p *Parser) Next() (Entry, error) {
+	// First time reading
+	if p.uniref == nil {
+		p.uniref = &UniRef{}
+		if err := p.decoder.Decode(p.uniref); err != nil {
+			return Entry{}, err
+		}
+		p.current = 0
+	}
+
+	// Check if we've reached the end of entries
+	if p.current >= len(p.uniref.Entries) {
+		return Entry{}, io.EOF
+	}
+
+	// Get current entry and increment counter
+	entry := p.uniref.Entries[p.current]
+	p.current++
+
+	return entry, nil
+}
+
+// ToXML converts an Entry back to its XML representation
+func (e *Entry) ToXML() (string, error) {
+	buf := new(bytes.Buffer)
+	enc := xml.NewEncoder(buf)
+	enc.Indent("", "  ")
+	if err := enc.Encode(e); err != nil {
+		return "", err
+	}
+	return buf.String(), nil
+}