-
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adds uniref as a parser. Uniref uses a different format from uniprot, so it is necessary to add another file format.
- Loading branch information
Showing
8 changed files
with
520 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
<?xml version="1.0" encoding="ISO-8859-1" ?> | ||
<UniRef50 xmlns="http://uniprot.org/uniref" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd" | ||
releaseDate="2024-11-27" version="2024_06"> | ||
<entry id="UniRef50_UPI002E2621C6" updated="2024-05-29"> | ||
<name>Cluster: uncharacterized protein LOC134193701</name> | ||
<property type="member count" value="1"/> | ||
<property type="common taxon" value="Corticium candelabrum"/> | ||
<property type="common taxon ID" value="121492"/> | ||
<representativeMember> | ||
<dbReference type="UniParc ID" id="UPI002E2621C6"> | ||
<property type="UniRef100 ID" value="UniRef100_UPI002E2621C6"/> | ||
<property type="UniRef90 ID" value="UniRef90_UPI002E2621C6"/> | ||
<property type="protein name" value="uncharacterized protein LOC134193701"/> | ||
<property type="source organism" value="Corticium candelabrum"/> | ||
<property type="NCBI taxonomy" value="121492"/> | ||
<property type="length" value="49499"/> | ||
<property type="isSeed" value="true"/> | ||
</dbReference> | ||
<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence> | ||
</representativeMember> | ||
</entry> | ||
<entry id="UniRef50_UPI00358F51CD" updated="2024-11-27"> | ||
<name>Cluster: LOW QUALITY PROTEIN: titin</name> | ||
<property type="member count" value="1"/> | ||
<property type="common taxon" value="Myxine glutinosa"/> | ||
<property type="common taxon ID" value="7769"/> | ||
<representativeMember> | ||
<dbReference type="UniParc ID" id="UPI00358F51CD"> | ||
<property type="UniRef100 ID" value="UniRef100_UPI00358F51CD"/> | ||
<property type="UniRef90 ID" value="UniRef90_UPI00358F51CD"/> | ||
<property type="protein name" value="LOW QUALITY PROTEIN: titin"/> | ||
<property type="source organism" value="Myxine glutinosa"/> | ||
<property type="NCBI taxonomy" value="7769"/> | ||
<property type="length" value="47063"/> | ||
<property type="isSeed" value="true"/> | ||
</dbReference> | ||
<sequence length="47063" checksum="48729625616C010E">MSEQ</sequence> | ||
</representativeMember> | ||
</entry> | ||
</UniRef50> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package uniref_test | ||
|
||
import ( | ||
"fmt" | ||
"os" | ||
"path/filepath" | ||
|
||
"github.com/koeng101/dnadesign/lib/bio/uniref" | ||
) | ||
|
||
func Example() { | ||
// Open the gzipped UniRef file | ||
file, _ := os.Open(filepath.Join("data", "uniref90_mini.xml")) | ||
defer file.Close() | ||
|
||
// Create new parser | ||
parser, _ := uniref.NewParser(file) | ||
|
||
// Read and print the first entry | ||
entry, _ := parser.Next() | ||
|
||
fmt.Printf("Entry ID: %s\n", entry.ID) | ||
fmt.Printf("Name: %s\n", entry.Name) | ||
fmt.Printf("Sequence Length: %d\n", entry.RepMember.Sequence.Length) | ||
|
||
// Output: | ||
// Entry ID: UniRef50_UPI002E2621C6 | ||
// Name: Cluster: uncharacterized protein LOC134193701 | ||
// Sequence Length: 49499 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
/* | ||
Package uniref provides a parser for UniRef XML files. | ||
UniRef clusters uniprot proteins by similarity. This is useful for doing | ||
bioinformatics on protein space, as many proteins are sequenced a ton of times | ||
in different organisms, and you don't want those proteins to dominate your | ||
training data. | ||
UniRef data dumps are available as FASTA or XML formatted data. The XML has | ||
more rich data, so we use that. The parser was created using AI. | ||
UniProt Reference Clusters (UniRef) provide clustered sets of sequences from | ||
the UniProt Knowledgebase (including isoforms) and selected UniParc records in | ||
order to obtain complete coverage of the sequence space at several resolutions | ||
while hiding redundant sequences (but not their descriptions) from view. | ||
(taken from uniref reference https://www.uniprot.org/help/uniref) | ||
Download uniref data dumps here: https://www.uniprot.org/downloads | ||
UniRef comes in three formats: | ||
- UniRef100: Clusters of sequences that have 100% sequence identity and same length | ||
- UniRef90: Clusters of sequences with at least 90% sequence identity and 80% overlap | ||
- UniRef50: Clusters of sequences with at least 50% sequence identity and 80% overlap | ||
*/ | ||
package uniref | ||
|
||
import ( | ||
"bytes" | ||
"encoding/xml" | ||
"fmt" | ||
"io" | ||
"strings" | ||
) | ||
|
||
// Header is an empty struct since UniRef files don't have headers | ||
type Header struct{} | ||
|
||
// Entry represents a UniRef entry | ||
type Entry struct { | ||
XMLName xml.Name `xml:"entry"` | ||
ID string `xml:"id,attr"` | ||
Updated string `xml:"updated,attr"` | ||
Name string `xml:"name"` | ||
Properties []Property `xml:"property"` | ||
RepMember RepresentativeMember `xml:"representativeMember"` | ||
Members []Member `xml:"member"` | ||
} | ||
|
||
// Property represents a property element | ||
type Property struct { | ||
Type string `xml:"type,attr"` | ||
Value string `xml:"value,attr"` | ||
} | ||
|
||
// DBReference represents a database reference | ||
type DBReference struct { | ||
Type string `xml:"type,attr"` | ||
ID string `xml:"id,attr"` | ||
Properties []Property `xml:"property"` | ||
} | ||
|
||
// Sequence represents a sequence element | ||
type Sequence struct { | ||
Length int `xml:"length,attr"` | ||
Checksum string `xml:"checksum,attr"` | ||
Value string `xml:",chardata"` | ||
} | ||
|
||
// Member represents a member element | ||
type Member struct { | ||
DBRef DBReference `xml:"dbReference"` | ||
Sequence *Sequence `xml:"sequence"` | ||
} | ||
|
||
// RepresentativeMember represents the representative member | ||
type RepresentativeMember Member | ||
|
||
// UniRef represents the root element which can be UniRef50, UniRef90, or UniRef100 | ||
type UniRef struct { | ||
XMLName xml.Name // This will automatically match the root element name | ||
ReleaseDate string `xml:"releaseDate,attr"` | ||
Version string `xml:"version,attr"` | ||
Entries []Entry `xml:"entry"` | ||
} | ||
|
||
// GetUniRefVersion returns "50", "90", or "100" based on the XML root element name | ||
func (u *UniRef) GetUniRefVersion() string { | ||
name := u.XMLName.Local | ||
if strings.HasPrefix(name, "UniRef") { | ||
return strings.TrimPrefix(name, "UniRef") | ||
} | ||
return "" | ||
} | ||
|
||
type Parser struct { | ||
decoder *xml.Decoder | ||
uniref *UniRef | ||
current int | ||
} | ||
|
||
func NewParser(r io.Reader) (*Parser, error) { | ||
decoder := xml.NewDecoder(r) | ||
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { | ||
if strings.ToLower(charset) == "iso-8859-1" { | ||
return input, nil | ||
} | ||
return nil, fmt.Errorf("unsupported charset: %s", charset) | ||
} | ||
|
||
return &Parser{ | ||
decoder: decoder, | ||
current: -1, | ||
}, nil | ||
} | ||
|
||
// Header returns an empty header since UniRef files don't have headers | ||
func (p *Parser) Header() (Header, error) { | ||
return Header{}, nil | ||
} | ||
|
||
// Next returns the next Entry from the UniRef file | ||
func (p *Parser) Next() (Entry, error) { | ||
// First time reading | ||
if p.uniref == nil { | ||
p.uniref = &UniRef{} | ||
if err := p.decoder.Decode(p.uniref); err != nil { | ||
return Entry{}, err | ||
} | ||
p.current = 0 | ||
} | ||
|
||
// Check if we've reached the end of entries | ||
if p.current >= len(p.uniref.Entries) { | ||
return Entry{}, io.EOF | ||
} | ||
|
||
// Get current entry and increment counter | ||
entry := p.uniref.Entries[p.current] | ||
p.current++ | ||
|
||
return entry, nil | ||
} | ||
|
||
// ToXML converts an Entry back to its XML representation | ||
func (e *Entry) ToXML() (string, error) { | ||
buf := new(bytes.Buffer) | ||
enc := xml.NewEncoder(buf) | ||
enc.Indent("", " ") | ||
if err := enc.Encode(e); err != nil { | ||
return "", err | ||
} | ||
return buf.String(), nil | ||
} |
Oops, something went wrong.