Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Interminebio #68

Open
wants to merge 8 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions intermine/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,225 @@
VERSION = "1.11.0"
from intermine import query
from intermine.iterators import *
try:
import simplejson as json
except ImportError:
try:
import json
except ImportError:
raise "No JSON module found - please install simplejson"

class SequenceDataQuery(object):

def to_query(self):
"""Fulfil the listable query interface"""
return self

def bed(self, ucsc_compatible=True):
"""
Get results as BED
==================
Return a BedIterator object, which stringifies to the BED results,
and works as an iterator over the lines. After iteration the header
information is accessible with the iter.header() method
"""
return BedIterator(self.service, self.query, ucsc_compatible)

def fasta(self):
"""
Get results as FASTA
====================
Return a FastaIterator object, which stringifies to the Fasta results,
and works as an iterator over the records (not the lines).
When attempting to get results as FASTA the query may only have a single
output column. Errors will be raised otherwise.
"""
return FastaIterator(self.service, self.query)

def gff3(self):
"""
Get results as GFF3
===================
Return a GFF3Iterator object, which stringifies to the GFF3 results,
and works as an iterator over the lines. After iteration the header
information is accessible with the iter.header() method
"""
return GFF3Iterator(self.service, self.query)

class _FakeRoot(object):
@property
def name(self): return "fake-root"

class RegionQuery(SequenceDataQuery):
"""
Class for querying InterMine Webservices for Features in Genomic Intervals
==========================================================================
This module allows you to construct queries that retrieve data about sequences and
sequence features in biologically relevant formats, where those features are located
overlapping genomic intervals.
The currently supported formats are UCSC-BED, GFF3, and FASTA.
These queries may also be used to construct lists with.
"""


LIST_PATH = "/regions/list"
BED_PATH = "/regions/bed"
FASTA_PATH = "/regions/fasta"
GFF3_PATH = "/regions/gff3"

def __init__(self, service, organism, feature_types, regions, extension=0, is_interbase=False):
"""
Constructor
===========
>>> s = Service("www.flymine.org/query", "API-KEY")
>>> org = "D. melanogaster"
>>> regions = ["2L:14614843..14619614"]
>>> feature_types = ["Exon", "Intron"]
>>> q = RegionQuery(s, org, feature_types, regions)
<interminebio.RegionQuery @xxx>
@param service: The service to connect to.
@type service: intermine.webservice.Service
@param organism: The short name of the organism to look within (eg: D. melanogaster)
@type organism: str
@param feature_types: The types of features to look for
@type feature_types: list[str]
@param regions: The regions to search within, in chrX:start..end or chrX\tstart\tend format
@type regions: list(str)
@param extension: A number of base-pairs to extend each region on either side (default: 0)
@type extension: int
@param is_interbase: Whether to interpret the co-ordinates as interbase co-ordinates
@type is_interbase: boolean
"""
self.service = service
self.organism = organism
self.feature_types = set(feature_types)
self.regions = set(regions)
self.extension = extension
self.is_interbase = is_interbase
self.bed_path = RegionQuery.BED_PATH
self.fasta_path = RegionQuery.FASTA_PATH
self.gff3_path = RegionQuery.GFF3_PATH
self.views = []
self.root = _FakeRoot()

def add_view(self, *args):
pass

def _get_region_query(self):
return {
"organism": self.organism,
"featureTypes": list(self.feature_types),
"regions": list(self.regions),
"extension": self.extension,
"isInterbase": self.is_interbase
}

def to_query_params(self):
"""
Returns the query parameters for this request.
==============================================
This method is a required part of the interface for creating lists.
@rtype: dict
"""
return {"query": json.dumps(self._get_region_query())}

def get_list_upload_uri(self):
"""
Returns the full url for the list upload service
================================================
This method is a required part of the interface for creating lists.
@rtype: str
"""
return self.service.root + RegionQuery.LIST_PATH

@property
def query(self):
return self


class SequenceQuery(SequenceDataQuery):
"""
Class for querying InterMine Webservices for Sequence based data
================================================================
This module allows you to construct queries that retrieve data about sequences and
sequence features in biologically relevant formats.
The currently supported formats are UCSC-BED, GFF3, and FASTA.
"""

def __init__(self, service_or_query, root=None):
"""
Constructor
===========
>>> s = Service("www.flymine.org/query")
>>> bio_query = SequenceQuery(s, "Gene")
<interminebio.SequenceQuery xxx>
>>> q = s.new_query("Gene").where(s.model.Gene.symbol == ["h", "r", "eve", "zen"])
>>> bio_query = SequenceQuery(q)
<interminebio.SequenceQuery yyy>
@param service_or_query: The service to connect to, or a query to wrap.
@type service_or_query: intermine.webservice.Service or intermine.query.Query
@param root: The root class of the query
@type root: str
"""
if isinstance(service_or_query, query.Query):
self.service = service_or_query.service
self.query = service_or_query
else:
self.service = service_or_query
self.query = query.Query(self.service.model, self.service, root=root)

# Set up delegations
self.add_constraint = self.query.add_constraint
self.filter = self.where

self.to_xml = self.query.to_xml

self.get_logic = self.query.get_logic
self.set_logic = self.query.set_logic

self.select_sequence = self.set_sequence
self.select_sequences = self.add_sequence_feature
self.add_sequence_features = self.add_sequence_feature

def add_sequence_feature(self, *features):
"""
Add an arbitrarily long list of sequence features to the query.
===============================================================
Fasta, GFF3 and BED queries all can read information from SequenceFeatures.
For Fasta you are advised to use the set_sequence method instead,
as unlike the GFF3 and BED services, the Fasta service can only handle
queries with one output column.
"""
for f in features:
p = self.query.column(f)._path
if p.is_attribute() or not p.get_class().isa("SequenceFeature"):
raise ValueError("%s is not a Sequence Feature" % (f))
self.query.add_view(str(p) + ".id")

return self

def where(self, *args, **kwargs):
"""
Add a constraint to the query, and return self for chaining.
"""
self.query.where(*args, **kwargs)
return self

def set_sequence(self, f):
"""
Set the sequence column to retrieve.
====================================
Add a sequence holding object to the query. It can be a SequenceFeature, Protein
or Sequence object.
Fasta queries, which read sequences rather than sequence features,
currently only permit one output column.
"""
self.query.views = []
p = self.query.column(f)._path
if p.is_attribute() or not (p.get_class().isa("SequenceFeature") or
p.get_class().isa("Protein") or
p.get_class().isa("Sequence")):
raise ValueError("%s has no sequence information" % (f))
self.query.add_view(str(p) + ".id")

return self
149 changes: 149 additions & 0 deletions intermine/iterators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
class BedIterator(object):

BED_PATHS = {}

def __init__(self, service, query, ucsc_compatible=True):
if service.root not in BedIterator.BED_PATHS:
BedIterator.BED_PATHS[service.root] = service.resolve_service_path("query.bed")
self.path = BedIterator.BED_PATHS[service.root]
self.service = service
self.query = query
self.ucsc_compatible = ucsc_compatible
self._header = []
self.it = self._get_iter()

def header(self):
return "\n".join(self._header)

def _get_iter(self):
params = self.query.to_query_params()
if not self.ucsc_compatible:
params["ucscCompatible"] = "no"
try:
path = self.query.bed_path
except:
path = self.path
i = self.service.get_results(path, params, "tsv", self.query.views)
return i

def __str__(self):
lines = [line for line in self]
return "\n".join(self._header + lines)

def __iter__(self):
return self

def next(self):
line = self.it.next()
while line and line.startswith("#") or line.startswith("track"):
self._header.append(line)
line = self.it.next()
if line:
return line
raise StopIteration

class GFF3Iterator(object):

GFF3_PATHS = {}

def __init__(self, service, query):
if service.root not in GFF3Iterator.GFF3_PATHS:
GFF3Iterator.GFF3_PATHS[service.root] = service.resolve_service_path("query.gff3")
self.path = GFF3Iterator.GFF3_PATHS[service.root]
self.service = service
self.query = query
self._header = []
self.it = self._get_iter()

def header(self):
return "\n".join(self._header)

def _get_iter(self):
params = self.query.to_query_params()
try:
path = self.query.gff3_path
except:
path = self.path
i = self.service.get_results(path, params, "tsv", self.query.views)
return i

def __str__(self):
lines = [line for line in self]
return "\n".join(self._header + lines)

def __iter__(self):
return self

def next(self):
line = self.it.next()
while line and line.startswith("#"):
self._header.append(line)
line = self.it.next()
if line:
return line
raise StopIteration

class FastaIterator(object):

FASTA_PATHS = {}

def __init__(self, service, query):
if service.root not in FastaIterator.FASTA_PATHS:
FastaIterator.FASTA_PATHS[service.root] = service.resolve_service_path("query.fasta")
self.path = FastaIterator.FASTA_PATHS[service.root]
self.service = service
self.query = query
self.it = self._get_iter()
self._holdover = None

def _get_iter(self):
params = self.query.to_query_params()
try:
path = self.query.fasta_path
except:
path = self.path
i = self.service.get_results(path, params, "tsv", self.query.views)
return i

def __str__(self):
records = [rec for rec in self]
return "\n".join(records)

def __iter__(self):
return self

def next(self):
lines = []
if self.it is None:
raise StopIteration

if self._holdover is not None:
lines.append(self._holdover)
self._holdover = None
else:
try:
lines.append(self.it.next())
except StopIteration:
self.it = None

try:
while self.it is not None:
line = self.it.next()
if line.startswith(">"):
self._holdover = line
break
lines.append(line)
except StopIteration:
self.it = None

if len(lines):
return "\n".join(lines)

if self._holdover:
ret = self._holdover
self._holdover = None
return self._holdover

raise StopIteration


Loading