From 03ce887741e34adeb2f545fad3c19e1e554d55e3 Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:56:27 +0530 Subject: [PATCH 1/8] interminebio init --- intermine/__init__.py | 224 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 224 insertions(+) diff --git a/intermine/__init__.py b/intermine/__init__.py index d891f299..0800d69a 100644 --- a/intermine/__init__.py +++ b/intermine/__init__.py @@ -1 +1,225 @@ VERSION = "1.11.0" +from intermine import query +from interminebio.iterators import * +try: + import simplejson as json +except ImportError: + try: + import json + except ImportError: + raise "No JSON module found - please install simplejson" + +class SequenceDataQuery(object): + + def to_query(self): + """Fulfil the listable query interface""" + return self + + def bed(self, ucsc_compatible=True): + """ + Get results as BED + ================== + Return a BedIterator object, which stringifies to the BED results, + and works as an iterator over the lines. After iteration the header + information is accessible with the iter.header() method + """ + return BedIterator(self.service, self.query, ucsc_compatible) + + def fasta(self): + """ + Get results as FASTA + ==================== + Return a FastaIterator object, which stringifies to the Fasta results, + and works as an iterator over the records (not the lines). + When attempting to get results as FASTA the query may only have a single + output column. Errors will be raised otherwise. + """ + return FastaIterator(self.service, self.query) + + def gff3(self): + """ + Get results as GFF3 + =================== + Return a GFF3Iterator object, which stringifies to the GFF3 results, + and works as an iterator over the lines. After iteration the header + information is accessible with the iter.header() method + """ + return GFF3Iterator(self.service, self.query) + +class _FakeRoot(object): + @property + def name(self): return "fake-root" + +class RegionQuery(SequenceDataQuery): + """ + Class for querying InterMine Webservices for Features in Genomic Intervals + ========================================================================== + This module allows you to construct queries that retrieve data about sequences and + sequence features in biologically relevant formats, where those features are located + overlapping genomic intervals. + The currently supported formats are UCSC-BED, GFF3, and FASTA. + These queries may also be used to construct lists with. + """ + + + LIST_PATH = "/regions/list" + BED_PATH = "/regions/bed" + FASTA_PATH = "/regions/fasta" + GFF3_PATH = "/regions/gff3" + + def __init__(self, service, organism, feature_types, regions, extension=0, is_interbase=False): + """ + Constructor + =========== + >>> s = Service("www.flymine.org/query", "API-KEY") + >>> org = "D. melanogaster" + >>> regions = ["2L:14614843..14619614"] + >>> feature_types = ["Exon", "Intron"] + >>> q = RegionQuery(s, org, feature_types, regions) + + @param service: The service to connect to. + @type service: intermine.webservice.Service + @param organism: The short name of the organism to look within (eg: D. melanogaster) + @type organism: str + @param feature_types: The types of features to look for + @type feature_types: list[str] + @param regions: The regions to search within, in chrX:start..end or chrX\tstart\tend format + @type regions: list(str) + @param extension: A number of base-pairs to extend each region on either side (default: 0) + @type extension: int + @param is_interbase: Whether to interpret the co-ordinates as interbase co-ordinates + @type is_interbase: boolean + """ + self.service = service + self.organism = organism + self.feature_types = set(feature_types) + self.regions = set(regions) + self.extension = extension + self.is_interbase = is_interbase + self.bed_path = RegionQuery.BED_PATH + self.fasta_path = RegionQuery.FASTA_PATH + self.gff3_path = RegionQuery.GFF3_PATH + self.views = [] + self.root = _FakeRoot() + + def add_view(self, *args): + pass + + def _get_region_query(self): + return { + "organism": self.organism, + "featureTypes": list(self.feature_types), + "regions": list(self.regions), + "extension": self.extension, + "isInterbase": self.is_interbase + } + + def to_query_params(self): + """ + Returns the query parameters for this request. + ============================================== + This method is a required part of the interface for creating lists. + @rtype: dict + """ + return {"query": json.dumps(self._get_region_query())} + + def get_list_upload_uri(self): + """ + Returns the full url for the list upload service + ================================================ + This method is a required part of the interface for creating lists. + @rtype: str + """ + return self.service.root + RegionQuery.LIST_PATH + + @property + def query(self): + return self + + +class SequenceQuery(SequenceDataQuery): + """ + Class for querying InterMine Webservices for Sequence based data + ================================================================ + This module allows you to construct queries that retrieve data about sequences and + sequence features in biologically relevant formats. + The currently supported formats are UCSC-BED, GFF3, and FASTA. + """ + + def __init__(self, service_or_query, root=None): + """ + Constructor + =========== + >>> s = Service("www.flymine.org/query") + >>> bio_query = SequenceQuery(s, "Gene") + + >>> q = s.new_query("Gene").where(s.model.Gene.symbol == ["h", "r", "eve", "zen"]) + >>> bio_query = SequenceQuery(q) + + @param service_or_query: The service to connect to, or a query to wrap. + @type service_or_query: intermine.webservice.Service or intermine.query.Query + @param root: The root class of the query + @type root: str + """ + if isinstance(service_or_query, query.Query): + self.service = service_or_query.service + self.query = service_or_query + else: + self.service = service_or_query + self.query = query.Query(self.service.model, self.service, root=root) + + # Set up delegations + self.add_constraint = self.query.add_constraint + self.filter = self.where + + self.to_xml = self.query.to_xml + + self.get_logic = self.query.get_logic + self.set_logic = self.query.set_logic + + self.select_sequence = self.set_sequence + self.select_sequences = self.add_sequence_feature + self.add_sequence_features = self.add_sequence_feature + + def add_sequence_feature(self, *features): + """ + Add an arbitrarily long list of sequence features to the query. + =============================================================== + Fasta, GFF3 and BED queries all can read information from SequenceFeatures. + For Fasta you are advised to use the set_sequence method instead, + as unlike the GFF3 and BED services, the Fasta service can only handle + queries with one output column. + """ + for f in features: + p = self.query.column(f)._path + if p.is_attribute() or not p.get_class().isa("SequenceFeature"): + raise ValueError("%s is not a Sequence Feature" % (f)) + self.query.add_view(str(p) + ".id") + + return self + + def where(self, *args, **kwargs): + """ + Add a constraint to the query, and return self for chaining. + """ + self.query.where(*args, **kwargs) + return self + + def set_sequence(self, f): + """ + Set the sequence column to retrieve. + ==================================== + Add a sequence holding object to the query. It can be a SequenceFeature, Protein + or Sequence object. + Fasta queries, which read sequences rather than sequence features, + currently only permit one output column. + """ + self.query.views = [] + p = self.query.column(f)._path + if p.is_attribute() or not (p.get_class().isa("SequenceFeature") or + p.get_class().isa("Protein") or + p.get_class().isa("Sequence")): + raise ValueError("%s has no sequence information" % (f)) + self.query.add_view(str(p) + ".id") + + return self From 82f8e77c2042351ba09a94a96b191d3fd3a462cd Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:57:20 +0530 Subject: [PATCH 2/8] interminebio --- intermine/iterators.py | 149 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 intermine/iterators.py diff --git a/intermine/iterators.py b/intermine/iterators.py new file mode 100644 index 00000000..faad2f7b --- /dev/null +++ b/intermine/iterators.py @@ -0,0 +1,149 @@ +class BedIterator(object): + + BED_PATHS = {} + + def __init__(self, service, query, ucsc_compatible=True): + if service.root not in BedIterator.BED_PATHS: + BedIterator.BED_PATHS[service.root] = service.resolve_service_path("query.bed") + self.path = BedIterator.BED_PATHS[service.root] + self.service = service + self.query = query + self.ucsc_compatible = ucsc_compatible + self._header = [] + self.it = self._get_iter() + + def header(self): + return "\n".join(self._header) + + def _get_iter(self): + params = self.query.to_query_params() + if not self.ucsc_compatible: + params["ucscCompatible"] = "no" + try: + path = self.query.bed_path + except: + path = self.path + i = self.service.get_results(path, params, "tsv", self.query.views) + return i + + def __str__(self): + lines = [line for line in self] + return "\n".join(self._header + lines) + + def __iter__(self): + return self + + def next(self): + line = self.it.next() + while line and line.startswith("#") or line.startswith("track"): + self._header.append(line) + line = self.it.next() + if line: + return line + raise StopIteration + +class GFF3Iterator(object): + + GFF3_PATHS = {} + + def __init__(self, service, query): + if service.root not in GFF3Iterator.GFF3_PATHS: + GFF3Iterator.GFF3_PATHS[service.root] = service.resolve_service_path("query.gff3") + self.path = GFF3Iterator.GFF3_PATHS[service.root] + self.service = service + self.query = query + self._header = [] + self.it = self._get_iter() + + def header(self): + return "\n".join(self._header) + + def _get_iter(self): + params = self.query.to_query_params() + try: + path = self.query.gff3_path + except: + path = self.path + i = self.service.get_results(path, params, "tsv", self.query.views) + return i + + def __str__(self): + lines = [line for line in self] + return "\n".join(self._header + lines) + + def __iter__(self): + return self + + def next(self): + line = self.it.next() + while line and line.startswith("#"): + self._header.append(line) + line = self.it.next() + if line: + return line + raise StopIteration + +class FastaIterator(object): + + FASTA_PATHS = {} + + def __init__(self, service, query): + if service.root not in FastaIterator.FASTA_PATHS: + FastaIterator.FASTA_PATHS[service.root] = service.resolve_service_path("query.fasta") + self.path = FastaIterator.FASTA_PATHS[service.root] + self.service = service + self.query = query + self.it = self._get_iter() + self._holdover = None + + def _get_iter(self): + params = self.query.to_query_params() + try: + path = self.query.fasta_path + except: + path = self.path + i = self.service.get_results(path, params, "tsv", self.query.views) + return i + + def __str__(self): + records = [rec for rec in self] + return "\n".join(records) + + def __iter__(self): + return self + + def next(self): + lines = [] + if self.it is None: + raise StopIteration + + if self._holdover is not None: + lines.append(self._holdover) + self._holdover = None + else: + try: + lines.append(self.it.next()) + except StopIteration: + self.it = None + + try: + while self.it is not None: + line = self.it.next() + if line.startswith(">"): + self._holdover = line + break + lines.append(line) + except StopIteration: + self.it = None + + if len(lines): + return "\n".join(lines) + + if self._holdover: + ret = self._holdover + self._holdover = None + return self._holdover + + raise StopIteration + + From 4d9423729b6d7f46167ca21e2058a710e37ed8fd Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Mon, 23 Dec 2019 23:59:05 +0530 Subject: [PATCH 3/8] interminebiotest --- tests/live_test.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 tests/live_test.py diff --git a/tests/live_test.py b/tests/live_test.py new file mode 100644 index 00000000..ea410704 --- /dev/null +++ b/tests/live_test.py @@ -0,0 +1,28 @@ +import sys +import os +sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + "/..")) + +from intermine.webservice import Service +from interminebio import RegionQuery, SequenceQuery +s = Service("squirrel.flymine.org/flymine", token="C1o3t1e0d4V06ep8xb47DdlFVMr") +q = RegionQuery(s, "D. melanogaster", ["Exon", "Intron"], ["2L:14614843..14619614", "Foo"]) + +print q.bed() +print q.fasta() +print q.gff3() + +l = s.create_list(q) + +print str(l) + +sq = SequenceQuery(s, "Gene") + +sq.add_sequence_features("Gene").where("symbol", "ONE OF", ["eve", "zen", "r"]) + +print sq.fasta() + +sq.add_sequence_features("exons") + +print sq.bed() +print sq.gff3() + From adbf91dc35cec45b54561ec5370ed81b9169a18a Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:00:05 +0530 Subject: [PATCH 4/8] interminebio -> intermine --- intermine/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/intermine/__init__.py b/intermine/__init__.py index 0800d69a..8b98a7c7 100644 --- a/intermine/__init__.py +++ b/intermine/__init__.py @@ -1,6 +1,6 @@ VERSION = "1.11.0" from intermine import query -from interminebio.iterators import * +from intermine.iterators import * try: import simplejson as json except ImportError: From 3c5615f87ba6e22188bfe3f16179ffcb02268ec9 Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:14:51 +0530 Subject: [PATCH 5/8] update to py-3 --- tests/live_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/live_test.py b/tests/live_test.py index ea410704..480905d7 100644 --- a/tests/live_test.py +++ b/tests/live_test.py @@ -7,9 +7,9 @@ s = Service("squirrel.flymine.org/flymine", token="C1o3t1e0d4V06ep8xb47DdlFVMr") q = RegionQuery(s, "D. melanogaster", ["Exon", "Intron"], ["2L:14614843..14619614", "Foo"]) -print q.bed() -print q.fasta() -print q.gff3() +print (q.bed()) +print (q.fasta()) +print (q.gff3()) l = s.create_list(q) From d4f8bd8931feda107c39a879e5e0a1c13439e33f Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:16:57 +0530 Subject: [PATCH 6/8] brackets --- tests/live_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/live_test.py b/tests/live_test.py index 480905d7..fb0c362c 100644 --- a/tests/live_test.py +++ b/tests/live_test.py @@ -13,16 +13,16 @@ l = s.create_list(q) -print str(l) +print (str(l)) sq = SequenceQuery(s, "Gene") sq.add_sequence_features("Gene").where("symbol", "ONE OF", ["eve", "zen", "r"]) -print sq.fasta() +print (sq.fasta()) sq.add_sequence_features("exons") -print sq.bed() -print sq.gff3() +print (sq.bed()) +print (sq.gff3()) From cd5e34833b5f25c788ae0284c388635bb463efe3 Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:35:10 +0530 Subject: [PATCH 7/8] imports --- tests/live_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/live_test.py b/tests/live_test.py index fb0c362c..2524a6ca 100644 --- a/tests/live_test.py +++ b/tests/live_test.py @@ -3,7 +3,7 @@ sys.path.insert(0, os.path.abspath(os.path.dirname(__file__) + "/..")) from intermine.webservice import Service -from interminebio import RegionQuery, SequenceQuery +from intermine import RegionQuery, SequenceQuery s = Service("squirrel.flymine.org/flymine", token="C1o3t1e0d4V06ep8xb47DdlFVMr") q = RegionQuery(s, "D. melanogaster", ["Exon", "Intron"], ["2L:14614843..14619614", "Foo"]) From 603b85e07605d89ab3b047f2376a2712b955e4a9 Mon Sep 17 00:00:00 2001 From: Basil Asad <55938020+mbasil09@users.noreply.github.com> Date: Tue, 24 Dec 2019 00:49:56 +0530 Subject: [PATCH 8/8] rename --- tests/{live_test.py => test_live.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/{live_test.py => test_live.py} (100%) diff --git a/tests/live_test.py b/tests/test_live.py similarity index 100% rename from tests/live_test.py rename to tests/test_live.py