From a21cd9dc27f40de3de38470155b579f125916d69 Mon Sep 17 00:00:00 2001 From: Cyril Pommier Date: Thu, 17 Sep 2020 15:05:14 +0200 Subject: [PATCH 1/4] Add FEM --- sources/FEM.json | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 sources/FEM.json diff --git a/sources/FEM.json b/sources/FEM.json new file mode 100644 index 0000000..92d1897 --- /dev/null +++ b/sources/FEM.json @@ -0,0 +1,20 @@ +{ + "@context": { + "schema": "http://schema.org/", + "brapi": "https://brapi.org/" + }, + "@type": "schema:DataCatalog", + "@id": "https://www.fmach.it", + "schema:identifier": "FEM", + "schema:name": "FEM PhenoDB", + "brapi:endpointUrl": "http://51.145.230.169:8081/brapi/v1/", + + "implemented-calls": [ + "GET germplasm", + "GET trials", + "GET studies", + "GET studies/{studyDbId}", + "GET studies/{studyDbId}/germplasm", + "GET studies/{studyDbId}/observationUnit" + ] +} From 795644f4994dbd666baf9f480ba9a061c7351130 Mon Sep 17 00:00:00 2001 From: Cyril Pommier <3224591+cpommier@users.noreply.github.com> Date: Mon, 15 Feb 2021 18:29:30 +0100 Subject: [PATCH 2/4] Fix/macos_multiprocess (#38) * Working fix for macos * Improved error message on transformation failure * Raise error with faulty endpoints * Handle case with unecessary data list for study details call, see comments delimiter in brap.py --- .gitignore | 2 ++ config/extract-brapi/entities/study.json | 3 ++- etl/extract/brapi.py | 13 +++++++++++++ etl/transform/elasticsearch.py | 7 ++++--- etl/transform/uri.py | 8 ++++++-- 5 files changed, 27 insertions(+), 6 deletions(-) diff --git a/.gitignore b/.gitignore index fcc3662..96f2631 100644 --- a/.gitignore +++ b/.gitignore @@ -100,3 +100,5 @@ ENV/ .vagrant /build-centos6/*.tar.gz +/sandbox/ +/sandboxSources/ diff --git a/config/extract-brapi/entities/study.json b/config/extract-brapi/entities/study.json index 2964c90..8f2b04f 100644 --- a/config/extract-brapi/entities/study.json +++ b/config/extract-brapi/entities/study.json @@ -21,6 +21,7 @@ }, "detail": { "required": true, + "expect-single-result": true, "call": { "method": "GET", "path": "studies/{studyDbId}" @@ -96,4 +97,4 @@ "json-path": "." } ] -} \ No newline at end of file +} diff --git a/etl/extract/brapi.py b/etl/extract/brapi.py index 87f9c3e..7e29638 100755 --- a/etl/extract/brapi.py +++ b/etl/extract/brapi.py @@ -20,6 +20,9 @@ class BrokenLink(Exception): pass +class EndPointError(Exception): + pass + def link_object(dest_entity_name, dest_object, src_object_id): dest_object_ref = dest_entity_name + 'DbIds' @@ -92,6 +95,16 @@ def fetch_details(options): details = BreedingAPIIterator.fetch_all(source['brapi:endpointUrl'], detail_call, logger).__next__() details['etl:detailed'] = True + + # ----------------------------------------------------------------- + # Detect bugy endpoints that returns several studies instead of one. + if "expect-single-result" in detail_call_group and detail_call_group["expect-single-result"] and 'data' in details and len(details['data'])!=1: + logger.debug(f"More than one results for {detail_call}") + raise EndPointError(f"More than one results for {detail_call}") + if 'data' in details and len(details['data']) == 1: + details = details['data'][0] + # ----------------------------------------------------------------- + return entity_name, [details] diff --git a/etl/transform/elasticsearch.py b/etl/transform/elasticsearch.py index ac39f08..e36ca62 100755 --- a/etl/transform/elasticsearch.py +++ b/etl/transform/elasticsearch.py @@ -6,7 +6,7 @@ from xml.sax import saxutils as su import jsonschema -from jsonschema import SchemaError +from jsonschema import SchemaError, ValidationError from etl.common.brapi import get_entity_links from etl.common.store import JSONSplitStore, list_entity_files @@ -123,9 +123,10 @@ def validate_documents(document_tuples, validation_schemas, logger): schema = validation_schemas.get(document_type) try: schema and jsonschema.validate(document, schema) - except SchemaError as e: + except (SchemaError, ValidationError) as e: raise Exception( - f"Could not validate document of type {document_type} using the provided json schema." + f"Could not validate document {document} \n" + f"of type {document_type} using the provided json schema:\n {schema}" ) from e yield document_type, document logger.debug(f"Validated {document_count} documents.") diff --git a/etl/transform/uri.py b/etl/transform/uri.py index 5caddb4..6355f98 100644 --- a/etl/transform/uri.py +++ b/etl/transform/uri.py @@ -190,7 +190,9 @@ def step1(source: dict, entities: dict, json_dir: str, index_dir: str) -> dict: First MAJOR step: Load JSON data, Add URI, Index on disk for quick access """ # Process 1: Read JSON for each source entity - entity_line_queue = Queue(50000) + # See https://github.com/uqfoundation/multiprocess/issues/66 + # entity_line_queue = Queue(50000) + entity_line_queue = Queue(32767) Process(target=read_json_lines, args=(json_dir, entity_line_queue)).start() # Process 2 (with pool): Parse & add URI @@ -341,7 +343,9 @@ def step2(source, entities, ignore_links, json_dir: str, index_dir: str, id_indi """ Second MAJOR step: Replace DbId links with encoded URI, Index by URI on disk for quick access """ - entity_line_queue = Queue(50000) + # See https://github.com/uqfoundation/multiprocess/issues/66 + # entity_line_queue = Queue(50000) + entity_line_queue = Queue(32767) Process(target=read_json_lines, args=(json_dir, entity_line_queue)).start() # Transform URI links in process pool From 86e616fd1bf27a564966a5ddb7a6305836beacc2 Mon Sep 17 00:00:00 2001 From: Cyril Pommier Date: Tue, 16 Mar 2021 09:30:30 +0100 Subject: [PATCH 3/4] Updating FEM endpoint URL --- sources/FEM.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sources/FEM.json b/sources/FEM.json index 92d1897..c967dab 100644 --- a/sources/FEM.json +++ b/sources/FEM.json @@ -7,7 +7,7 @@ "@id": "https://www.fmach.it", "schema:identifier": "FEM", "schema:name": "FEM PhenoDB", - "brapi:endpointUrl": "http://51.145.230.169:8081/brapi/v1/", + "brapi:endpointUrl": "http://brapi.fmach.it:8081/brapi/v1/", "implemented-calls": [ "GET germplasm", From 2c9c0ef1d6674a679008d5151a59cdb7e90160f2 Mon Sep 17 00:00:00 2001 From: Cyril Pommier Date: Tue, 16 Mar 2021 09:42:10 +0100 Subject: [PATCH 4/4] If no @type, then defaulting to Phenotyping study. This is likely a temp workaround --- .../documents/datadiscovery_study.json | 1 + .../documents/study.json | 41 ++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/config/transform-elasticsearch/documents/datadiscovery_study.json b/config/transform-elasticsearch/documents/datadiscovery_study.json index 97080a9..1ea9dc7 100644 --- a/config/transform-elasticsearch/documents/datadiscovery_study.json +++ b/config/transform-elasticsearch/documents/datadiscovery_study.json @@ -21,6 +21,7 @@ { "{replace}":{ "possible_terms": [ + "", null, "Phenotypes", "Phenotyping", "Field Experiement", "Greenhouse (29\u00baC/20\u00baC)", "Green house", "Growth chamber", "Phenotyping Study", "Provenance trial", diff --git a/config/transform-elasticsearch/documents/study.json b/config/transform-elasticsearch/documents/study.json index 01436cc..5b253a9 100644 --- a/config/transform-elasticsearch/documents/study.json +++ b/config/transform-elasticsearch/documents/study.json @@ -2,6 +2,45 @@ "document-type": "study", "source-entity": "study", "document-transform": { + "@type": { + "{list}": [ + { + "{or}": [ + { + "{replace}":{ + "possible_terms": [ + "Genotyping", "Genotyping Study", + "allele size", "genotype" + ] + }, + "{with}": { + "replaced_by": "Genotyping Study" + } + }, + { + "{replace}":{ + "possible_terms": [ + "", null, + "Phenotypes", "Phenotyping", "Field Experiement", + "Greenhouse (29\u00baC/20\u00baC)", "Green house", + "Growth chamber", "Phenotyping Study", "Provenance trial", + "Half sibling progeny trial", "Clonal trial", "Progeny trial", + "Other", "Provenance and half sibling progeny trial", + "Species comparison", "Seed orchard", "Demonstration test", + "Full sibling progeny trial", "Juveniles comparison", + "Clonal archiva, clone bank", "Conservation plot", + "Biomasse test - sylvabiom", "Response study", "raw" + ] + }, + "{with}": { + "replaced_by": "Phenotyping Study" + } + }, + "Study" + ] + } + ] + }, "schema:url": "{.documentationURL}", "schema:includedInDataCatalog": "{.source}", "studyName": { @@ -34,4 +73,4 @@ } } } -} \ No newline at end of file +}