From 42d1a4302981ec9cba6ee25ee28d12ce1b3d6288 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 11 Jul 2024 15:21:20 -0700 Subject: [PATCH 001/125] #390 first attempt at reducing dependencies --- setup/requirements-kg2-build.txt | 35 +------------------------------- 1 file changed, 1 insertion(+), 34 deletions(-) diff --git a/setup/requirements-kg2-build.txt b/setup/requirements-kg2-build.txt index e4ae6bec..b3d36c41 100644 --- a/setup/requirements-kg2-build.txt +++ b/setup/requirements-kg2-build.txt @@ -1,53 +1,20 @@ argh==0.26.2 -attrs==19.2.0 -bidict==0.21.0 bmt==0.7.6 -CacheControl==0.12.6 -cachetools==5.1.0 -cachier==1.2.5 -certifi==2023.7.22 -chardet==3.0.4 -Click==8.0.0 conf==0.4. -Cython==0.29.26 -dataclasses==0.6 -decorator==4.4.0 -diskcache==4.0.0 -docker==4.4.2 graphviz==0.20.1 HTMLParser==0.0.2 -idna==2.8 isodate==0.6.0 -jsobject==0.10.2 jsonlines==3.0.0 jsonpickle==1.0.0 kgx==1.5.6 -lockfile==0.12.2 -marshmallow==3.0.0b11 -mysqlclient==1.4.4 -neo4j==4.3 -networkx==2.5 -numpy==1.21.6 ontobio==2.8.0 -pandas==1.0.3 -pathtools==0.1.2 -portalocker==1.4.0 prefixcommons==0.1.9 pymongo==3.8.0 PyMySQL==0.9.3 -pyparsing==2.4.7 -pysolr==3.8.1 python-dateutil==2.8.1 -pytz==2022.2 PyYAML==5.4 -rdflib==6.0.0 requests==2.31.0 -scipy==1.7.3 -six==1.12.0 snakemake==5.5.4 -SPARQLWrapper==1.8.5 urllib3>=1.25.9 -watchdog==0.9.0 xmltodict==0.12.0 -yamldown==0.1.8 -validators==0.15.0 +validators==0.15.0 \ No newline at end of file From 20eeff6de3a255bbeb90e7593a01b00c4861fe0c Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 11 Jul 2024 15:41:11 -0700 Subject: [PATCH 002/125] #390 hopefully these can go as well --- setup/requirements-kg2-build.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup/requirements-kg2-build.txt b/setup/requirements-kg2-build.txt index b3d36c41..c40910aa 100644 --- a/setup/requirements-kg2-build.txt +++ b/setup/requirements-kg2-build.txt @@ -1,12 +1,10 @@ argh==0.26.2 bmt==0.7.6 -conf==0.4. graphviz==0.20.1 HTMLParser==0.0.2 isodate==0.6.0 jsonlines==3.0.0 jsonpickle==1.0.0 -kgx==1.5.6 ontobio==2.8.0 prefixcommons==0.1.9 pymongo==3.8.0 From 281bf43aa474dc96eace072120636f130604dc83 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 03:32:53 -0700 Subject: [PATCH 003/125] #398 first pass at clinical trials kg --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 116 ++++++++++++++++++ extract/extract-clinicaltrialskg.sh | 29 +++++ kg2_util.py | 3 +- ...g2-provided-by-curie-to-infores-curie.yaml | 8 ++ 4 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 convert/clinicaltrialskg_tsv_to_kg_jsonl.py create mode 100644 extract/extract-clinicaltrialskg.sh diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py new file mode 100644 index 00000000..b1d425d1 --- /dev/null +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +'''clinicaltrialskg_tsv_to_kg_jsonl.py: Extracts a KG2 JSON file from the ClinicalTrials Knowledge Graphy in TSV format + + Usage: clinicaltrialskg_tsv_to_kg_jsonl.py [--test] +''' + +import argparse +import kg2_util +import csv +import datetime + +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + +CLINICALTRIALSKG_BASE_IRI = kg2_util.BASE_URL_CLINICALTRIALSKG +CLINICALTRIALSKG_CURIE = kg2_util.CURIE_ID_CLINICALTRIALSKG + +TEST_MODE_LIMIT = 10000 + + +def get_args(): + description = 'clinicaltrialskg_tsv_to_kg_jsonl.py: builds a KG2 JSON file from the \ + ClinicalTrials Knowledge Graph TSV file' + arg_parser = argparse.ArgumentParser(description=description) + arg_parser.add_argument('--test', + dest='test', + action="store_true", + default=False) + arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('outputNodesFile', type=str) + arg_parser.add_argument('outputEdgesFile', type=str) + return arg_parser.parse_args() + + +def date(): + return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + + +def format_id(id: str, prefix: str): + return prefix + ':' + id.strip() + + +def make_edges(input_file: str, edges_output, test_mode: bool): + count = 0 + with open(input_file, 'r') as input_tsv: + tsvreader = csv.reader(input_tsv, delimiter='\t') + for line in tsvreader: + count += 1 + if count == 1: + continue + if test_mode and count >= TEST_MODE_LIMIT: + break + [clinicaltrialskg_edge_id, + subject_id, + predicate, + object_id, + subject_name, + object_name, + category, + knowledge_level, + agent_type, + nctid, + phase, + primary_purpose, + intervention_model, + time_perspective, + overall_status, + start_date, + enrollment, + enrollment_type, + age_range, + child, + adult, + older_adult + unii] = line + + edge = kg2_util.make_edge_biolink(subject_id, + object_id, + predicate, + CLINICALTRIALSKG_CURIE, + start_date) + edges_output.write(edge) + + +if __name__ == '__main__': + print("Start time: ", date()) + args = get_args() + input_file_name = args.inputFile + output_nodes_file_name = args.outputNodesFile + output_edges_file_name = args.outputEdgesFile + test_mode = args.test + + nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) + nodes_output = nodes_info[0] + edges_output = edges_info[0] + + make_edges(input_file_name, edges_output, test_mode) + + kp_node = kg2_util.make_node(CLINICALTRIALSKG_CURIE, + CLINICALTRIALSKG_BASE_IRI, + "Clinical Trials Knowledge Graph", + kg2_util.SOURCE_NODE_CATEGORY, + None, + CLINICALTRIALSKG_CURIE) + nodes_output.write(kp_node) + + kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) + + print("Finish time: ", date()) diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh new file mode 100644 index 00000000..20f97d26 --- /dev/null +++ b/extract/extract-clinicaltrialskg.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash +# extract-clinicaltrialskg.sh: Download the ClinicalTrials Knowledge Graph +# Copyright 2024 Stephen A. Ramsey +# Author Erica Wood + +set -o nounset -o pipefail -o errexit + +if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then + echo Usage: "$0 " + exit 2 +fi + +# Usage: extract-clinicaltrialskg.sh + +echo "================= starting extract-clinicaltrialskg.sh ==================" +date + +config_dir=`dirname "$0"` +source ${config_dir}/master-config.shinc + +clinicaltrialskgoutput_file=${1:-"${BUILD_DIR}/clinicaltrialskg-edges.tsv"} +version="2.2.6" + +clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv" + +${curl_get} ${clinicaltrialskg_download_link} > ${clinicaltrialskg_output_file} + +date +echo "================= finishing extract-clinicaltrialskg.sh ==================" \ No newline at end of file diff --git a/kg2_util.py b/kg2_util.py index 6e19e7e2..a686e037 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -143,7 +143,7 @@ BASE_URL_CHEMBL_COMPOUND = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.compound:' BASE_URL_CHEMBL_TARGET = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.target:' BASE_URL_CHEMBL_MECHANISM = 'https://www.ebi.ac.uk/chembl/mechanism/inspect/' -BASE_URL_CLINICALTRIALS = BASE_BASE_URL_IDENTIFIERS_ORG + 'clinicaltrials:' +BASE_URL_CLINICALTRIALS_KG = 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/' BASE_URL_DGIDB = 'https://www.dgidb.org/interaction_types' BASE_URL_DISGENET = 'http://www.disgenet.org' BASE_URL_DRUGBANK = BASE_BASE_URL_IDENTIFIERS_ORG + 'drugbank:' @@ -216,6 +216,7 @@ # Since this has changed 2(?) times now, this will make it easier going forward if things change again SOURCE_NODE_CATEGORY = BIOLINK_CATEGORY_RETRIEVAL_SOURCE +CURIE_ID_CLINICALTRIALSKG = 'ClinicalTrialsKG:' CURIE_ID_DCTERMS_ISSUED = CURIE_PREFIX_DCTERMS + ':' + 'issued' CURIE_ID_DISGENET = 'DisGeNET:' CURIE_ID_DRUGCENTRAL_SOURCE = CURIE_PREFIX_DRUGCENTRAL + ':' diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml index ec5fcddb..efb2d639 100644 --- a/maps/kg2-provided-by-curie-to-infores-curie.yaml +++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml @@ -1,3 +1,11 @@ +'ClinicalTrialKG:': + source_name: Multiomics ClinicalTrials KP API + infores_curie: infores:biothings-multiomics-clinicaltrials + knowledge_type: knowledge_source +'DGIdb:': + source_name: Drug Gene Interaction Database + infores_curie: infores:dgidb + knowledge_type: aggregator_knowledge_source 'DGIdb:': source_name: Drug Gene Interaction Database infores_curie: infores:dgidb From 4f338f59286c134fb54f72456ccf73bccb982161 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 03:41:11 -0700 Subject: [PATCH 004/125] #398 forgot the url map entry --- maps/curies-to-urls-map.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 6de72aa0..ec99da3f 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -65,6 +65,8 @@ use_for_bidirectional_mapping: CID: 'http://pubchem.ncbi.nlm.nih.gov/compound/' - CL: http://purl.obolibrary.org/obo/CL_ + - + ClinicalTrialsKG: 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/' - clinicaltrials: "https://identifiers.org/clinicaltrials:" - From cfd305e02762b8f24045134da170a04afe959ed1 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 03:53:32 -0700 Subject: [PATCH 005/125] #399 to avoid confusion --- maps/curies-to-urls-map.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index ec99da3f..fe4192b8 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -32,7 +32,7 @@ use_for_bidirectional_mapping: # - # biolink: https://w3id.org/linkml/ - - biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/v4.0.0/project/owl/biolink_model.owl.ttl + biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/vVERSION_HERE/project/owl/biolink_model.owl.ttl - bioschemas: 'https://bioschemas.org/' - From 42d987224d318a228c13a4a19cfa6e3ef1f7e6a6 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:11:56 -0700 Subject: [PATCH 006/125] #398 updating the Snakemake pipeline --- build/Snakefile-conversion | 13 +++++++++++++ build/Snakefile-extraction | 13 ++++++++++++- build/Snakefile-post-etl | 10 +++++++--- build/snakemake-config-var.yaml | 11 +++++++++++ 4 files changed, 43 insertions(+), 4 deletions(-) diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion index 0d33b54f..ae80d765 100644 --- a/build/Snakefile-conversion +++ b/build/Snakefile-conversion @@ -275,3 +275,16 @@ rule KEGG_Conversion: config['KEGG_CONVERSION_LOG'] shell: config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1" + +rule ClinicalTrialsKG_Conversion: + input: + code = config['CLINICALTRIALSKG_CONVERSION_SCRIPT'], + real = config['CLINICALTRIALSKG_INPUT_FILE'], + validation = config['VALIDATION_PLACEHOLDER'] + output: + nodes = config['CLINICALTRIALSKG_OUTPUT_NODES_FILE'], + edges = config['CLINICALTRIALSKG_OUTPUT_EDGES_FILE'] + log: + config['CLINICALTRIALSKG_CONVERSION_LOG'] + shell: + config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1" diff --git a/build/Snakefile-extraction b/build/Snakefile-extraction index 14cf0eb8..ac5e19b1 100644 --- a/build/Snakefile-extraction +++ b/build/Snakefile-extraction @@ -218,4 +218,15 @@ rule KEGG: log: config['KEGG_EXTRACTION_LOG'] shell: - "bash -x {input.code} {output} > {log} 2>&1" \ No newline at end of file + "bash -x {input.code} {output} > {log} 2>&1" + +rule ClinicalTrialsKG: + input: + code = config['CLINICALTRIALSKG_EXTRACTION_SCRIPT'], + validation = config['VALIDATION_PLACEHOLDER'] + output: + config['CLINICALTRIALSKG_INPUT_FILE'] + log: + config['CLINICALTRIALSKG_EXTRACTION_LOG'] + shell: + "bash -x {input.code} {output} > {log} 2>&1" diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl index 05fd073b..47b109e8 100644 --- a/build/Snakefile-post-etl +++ b/build/Snakefile-post-etl @@ -42,7 +42,9 @@ rule Merge: disgenet_nodes = config['DISGENET_OUTPUT_NODES_FILE'], disgenet_edges = config['DISGENET_OUTPUT_EDGES_FILE'], kegg_nodes = config['KEGG_OUTPUT_NODES_FILE'], - kegg_edges = config['KEGG_OUTPUT_EDGES_FILE'] + kegg_edges = config['KEGG_OUTPUT_EDGES_FILE'], + clinicaltrialskg_nodes = config['CLINICALTRIALSKG_OUTPUT_NODES_FILE'], + clinicaltrialskg_edges = config['CLINICALTRIALSKG_OUTPUT_EDGES_FILE'] output: nodes = config['MERGED_OUTPUT_NODES_FILE'], edges = config['MERGED_OUTPUT_EDGES_FILE'], @@ -54,7 +56,7 @@ rule Merge: " --kgFileOrphanEdges {output.orph}" + \ " --outputNodesFile {output.nodes} " + \ " --outputEdgesFile {output.edges} " + \ - " --kgNodesFiles " + \ + " --kgNodesFiles "{input.umls_nodes} " + \ "{input.ont_nodes} " + \ "{input.semmeddb_nodes} " + \ @@ -76,6 +78,7 @@ rule Merge: "{input.intact_nodes} " + \ "{input.disgenet_nodes} " + \ "{input.kegg_nodes} " + \ + "{input.clinicaltrialskg_nodes} " + \ " --kgEdgesFiles " + \ "{input.umls_edges} " + \ "{input.ont_edges} " + \ @@ -97,7 +100,8 @@ rule Merge: "{input.drugcentral_edges} " + \ "{input.intact_edges} " + \ "{input.disgenet_edges} " + \ - "{input.kegg_edges} > {log} 2>&1" + "{input.kegg_edges} " + \ + "{input.clinicaltrialskg_edges > {log} 2>&1" rule Stats: input: diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index eb32aa6a..66e5fea8 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -240,6 +240,17 @@ kegg_conversion_log: ${BUILD_DIR}/${kegg_conversion_base}${test_suffix}.log kegg_output_nodes_file: ${BUILD_DIR}/${kegg_output_base}${nodes_suffix}${test_suffix}.jsonl kegg_output_edges_file: ${BUILD_DIR}/${kegg_output_base}${edges_suffix}${test_suffix}.jsonl +clinicaltrialskg_extraction_base: extract-clinicaltrialskg +clinicaltrialskg_conversion_base: clinicaltrialskg_tsv_to_kg_jsonl +clinicaltrialskg_output_base: kg2-clinicaltrialskg +clinicaltrialskg_extraction_script: ${EXTRACT_CODE_DIR}/${clinicaltrialskg_extraction_base}.sh +clinicaltrialskg_extraction_log: ${BUILD_DIR}/${clinicaltrialskg_extraction_base}${test_suffix}.log +clinicaltrialskg_input_file: ${BUILD_DIR}/clinicaltrialskg-edges.tsv +clinicaltrialskg_conversion_script: ${CONVERT_CODE_DIR}/${clinicaltrialskg_conversion_base}.py +clinicaltrialskg_conversion_log: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${test_suffix}.log +clinicaltrialskg_output_nodes_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${nodes_suffix}${test_suffix}.jsonl +clinicaltrialskg_output_edges_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${edges_suffix}${test_suffix}.jsonl + merge_base: merge_graphs merge_script: ${PROCESS_CODE_DIR}/${merge_base}.py merged_output_base: kg2-merged From 134f5ec545e66cdb4181d09113feb46d81b952f1 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:17:31 -0700 Subject: [PATCH 007/125] #398 correcting a typo --- build/Snakefile-post-etl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl index 47b109e8..c0f73cee 100644 --- a/build/Snakefile-post-etl +++ b/build/Snakefile-post-etl @@ -56,7 +56,7 @@ rule Merge: " --kgFileOrphanEdges {output.orph}" + \ " --outputNodesFile {output.nodes} " + \ " --outputEdgesFile {output.edges} " + \ - " --kgNodesFiles + " --kgNodesFiles " + \ "{input.umls_nodes} " + \ "{input.ont_nodes} " + \ "{input.semmeddb_nodes} " + \ From a61d098785648977da07f41c55f51c9e905b6d66 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:21:25 -0700 Subject: [PATCH 008/125] #398 correcting another typo --- build/Snakefile-post-etl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl index c0f73cee..e6de67ee 100644 --- a/build/Snakefile-post-etl +++ b/build/Snakefile-post-etl @@ -101,7 +101,7 @@ rule Merge: "{input.intact_edges} " + \ "{input.disgenet_edges} " + \ "{input.kegg_edges} " + \ - "{input.clinicaltrialskg_edges > {log} 2>&1" + "{input.clinicaltrialskg_edges} > {log} 2>&1" rule Stats: input: From 93168e76b692224469128798e8a5ca9c95e096b6 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:28:44 -0700 Subject: [PATCH 009/125] #398 correcting typo in extract --- extract/extract-clinicaltrialskg.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh index 20f97d26..639e1eb2 100644 --- a/extract/extract-clinicaltrialskg.sh +++ b/extract/extract-clinicaltrialskg.sh @@ -18,7 +18,7 @@ date config_dir=`dirname "$0"` source ${config_dir}/master-config.shinc -clinicaltrialskgoutput_file=${1:-"${BUILD_DIR}/clinicaltrialskg-edges.tsv"} +clinicaltrialskg_output_file=${1:-"${BUILD_DIR}/clinicaltrialskg-edges.tsv"} version="2.2.6" clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv" From dd73e65b81b4dadbf748281f59d18f5cf7f6d151 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:30:16 -0700 Subject: [PATCH 010/125] #398 correcting typo in conversion --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index b1d425d1..0848205f 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -78,7 +78,7 @@ def make_edges(input_file: str, edges_output, test_mode: bool): age_range, child, adult, - older_adult + older_adult, unii] = line edge = kg2_util.make_edge_biolink(subject_id, From 0c16ea319cd21ef2b72a0628634d202893a1525b Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:31:20 -0700 Subject: [PATCH 011/125] #398 correcting to standardize in kg2_util --- kg2_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kg2_util.py b/kg2_util.py index a686e037..e61c6cba 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -143,7 +143,7 @@ BASE_URL_CHEMBL_COMPOUND = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.compound:' BASE_URL_CHEMBL_TARGET = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.target:' BASE_URL_CHEMBL_MECHANISM = 'https://www.ebi.ac.uk/chembl/mechanism/inspect/' -BASE_URL_CLINICALTRIALS_KG = 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/' +BASE_URL_CLINICALTRIALSKG = 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/' BASE_URL_DGIDB = 'https://www.dgidb.org/interaction_types' BASE_URL_DISGENET = 'http://www.disgenet.org' BASE_URL_DRUGBANK = BASE_BASE_URL_IDENTIFIERS_ORG + 'drugbank:' From 4a13131b7c77b25f9ddbb326e7253a2f82c19e44 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:38:48 -0700 Subject: [PATCH 012/125] #398 versioning attempt --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 10 ++++++++-- extract/extract-clinicaltrialskg.sh | 3 ++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index 0848205f..a421d505 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -49,11 +49,15 @@ def format_id(id: str, prefix: str): def make_edges(input_file: str, edges_output, test_mode: bool): count = 0 + version = "v" with open(input_file, 'r') as input_tsv: tsvreader = csv.reader(input_tsv, delimiter='\t') for line in tsvreader: count += 1 if count == 1: + version += str(line) + continue + if count == 2: continue if test_mode and count >= TEST_MODE_LIMIT: break @@ -88,6 +92,8 @@ def make_edges(input_file: str, edges_output, test_mode: bool): start_date) edges_output.write(edge) + return version + if __name__ == '__main__': print("Start time: ", date()) @@ -101,11 +107,11 @@ def make_edges(input_file: str, edges_output, test_mode: bool): nodes_output = nodes_info[0] edges_output = edges_info[0] - make_edges(input_file_name, edges_output, test_mode) + version = make_edges(input_file_name, edges_output, test_mode) kp_node = kg2_util.make_node(CLINICALTRIALSKG_CURIE, CLINICALTRIALSKG_BASE_IRI, - "Clinical Trials Knowledge Graph", + "Clinical Trials Knowledge Graph " + version, kg2_util.SOURCE_NODE_CATEGORY, None, CLINICALTRIALSKG_CURIE) diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh index 639e1eb2..95278512 100644 --- a/extract/extract-clinicaltrialskg.sh +++ b/extract/extract-clinicaltrialskg.sh @@ -23,7 +23,8 @@ version="2.2.6" clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv" -${curl_get} ${clinicaltrialskg_download_link} > ${clinicaltrialskg_output_file} +echo "# ${version}" > ${clinicaltrialskg_output_file} +${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file} date echo "================= finishing extract-clinicaltrialskg.sh ==================" \ No newline at end of file From 974ed5c9ed6b58a606ac954148fb9b6adf2b786b Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:40:18 -0700 Subject: [PATCH 013/125] #398 clean up the versioning --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index a421d505..1a4abff0 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -55,7 +55,7 @@ def make_edges(input_file: str, edges_output, test_mode: bool): for line in tsvreader: count += 1 if count == 1: - version += str(line) + version += line[0].strip('#').strip(' ') continue if count == 2: continue From 9d65fd9af3cb91407f46a30e42e20950bfd3382d Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:42:42 -0700 Subject: [PATCH 014/125] #398 add an entry into KL/AT map --- maps/knowledge-level-agent-type-map.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/maps/knowledge-level-agent-type-map.yaml b/maps/knowledge-level-agent-type-map.yaml index 5d860e96..db99215a 100644 --- a/maps/knowledge-level-agent-type-map.yaml +++ b/maps/knowledge-level-agent-type-map.yaml @@ -10,6 +10,10 @@ infores:biolink-ontology: agent_type: manual_agent knowledge_level: knowledge_assertion reference: https://github.com/biolink/biolink-model/blob/master/biolink-model.yaml +infores:biothings-multiomics-clinicaltrials: + agent_type: manual_agent + knowledge_level: knowledge_assertion + reference: https://github.com/biolink/information-resource-registry/blob/d84a524bfaf749d92a42c867b1b6798a88e905c8/infores_catalog.yaml#L650-L651 infores:bspo: agent_type: manual_agent knowledge_level: knowledge_assertion From bdf1ad18e066c0c4af86fecbcb541ff7b8d07413 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:57:05 -0700 Subject: [PATCH 015/125] #398 reworking update date --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 32 ++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index 1a4abff0..60a4e9fb 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -47,6 +47,36 @@ def format_id(id: str, prefix: str): return prefix + ':' + id.strip() +def format_date(date_field): + dates = date_field.split(',') + + # Arbitrarily far back date to improve on + latest_date = datetime.date(1700, 1, 1) + + if len(dates) > 1: + split_date = date.split('-') + year = split_date[0] + month = split_date[1] + day = 1 # most of the time, there's no day + if len(split_date) > 2: + day = split_date[2] + curr_date = datetime.date(year, month, day) + + if curr_date > latest_date: + latest_date = curr_date + else: + split_date = date.split('-') + year = split_date[0] + month = split_date[1] + day = 1 # most of the time, there's no day + if len(split_date) > 2: + day = split_date[2] + latest_date = datetime.date(year, month, day) + + return latest_date + + + def make_edges(input_file: str, edges_output, test_mode: bool): count = 0 version = "v" @@ -89,7 +119,7 @@ def make_edges(input_file: str, edges_output, test_mode: bool): object_id, predicate, CLINICALTRIALSKG_CURIE, - start_date) + format_date(start_date)) edges_output.write(edge) return version From f0f73fd876d419292e7c6eb185b109108173ed21 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:58:12 -0700 Subject: [PATCH 016/125] #398 revising some access patterns --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index 60a4e9fb..91f70c0e 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -54,18 +54,19 @@ def format_date(date_field): latest_date = datetime.date(1700, 1, 1) if len(dates) > 1: - split_date = date.split('-') - year = split_date[0] - month = split_date[1] - day = 1 # most of the time, there's no day - if len(split_date) > 2: - day = split_date[2] - curr_date = datetime.date(year, month, day) - - if curr_date > latest_date: - latest_date = curr_date + for date in dates: + split_date = date.split('-') + year = split_date[0] + month = split_date[1] + day = 1 # most of the time, there's no day + if len(split_date) > 2: + day = split_date[2] + curr_date = datetime.date(year, month, day) + + if curr_date > latest_date: + latest_date = curr_date else: - split_date = date.split('-') + split_date = dates[0].split('-') year = split_date[0] month = split_date[1] day = 1 # most of the time, there's no day From 73c085fbeb72aef9358e9c3e9f44e559f6165aae Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:59:16 -0700 Subject: [PATCH 017/125] #398 changing data type for datetime --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index 91f70c0e..2b2acd85 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -56,22 +56,22 @@ def format_date(date_field): if len(dates) > 1: for date in dates: split_date = date.split('-') - year = split_date[0] - month = split_date[1] + year = int(split_date[0]) + month = int(split_date[1]) day = 1 # most of the time, there's no day if len(split_date) > 2: - day = split_date[2] + day = int(split_date[2]) curr_date = datetime.date(year, month, day) if curr_date > latest_date: latest_date = curr_date else: split_date = dates[0].split('-') - year = split_date[0] - month = split_date[1] + year = int(split_date[0]) + month = int(split_date[1]) day = 1 # most of the time, there's no day if len(split_date) > 2: - day = split_date[2] + day = int(split_date[2]) latest_date = datetime.date(year, month, day) return latest_date From 6bdee2349a7d69a5752de1c139ceec013492cb4f Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 04:59:57 -0700 Subject: [PATCH 018/125] #398 have to save as a string afterwards --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index 2b2acd85..09934ac1 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -74,7 +74,7 @@ def format_date(date_field): day = int(split_date[2]) latest_date = datetime.date(year, month, day) - return latest_date + return str(latest_date) From 00d53f4b153bd9ffa09197766f529b23d4a7e69d Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 05:01:13 -0700 Subject: [PATCH 019/125] #398 handling an edge case --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index 09934ac1..b26da672 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -48,6 +48,8 @@ def format_id(id: str, prefix: str): def format_date(date_field): + if len(date_field) == 0: + return str() dates = date_field.split(',') # Arbitrarily far back date to improve on From 98b6ad59b3fb8b3413c8f0af8da1c77486698a56 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 05:02:04 -0700 Subject: [PATCH 020/125] #398 some debugging code --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index b26da672..eab79b5e 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -69,6 +69,7 @@ def format_date(date_field): latest_date = curr_date else: split_date = dates[0].split('-') + print(split_date, date_field) year = int(split_date[0]) month = int(split_date[1]) day = 1 # most of the time, there's no day From a6049fc1a46d22ce48ce2b2413663a38cff79fd2 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 05:03:13 -0700 Subject: [PATCH 021/125] #398 more debugging code --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index eab79b5e..1d60c367 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -58,6 +58,7 @@ def format_date(date_field): if len(dates) > 1: for date in dates: split_date = date.split('-') + print(split_date, date, date_field) year = int(split_date[0]) month = int(split_date[1]) day = 1 # most of the time, there's no day From 0cc5805d10feeead18c4d4947dc2698a4db74a9b Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 05:04:08 -0700 Subject: [PATCH 022/125] #398 another (very strange) edge case --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index 1d60c367..fe76d22a 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -57,8 +57,9 @@ def format_date(date_field): if len(dates) > 1: for date in dates: + if len(date) == 0: + continue split_date = date.split('-') - print(split_date, date, date_field) year = int(split_date[0]) month = int(split_date[1]) day = 1 # most of the time, there's no day From a163ae3ba2db3c38ea61af5b66ca7f7d219bf604 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 05:16:55 -0700 Subject: [PATCH 023/125] #393 --- convert/chembl_mysql_to_kg_jsonl.py | 32 ++++++++++++++--------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/convert/chembl_mysql_to_kg_jsonl.py b/convert/chembl_mysql_to_kg_jsonl.py index 37e7692b..966289d9 100755 --- a/convert/chembl_mysql_to_kg_jsonl.py +++ b/convert/chembl_mysql_to_kg_jsonl.py @@ -435,22 +435,22 @@ def make_node(id: str, update_date)) # get molecule-to-disease indications - - sql = '''select md.chembl_id, di.mesh_id from molecule_dictionary as md inner join drug_indication as di on md.molregno = di.molregno''' - if test_mode: - sql += str_sql_row_limit_test_mode - with connection.cursor() as cursor: - cursor.execute(sql) - results = cursor.fetchall() - for (chembl_id, mesh_id) in results: - subject_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + chembl_id - object_curie_id = kg2_util.CURIE_PREFIX_MESH + ':' + mesh_id - predicate_label = kg2_util.EDGE_LABEL_BIOLINK_APPLIED_TO_TREAT - edges_output.write(kg2_util.make_edge_biolink(subject_curie_id, - object_curie_id, - predicate_label, - CHEMBL_KB_CURIE_ID, - update_date)) +# Removed per #393 + # sql = '''select md.chembl_id, di.mesh_id from molecule_dictionary as md inner join drug_indication as di on md.molregno = di.molregno''' + # if test_mode: + # sql += str_sql_row_limit_test_mode + # with connection.cursor() as cursor: + # cursor.execute(sql) + # results = cursor.fetchall() + # for (chembl_id, mesh_id) in results: + # subject_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + chembl_id + # object_curie_id = kg2_util.CURIE_PREFIX_MESH + ':' + mesh_id + # predicate_label = kg2_util.EDGE_LABEL_BIOLINK_APPLIED_TO_TREAT + # edges_output.write(kg2_util.make_edge_biolink(subject_curie_id, + # object_curie_id, + # predicate_label, + # CHEMBL_KB_CURIE_ID, + # update_date)) # get metabolism information sql = '''select m1.chembl_id as drug_id, From 65543c9dc8fd78fc88445d3e744cd1dc8275473d Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 05:17:15 -0700 Subject: [PATCH 024/125] #398 remove debugging info --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py index fe76d22a..4fa7ac64 100644 --- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py +++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py @@ -71,7 +71,6 @@ def format_date(date_field): latest_date = curr_date else: split_date = dates[0].split('-') - print(split_date, date_field) year = int(split_date[0]) month = int(split_date[1]) day = 1 # most of the time, there's no day From e5544dc2803909cd7f7cbbd96984e9e309accb02 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 15:41:54 -0700 Subject: [PATCH 025/125] #140 architecture for versioning files --- build/build-kg2-snakemake.sh | 47 ++++++++++++++++++++++++++++++++++-- master-config.shinc | 3 ++- process/run-simplify.sh | 35 --------------------------- 3 files changed, 47 insertions(+), 38 deletions(-) diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh index c0c31fc1..5fb7a8bb 100755 --- a/build/build-kg2-snakemake.sh +++ b/build/build-kg2-snakemake.sh @@ -75,6 +75,44 @@ function build_kg2 () { echo "================= starting build-kg2-snakemake.sh ==================" date +export PATH=$PATH:${BUILD_DIR} + +kg2_version_file="kg2-version.txt" +local_kg2_version_file="${BUILD_DIR}/${kg2_version_file}" +trigger_file_is_major_release=${BUILD_DIR}/major-release +trigger_file_is_minor_release=${BUILD_DIR}/minor-release + +increment_flag='' +if [[ "${test_flag}" == "test" || "${dryrun}" == "-n" ]] +then + increment_flag='' +else + if [ -e ${trigger_file_is_major_release} ] + then + increment_flag='--increment_major' + else + if [ -e ${trigger_file_is_minor_release} ] + then + increment_flag='--increment_minor' + fi + fi +fi + +if [[ "${ci_flag}" == "ci" ]] +then + sed -i "\@^version=@cversion=KG2.CI" ${CODE_DIR}/master-config.shinc +else + ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${local_kg2_version_file} + if [[ "${increment_flag}" != '' ]] + then + ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${local_kg2_version_file} + else + echo "*** TEST MODE -- NO INCREMENT ***" + fi + kg2_version=`cat ${local_kg2_version_file}` + sed -i "\@^version=@cversion=${kg2_version}" ${CODE_DIR}/master-config.shinc +fi + snakemake_config_file=${BUILD_CODE_DIR}/snakemake-config.yaml snakefile=${BUILD_CODE_DIR}/Snakefile @@ -91,8 +129,6 @@ ${python_command} ${BUILD_CODE_DIR}/generate_snakemake_config_file.py ${test_arg # --dag | dot -Tpng > ~/kg2-build/snakemake_diagram.png: Creates Snakemake workflow diagram (when combined with -F and -j) # -n: dry run REMOVE THIS BEFORE BUILDING -export PATH=$PATH:${BUILD_DIR} - graphic="" if [[ "${build_flag}" == "graphic" || "${secondary_build_flag}" == "graphic" || "${tertiary_build_flag}" == "graphic" ]] then @@ -116,6 +152,13 @@ fi cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish -j 16 ${dryrun} ${graphic} +${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file} + +if [[ -f ${trigger_file_is_major_release} ]] +then + rm -f ${trigger_file_is_major_release} +fi + date echo "================ script finished ============================" } diff --git a/master-config.shinc b/master-config.shinc index e5c5e8ce..c6e5de45 100644 --- a/master-config.shinc +++ b/master-config.shinc @@ -29,4 +29,5 @@ knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-t ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml rtx_config_file=RTXConfiguration-config.json biolink_model_version=4.2.0 -infores_registry_version=0.2.8 \ No newline at end of file +infores_registry_version=0.2.8 +version= \ No newline at end of file diff --git a/process/run-simplify.sh b/process/run-simplify.sh index 2c329430..6a8951b7 100755 --- a/process/run-simplify.sh +++ b/process/run-simplify.sh @@ -18,41 +18,12 @@ date CONFIG_DIR=`dirname "$0"` source ${CONFIG_DIR}/master-config.shinc -trigger_file_is_major_release=${BUILD_DIR}/major-release -trigger_file_is_minor_release=${BUILD_DIR}/minor-release - input_nodes_json=${1:-} input_edges_json=${2:-} output_nodes_json=${3:-} output_edges_json=${4:-} local_version_filename=${5:-"${BUILD_DIR}/kg2-version.txt"} build_flag=${6:-""} -s3_version_filename="kg2-version.txt" - -${s3_cp_cmd} s3://${s3_bucket_public}/${s3_version_filename} ${local_version_filename} -test_flag='' -increment_flag='' -if [[ "${build_flag}" == 'test' ]] -then - test_flag='--test' -else - if [ -e ${trigger_file_is_major_release} ] - then - increment_flag='--increment_major' - else - if [ -e ${trigger_file_is_minor_release} ] - then - increment_flag='--increment_minor' - fi - fi -fi - -if [[ "${increment_flag}" != '' ]] -then - ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${local_version_filename} -else - echo "*** TEST MODE -- NO INCREMENT ***" -fi # TODO: Inhibits and increase are not in biolink model anymore - Find out what that should be now ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py ${test_flag} --dropNegated \ @@ -60,12 +31,6 @@ ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py ${predicate_mapping_file} ${infores_mapping_file} ${curies_to_urls_file} \ ${knowledge_level_agent_type_mapping_file} ${input_nodes_json} ${input_edges_json} \ ${output_nodes_json} ${output_edges_json} ${local_version_filename} -${s3_cp_cmd} ${local_version_filename} s3://${s3_bucket_public}/${s3_version_filename} - -if [[ -f ${trigger_file_is_major_release} ]] -then - rm -f ${trigger_file_is_major_release} -fi date echo "================= finishing run-simplify.sh ==================" From 2796ecd62db58d9b4d36870c70e34ff0e8e59491 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 15:46:50 -0700 Subject: [PATCH 026/125] #140 comment out s3 command for ci --- build/build-kg2-snakemake.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh index 5fb7a8bb..4e476038 100755 --- a/build/build-kg2-snakemake.sh +++ b/build/build-kg2-snakemake.sh @@ -152,13 +152,21 @@ fi cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish -j 16 ${dryrun} ${graphic} -${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file} +if [[ "${ci_flag}" != "ci" ]] +then + ${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file} +fi if [[ -f ${trigger_file_is_major_release} ]] then rm -f ${trigger_file_is_major_release} fi +if [[ -f ${trigger_file_is_minor_release} ]] +then + rm -f ${trigger_file_is_minor_release} +fi + date echo "================ script finished ============================" } From 1270624a1caeb927b9ded2c2ff8490db056cb3a8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 16 Jul 2024 15:59:33 -0700 Subject: [PATCH 027/125] #140 a lot more changes to log file names and tsv output --- build/build-kg2-snakemake.sh | 34 ++++++----- build/snakemake-config-var.yaml | 103 ++++++++++++++++---------------- master-config.shinc | 2 +- 3 files changed, 71 insertions(+), 68 deletions(-) diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh index 4e476038..6d313846 100755 --- a/build/build-kg2-snakemake.sh +++ b/build/build-kg2-snakemake.sh @@ -64,19 +64,6 @@ then run_flag="-F" fi -build_kg2_log_file=${BUILD_DIR}/build-kg2-snakemake${dryrun}${test_suffix}.log -touch ${build_kg2_log_file} -if [[ "${ci_flag}" == "ci" ]] -then - trap "cat ${build_kg2_log_file}" EXIT -fi - -function build_kg2 () { -echo "================= starting build-kg2-snakemake.sh ==================" -date - -export PATH=$PATH:${BUILD_DIR} - kg2_version_file="kg2-version.txt" local_kg2_version_file="${BUILD_DIR}/${kg2_version_file}" trigger_file_is_major_release=${BUILD_DIR}/major-release @@ -100,7 +87,7 @@ fi if [[ "${ci_flag}" == "ci" ]] then - sed -i "\@^version=@cversion=KG2.CI" ${CODE_DIR}/master-config.shinc + sed -i "\@^kg2_version=@ckg2_version=KG2.CI" ${CODE_DIR}/master-config.shinc else ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${local_kg2_version_file} if [[ "${increment_flag}" != '' ]] @@ -109,10 +96,25 @@ else else echo "*** TEST MODE -- NO INCREMENT ***" fi - kg2_version=`cat ${local_kg2_version_file}` - sed -i "\@^version=@cversion=${kg2_version}" ${CODE_DIR}/master-config.shinc + curr_kg2_version=`cat ${local_kg2_version_file}` + sed -i "\@^kg2_version=@ckg2_version=${curr_kg2_version}" ${CODE_DIR}/master-config.shinc +fi + +source ${config_dir}/master-config.shinc + +build_kg2_log_file=${BUILD_DIR}/build-kg2-snakemake-${kg2_version}${dryrun}${test_suffix}.log +touch ${build_kg2_log_file} +if [[ "${ci_flag}" == "ci" ]] +then + trap "cat ${build_kg2_log_file}" EXIT fi +function build_kg2 () { +echo "================= starting build-kg2-snakemake.sh ==================" +date + +export PATH=$PATH:${BUILD_DIR} + snakemake_config_file=${BUILD_CODE_DIR}/snakemake-config.yaml snakefile=${BUILD_CODE_DIR}/Snakefile diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index 66e5fea8..72164364 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -1,19 +1,20 @@ nodes_suffix: -nodes edges_suffix: -edges +version_suffix: -${kg2_version} validation_base: run-validation-tests validation_script: ${VALIDATE_CODE_DIR}/${validation_base}.sh -validation_log: ${BUILD_DIR}/${validation_base}${test_suffix}.log +validation_log: ${BUILD_DIR}/${validation_base}${version_suffix}${test_suffix}.log validation_placeholder: ${BUILD_DIR}/validation-placeholder.empty umls_extraction_base: extract-umls umls_conversion_base: umls_list_jsonl_to_kg_jsonl umls_output_base: kg2-umls umls_extraction_script: ${EXTRACT_CODE_DIR}/${umls_extraction_base}.sh -umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${test_suffix}.log +umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${version_suffix}${test_suffix}.log umls_extract_file: ${BUILD_DIR}/umls.jsonl umls_conversion_script: ${CONVERT_CODE_DIR}/${umls_conversion_base}.py -umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${test_suffix}.log +umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${version_suffix}${test_suffix}.log umls_name_heirarchy: ${MAPS_CODE_DIR}/umls-name-heirarchy.yaml umls_tui_map: ${MAPS_CODE_DIR}/tui_combo_mappings.json umls_output_nodes_file: ${BUILD_DIR}/${umls_output_base}${nodes_suffix}${test_suffix}.jsonl @@ -22,7 +23,7 @@ umls_output_edges_file: ${BUILD_DIR}/${umls_output_base}${edges_suffix}${test_su ont_conversion_base: build-multi-ont-kg ont_output_base: kg2-ont ont_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.sh -ont_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${test_suffix}.log +ont_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log ont_output_nodes_file: ${BUILD_DIR}/${ont_output_base}${nodes_suffix}${test_suffix}.jsonl ont_output_edges_file: ${BUILD_DIR}/${ont_output_base}${edges_suffix}${test_suffix}.jsonl @@ -30,12 +31,12 @@ semmeddb_extraction_base: extract-semmeddb semmeddb_conversion_base: semmeddb_tuplelist_json_to_kg_jsonl semmeddb_output_base: kg2-semmeddb semmeddb_extraction_script: ${EXTRACT_CODE_DIR}/${semmeddb_extraction_base}.sh -semmeddb_extraction_log: ${BUILD_DIR}/${semmeddb_extraction_base}${test_suffix}.log +semmeddb_extraction_log: ${BUILD_DIR}/${semmeddb_extraction_base}${version_suffix}${test_suffix}.log semmeddb_tuplelist_file: ${BUILD_DIR}/semmeddb-tuplelist.jsonl semmeddb_exclusion_file: ${BUILD_DIR}/semmed-exclude-list.yaml semmeddb_version_file: ${BUILD_DIR}/semmeddb-version.txt semmeddb_conversion_script: ${CONVERT_CODE_DIR}/${semmeddb_conversion_base}.py -semmeddb_conversion_log: ${BUILD_DIR}/${semmeddb_conversion_base}${test_suffix}.log +semmeddb_conversion_log: ${BUILD_DIR}/${semmeddb_conversion_base}${version_suffix}${test_suffix}.log semmeddb_output_nodes_file: ${BUILD_DIR}/${semmeddb_output_base}${nodes_suffix}${test_suffix}.jsonl semmeddb_output_edges_file: ${BUILD_DIR}/${semmeddb_output_base}${edges_suffix}${test_suffix}.jsonl @@ -43,10 +44,10 @@ uniprotkb_extraction_base: extract-uniprotkb uniprotkb_conversion_base: uniprotkb_dat_to_kg_jsonl uniprotkb_output_base: kg2-uniprotkb uniprotkb_extraction_script: ${EXTRACT_CODE_DIR}/${uniprotkb_extraction_base}.sh -uniprotkb_extraction_log: ${BUILD_DIR}/${uniprotkb_extraction_base}${test_suffix}.log +uniprotkb_extraction_log: ${BUILD_DIR}/${uniprotkb_extraction_base}${version_suffix}${test_suffix}.log uniprotkb_dat_file: ${BUILD_DIR}/uniprotkb/uniprot_sprot.dat uniprotkb_conversion_script: ${CONVERT_CODE_DIR}/${uniprotkb_conversion_base}.py -uniprotkb_conversion_log: ${BUILD_DIR}/${uniprotkb_conversion_base}${test_suffix}.log +uniprotkb_conversion_log: ${BUILD_DIR}/${uniprotkb_conversion_base}${version_suffix}${test_suffix}.log uniprotkb_output_nodes_file: ${BUILD_DIR}/${uniprotkb_output_base}${nodes_suffix}${test_suffix}.jsonl uniprotkb_output_edges_file: ${BUILD_DIR}/${uniprotkb_output_base}${edges_suffix}${test_suffix}.jsonl @@ -54,10 +55,10 @@ ensembl_extraction_base: extract-ensembl ensembl_conversion_base: ensembl_json_to_kg_jsonl ensembl_output_base: kg2-ensembl ensembl_extraction_script: ${EXTRACT_CODE_DIR}/${ensembl_extraction_base}.sh -ensembl_extraction_log: ${BUILD_DIR}/${ensembl_extraction_base}${test_suffix}.log +ensembl_extraction_log: ${BUILD_DIR}/${ensembl_extraction_base}${version_suffix}${test_suffix}.log ensembl_source_json_file: ${BUILD_DIR}/ensembl/ensembl_genes_homo_sapiens.json ensembl_conversion_script: ${CONVERT_CODE_DIR}/${ensembl_conversion_base}.py -ensembl_conversion_log: ${BUILD_DIR}/${ensembl_conversion_base}${test_suffix}.log +ensembl_conversion_log: ${BUILD_DIR}/${ensembl_conversion_base}${version_suffix}${test_suffix}.log ensembl_output_nodes_file: ${BUILD_DIR}/${ensembl_output_base}${nodes_suffix}${test_suffix}.jsonl ensembl_output_edges_file: ${BUILD_DIR}/${ensembl_output_base}${edges_suffix}${test_suffix}.jsonl @@ -65,10 +66,10 @@ unichem_extraction_base: extract-unichem unichem_conversion_base: unichem_tsv_to_kg_jsonl unichem_output_base: kg2-unichem unichem_extraction_script: ${EXTRACT_CODE_DIR}/${unichem_extraction_base}.sh -unichem_extraction_log: ${BUILD_DIR}/${unichem_extraction_base}${test_suffix}.log +unichem_extraction_log: ${BUILD_DIR}/${unichem_extraction_base}${version_suffix}${test_suffix}.log unichem_output_tsv_file: ${BUILD_DIR}/unichem/unichem-mappings.tsv unichem_conversion_script: ${CONVERT_CODE_DIR}/${unichem_conversion_base}.py -unichem_conversion_log: ${BUILD_DIR}/${unichem_conversion_base}${test_suffix}.log +unichem_conversion_log: ${BUILD_DIR}/${unichem_conversion_base}${version_suffix}${test_suffix}.log unichem_output_nodes_file: ${BUILD_DIR}/${unichem_output_base}${nodes_suffix}${test_suffix}.jsonl unichem_output_edges_file: ${BUILD_DIR}/${unichem_output_base}${edges_suffix}${test_suffix}.jsonl @@ -76,10 +77,10 @@ chembl_extraction_base: extract-chembl chembl_conversion_base: chembl_mysql_to_kg_jsonl chembl_output_base: kg2-chembl chembl_extraction_script: ${EXTRACT_CODE_DIR}/${chembl_extraction_base}.sh -chembl_extraction_log: ${BUILD_DIR}/${chembl_extraction_base}${test_suffix}.log +chembl_extraction_log: ${BUILD_DIR}/${chembl_extraction_base}${version_suffix}${test_suffix}.log chembl_mysql_dbname: chembl chembl_conversion_script: ${CONVERT_CODE_DIR}/${chembl_conversion_base}.py -chembl_conversion_log: ${BUILD_DIR}/${chembl_conversion_base}${test_suffix}.log +chembl_conversion_log: ${BUILD_DIR}/${chembl_conversion_base}${version_suffix}${test_suffix}.log chembl_output_nodes_file: ${BUILD_DIR}/${chembl_output_base}${nodes_suffix}${test_suffix}.jsonl chembl_output_edges_file: ${BUILD_DIR}/${chembl_output_base}${edges_suffix}${test_suffix}.jsonl @@ -87,10 +88,10 @@ ncbigene_extraction_base: extract-ncbigene ncbigene_conversion_base: ncbigene_tsv_to_kg_jsonl ncbigene_output_base: kg2-ncbigene ncbigene_extraction_script: ${EXTRACT_CODE_DIR}/${ncbigene_extraction_base}.sh -ncbigene_extraction_log: ${BUILD_DIR}/${ncbigene_extraction_base}${test_suffix}.log +ncbigene_extraction_log: ${BUILD_DIR}/${ncbigene_extraction_base}${version_suffix}${test_suffix}.log ncbigene_tsv_file: ${BUILD_DIR}/ncbigene/Homo_sapiens_gene_info.tsv ncbigene_conversion_script: ${CONVERT_CODE_DIR}/${ncbigene_conversion_base}.py -ncbigene_conversion_log: ${BUILD_DIR}/${ncbigene_conversion_base}${test_suffix}.log +ncbigene_conversion_log: ${BUILD_DIR}/${ncbigene_conversion_base}${version_suffix}${test_suffix}.log ncbigene_output_nodes_file: ${BUILD_DIR}/${ncbigene_output_base}${nodes_suffix}${test_suffix}.jsonl ncbigene_output_edges_file: ${BUILD_DIR}/${ncbigene_output_base}${edges_suffix}${test_suffix}.jsonl @@ -98,10 +99,10 @@ dgidb_extraction_base: extract-dgidb dgidb_conversion_base: dgidb_tsv_to_kg_jsonl dgidb_output_base: kg2-dgidb dgidb_extraction_script: ${EXTRACT_CODE_DIR}/${dgidb_extraction_base}.sh -dgidb_extraction_log: ${BUILD_DIR}/${dgidb_extraction_base}${test_suffix}.log +dgidb_extraction_log: ${BUILD_DIR}/${dgidb_extraction_base}${version_suffix}${test_suffix}.log dgidb_dir: ${BUILD_DIR}/dgidb dgidb_conversion_script: ${CONVERT_CODE_DIR}/${dgidb_conversion_base}.py -dgidb_conversion_log: ${BUILD_DIR}/${dgidb_conversion_base}${test_suffix}.log +dgidb_conversion_log: ${BUILD_DIR}/${dgidb_conversion_base}${version_suffix}${test_suffix}.log dgidb_output_nodes_file: ${BUILD_DIR}/${dgidb_output_base}${nodes_suffix}${test_suffix}.jsonl dgidb_output_edges_file: ${BUILD_DIR}/${dgidb_output_base}${edges_suffix}${test_suffix}.jsonl @@ -109,11 +110,11 @@ repodb_extraction_base: extract-repodb repodb_conversion_base: repodb_csv_to_kg_jsonl repodb_output_base: kg2-repodb repodb_extraction_script: ${EXTRACT_CODE_DIR}/${repodb_extraction_base}.sh -repodb_extraction_log: ${BUILD_DIR}/${repodb_extraction_base}${test_suffix}.log +repodb_extraction_log: ${BUILD_DIR}/${repodb_extraction_base}${version_suffix}${test_suffix}.log repodb_dir: ${BUILD_DIR}/repodb repodb_input_file: ${repodb_dir}/repodb.csv repodb_conversion_script: ${CONVERT_CODE_DIR}/${repodb_conversion_base}.py -repodb_conversion_log: ${BUILD_DIR}/${repodb_conversion_base}${test_suffix}.log +repodb_conversion_log: ${BUILD_DIR}/${repodb_conversion_base}${version_suffix}${test_suffix}.log repodb_output_nodes_file: ${BUILD_DIR}/${repodb_output_base}${nodes_suffix}${test_suffix}.jsonl repodb_output_edges_file: ${BUILD_DIR}/${repodb_output_base}${edges_suffix}${test_suffix}.jsonl @@ -121,10 +122,10 @@ drugbank_extraction_base: extract-drugbank drugbank_conversion_base: drugbank_xml_to_kg_jsonl drugbank_output_base: kg2-drugbank drugbank_extraction_script: ${EXTRACT_CODE_DIR}/${drugbank_extraction_base}.sh -drugbank_extraction_log: ${BUILD_DIR}/${drugbank_extraction_base}${test_suffix}.log +drugbank_extraction_log: ${BUILD_DIR}/${drugbank_extraction_base}${version_suffix}${test_suffix}.log drugbank_input_file: ${BUILD_DIR}/drugbank.xml drugbank_conversion_script: ${CONVERT_CODE_DIR}/${drugbank_conversion_base}.py -drugbank_conversion_log: ${BUILD_DIR}/${drugbank_conversion_base}${test_suffix}.log +drugbank_conversion_log: ${BUILD_DIR}/${drugbank_conversion_base}${version_suffix}${test_suffix}.log drugbank_output_nodes_file: ${BUILD_DIR}/${drugbank_output_base}${nodes_suffix}${test_suffix}.jsonl drugbank_output_edges_file: ${BUILD_DIR}/${drugbank_output_base}${edges_suffix}${test_suffix}.jsonl @@ -132,11 +133,11 @@ smpdb_extraction_base: extract-smpdb smpdb_conversion_base: smpdb_csv_to_kg_jsonl smpdb_output_base: kg2-smpdb smpdb_extraction_script: ${EXTRACT_CODE_DIR}/${smpdb_extraction_base}.sh -smpdb_extraction_log: ${BUILD_DIR}/${smpdb_extraction_base}${test_suffix}.log +smpdb_extraction_log: ${BUILD_DIR}/${smpdb_extraction_base}${version_suffix}${test_suffix}.log smpdb_dir: ${BUILD_DIR}/smpdb smpdb_input_file: ${smpdb_dir}/pathbank_pathways.csv smpdb_conversion_script: ${CONVERT_CODE_DIR}/${smpdb_conversion_base}.py -smpdb_conversion_log: ${BUILD_DIR}/${smpdb_conversion_base}${test_suffix}.log +smpdb_conversion_log: ${BUILD_DIR}/${smpdb_conversion_base}${version_suffix}${test_suffix}.log smpdb_output_nodes_file: ${BUILD_DIR}/${smpdb_output_base}${nodes_suffix}${test_suffix}.jsonl smpdb_output_edges_file: ${BUILD_DIR}/${smpdb_output_base}${edges_suffix}${test_suffix}.jsonl @@ -144,10 +145,10 @@ hmdb_extraction_base: extract-hmdb hmdb_conversion_base: hmdb_xml_to_kg_jsonl hmdb_output_base: kg2-hmdb hmdb_extraction_script: ${EXTRACT_CODE_DIR}/${hmdb_extraction_base}.sh -hmdb_extraction_log: ${BUILD_DIR}/${hmdb_extraction_base}${test_suffix}.log +hmdb_extraction_log: ${BUILD_DIR}/${hmdb_extraction_base}${version_suffix}${test_suffix}.log hmdb_input_file: ${BUILD_DIR}/hmdb_metabolites.xml hmdb_conversion_script: ${CONVERT_CODE_DIR}/${hmdb_conversion_base}.py -hmdb_conversion_log: ${BUILD_DIR}/${hmdb_conversion_base}${test_suffix}.log +hmdb_conversion_log: ${BUILD_DIR}/${hmdb_conversion_base}${version_suffix}${test_suffix}.log hmdb_output_nodes_file: ${BUILD_DIR}/${hmdb_output_base}${nodes_suffix}${test_suffix}.jsonl hmdb_output_edges_file: ${BUILD_DIR}/${hmdb_output_base}${edges_suffix}${test_suffix}.jsonl @@ -155,10 +156,10 @@ go_annotations_extraction_base: extract-go-annotations go_annotations_conversion_base: go_gpa_to_kg_jsonl go_annotations_output_base: kg2-go-annotations go_annotations_extraction_script: ${EXTRACT_CODE_DIR}/${go_annotations_extraction_base}.sh -go_annotations_extraction_log: ${BUILD_DIR}/${go_annotations_extraction_base}${test_suffix}.log +go_annotations_extraction_log: ${BUILD_DIR}/${go_annotations_extraction_base}${version_suffix}${test_suffix}.log go_annotations_input_file: ${BUILD_DIR}/goa_human.gpa go_annotations_conversion_script: ${CONVERT_CODE_DIR}/${go_annotations_conversion_base}.py -go_annotations_conversion_log: ${BUILD_DIR}/${go_annotations_conversion_base}${test_suffix}.log +go_annotations_conversion_log: ${BUILD_DIR}/${go_annotations_conversion_base}${version_suffix}${test_suffix}.log go_annotations_output_nodes_file: ${BUILD_DIR}/${go_annotations_output_base}${nodes_suffix}${test_suffix}.jsonl go_annotations_output_edges_file: ${BUILD_DIR}/${go_annotations_output_base}${edges_suffix}${test_suffix}.jsonl @@ -166,10 +167,10 @@ reactome_extraction_base: extract-reactome reactome_conversion_base: reactome_mysql_to_kg_jsonl reactome_output_base: kg2-reactome reactome_extraction_script: ${EXTRACT_CODE_DIR}/${reactome_extraction_base}.sh -reactome_extraction_log: ${BUILD_DIR}/${reactome_extraction_base}${test_suffix}.log +reactome_extraction_log: ${BUILD_DIR}/${reactome_extraction_base}${version_suffix}${test_suffix}.log reactome_mysql_dbname: reactome reactome_conversion_script: ${CONVERT_CODE_DIR}/${reactome_conversion_base}.py -reactome_conversion_log: ${BUILD_DIR}/${reactome_conversion_base}${test_suffix}.log +reactome_conversion_log: ${BUILD_DIR}/${reactome_conversion_base}${version_suffix}${test_suffix}.log reactome_output_nodes_file: ${BUILD_DIR}/${reactome_output_base}${nodes_suffix}${test_suffix}.jsonl reactome_output_edges_file: ${BUILD_DIR}/${reactome_output_base}${edges_suffix}${test_suffix}.jsonl @@ -177,10 +178,10 @@ mirbase_extraction_base: extract-mirbase mirbase_conversion_base: mirbase_dat_to_kg_jsonl mirbase_output_base: kg2-mirbase mirbase_extraction_script: ${EXTRACT_CODE_DIR}/${mirbase_extraction_base}.sh -mirbase_extraction_log: ${BUILD_DIR}/${mirbase_extraction_base}${test_suffix}.log +mirbase_extraction_log: ${BUILD_DIR}/${mirbase_extraction_base}${version_suffix}${test_suffix}.log mirbase_input_file: ${BUILD_DIR}/miRNA.dat mirbase_conversion_script: ${CONVERT_CODE_DIR}/${mirbase_conversion_base}.py -mirbase_conversion_log: ${BUILD_DIR}/${mirbase_conversion_base}${test_suffix}.log +mirbase_conversion_log: ${BUILD_DIR}/${mirbase_conversion_base}${version_suffix}${test_suffix}.log mirbase_output_nodes_file: ${BUILD_DIR}/${mirbase_output_base}${nodes_suffix}${test_suffix}.jsonl mirbase_output_edges_file: ${BUILD_DIR}/${mirbase_output_base}${edges_suffix}${test_suffix}.jsonl @@ -188,10 +189,10 @@ jensenlab_extraction_base: extract-jensenlab jensenlab_conversion_base: jensenlab_tsv_to_kg_jsonl jensenlab_output_base: kg2-jensenlab jensenlab_extraction_script: ${EXTRACT_CODE_DIR}/${jensenlab_extraction_base}.sh -jensenlab_extraction_log: ${BUILD_DIR}/${jensenlab_extraction_base}${test_suffix}.log +jensenlab_extraction_log: ${BUILD_DIR}/${jensenlab_extraction_base}${version_suffix}${test_suffix}.log jensenlab_dir: ${BUILD_DIR}/jensenlab jensenlab_conversion_script: ${CONVERT_CODE_DIR}/${jensenlab_conversion_base}.py -jensenlab_conversion_log: ${BUILD_DIR}/${jensenlab_conversion_base}${test_suffix}.log +jensenlab_conversion_log: ${BUILD_DIR}/${jensenlab_conversion_base}${version_suffix}${test_suffix}.log jensenlab_output_nodes_file: ${BUILD_DIR}/${jensenlab_output_base}${nodes_suffix}${test_suffix}.jsonl jensenlab_output_edges_file: ${BUILD_DIR}/${jensenlab_output_base}${edges_suffix}${test_suffix}.jsonl @@ -199,11 +200,11 @@ drugcentral_extraction_base: extract-drugcentral drugcentral_conversion_base: drugcentral_json_to_kg_jsonl drugcentral_output_base: kg2-drugcentral drugcentral_extraction_script: ${EXTRACT_CODE_DIR}/${drugcentral_extraction_base}.sh -drugcentral_extraction_log: ${BUILD_DIR}/${drugcentral_extraction_base}${test_suffix}.log +drugcentral_extraction_log: ${BUILD_DIR}/${drugcentral_extraction_base}${version_suffix}${test_suffix}.log drugcentral_dir: ${BUILD_DIR}/drugcentral drugcentral_input_file: ${drugcentral_dir}/drugcentral_psql_json.json drugcentral_conversion_script: ${CONVERT_CODE_DIR}/${drugcentral_conversion_base}.py -drugcentral_conversion_log: ${BUILD_DIR}/${drugcentral_conversion_base}${test_suffix}.log +drugcentral_conversion_log: ${BUILD_DIR}/${drugcentral_conversion_base}${version_suffix}${test_suffix}.log drugcentral_output_nodes_file: ${BUILD_DIR}/${drugcentral_output_base}${nodes_suffix}${test_suffix}.jsonl drugcentral_output_edges_file: ${BUILD_DIR}/${drugcentral_output_base}${edges_suffix}${test_suffix}.jsonl @@ -211,10 +212,10 @@ intact_extraction_base: extract-intact intact_conversion_base: intact_tsv_to_kg_jsonl intact_output_base: kg2-intact intact_extraction_script: ${EXTRACT_CODE_DIR}/${intact_extraction_base}.sh -intact_extraction_log: ${BUILD_DIR}/${intact_extraction_base}${test_suffix}.log +intact_extraction_log: ${BUILD_DIR}/${intact_extraction_base}${version_suffix}${test_suffix}.log intact_input_file: ${BUILD_DIR}/intact.txt intact_conversion_script: ${CONVERT_CODE_DIR}/${intact_conversion_base}.py -intact_conversion_log: ${BUILD_DIR}/${intact_conversion_base}${test_suffix}.log +intact_conversion_log: ${BUILD_DIR}/${intact_conversion_base}${version_suffix}${test_suffix}.log intact_output_nodes_file: ${BUILD_DIR}/${intact_output_base}${nodes_suffix}${test_suffix}.jsonl intact_output_edges_file: ${BUILD_DIR}/${intact_output_base}${edges_suffix}${test_suffix}.jsonl @@ -222,10 +223,10 @@ disgenet_extraction_base: extract-disgenet disgenet_conversion_base: disgenet_tsv_to_kg_jsonl disgenet_output_base: kg2-disgenet disgenet_extraction_script: ${EXTRACT_CODE_DIR}/${disgenet_extraction_base}.sh -disgenet_extraction_log: ${BUILD_DIR}/${disgenet_extraction_base}${test_suffix}.log +disgenet_extraction_log: ${BUILD_DIR}/${disgenet_extraction_base}${version_suffix}${test_suffix}.log disgenet_input_file: ${BUILD_DIR}/all_gene_disease_pmid_associations.tsv disgenet_conversion_script: ${CONVERT_CODE_DIR}/${disgenet_conversion_base}.py -disgenet_conversion_log: ${BUILD_DIR}/${disgenet_conversion_base}${test_suffix}.log +disgenet_conversion_log: ${BUILD_DIR}/${disgenet_conversion_base}${version_suffix}${test_suffix}.log disgenet_output_nodes_file: ${BUILD_DIR}/${disgenet_output_base}${nodes_suffix}${test_suffix}.jsonl disgenet_output_edges_file: ${BUILD_DIR}/${disgenet_output_base}${edges_suffix}${test_suffix}.jsonl @@ -233,10 +234,10 @@ kegg_extraction_base: extract-kegg kegg_conversion_base: kegg_jsonl_to_kg_jsonl kegg_output_base: kg2-kegg kegg_extraction_script: ${EXTRACT_CODE_DIR}/${kegg_extraction_base}.sh -kegg_extraction_log: ${BUILD_DIR}/${kegg_extraction_base}${test_suffix}.log +kegg_extraction_log: ${BUILD_DIR}/${kegg_extraction_base}${version_suffix}${test_suffix}.log kegg_input_file: ${BUILD_DIR}/kegg.jsonl kegg_conversion_script: ${CONVERT_CODE_DIR}/${kegg_conversion_base}.py -kegg_conversion_log: ${BUILD_DIR}/${kegg_conversion_base}${test_suffix}.log +kegg_conversion_log: ${BUILD_DIR}/${kegg_conversion_base}${version_suffix}${test_suffix}.log kegg_output_nodes_file: ${BUILD_DIR}/${kegg_output_base}${nodes_suffix}${test_suffix}.jsonl kegg_output_edges_file: ${BUILD_DIR}/${kegg_output_base}${edges_suffix}${test_suffix}.jsonl @@ -244,17 +245,17 @@ clinicaltrialskg_extraction_base: extract-clinicaltrialskg clinicaltrialskg_conversion_base: clinicaltrialskg_tsv_to_kg_jsonl clinicaltrialskg_output_base: kg2-clinicaltrialskg clinicaltrialskg_extraction_script: ${EXTRACT_CODE_DIR}/${clinicaltrialskg_extraction_base}.sh -clinicaltrialskg_extraction_log: ${BUILD_DIR}/${clinicaltrialskg_extraction_base}${test_suffix}.log +clinicaltrialskg_extraction_log: ${BUILD_DIR}/${clinicaltrialskg_extraction_base}${version_suffix}${test_suffix}.log clinicaltrialskg_input_file: ${BUILD_DIR}/clinicaltrialskg-edges.tsv clinicaltrialskg_conversion_script: ${CONVERT_CODE_DIR}/${clinicaltrialskg_conversion_base}.py -clinicaltrialskg_conversion_log: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${test_suffix}.log +clinicaltrialskg_conversion_log: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${version_suffix}${test_suffix}.log clinicaltrialskg_output_nodes_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${nodes_suffix}${test_suffix}.jsonl clinicaltrialskg_output_edges_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${edges_suffix}${test_suffix}.jsonl merge_base: merge_graphs merge_script: ${PROCESS_CODE_DIR}/${merge_base}.py merged_output_base: kg2-merged -merge_log: ${BUILD_DIR}/${merge_base}${test_suffix}.log +merge_log: ${BUILD_DIR}/${merge_base}${version_suffix}${test_suffix}.log merged_output_nodes_file: ${BUILD_DIR}/${merged_output_base}${nodes_suffix}${test_suffix}.jsonl merged_output_edges_file: ${BUILD_DIR}/${merged_output_base}${edges_suffix}${test_suffix}.jsonl output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${test_suffix}.jsonl @@ -262,31 +263,31 @@ output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${test_suffix}.j simplify_base: run-simplify simplified_output_base: kg2-simplified simplify_script: ${PROCESS_CODE_DIR}/${simplify_base}.sh -simplify_log: ${BUILD_DIR}/${simplify_base}${test_suffix}.log +simplify_log: ${BUILD_DIR}/${simplify_base}${version_suffix}${test_suffix}.log simplified_output_nodes_file: ${BUILD_DIR}/${simplified_output_base}${nodes_suffix}${test_suffix}.jsonl simplified_output_edges_file: ${BUILD_DIR}/${simplified_output_base}${edges_suffix}${test_suffix}.jsonl report_base: report_stats_on_kg_jsonl report_script: ${PROCESS_CODE_DIR}/${report_base}.py -report_log: ${BUILD_DIR}/${report_base}${test_suffix}.log +report_log: ${BUILD_DIR}/${report_base}${version_suffix}${test_suffix}.log report_file: ${BUILD_DIR}/kg2-report${test_suffix}.json -simplified_report_log: ${BUILD_DIR}/${report_base}-simplified${test_suffix}.log +simplified_report_log: ${BUILD_DIR}/${report_base}-simplified${version_suffix}${test_suffix}.log simplified_report_file_base: kg2-simplified-report${test_suffix}.json simplified_report_file: ${BUILD_DIR}/${simplified_report_file_base} slim_base: slim_kg2 slim_output_base: kg2-slim slim_script: ${PROCESS_CODE_DIR}/${slim_base}.py -slim_log: ${BUILD_DIR}/${slim_base}${test_suffix}.log +slim_log: ${BUILD_DIR}/${slim_base}${version_suffix}${test_suffix}.log slim_output_nodes_file: ${BUILD_DIR}/${slim_output_base}${nodes_suffix}${test_suffix}.jsonl slim_output_edges_file: ${BUILD_DIR}/${slim_output_base}${edges_suffix}${test_suffix}.jsonl tsv_base: kg_json_to_tsv tsv_script: ${PROCESS_CODE_DIR}/${tsv_base}.py -tsv_log: ${BUILD_DIR}/${tsv_base}${test_suffix}.log +tsv_log: ${BUILD_DIR}/${tsv_base}${version_suffix}${test_suffix}.log kg2_tsv_dir: ${BUILD_DIR}/TSV -kg2_tsv_tarball: ${BUILD_DIR}/kg2-tsv-for-neo4j${test_suffix}.tar.gz +kg2_tsv_tarball: ${BUILD_DIR}/kg2-tsv-for-neo4j${version_suffix}${test_suffix}.tar.gz tsv_placeholder: ${BUILD_DIR}/tsv_placeholder.empty finish_script: ${BUILD_CODE_DIR}/finish-snakemake.sh diff --git a/master-config.shinc b/master-config.shinc index c6e5de45..97fb5956 100644 --- a/master-config.shinc +++ b/master-config.shinc @@ -30,4 +30,4 @@ ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml rtx_config_file=RTXConfiguration-config.json biolink_model_version=4.2.0 infores_registry_version=0.2.8 -version= \ No newline at end of file +kg2_version= \ No newline at end of file From f0aee45706b445b5561777181d1df4b406c83899 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 14:58:01 -0700 Subject: [PATCH 028/125] #140 on the neo4j side --- neo4j/tsv-to-neo4j.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/neo4j/tsv-to-neo4j.sh b/neo4j/tsv-to-neo4j.sh index 4371b910..2fd03361 100755 --- a/neo4j/tsv-to-neo4j.sh +++ b/neo4j/tsv-to-neo4j.sh @@ -53,8 +53,13 @@ rm -f ${tsv_tarball} rm -r -f ${tsv_dir} mkdir -p ${tsv_dir} +# get the latest KG2 version +kg2_version_file="kg2-version.txt" +${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${BUILD_DIR}/${kg2_version_file} +kg2_version=`cat ${BUILD_DIR}/${kg2_version_file}` + # download the latest TSV files from the S3 Bucket -${s3_cp_cmd} s3://${s3_bucket}/kg2-tsv-for-neo4j${test_arg}.tar.gz ${tsv_tarball} +${s3_cp_cmd} s3://${s3_bucket}/kg2-tsv-for-neo4j-${kg2_version}${test_arg}.tar.gz ${tsv_tarball} # unpack the TSV tarball tar -xvzf ${tsv_tarball} -C ${tsv_dir} From 07c854959da9a5fd6fcf2e299ee0354a1535b4ce Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 15:01:34 -0700 Subject: [PATCH 029/125] #140 adding the name to the name of other build artifacts as well --- build/snakemake-config-var.yaml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index 72164364..c811da79 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -256,32 +256,32 @@ merge_base: merge_graphs merge_script: ${PROCESS_CODE_DIR}/${merge_base}.py merged_output_base: kg2-merged merge_log: ${BUILD_DIR}/${merge_base}${version_suffix}${test_suffix}.log -merged_output_nodes_file: ${BUILD_DIR}/${merged_output_base}${nodes_suffix}${test_suffix}.jsonl -merged_output_edges_file: ${BUILD_DIR}/${merged_output_base}${edges_suffix}${test_suffix}.jsonl -output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${test_suffix}.jsonl +merged_output_nodes_file: ${BUILD_DIR}/${merged_output_base}${version_suffix}${nodes_suffix}${test_suffix}.jsonl +merged_output_edges_file: ${BUILD_DIR}/${merged_output_base}${version_suffix}${edges_suffix}${test_suffix}.jsonl +output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${version_suffix}${test_suffix}.jsonl simplify_base: run-simplify simplified_output_base: kg2-simplified simplify_script: ${PROCESS_CODE_DIR}/${simplify_base}.sh simplify_log: ${BUILD_DIR}/${simplify_base}${version_suffix}${test_suffix}.log -simplified_output_nodes_file: ${BUILD_DIR}/${simplified_output_base}${nodes_suffix}${test_suffix}.jsonl -simplified_output_edges_file: ${BUILD_DIR}/${simplified_output_base}${edges_suffix}${test_suffix}.jsonl +simplified_output_nodes_file: ${BUILD_DIR}/${simplified_output_base}${version_suffix}${nodes_suffix}${test_suffix}.jsonl +simplified_output_edges_file: ${BUILD_DIR}/${simplified_output_base}${version_suffix}${edges_suffix}${test_suffix}.jsonl report_base: report_stats_on_kg_jsonl report_script: ${PROCESS_CODE_DIR}/${report_base}.py report_log: ${BUILD_DIR}/${report_base}${version_suffix}${test_suffix}.log -report_file: ${BUILD_DIR}/kg2-report${test_suffix}.json +report_file: ${BUILD_DIR}/kg2-report${version_suffix}${test_suffix}.json simplified_report_log: ${BUILD_DIR}/${report_base}-simplified${version_suffix}${test_suffix}.log -simplified_report_file_base: kg2-simplified-report${test_suffix}.json +simplified_report_file_base: kg2-simplified-report${version_suffix}${test_suffix}.json simplified_report_file: ${BUILD_DIR}/${simplified_report_file_base} slim_base: slim_kg2 slim_output_base: kg2-slim slim_script: ${PROCESS_CODE_DIR}/${slim_base}.py slim_log: ${BUILD_DIR}/${slim_base}${version_suffix}${test_suffix}.log -slim_output_nodes_file: ${BUILD_DIR}/${slim_output_base}${nodes_suffix}${test_suffix}.jsonl -slim_output_edges_file: ${BUILD_DIR}/${slim_output_base}${edges_suffix}${test_suffix}.jsonl +slim_output_nodes_file: ${BUILD_DIR}/${slim_output_base}${version_suffix}${nodes_suffix}${test_suffix}.jsonl +slim_output_edges_file: ${BUILD_DIR}/${slim_output_base}${version_suffix}${edges_suffix}${test_suffix}.jsonl tsv_base: kg_json_to_tsv tsv_script: ${PROCESS_CODE_DIR}/${tsv_base}.py From 184fa6ec204cfb488e0e9890c3eb37f9a78c008b Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 15:09:18 -0700 Subject: [PATCH 030/125] #140 made sure its only defined once --- build/Snakefile-post-etl | 2 +- build/build-kg2-snakemake.sh | 10 ++++------ build/snakemake-config-var.yaml | 2 -- master-config.shinc | 2 ++ neo4j/tsv-to-neo4j.sh | 5 ++--- process/run-simplify.sh | 7 +++---- 6 files changed, 12 insertions(+), 16 deletions(-) diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl index e6de67ee..62ccffd7 100644 --- a/build/Snakefile-post-etl +++ b/build/Snakefile-post-etl @@ -126,7 +126,7 @@ rule Simplify: log: config['SIMPLIFY_LOG'] shell: - "bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['VERSION_FILE'] + " " + config['TEST_FLAG'] + " > {log} 2>&1" + "bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" rule Slim: input: diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh index 6d313846..96d00935 100755 --- a/build/build-kg2-snakemake.sh +++ b/build/build-kg2-snakemake.sh @@ -64,8 +64,6 @@ then run_flag="-F" fi -kg2_version_file="kg2-version.txt" -local_kg2_version_file="${BUILD_DIR}/${kg2_version_file}" trigger_file_is_major_release=${BUILD_DIR}/major-release trigger_file_is_minor_release=${BUILD_DIR}/minor-release @@ -89,14 +87,14 @@ if [[ "${ci_flag}" == "ci" ]] then sed -i "\@^kg2_version=@ckg2_version=KG2.CI" ${CODE_DIR}/master-config.shinc else - ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${local_kg2_version_file} + ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${kg2_version_file_local} if [[ "${increment_flag}" != '' ]] then - ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${local_kg2_version_file} + ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${kg2_version_file_local} else echo "*** TEST MODE -- NO INCREMENT ***" fi - curr_kg2_version=`cat ${local_kg2_version_file}` + curr_kg2_version=`cat ${kg2_version_file_local}` sed -i "\@^kg2_version=@ckg2_version=${curr_kg2_version}" ${CODE_DIR}/master-config.shinc fi @@ -156,7 +154,7 @@ cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish if [[ "${ci_flag}" != "ci" ]] then - ${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file} + ${s3_cp_cmd} ${kg2_version_file_local} s3://${s3_bucket_public}/${kg2_version_file} fi if [[ -f ${trigger_file_is_major_release} ]] diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index c811da79..a751dc4c 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -291,5 +291,3 @@ kg2_tsv_tarball: ${BUILD_DIR}/kg2-tsv-for-neo4j${version_suffix}${test_suffix}.t tsv_placeholder: ${BUILD_DIR}/tsv_placeholder.empty finish_script: ${BUILD_CODE_DIR}/finish-snakemake.sh - -version_file: ${BUILD_DIR}/kg2-version.txt diff --git a/master-config.shinc b/master-config.shinc index 97fb5956..ac6dd9e5 100644 --- a/master-config.shinc +++ b/master-config.shinc @@ -30,4 +30,6 @@ ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml rtx_config_file=RTXConfiguration-config.json biolink_model_version=4.2.0 infores_registry_version=0.2.8 +kg2_version_file=version.txt +kg2_version_file_local=${BUILD_DIR}/${kg2_version_file} kg2_version= \ No newline at end of file diff --git a/neo4j/tsv-to-neo4j.sh b/neo4j/tsv-to-neo4j.sh index 2fd03361..07c2f692 100755 --- a/neo4j/tsv-to-neo4j.sh +++ b/neo4j/tsv-to-neo4j.sh @@ -54,9 +54,8 @@ rm -r -f ${tsv_dir} mkdir -p ${tsv_dir} # get the latest KG2 version -kg2_version_file="kg2-version.txt" -${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${BUILD_DIR}/${kg2_version_file} -kg2_version=`cat ${BUILD_DIR}/${kg2_version_file}` +${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${kg2_version_file_local} +kg2_version=`cat ${kg2_version_file_local}` # download the latest TSV files from the S3 Bucket ${s3_cp_cmd} s3://${s3_bucket}/kg2-tsv-for-neo4j-${kg2_version}${test_arg}.tar.gz ${tsv_tarball} diff --git a/process/run-simplify.sh b/process/run-simplify.sh index 6a8951b7..4a033273 100755 --- a/process/run-simplify.sh +++ b/process/run-simplify.sh @@ -10,7 +10,7 @@ if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then exit 2 fi -# Usage: run-simplify.sh [version_filename] [test] +# Usage: run-simplify.sh [test] echo "================= starting run-simplify.sh ==================" date @@ -22,15 +22,14 @@ input_nodes_json=${1:-} input_edges_json=${2:-} output_nodes_json=${3:-} output_edges_json=${4:-} -local_version_filename=${5:-"${BUILD_DIR}/kg2-version.txt"} -build_flag=${6:-""} +build_flag=${5:-""} # TODO: Inhibits and increase are not in biolink model anymore - Find out what that should be now ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py ${test_flag} --dropNegated \ --dropSelfEdgesExcept interacts_with,regulates,inhibits,increase \ ${predicate_mapping_file} ${infores_mapping_file} ${curies_to_urls_file} \ ${knowledge_level_agent_type_mapping_file} ${input_nodes_json} ${input_edges_json} \ - ${output_nodes_json} ${output_edges_json} ${local_version_filename} + ${output_nodes_json} ${output_edges_json} ${kg2_version_file_local} date echo "================= finishing run-simplify.sh ==================" From 49d75edba30e75a19c693b06365f2948de77a413 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 15:19:26 -0700 Subject: [PATCH 031/125] #393 remove RepoDB from the build system --- build/Snakefile-conversion | 13 ----- build/Snakefile-extraction | 11 ---- build/Snakefile-post-etl | 4 -- build/snakemake-config-var.yaml | 12 ---- .../{ => archive}/repodb_csv_to_kg_jsonl.py | 0 extract/{ => archive}/extract-repodb.sh | 0 maps/curies-to-urls-map.yaml | 2 - ...g2-provided-by-curie-to-infores-curie.yaml | 4 -- maps/knowledge-level-agent-type-map.yaml | 4 -- maps/predicate-remap.yaml | 57 ------------------- 10 files changed, 107 deletions(-) rename convert/{ => archive}/repodb_csv_to_kg_jsonl.py (100%) rename extract/{ => archive}/extract-repodb.sh (100%) diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion index ae80d765..6754be04 100644 --- a/build/Snakefile-conversion +++ b/build/Snakefile-conversion @@ -120,19 +120,6 @@ rule DGIdb_Conversion: shell: config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1" -rule RepoDB_Conversion: - input: - code = config['REPODB_CONVERSION_SCRIPT'], - real = config['REPODB_INPUT_FILE'], - validation = config['VALIDATION_PLACEHOLDER'] - output: - nodes = config['REPODB_OUTPUT_NODES_FILE'], - edges = config['REPODB_OUTPUT_EDGES_FILE'] - log: - config['REPODB_CONVERSION_LOG'] - shell: - config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1" - rule DrugBank_Conversion: input: code = config['DRUGBANK_CONVERSION_SCRIPT'], diff --git a/build/Snakefile-extraction b/build/Snakefile-extraction index ac5e19b1..c23d0ef0 100644 --- a/build/Snakefile-extraction +++ b/build/Snakefile-extraction @@ -88,17 +88,6 @@ rule DGIdb: shell: "bash -x {input.code} " + config['DGIDB_DIR'] + " > {log} 2>&1" -rule RepoDB: - input: - code = config['REPODB_EXTRACTION_SCRIPT'], - validation = config['VALIDATION_PLACEHOLDER'] - output: - config['REPODB_INPUT_FILE'] - log: - config['REPODB_EXTRACTION_LOG'] - shell: - "bash -x {input.code} " + config['REPODB_DIR'] + " > {log} 2>&1" - rule DrugBank: input: code = config['DRUGBANK_EXTRACTION_SCRIPT'], diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl index 62ccffd7..eeb1a44d 100644 --- a/build/Snakefile-post-etl +++ b/build/Snakefile-post-etl @@ -19,8 +19,6 @@ rule Merge: ncbigene_edges = config['NCBIGENE_OUTPUT_EDGES_FILE'], dgidb_nodes = config['DGIDB_OUTPUT_NODES_FILE'], dgidb_edges = config['DGIDB_OUTPUT_EDGES_FILE'], - repodb_nodes = config['REPODB_OUTPUT_NODES_FILE'], - repodb_edges = config['REPODB_OUTPUT_EDGES_FILE'], drugbank_nodes = config['DRUGBANK_OUTPUT_NODES_FILE'], drugbank_edges = config['DRUGBANK_OUTPUT_EDGES_FILE'], smpdb_nodes = config['SMPDB_OUTPUT_NODES_FILE'], @@ -66,7 +64,6 @@ rule Merge: "{input.chembl_nodes} " + \ "{input.ncbigene_nodes} " + \ "{input.dgidb_nodes} " + \ - "{input.repodb_nodes} " + \ "{input.smpdb_nodes} " + \ "{input.drugbank_nodes} " + \ "{input.hmdb_nodes} " + \ @@ -89,7 +86,6 @@ rule Merge: "{input.chembl_edges} " + \ "{input.ncbigene_edges} " + \ "{input.dgidb_edges} " + \ - "{input.repodb_edges} " + \ "{input.smpdb_edges} " + \ "{input.drugbank_edges} " + \ "{input.hmdb_edges} " + \ diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index a751dc4c..209b3659 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -106,18 +106,6 @@ dgidb_conversion_log: ${BUILD_DIR}/${dgidb_conversion_base}${version_suffix}${te dgidb_output_nodes_file: ${BUILD_DIR}/${dgidb_output_base}${nodes_suffix}${test_suffix}.jsonl dgidb_output_edges_file: ${BUILD_DIR}/${dgidb_output_base}${edges_suffix}${test_suffix}.jsonl -repodb_extraction_base: extract-repodb -repodb_conversion_base: repodb_csv_to_kg_jsonl -repodb_output_base: kg2-repodb -repodb_extraction_script: ${EXTRACT_CODE_DIR}/${repodb_extraction_base}.sh -repodb_extraction_log: ${BUILD_DIR}/${repodb_extraction_base}${version_suffix}${test_suffix}.log -repodb_dir: ${BUILD_DIR}/repodb -repodb_input_file: ${repodb_dir}/repodb.csv -repodb_conversion_script: ${CONVERT_CODE_DIR}/${repodb_conversion_base}.py -repodb_conversion_log: ${BUILD_DIR}/${repodb_conversion_base}${version_suffix}${test_suffix}.log -repodb_output_nodes_file: ${BUILD_DIR}/${repodb_output_base}${nodes_suffix}${test_suffix}.jsonl -repodb_output_edges_file: ${BUILD_DIR}/${repodb_output_base}${edges_suffix}${test_suffix}.jsonl - drugbank_extraction_base: extract-drugbank drugbank_conversion_base: drugbank_xml_to_kg_jsonl drugbank_output_base: kg2-drugbank diff --git a/convert/repodb_csv_to_kg_jsonl.py b/convert/archive/repodb_csv_to_kg_jsonl.py similarity index 100% rename from convert/repodb_csv_to_kg_jsonl.py rename to convert/archive/repodb_csv_to_kg_jsonl.py diff --git a/extract/extract-repodb.sh b/extract/archive/extract-repodb.sh similarity index 100% rename from extract/extract-repodb.sh rename to extract/archive/extract-repodb.sh diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index fe4192b8..b9c2af1c 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -427,8 +427,6 @@ use_for_bidirectional_mapping: rdfs: http://www.w3.org/2000/01/rdf-schema# - REACT: "https://identifiers.org/reactome:" - - - REPODB: http://apps.chiragjpgroup.org/repoDB/ - RGD: "https://identifiers.org/rgd:" - diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml index efb2d639..718cbd11 100644 --- a/maps/kg2-provided-by-curie-to-infores-curie.yaml +++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml @@ -134,10 +134,6 @@ OBO:uberon.owl: source_name: PathWhiz infores_curie: infores:pathwhiz knowledge_type: primary_knowledge_source -'REPODB:': - source_name: Drug Repositioning Database - infores_curie: infores:repodb - knowledge_type: knowledge_source 'RTX:': source_name: RTX KG2 infores_curie: infores:rtx-kg2 diff --git a/maps/knowledge-level-agent-type-map.yaml b/maps/knowledge-level-agent-type-map.yaml index db99215a..85bf161a 100644 --- a/maps/knowledge-level-agent-type-map.yaml +++ b/maps/knowledge-level-agent-type-map.yaml @@ -222,10 +222,6 @@ infores:reactome: agent_type: manual_agent knowledge_level: knowledge_assertion reference: https://en.wikipedia.org/wiki/Reactome -infores:repodb: - agent_type: automated_agent - knowledge_level: knowledge_assertion - reference: https://www.nature.com/articles/sdata201729 infores:ro: agent_type: manual_agent knowledge_level: knowledge_assertion diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml index fbff26a8..1e08b978 100644 --- a/maps/predicate-remap.yaml +++ b/maps/predicate-remap.yaml @@ -3367,63 +3367,6 @@ REACT:positively_regulates_gene_expression: REACT:related_to: operation: keep core_predicate: biolink:related_to -REPODB:clinically_tested_approved_unknown_phase: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_suspended_phase_0: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_suspended_phase_1: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_suspended_phase_1_or_phase_2: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_suspended_phase_2: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_suspended_phase_2_or_phase_3: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_suspended_phase_3: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_terminated_phase_0: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_terminated_phase_1: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_terminated_phase_1_or_phase_2: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_terminated_phase_2: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_terminated_phase_2_or_phase_3: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_terminated_phase_3: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_withdrawn_phase_0: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_withdrawn_phase_1: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_withdrawn_phase_1_or_phase_2: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_withdrawn_phase_2: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_withdrawn_phase_2_or_phase_3: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide -REPODB:clinically_tested_withdrawn_phase_3: - operation: keep - core_predicate: biolink:drug_regulatory_status_world_wide RO:0000052: operation: keep core_predicate: biolink:related_to From ce8d8de7ff9324332063db753090d4d5375c83ef Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 15:45:29 -0700 Subject: [PATCH 032/125] #393 have to remove info from kg2_util as well --- kg2_util.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index e61c6cba..a5aa0971 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -112,7 +112,6 @@ CURIE_PREFIX_RDF = 'rdf' CURIE_PREFIX_RDFS = 'rdfs' CURIE_PREFIX_REACTOME='REACT' -CURIE_PREFIX_REPODB = 'REPODB' CURIE_PREFIX_RHEA = 'RHEA' CURIE_PREFIX_RHEA_COMP = 'RHEA.COMP' CURIE_PREFIX_RO = 'RO' @@ -175,7 +174,6 @@ BASE_URL_PATHWHIZ_BOUND = 'https://pathbank.org/lims#/bounds/' BASE_URL_PMID = "http://www.ncbi.nlm.nih.gov/pubmed/" BASE_URL_REACTOME = BASE_BASE_URL_IDENTIFIERS_ORG + 'reactome:' -BASE_URL_REPODB = 'http://apps.chiragjpgroup.org/repoDB/' BASE_URL_RTX = 'http://rtx.ai/identifiers#' BASE_URL_SEMMEDDB = 'https://skr3.nlm.nih.gov/SemMedDB' BASE_URL_SMPDB = BASE_BASE_URL_IDENTIFIERS_ORG + 'smpdb:' From 6bdb5a6ed0429caeda31ae6387cbb74668eb89c7 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 16:23:28 -0700 Subject: [PATCH 033/125] #400 first pass at this --- maps/curies-to-urls-map.yaml | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index b9c2af1c..468c842e 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -378,9 +378,7 @@ use_for_bidirectional_mapping: - OPL: http://purl.obolibrary.org/obo/OPL_ - - orphanet: 'http://www.orpha.net/ORDO/Orphanet_' - - - ORPHANET: http://purl.bioontology.org/ontology/ORDO/ + orphanet: http://purl.bioontology.org/ontology/ORDO/ - owl: http://www.w3.org/2002/07/owl# - @@ -665,10 +663,10 @@ use_for_contraction_only: OMIM: http://identifiers.org/omim/ - OMOP: http://purl.obolibrary.org/obo/COHD_ - # - - # ORPHANET: http://www.orpha.net/ORDO/Orphanet_ - - ORPHANET: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb + orphanet: http://www.orpha.net/ORDO/Orphanet_ + - + orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb - PATO: http://purl.obolibrary.org/obo/pato# - From 31cb00a5211c837e53e028e52ec3df4ac78ed6f0 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 16:26:58 -0700 Subject: [PATCH 034/125] #400 try this instead --- maps/curies-to-urls-map.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 468c842e..48d4956c 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -378,7 +378,7 @@ use_for_bidirectional_mapping: - OPL: http://purl.obolibrary.org/obo/OPL_ - - orphanet: http://purl.bioontology.org/ontology/ORDO/ + orphanet: http://www.orpha.net/ORDO/Orphanet_ - owl: http://www.w3.org/2002/07/owl# - @@ -664,7 +664,7 @@ use_for_contraction_only: - OMOP: http://purl.obolibrary.org/obo/COHD_ - - orphanet: http://www.orpha.net/ORDO/Orphanet_ + orphanet: http://purl.bioontology.org/ontology/ORDO/ - orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb - From 475066576832dc62bf5c27c7be023ad34903188e Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 17 Jul 2024 16:32:14 -0700 Subject: [PATCH 035/125] #400 handling the expansion map (hopefully) --- maps/curies-to-urls-map.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 48d4956c..0e4ba4cb 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -728,7 +728,9 @@ use_for_expansion_only: - FlyBase: https://flybase.org/reports/ - - Orphanet: http://purl.bioontology.org/ontology/ORDO/ + Orphanet: http://www.orpha.net/ORDO/Orphanet_ + - + ORPHANET: http://www.orpha.net/ORDO/Orphanet_ - oboInOwl: http://www.geneontology.org/formats/oboInOwl# - From 89591c3a07608be8bdaa73f75a3e0637cf20a5c3 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 22 Jul 2024 11:42:44 -0700 Subject: [PATCH 036/125] #392 initial edge blocklist (no synonyms yet) --- maps/edge-blocklist.yaml | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 maps/edge-blocklist.yaml diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml new file mode 100644 index 00000000..f40dca64 --- /dev/null +++ b/maps/edge-blocklist.yaml @@ -0,0 +1,32 @@ +- + subject_name: Vaccines + subject_ids: + - UMLS:C0042210 + predicate: biolink:causes + object_name: Autism + object_ids: + - UMLS:C0004352 +- + subject_name: Measles-Mumps-Rubella Vaccine + subject_ids: + - UMLS:C0065828 + predicate: biolink:causes + object_name: Autism + object_ids: + - UMLS:C0004352 +- + subject_name: Mercury + subject_ids: + - UMLS:C0025424 + predicate: biolink:causes + object_name: Autism + object_ids: + - UMLS:C0004352 +- + subject_name: Thimerosal + subject_ids: + - UMLS:C0039867 + predicate: biolink:causes + object_name: Autism + object_ids: + - UMLS:C0004352 From e33b320a7e8edf5895cd84479c61469d955237a4 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 22 Jul 2024 11:43:32 -0700 Subject: [PATCH 037/125] #387 grouping together xml blocks --- misc-tools/owlparser.py | 43 ++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 0db4b0be..6006e409 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -2,8 +2,12 @@ import argparse COMMENT = "!--" +XML_TAG = "?xml" +RDF_TAG = "rdf:RDF" -LINE_TYPE_COMMENT = "comment" +OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG] + +LINE_TYPE_IGNORE = "ignore" LINE_TYPE_START_NEST = "start nest" LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" LINE_TYPE_ENTRY = "entry" @@ -16,6 +20,8 @@ KEY_TEXT = "text" KEY_TYPE = "type" +IGNORED_ATTRIBUTES = ["xml:lang"] + def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', @@ -73,7 +79,8 @@ def convert_line(line): start_reading_attribute_tag = False start_reading_attribute_text = False start_reading_main = True - attributes[attribute_tag] = attribute_text.strip('/').strip('"') + if attribute_tag not in IGNORED_ATTRIBUTES: + attributes[attribute_tag] = attribute_text.strip('/').strip('"') attribute_tag = "" attribute_text = "" @@ -91,7 +98,8 @@ def convert_line(line): if letter == ' ' and start_reading_attribute_text: start_reading_attribute_tag = True start_reading_attribute_text = False - attributes[attribute_tag] = attribute_text.strip('/').strip('"') + if attribute_tag not in IGNORED_ATTRIBUTES: + attributes[attribute_tag] = attribute_text.strip('/').strip('"') attribute_tag = "" attribute_text = "" continue @@ -113,8 +121,8 @@ def convert_line(line): # Categorize the type of line line_type = str() out = dict() - if tag == COMMENT: - line_type = "comment" + if tag == COMMENT or tag in OUTMOST_TAGS_SKIP: + line_type = LINE_TYPE_IGNORE else: start_tag_exists = (tag != str()) attributes_exist = (attributes != dict()) @@ -154,7 +162,8 @@ def convert_line(line): def divide_into_lines(input_file_name): curr_str = "" - keys = set() + curr_nest = list() + curr_nest_tag = str() with open(input_file_name) as input_file: for line in input_file: @@ -174,21 +183,29 @@ def divide_into_lines(input_file_name): line_parsed = convert_line(curr_str) tag = line_parsed.get(KEY_TAG, None) + line_type = line_parsed.get(KEY_TYPE, None) attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys() - if tag is not None: - keys.add(tag) - for attribute_key in attribute_keys: - keys.add(attribute_key) - # print(json.dumps(convert_line(curr_str), indent=4)) + if curr_nest_tag == str(): + if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: + curr_nest_tag = tag + curr_nest.append(line_parsed) + elif line_type != LINE_TYPE_IGNORE: + print(json.dumps(line_parsed, indent=4)) # replacement for processing right now + else: + if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag: + print(json.dumps(curr_nest, indent=4)) # replacement for processing right now + curr_nest = list() + curr_nest_tag = str() + else: + curr_nest.append(line_parsed) + curr_str = "" if curr_str != "": # divide lines by a space curr_str += ' ' - print(json.dumps(list(keys), indent=4)) - if __name__ == '__main__': args = get_args() From d7743bb3f96bc12cbbe63e856f425d22a02e507b Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 22 Jul 2024 12:00:34 -0700 Subject: [PATCH 038/125] #392 autism synonyms --- maps/edge-blocklist.yaml | 103 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml index f40dca64..67ec5623 100644 --- a/maps/edge-blocklist.yaml +++ b/maps/edge-blocklist.yaml @@ -5,28 +5,125 @@ predicate: biolink:causes object_name: Autism object_ids: + - CHV:0000001598 + - CHV:0000050438 + - DOID:0060041 + - DOID:12849 + - EFO:0003756 + - EFO:0003758 + - HP:0000717 + - HP:0000729 + - ICD9:299.0 + - MESH:D000067877 + - MESH:D001321 + - MONDO:0005258 + - MONDO:0005260 + - NCIT:C88412 + - NCIT:C97161 + - OMIM:209850 + - OMIM:MTHU004240 + - OMIM:MTHU038054 + - OMIM:MTHU043125 + - OMIM:MTHU043132 + - PSY:04850 + - PSY:04855 - UMLS:C0004352 -- + - UMLS:C0856975 + - UMLS:C1510586 + - UMLS:C1968924- subject_name: Measles-Mumps-Rubella Vaccine subject_ids: - UMLS:C0065828 predicate: biolink:causes object_name: Autism object_ids: + - CHV:0000001598 + - CHV:0000050438 + - DOID:0060041 + - DOID:12849 + - EFO:0003756 + - EFO:0003758 + - HP:0000717 + - HP:0000729 + - ICD9:299.0 + - MESH:D000067877 + - MESH:D001321 + - MONDO:0005258 + - MONDO:0005260 + - NCIT:C88412 + - NCIT:C97161 + - OMIM:209850 + - OMIM:MTHU004240 + - OMIM:MTHU038054 + - OMIM:MTHU043125 + - OMIM:MTHU043132 + - PSY:04850 + - PSY:04855 - UMLS:C0004352 -- + - UMLS:C0856975 + - UMLS:C1510586 + - UMLS:C1968924- subject_name: Mercury subject_ids: - UMLS:C0025424 predicate: biolink:causes object_name: Autism object_ids: + - CHV:0000001598 + - CHV:0000050438 + - DOID:0060041 + - DOID:12849 + - EFO:0003756 + - EFO:0003758 + - HP:0000717 + - HP:0000729 + - ICD9:299.0 + - MESH:D000067877 + - MESH:D001321 + - MONDO:0005258 + - MONDO:0005260 + - NCIT:C88412 + - NCIT:C97161 + - OMIM:209850 + - OMIM:MTHU004240 + - OMIM:MTHU038054 + - OMIM:MTHU043125 + - OMIM:MTHU043132 + - PSY:04850 + - PSY:04855 - UMLS:C0004352 -- + - UMLS:C0856975 + - UMLS:C1510586 + - UMLS:C1968924- subject_name: Thimerosal subject_ids: - UMLS:C0039867 predicate: biolink:causes object_name: Autism object_ids: + - CHV:0000001598 + - CHV:0000050438 + - DOID:0060041 + - DOID:12849 + - EFO:0003756 + - EFO:0003758 + - HP:0000717 + - HP:0000729 + - ICD9:299.0 + - MESH:D000067877 + - MESH:D001321 + - MONDO:0005258 + - MONDO:0005260 + - NCIT:C88412 + - NCIT:C97161 + - OMIM:209850 + - OMIM:MTHU004240 + - OMIM:MTHU038054 + - OMIM:MTHU043125 + - OMIM:MTHU043132 + - PSY:04850 + - PSY:04855 - UMLS:C0004352 + - UMLS:C0856975 + - UMLS:C1510586 + - UMLS:C1968924 \ No newline at end of file From 7c53a604774289c94eee779f2a627dc77bc81afa Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 22 Jul 2024 16:15:46 -0700 Subject: [PATCH 039/125] #392 full edge blocklist --- maps/edge-blocklist.yaml | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml index 67ec5623..bb64c0be 100644 --- a/maps/edge-blocklist.yaml +++ b/maps/edge-blocklist.yaml @@ -1,7 +1,10 @@ - subject_name: Vaccines subject_ids: + - ATC:J07 + - MESH:D014612 - UMLS:C0042210 + - VANDF:4021642 predicate: biolink:causes object_name: Autism object_ids: @@ -30,9 +33,13 @@ - UMLS:C0004352 - UMLS:C0856975 - UMLS:C1510586 - - UMLS:C1968924- + - UMLS:C1968924 +- subject_name: Measles-Mumps-Rubella Vaccine subject_ids: + - MESH:D022542 + - NCIT:C96403 + - PDQ:CDR0000702931 - UMLS:C0065828 predicate: biolink:causes object_name: Autism @@ -62,10 +69,19 @@ - UMLS:C0004352 - UMLS:C0856975 - UMLS:C1510586 - - UMLS:C1968924- + - UMLS:C1968924 +- subject_name: Mercury subject_ids: + - CHEBI:16170 + - CHEMBL.TARGET:CHEMBL2363061 + - KEGG.COMPOUND:C01319 + - MESH:D008628 + - NCIT:C66842 + - NCIT:C68270 + - RXNORM:6769 - UMLS:C0025424 + - VANDF:4025953 predicate: biolink:causes object_name: Autism object_ids: @@ -94,10 +110,23 @@ - UMLS:C0004352 - UMLS:C0856975 - UMLS:C1510586 - - UMLS:C1968924- + - UMLS:C1968924 +- subject_name: Thimerosal subject_ids: + - ATC:D08AK06 + - CHEBI:9546 + - CHEMBL.COMPOUND:CHEMBL508338 + - CHV:0000012180 + - DRUGBANK:DB11590 + - DrugCentral:4733 + - KEGG.DRUG:D00864 + - MESH:D013849 + - NCIT:C47751 + - NDDF:003125 + - RXNORM:10472 - UMLS:C0039867 + - VANDF:4017480 predicate: biolink:causes object_name: Autism object_ids: From f5c72743f55d8c20f50a47d8da888df00b7679c0 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 24 Jul 2024 19:19:24 -0700 Subject: [PATCH 040/125] #387 parses it into little dictionaries (generically) --- misc-tools/owlparser.py | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 6006e409..c0f7f602 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -160,6 +160,46 @@ def convert_line(line): return out +def convert_nest(nest, index, working_dict): + if index >= len(nest): + return working_dict + + element = nest[index] + line_type = element[KEY_TYPE] + line_tag = element[KEY_TAG] + line_text = element.get(KEY_TEXT, None) + line_attributes = element.get(KEY_ATTRIBUTES, None) + + if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: + working_dict[line_tag] = dict() + + converted_nest = convert_nest(nest, index + 1, dict()) + working_dict[line_tag] = converted_nest + + if line_type == LINE_TYPE_START_NEST_WITH_ATTR: + working_dict[line_tag][KEY_ATTRIBUTES] = line_attributes + + if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]: + if line_tag not in working_dict: + working_dict[line_tag] = list() + + curr_dict = dict() + + if line_text is not None: + curr_dict[KEY_TEXT] = line_text + + if line_attributes is not None: + for attribute in line_attributes: + curr_dict[attribute] = line_attributes[attribute] + + working_dict[line_tag].append(curr_dict) + + convert_nest(nest, index + 1, working_dict) + + return working_dict + + + def divide_into_lines(input_file_name): curr_str = "" curr_nest = list() @@ -191,10 +231,13 @@ def divide_into_lines(input_file_name): curr_nest_tag = tag curr_nest.append(line_parsed) elif line_type != LINE_TYPE_IGNORE: + print("THIS VERSION") print(json.dumps(line_parsed, indent=4)) # replacement for processing right now else: if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag: print(json.dumps(curr_nest, indent=4)) # replacement for processing right now + nest_dict = convert_nest(curr_nest, 0, dict()) + print(json.dumps(nest_dict, indent=4)) curr_nest = list() curr_nest_tag = str() else: From 6db935f25c105c37d2d67f39fbefd07879080aee Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 27 Jul 2024 00:35:54 -0700 Subject: [PATCH 041/125] #387 corrected some bugs with the XML parsing --- misc-tools/owlparser.py | 94 ++++++++++++++++++++++------------------- 1 file changed, 51 insertions(+), 43 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index c0f7f602..6593d0cd 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -17,7 +17,7 @@ KEY_TAG = "tag" KEY_ATTRIBUTES = "attributes" -KEY_TEXT = "text" +KEY_TEXT = "ENTRY_TEXT" KEY_TYPE = "type" IGNORED_ATTRIBUTES = ["xml:lang"] @@ -121,7 +121,8 @@ def convert_line(line): # Categorize the type of line line_type = str() out = dict() - if tag == COMMENT or tag in OUTMOST_TAGS_SKIP: + + if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP: line_type = LINE_TYPE_IGNORE else: start_tag_exists = (tag != str()) @@ -160,50 +161,60 @@ def convert_line(line): return out -def convert_nest(nest, index, working_dict): - if index >= len(nest): - return working_dict +def convert_nest(nest, start_index): + nest_dict = dict() + curr_index = start_index + + while curr_index < len(nest): + element = nest[curr_index] + line_type = element[KEY_TYPE] + line_tag = element[KEY_TAG] + line_text = element.get(KEY_TEXT, None) + line_attributes = element.get(KEY_ATTRIBUTES, None) + + if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - element = nest[index] - line_type = element[KEY_TYPE] - line_tag = element[KEY_TAG] - line_text = element.get(KEY_TEXT, None) - line_attributes = element.get(KEY_ATTRIBUTES, None) + converted_nest, ret_index = convert_nest(nest, curr_index + 1) - if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: - working_dict[line_tag] = dict() + if line_attributes is not None: + for attribute in line_attributes: + converted_nest[attribute] = line_attributes[attribute] - converted_nest = convert_nest(nest, index + 1, dict()) - working_dict[line_tag] = converted_nest + nest_dict[line_tag].append(converted_nest) - if line_type == LINE_TYPE_START_NEST_WITH_ATTR: - working_dict[line_tag][KEY_ATTRIBUTES] = line_attributes + curr_index = ret_index + 1 + continue - if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]: - if line_tag not in working_dict: - working_dict[line_tag] = list() + if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - curr_dict = dict() + curr_dict = dict() - if line_text is not None: - curr_dict[KEY_TEXT] = line_text + if line_text is not None: + curr_dict[KEY_TEXT] = line_text - if line_attributes is not None: - for attribute in line_attributes: - curr_dict[attribute] = line_attributes[attribute] + if line_attributes is not None: + for attribute in line_attributes: + curr_dict[attribute] = line_attributes[attribute] - working_dict[line_tag].append(curr_dict) + nest_dict[line_tag].append(curr_dict) - convert_nest(nest, index + 1, working_dict) + curr_index += 1 + continue - return working_dict + if line_type in [LINE_TYPE_END_NEST]: + return nest_dict, curr_index + return nest_dict, curr_index def divide_into_lines(input_file_name): curr_str = "" curr_nest = list() - curr_nest_tag = str() + curr_nest_tags = list() # Treating it as a stack with open(input_file_name) as input_file: for line in input_file: @@ -219,29 +230,26 @@ def divide_into_lines(input_file_name): if letter == '>' and (next_letter == '<' or next_letter == ""): # Only return if nesting - # print(curr_str) line_parsed = convert_line(curr_str) tag = line_parsed.get(KEY_TAG, None) + assert tag != KEY_TEXT # This could cause a massive conflict, but it is unlikely line_type = line_parsed.get(KEY_TYPE, None) attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys() - if curr_nest_tag == str(): - if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: - curr_nest_tag = tag - curr_nest.append(line_parsed) - elif line_type != LINE_TYPE_IGNORE: - print("THIS VERSION") - print(json.dumps(line_parsed, indent=4)) # replacement for processing right now - else: - if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag: - print(json.dumps(curr_nest, indent=4)) # replacement for processing right now - nest_dict = convert_nest(curr_nest, 0, dict()) + if line_type != LINE_TYPE_IGNORE: + curr_nest.append(line_parsed) + + if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: + curr_nest_tags.append(tag) + elif line_type == LINE_TYPE_END_NEST: + popped_curr_nest_tag = curr_nest_tags.pop() + assert popped_curr_nest_tag == tag + if len(curr_nest_tags) == 0: + nest_dict, _ = convert_nest(curr_nest, 0) print(json.dumps(nest_dict, indent=4)) curr_nest = list() curr_nest_tag = str() - else: - curr_nest.append(line_parsed) curr_str = "" From 7b4ac97aa6fca31b119dec1eefcea40a03f6ea1f Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 27 Jul 2024 03:12:28 -0700 Subject: [PATCH 042/125] #387 handling case where something is just one line and not in another nest --- misc-tools/owlparser.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 6593d0cd..76715eb2 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -1,5 +1,6 @@ import json import argparse +import datetime COMMENT = "!--" XML_TAG = "?xml" @@ -29,6 +30,9 @@ def get_args(): arg_parser.add_argument('inputFile', type=str) return arg_parser.parse_args() +def date(): + return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + def convert_line(line): tag = "" attributes = dict() @@ -240,16 +244,20 @@ def divide_into_lines(input_file_name): if line_type != LINE_TYPE_IGNORE: curr_nest.append(line_parsed) + output_nest = (line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) + if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: curr_nest_tags.append(tag) elif line_type == LINE_TYPE_END_NEST: popped_curr_nest_tag = curr_nest_tags.pop() assert popped_curr_nest_tag == tag if len(curr_nest_tags) == 0: - nest_dict, _ = convert_nest(curr_nest, 0) - print(json.dumps(nest_dict, indent=4)) - curr_nest = list() - curr_nest_tag = str() + output_nest = True + if output_nest: + nest_dict, _ = convert_nest(curr_nest, 0) + print(json.dumps(nest_dict, indent=4)) + curr_nest = list() + curr_nest_tag = str() curr_str = "" @@ -262,4 +270,7 @@ def divide_into_lines(input_file_name): args = get_args() input_file_name = args.inputFile - divide_into_lines(input_file_name) \ No newline at end of file + print("File:", input_file_name) + print("Start Time:", date()) + divide_into_lines(input_file_name) + print("End Time:", date()) \ No newline at end of file From 31b47795449c33143f50d5349b3eafde8a631e2f Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 1 Aug 2024 13:54:07 -0700 Subject: [PATCH 043/125] #404, testing it out on CI first --- master-config.shinc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/master-config.shinc b/master-config.shinc index ac6dd9e5..015cf2f7 100644 --- a/master-config.shinc +++ b/master-config.shinc @@ -28,7 +28,7 @@ infores_mapping_file=${MAPS_CODE_DIR}/kg2-provided-by-curie-to-infores-curie.yam knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-type-map.yaml ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml rtx_config_file=RTXConfiguration-config.json -biolink_model_version=4.2.0 +biolink_model_version=4.2.1 infores_registry_version=0.2.8 kg2_version_file=version.txt kg2_version_file_local=${BUILD_DIR}/${kg2_version_file} From b7597b948ac87fec01c87a02761cfe3cdb2f4880 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 1 Aug 2024 14:01:51 -0700 Subject: [PATCH 044/125] #404 predicate remapping for biolink 4.2.1 --- maps/predicate-remap.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml index 1e08b978..4da6dcac 100644 --- a/maps/predicate-remap.yaml +++ b/maps/predicate-remap.yaml @@ -4236,10 +4236,10 @@ SEMMEDDB:affects: core_predicate: biolink:affects SEMMEDDB:ASSOCIATED_WITH: operation: keep - core_predicate: biolink:associated_with + core_predicate: biolink:related_to SEMMEDDB:associated_with: operation: keep - core_predicate: biolink:associated_with + core_predicate: biolink:related_to SEMMEDDB:AUGMENTS: operation: keep core_predicate: biolink:affects From 2e62525e824ca2b650c3ff4f082248b8305f254b Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 8 Aug 2024 16:52:17 -0700 Subject: [PATCH 045/125] #387 handle doctype special case from foodon --- misc-tools/owlparser.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 76715eb2..0a38ca66 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -5,8 +5,9 @@ COMMENT = "!--" XML_TAG = "?xml" RDF_TAG = "rdf:RDF" +DOCTYPE_TAG = "!DOCTYPE" -OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG] +OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG] LINE_TYPE_IGNORE = "ignore" LINE_TYPE_START_NEST = "start nest" @@ -48,6 +49,8 @@ def convert_line(line): start_reading_main = False start_reading_end_tag = False + start_brackets = 0 + for letter_index in range(len(line)): letter = line[letter_index] next_letter = "" @@ -57,6 +60,11 @@ def convert_line(line): if letter_index - 1 >= 0: prev_letter = line[letter_index - 1] + if letter == '<': + start_brackets += 1 + if letter == '>': + start_brackets -= 1 + # First < if letter == '<' and letter_index == 0: if next_letter != '/': @@ -71,14 +79,14 @@ def convert_line(line): start_reading_attributes = True start_reading_attribute_tag = True continue - elif letter == '>' and start_reading_tag: + elif letter == '>' and start_reading_tag and start_brackets == 0: start_reading_tag = False start_reading_main = True continue elif start_reading_tag: tag += letter - if letter == '>' and start_reading_attributes: + if letter == '>' and start_reading_attributes and start_brackets == 0: start_reading_attributes = False start_reading_attribute_tag = False start_reading_attribute_text = False @@ -117,7 +125,7 @@ def convert_line(line): elif start_reading_main: main_text += letter - if letter == '>' and start_reading_end_tag: + if letter == '>' and start_reading_end_tag and start_brackets == 0: continue elif start_reading_end_tag: end_tag += letter @@ -219,6 +227,7 @@ def divide_into_lines(input_file_name): curr_str = "" curr_nest = list() curr_nest_tags = list() # Treating it as a stack + start_brackets = 0 with open(input_file_name) as input_file: for line in input_file: @@ -226,13 +235,18 @@ def divide_into_lines(input_file_name): for letter_index in range(len(line_str)): letter = line_str[letter_index] + if letter == '<': + start_brackets += 1 + if letter == '>': + start_brackets -= 1 + next_letter = "" if letter_index + 1 < len(line_str): next_letter = line_str[letter_index + 1] curr_str += letter - if letter == '>' and (next_letter == '<' or next_letter == ""): + if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: # Only return if nesting line_parsed = convert_line(curr_str) @@ -264,6 +278,7 @@ def divide_into_lines(input_file_name): if curr_str != "": # divide lines by a space curr_str += ' ' + # print(json.dumps(curr_nest, indent=4)) if __name__ == '__main__': From 38634ddbe1ddeba8a2fb9ec349a6b7f4142584d8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 8 Aug 2024 17:45:08 -0700 Subject: [PATCH 046/125] #387 handle doctype special case from foodon --- misc-tools/owlparser.py | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 0a38ca66..84fef2c2 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -6,6 +6,10 @@ XML_TAG = "?xml" RDF_TAG = "rdf:RDF" DOCTYPE_TAG = "!DOCTYPE" +CLASS_TAG = "owl:Class" +SUBCLASS_TAG = "rdfs:subClassOf" +NODEID_TAG = "rdf:nodeID" +GENID_PREFIX = "genid" OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG] @@ -24,6 +28,11 @@ IGNORED_ATTRIBUTES = ["xml:lang"] +OUTPUT_NESTS = [] +GENID_REMAINING_NESTS = dict() +GENID_TO_ID = dict() +ID_TO_GENIDS = dict() + def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', @@ -223,6 +232,22 @@ def convert_nest(nest, start_index): return nest_dict, curr_index +def check_for_genids(nest_dict): + CLASS_TAG = "owl:Class" + SUBCLASS_TAG = "rdfs:subClassOf" + NODEID_TAG = "rdf:nodeID" + GENID_PREFIX = "genid" + + genids = list() + + for nest_class in nest_dict.get(CLASS_TAG, dict()): + for nest_subclass in nest_class.get(SUBCLASS_TAG, dict()): + potential_genid = nest_subclass.get(NODEID_TAG, str()) + if potential_genid.startswith(GENID_PREFIX): + genids.append(potential_genid) + + return genids + def divide_into_lines(input_file_name): curr_str = "" curr_nest = list() @@ -269,6 +294,9 @@ def divide_into_lines(input_file_name): output_nest = True if output_nest: nest_dict, _ = convert_nest(curr_nest, 0) + genids = check_for_genids(nest_dict) + if len(genids) > 0: + nest_dict['genids'] = genids print(json.dumps(nest_dict, indent=4)) curr_nest = list() curr_nest_tag = str() @@ -278,8 +306,6 @@ def divide_into_lines(input_file_name): if curr_str != "": # divide lines by a space curr_str += ' ' - # print(json.dumps(curr_nest, indent=4)) - if __name__ == '__main__': args = get_args() From 23ff6eaaf6f0cda76cb00eb07aa4be4cccb64ed3 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 10 Aug 2024 17:01:48 -0700 Subject: [PATCH 047/125] #387 refactored for clarity --- misc-tools/owlparser.py | 343 +++++++++++++++++++++++++++------------- 1 file changed, 235 insertions(+), 108 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 84fef2c2..f38035b9 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -7,8 +7,10 @@ RDF_TAG = "rdf:RDF" DOCTYPE_TAG = "!DOCTYPE" CLASS_TAG = "owl:Class" +RESTRICTION_TAG = "owl:Restriction" SUBCLASS_TAG = "rdfs:subClassOf" NODEID_TAG = "rdf:nodeID" +RDF_ABOUT_TAG = "rdf:about" GENID_PREFIX = "genid" OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG] @@ -43,107 +45,21 @@ def get_args(): def date(): return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") -def convert_line(line): - tag = "" - attributes = dict() - attribute_tag = "" - attribute_text = "" - main_text = "" - end_tag = "" - - start_reading_tag = False - start_reading_attributes = False - start_reading_attribute_tag = False - start_reading_attribute_text = False - start_reading_main = False - start_reading_end_tag = False - - start_brackets = 0 - - for letter_index in range(len(line)): - letter = line[letter_index] - next_letter = "" - prev_letter = "" - if letter_index + 1 < len(line): - next_letter = line[letter_index + 1] - if letter_index - 1 >= 0: - prev_letter = line[letter_index - 1] - - if letter == '<': - start_brackets += 1 - if letter == '>': - start_brackets -= 1 - - # First < - if letter == '<' and letter_index == 0: - if next_letter != '/': - start_reading_tag = True - continue - if letter == '/' and prev_letter == '<': - start_reading_end_tag = True - continue - - if letter == ' ' and start_reading_tag: - start_reading_tag = False - start_reading_attributes = True - start_reading_attribute_tag = True - continue - elif letter == '>' and start_reading_tag and start_brackets == 0: - start_reading_tag = False - start_reading_main = True - continue - elif start_reading_tag: - tag += letter - - if letter == '>' and start_reading_attributes and start_brackets == 0: - start_reading_attributes = False - start_reading_attribute_tag = False - start_reading_attribute_text = False - start_reading_main = True - if attribute_tag not in IGNORED_ATTRIBUTES: - attributes[attribute_tag] = attribute_text.strip('/').strip('"') - attribute_tag = "" - attribute_text = "" - - if prev_letter == '/': - end_tag = tag - continue - elif start_reading_attributes: - if letter == '=' and start_reading_attribute_tag: - start_reading_attribute_text = True - start_reading_attribute_tag = False - continue - elif start_reading_attribute_tag: - attribute_tag += letter - - if letter == ' ' and start_reading_attribute_text: - start_reading_attribute_tag = True - start_reading_attribute_text = False - if attribute_tag not in IGNORED_ATTRIBUTES: - attributes[attribute_tag] = attribute_text.strip('/').strip('"') - attribute_tag = "" - attribute_text = "" - continue - elif start_reading_attribute_text: - attribute_text += letter - - if letter == '<' and start_reading_main: - start_reading_main = False - start_reading_end_tag = True - continue - elif start_reading_main: - main_text += letter +class LineElementRead(): + TAG = 1 + ATTRIBUTE_TAG = 2 + ATTRIBUTE_TEXT = 3 + MAIN = 4 + END_TAG = 5 - if letter == '>' and start_reading_end_tag and start_brackets == 0: - continue - elif start_reading_end_tag: - end_tag += letter +def categorize_line(tag, attributes, main_text, end_tag, only_tag): # Categorize the type of line line_type = str() out = dict() - if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP: + # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it + if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP or only_tag: line_type = LINE_TYPE_IGNORE else: start_tag_exists = (tag != str()) @@ -181,6 +97,143 @@ def convert_line(line): return out +def get_letters(line, letter_index, start_brackets): + letter = line[letter_index] + next_letter = "" + prev_letter = "" + if letter_index + 1 < len(line): + next_letter = line[letter_index + 1] + if letter_index - 1 >= 0: + prev_letter = line[letter_index - 1] + + if letter == '<': + start_brackets += 1 + if letter == '>': + start_brackets -= 1 + + return letter, next_letter, prev_letter, start_brackets + + +def identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read): + changed = False + + if letter == '<' and letter_index == 0: + if next_letter != '/': + type_to_read = LineElementRead.TAG + changed = True + if letter == '/' and prev_letter == '<': + type_to_read = LineElementRead.END_TAG + changed = True + + return changed, type_to_read + + +def read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line): + only_tag = False + changed = False + + if letter == ' ' and type_to_read == LineElementRead.TAG: + type_to_read = LineElementRead.ATTRIBUTE_TAG + changed = True + elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0: + type_to_read = LineElementRead.MAIN + + if prev_letter == '/': + print("Warning - strange tag, ignoring", line) + only_tag = True + changed = True + elif type_to_read == LineElementRead.TAG: + tag += letter + changed = True + + return changed, type_to_read, (only_tag, tag) + + +def store_attribute(attributes, attribute_tag, attribute_text): + if attribute_tag not in IGNORED_ATTRIBUTES: + attributes[attribute_tag] = attribute_text.strip('/').strip('"') + attribute_tag = "" + attribute_text = "" + + return attributes, attribute_tag, attribute_text + + +def process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag): + changed = False + start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT) + + if letter == '>' and start_reading_attributes and start_brackets == 0: + type_to_read = LineElementRead.MAIN + attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text) + + if prev_letter == '/': + end_tag = tag + changed = True + elif start_reading_attributes: + if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG: + type_to_read = LineElementRead.ATTRIBUTE_TEXT + changed = True + elif type_to_read == LineElementRead.ATTRIBUTE_TAG: + attribute_tag += letter + changed = True + + elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT: + type_to_read = LineElementRead.ATTRIBUTE_TAG + attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text) + changed = True + elif type_to_read == LineElementRead.ATTRIBUTE_TEXT: + attribute_text += letter + changed = True + + return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag) + + + +def convert_line(line): + tag = "" + attributes = dict() + attribute_tag = "" + attribute_text = "" + main_text = "" + end_tag = "" + + type_to_read = 0 + + only_tag = False + + start_brackets = 0 + + for letter_index in range(len(line)): + letter, next_letter, prev_letter, start_brackets = get_letters(line, letter_index, start_brackets) + + # First < + tag_identified, type_to_read = identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read) + if tag_identified: + continue + + tag_read, type_to_read, tag_read_data = read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line) + if tag_read: + (only_tag, tag) = tag_read_data + continue + + attributes_read, type_to_read, attributes_read_data = process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag) + if attributes_read: + (attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data + continue + + if letter == '<' and type_to_read == LineElementRead.MAIN: + type_to_read = LineElementRead.END_TAG + continue + elif type_to_read == LineElementRead.MAIN: + main_text += letter + + if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0: + continue + elif type_to_read == LineElementRead.END_TAG: + end_tag += letter + + return categorize_line(tag, attributes, main_text, end_tag, only_tag) + def convert_nest(nest, start_index): nest_dict = dict() @@ -232,22 +285,83 @@ def convert_nest(nest, start_index): return nest_dict, curr_index -def check_for_genids(nest_dict): - CLASS_TAG = "owl:Class" - SUBCLASS_TAG = "rdfs:subClassOf" - NODEID_TAG = "rdf:nodeID" - GENID_PREFIX = "genid" - +def check_for_class_genids(nest_dict): genids = list() - for nest_class in nest_dict.get(CLASS_TAG, dict()): - for nest_subclass in nest_class.get(SUBCLASS_TAG, dict()): + nest_dict_classes = nest_dict.get(CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] potential_genid = nest_subclass.get(NODEID_TAG, str()) if potential_genid.startswith(GENID_PREFIX): genids.append(potential_genid) return genids + +def check_for_restriction_genids(nest_dict): + for nest_restriction in nest_dict.get(RESTRICTION_TAG, dict()): + potential_genid = nest_restriction.get(NODEID_TAG, str()) + if potential_genid.startswith(GENID_PREFIX): + return potential_genid + return None + +def extract_class_id(nest_dict): + nest_dict_classes = nest_dict.get(CLASS_TAG, list()) + # Can't have competing class_ids + assert len(nest_dict_classes) <= 1 + + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + return nest_class.get(RDF_ABOUT_TAG, str()) + +def store_genid_nest_in_class_nest(genid, genid_nest, class_nest): + output_class_nest = class_nest + + nest_dict_classes = class_nest.get(CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] + potential_genid = nest_subclass.get(NODEID_TAG, str()) + if potential_genid == genid: + output_class_nest[CLASS_TAG][nest_class_index][SUBCLASS_TAG][nest_subclass_index][RESTRICTION_TAG] = genid_nest[RESTRICTION_TAG] + + return output_class_nest + + +def triage_nest_dict(nest_dict): + genids = check_for_class_genids(nest_dict) + restriction_genid = check_for_restriction_genids(nest_dict) + class_id = extract_class_id(nest_dict) + + if len(genids) > 0: + for genid in genids: + GENID_TO_ID[genid] = class_id + ID_TO_GENIDS[class_id] = genids + GENID_REMAINING_NESTS[class_id] = nest_dict + elif restriction_genid is not None: + class_id = GENID_TO_ID.get(restriction_genid, str()) + if len(class_id) == 0: + print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") + OUTPUT_NESTS.append(nest_dict) + return + class_nest = GENID_REMAINING_NESTS[class_id] + ID_TO_GENIDS[class_id].remove(restriction_genid) + updated_class_nest = store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) + + if len(ID_TO_GENIDS[class_id]) > 0: + GENID_REMAINING_NESTS[class_id] = updated_class_nest + else: + OUTPUT_NESTS.append(updated_class_nest) + GENID_REMAINING_NESTS[class_id] = None + else: + OUTPUT_NESTS.append(nest_dict) + + def divide_into_lines(input_file_name): curr_str = "" curr_nest = list() @@ -289,15 +403,18 @@ def divide_into_lines(input_file_name): curr_nest_tags.append(tag) elif line_type == LINE_TYPE_END_NEST: popped_curr_nest_tag = curr_nest_tags.pop() - assert popped_curr_nest_tag == tag + assert popped_curr_nest_tag == tag, curr_nest if len(curr_nest_tags) == 0: output_nest = True if output_nest: nest_dict, _ = convert_nest(curr_nest, 0) - genids = check_for_genids(nest_dict) - if len(genids) > 0: - nest_dict['genids'] = genids - print(json.dumps(nest_dict, indent=4)) + # genids = check_for_class_genids(nest_dict) + triage_nest_dict(nest_dict) + # restriction_genid = check_for_restriction_genids(nest_dict) + + # if len(genids) > 0: + # nest_dict['genids'] = genids + # print(json.dumps(nest_dict, indent=4)) curr_nest = list() curr_nest_tag = str() @@ -307,6 +424,16 @@ def divide_into_lines(input_file_name): # divide lines by a space curr_str += ' ' + print(json.dumps(OUTPUT_NESTS, indent=4)) + + print("=========") + + print("Remaining:") + for item in GENID_REMAINING_NESTS: + if GENID_REMAINING_NESTS[item] != None: + print(item) + print(json.dumps(GENID_REMAINING_NESTS[item], indent=4)) + if __name__ == '__main__': args = get_args() input_file_name = args.inputFile From 9b8dfc49b5451b4622568b9168530bab991c711a Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 10 Aug 2024 18:42:06 -0700 Subject: [PATCH 048/125] #387 more refactoring, but pre-sorting into classes --- misc-tools/owlparser.py | 65 ++++++++++++++++++++++++++--------------- 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index f38035b9..114e72e9 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -115,36 +115,35 @@ def get_letters(line, letter_index, start_brackets): def identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read): - changed = False + changed = True if letter == '<' and letter_index == 0: if next_letter != '/': type_to_read = LineElementRead.TAG - changed = True - if letter == '/' and prev_letter == '<': + elif letter == '/' and prev_letter == '<': type_to_read = LineElementRead.END_TAG - changed = True + else: + changed = False return changed, type_to_read def read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line): only_tag = False - changed = False + changed = True if letter == ' ' and type_to_read == LineElementRead.TAG: type_to_read = LineElementRead.ATTRIBUTE_TAG - changed = True elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0: type_to_read = LineElementRead.MAIN if prev_letter == '/': print("Warning - strange tag, ignoring", line) only_tag = True - changed = True elif type_to_read == LineElementRead.TAG: tag += letter - changed = True + else: + changed = False return changed, type_to_read, (only_tag, tag) @@ -158,8 +157,8 @@ def store_attribute(attributes, attribute_tag, attribute_text): return attributes, attribute_tag, attribute_text -def process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag): - changed = False +def read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag): + changed = True start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT) if letter == '>' and start_reading_attributes and start_brackets == 0: @@ -168,26 +167,45 @@ def process_attributes(letter, prev_letter, type_to_read, start_brackets, attrib if prev_letter == '/': end_tag = tag - changed = True elif start_reading_attributes: if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG: type_to_read = LineElementRead.ATTRIBUTE_TEXT - changed = True elif type_to_read == LineElementRead.ATTRIBUTE_TAG: attribute_tag += letter - changed = True - elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT: type_to_read = LineElementRead.ATTRIBUTE_TAG attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text) - changed = True elif type_to_read == LineElementRead.ATTRIBUTE_TEXT: attribute_text += letter - changed = True + else: + changed = False return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag) +def read_main(letter, type_to_read, main_text): + changed = True + if letter == '<' and type_to_read == LineElementRead.MAIN: + type_to_read = LineElementRead.END_TAG + elif type_to_read == LineElementRead.MAIN: + main_text += letter + else: + changed = False + + return changed, type_to_read, (main_text) + + +def read_end_tag(letter, type_to_read, start_brackets, end_tag): + changed = True + if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0: + pass + elif type_to_read == LineElementRead.END_TAG: + end_tag += letter + else: + changed = False + + return changed, type_to_read, (end_tag) + def convert_line(line): tag = "" @@ -216,21 +234,20 @@ def convert_line(line): (only_tag, tag) = tag_read_data continue - attributes_read, type_to_read, attributes_read_data = process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag) + attributes_read, type_to_read, attributes_read_data = read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag) if attributes_read: (attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data continue - if letter == '<' and type_to_read == LineElementRead.MAIN: - type_to_read = LineElementRead.END_TAG + main_read, type_to_read, main_read_data = read_main(letter, type_to_read, main_text) + if main_read: + (main_text) = main_read_data continue - elif type_to_read == LineElementRead.MAIN: - main_text += letter - if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0: + end_tag_read, type_to_read, end_tag_read_data = read_end_tag(letter, type_to_read, start_brackets, end_tag) + if end_tag_read: + (end_tag) = end_tag_read_data continue - elif type_to_read == LineElementRead.END_TAG: - end_tag += letter return categorize_line(tag, attributes, main_text, end_tag, only_tag) From bab707c4f4bdca3d3c7eed72a0f3c2a63bf696e0 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 12 Aug 2024 11:42:37 -0700 Subject: [PATCH 049/125] #387 refactored into class form --- misc-tools/owlparser.py | 745 ++++++++++++++++++++-------------------- 1 file changed, 379 insertions(+), 366 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 114e72e9..83371543 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -2,39 +2,6 @@ import argparse import datetime -COMMENT = "!--" -XML_TAG = "?xml" -RDF_TAG = "rdf:RDF" -DOCTYPE_TAG = "!DOCTYPE" -CLASS_TAG = "owl:Class" -RESTRICTION_TAG = "owl:Restriction" -SUBCLASS_TAG = "rdfs:subClassOf" -NODEID_TAG = "rdf:nodeID" -RDF_ABOUT_TAG = "rdf:about" -GENID_PREFIX = "genid" - -OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG] - -LINE_TYPE_IGNORE = "ignore" -LINE_TYPE_START_NEST = "start nest" -LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" -LINE_TYPE_ENTRY = "entry" -LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes" -LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes" -LINE_TYPE_END_NEST = "end nest" - -KEY_TAG = "tag" -KEY_ATTRIBUTES = "attributes" -KEY_TEXT = "ENTRY_TEXT" -KEY_TYPE = "type" - -IGNORED_ATTRIBUTES = ["xml:lang"] - -OUTPUT_NESTS = [] -GENID_REMAINING_NESTS = dict() -GENID_TO_ID = dict() -ID_TO_GENIDS = dict() - def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', @@ -52,404 +19,449 @@ class LineElementRead(): MAIN = 4 END_TAG = 5 - -def categorize_line(tag, attributes, main_text, end_tag, only_tag): - # Categorize the type of line - line_type = str() - out = dict() - - # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it - if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP or only_tag: - line_type = LINE_TYPE_IGNORE - else: - start_tag_exists = (tag != str()) - attributes_exist = (attributes != dict()) - text_exists = (main_text != str()) - end_tag_exists = (end_tag != str()) - - if start_tag_exists: - if attributes_exist: - if text_exists: - line_type = LINE_TYPE_ENTRY_WITH_ATTR - out[KEY_TAG] = tag - out[KEY_ATTRIBUTES] = attributes - out[KEY_TEXT] = main_text - elif end_tag_exists: - line_type = LINE_TYPE_ENTRY_ONLY_ATTR - out[KEY_TAG] = tag - out[KEY_ATTRIBUTES] = attributes +class XMLParser(): + def __init__(self, skip_tags, ignored_attributes, processing_func): + self.COMMENT = "!--" + self.OUTMOST_TAGS_SKIP = skip_tags + self.IGNORED_ATTRIBUTES = ignored_attributes + self.processing_func = processing_func + + self.LINE_TYPE_IGNORE = "ignore" + self.LINE_TYPE_START_NEST = "start nest" + self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" + self.LINE_TYPE_ENTRY = "entry" + self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes" + self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes" + self.LINE_TYPE_END_NEST = "end nest" + + self.KEY_TAG = "tag" + self.KEY_ATTRIBUTES = "attributes" + self.KEY_TEXT = "ENTRY_TEXT" + self.KEY_TYPE = "type" + + + def categorize_line(self, tag, attributes, main_text, end_tag, only_tag): + # Categorize the type of line + line_type = str() + out = dict() + + # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it + if tag == self.COMMENT or tag in self.OUTMOST_TAGS_SKIP or end_tag in self.OUTMOST_TAGS_SKIP or only_tag: + line_type = self.LINE_TYPE_IGNORE + else: + start_tag_exists = (tag != str()) + attributes_exist = (attributes != dict()) + text_exists = (main_text != str()) + end_tag_exists = (end_tag != str()) + + if start_tag_exists: + if attributes_exist: + if text_exists: + line_type = self.LINE_TYPE_ENTRY_WITH_ATTR + out[self.KEY_TAG] = tag + out[self.KEY_ATTRIBUTES] = attributes + out[self.KEY_TEXT] = main_text + elif end_tag_exists: + line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR + out[self.KEY_TAG] = tag + out[self.KEY_ATTRIBUTES] = attributes + else: + line_type = self.LINE_TYPE_START_NEST_WITH_ATTR + out[self.KEY_TAG] = tag + out[self.KEY_ATTRIBUTES] = attributes + elif text_exists: + line_type = self.LINE_TYPE_ENTRY + out[self.KEY_TAG] = tag + out[self.KEY_TEXT] = main_text else: - line_type = LINE_TYPE_START_NEST_WITH_ATTR - out[KEY_TAG] = tag - out[KEY_ATTRIBUTES] = attributes - elif text_exists: - line_type = LINE_TYPE_ENTRY - out[KEY_TAG] = tag - out[KEY_TEXT] = main_text - else: - line_type = LINE_TYPE_START_NEST - out[KEY_TAG] = tag - elif end_tag_exists: - line_type = LINE_TYPE_END_NEST - out[KEY_TAG] = end_tag + line_type = self.LINE_TYPE_START_NEST + out[self.KEY_TAG] = tag + elif end_tag_exists: + line_type = self.LINE_TYPE_END_NEST + out[self.KEY_TAG] = end_tag - out[KEY_TYPE] = line_type + out[self.KEY_TYPE] = line_type - return out + return out -def get_letters(line, letter_index, start_brackets): - letter = line[letter_index] - next_letter = "" - prev_letter = "" - if letter_index + 1 < len(line): - next_letter = line[letter_index + 1] - if letter_index - 1 >= 0: - prev_letter = line[letter_index - 1] + def get_letters(self, line, letter_index, start_brackets): + letter = line[letter_index] + next_letter = "" + prev_letter = "" + if letter_index + 1 < len(line): + next_letter = line[letter_index + 1] + if letter_index - 1 >= 0: + prev_letter = line[letter_index - 1] - if letter == '<': - start_brackets += 1 - if letter == '>': - start_brackets -= 1 + if letter == '<': + start_brackets += 1 + if letter == '>': + start_brackets -= 1 - return letter, next_letter, prev_letter, start_brackets + return letter, next_letter, prev_letter, start_brackets -def identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read): - changed = True + def identify_tag_type(self, letter_index, letter, next_letter, prev_letter, type_to_read): + changed = True - if letter == '<' and letter_index == 0: - if next_letter != '/': - type_to_read = LineElementRead.TAG - elif letter == '/' and prev_letter == '<': - type_to_read = LineElementRead.END_TAG - else: - changed = False + if letter == '<' and letter_index == 0: + if next_letter != '/': + type_to_read = LineElementRead.TAG + elif letter == '/' and prev_letter == '<': + type_to_read = LineElementRead.END_TAG + else: + changed = False - return changed, type_to_read + return changed, type_to_read -def read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line): - only_tag = False - changed = True + def read_tag(self, letter, prev_letter, type_to_read, start_brackets, tag, line): + only_tag = False + changed = True - if letter == ' ' and type_to_read == LineElementRead.TAG: - type_to_read = LineElementRead.ATTRIBUTE_TAG - elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0: - type_to_read = LineElementRead.MAIN + if letter == ' ' and type_to_read == LineElementRead.TAG: + type_to_read = LineElementRead.ATTRIBUTE_TAG + elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0: + type_to_read = LineElementRead.MAIN + + if prev_letter == '/': + print("Warning - strange tag, ignoring", line) + only_tag = True + elif type_to_read == LineElementRead.TAG: + tag += letter + else: + changed = False - if prev_letter == '/': - print("Warning - strange tag, ignoring", line) - only_tag = True - elif type_to_read == LineElementRead.TAG: - tag += letter - else: - changed = False + return changed, type_to_read, (only_tag, tag) - return changed, type_to_read, (only_tag, tag) + def store_attribute(self, attributes, attribute_tag, attribute_text): + if attribute_tag not in self.IGNORED_ATTRIBUTES: + attributes[attribute_tag] = attribute_text.strip('/').strip('"') + attribute_tag = "" + attribute_text = "" -def store_attribute(attributes, attribute_tag, attribute_text): - if attribute_tag not in IGNORED_ATTRIBUTES: - attributes[attribute_tag] = attribute_text.strip('/').strip('"') - attribute_tag = "" - attribute_text = "" + return attributes, attribute_tag, attribute_text - return attributes, attribute_tag, attribute_text + def read_attributes(self, letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag): + changed = True + start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT) -def read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag): - changed = True - start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT) + if letter == '>' and start_reading_attributes and start_brackets == 0: + type_to_read = LineElementRead.MAIN + attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text) - if letter == '>' and start_reading_attributes and start_brackets == 0: - type_to_read = LineElementRead.MAIN - attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text) + if prev_letter == '/': + end_tag = tag + elif start_reading_attributes: + if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG: + type_to_read = LineElementRead.ATTRIBUTE_TEXT + elif type_to_read == LineElementRead.ATTRIBUTE_TAG: + attribute_tag += letter + elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT: + type_to_read = LineElementRead.ATTRIBUTE_TAG + attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text) + elif type_to_read == LineElementRead.ATTRIBUTE_TEXT: + attribute_text += letter + else: + changed = False - if prev_letter == '/': - end_tag = tag - elif start_reading_attributes: - if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG: - type_to_read = LineElementRead.ATTRIBUTE_TEXT - elif type_to_read == LineElementRead.ATTRIBUTE_TAG: - attribute_tag += letter - elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT: - type_to_read = LineElementRead.ATTRIBUTE_TAG - attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text) - elif type_to_read == LineElementRead.ATTRIBUTE_TEXT: - attribute_text += letter - else: - changed = False + return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag) - return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag) + def read_main(self, letter, type_to_read, main_text): + changed = True + if letter == '<' and type_to_read == LineElementRead.MAIN: + type_to_read = LineElementRead.END_TAG + elif type_to_read == LineElementRead.MAIN: + main_text += letter + else: + changed = False -def read_main(letter, type_to_read, main_text): - changed = True - if letter == '<' and type_to_read == LineElementRead.MAIN: - type_to_read = LineElementRead.END_TAG - elif type_to_read == LineElementRead.MAIN: - main_text += letter - else: - changed = False + return changed, type_to_read, (main_text) - return changed, type_to_read, (main_text) + def read_end_tag(self, letter, type_to_read, start_brackets, end_tag): + changed = True + if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0: + pass + elif type_to_read == LineElementRead.END_TAG: + end_tag += letter + else: + changed = False -def read_end_tag(letter, type_to_read, start_brackets, end_tag): - changed = True - if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0: - pass - elif type_to_read == LineElementRead.END_TAG: - end_tag += letter - else: - changed = False + return changed, type_to_read, (end_tag) - return changed, type_to_read, (end_tag) + def convert_line(self, line): + tag = "" + attributes = dict() + attribute_tag = "" + attribute_text = "" + main_text = "" + end_tag = "" -def convert_line(line): - tag = "" - attributes = dict() - attribute_tag = "" - attribute_text = "" - main_text = "" - end_tag = "" + type_to_read = 0 - type_to_read = 0 + only_tag = False - only_tag = False + start_brackets = 0 - start_brackets = 0 + for letter_index in range(len(line)): + letter, next_letter, prev_letter, start_brackets = self.get_letters(line, letter_index, start_brackets) - for letter_index in range(len(line)): - letter, next_letter, prev_letter, start_brackets = get_letters(line, letter_index, start_brackets) + # First < + tag_identified, type_to_read = self.identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read) + if tag_identified: + continue - # First < - tag_identified, type_to_read = identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read) - if tag_identified: - continue + tag_read, type_to_read, tag_read_data = self.read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line) + if tag_read: + (only_tag, tag) = tag_read_data + continue - tag_read, type_to_read, tag_read_data = read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line) - if tag_read: - (only_tag, tag) = tag_read_data - continue + attributes_read, type_to_read, attributes_read_data = self.read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag) + if attributes_read: + (attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data + continue - attributes_read, type_to_read, attributes_read_data = read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag) - if attributes_read: - (attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data - continue + main_read, type_to_read, main_read_data = self.read_main(letter, type_to_read, main_text) + if main_read: + (main_text) = main_read_data + continue - main_read, type_to_read, main_read_data = read_main(letter, type_to_read, main_text) - if main_read: - (main_text) = main_read_data - continue + end_tag_read, type_to_read, end_tag_read_data = self.read_end_tag(letter, type_to_read, start_brackets, end_tag) + if end_tag_read: + (end_tag) = end_tag_read_data + continue - end_tag_read, type_to_read, end_tag_read_data = read_end_tag(letter, type_to_read, start_brackets, end_tag) - if end_tag_read: - (end_tag) = end_tag_read_data - continue + return self.categorize_line(tag, attributes, main_text, end_tag, only_tag) - return categorize_line(tag, attributes, main_text, end_tag, only_tag) + def convert_nest(self, nest, start_index): + nest_dict = dict() + curr_index = start_index -def convert_nest(nest, start_index): - nest_dict = dict() - curr_index = start_index + while curr_index < len(nest): + element = nest[curr_index] + line_type = element[self.KEY_TYPE] + line_tag = element[self.KEY_TAG] + line_text = element.get(self.KEY_TEXT, None) + line_attributes = element.get(self.KEY_ATTRIBUTES, None) - while curr_index < len(nest): - element = nest[curr_index] - line_type = element[KEY_TYPE] - line_tag = element[KEY_TAG] - line_text = element.get(KEY_TEXT, None) - line_attributes = element.get(KEY_ATTRIBUTES, None) + if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: - if line_tag not in nest_dict: - nest_dict[line_tag] = list() + converted_nest, ret_index = self.convert_nest(nest, curr_index + 1) - converted_nest, ret_index = convert_nest(nest, curr_index + 1) + if line_attributes is not None: + for attribute in line_attributes: + converted_nest[attribute] = line_attributes[attribute] - if line_attributes is not None: - for attribute in line_attributes: - converted_nest[attribute] = line_attributes[attribute] + nest_dict[line_tag].append(converted_nest) - nest_dict[line_tag].append(converted_nest) + curr_index = ret_index + 1 + continue - curr_index = ret_index + 1 - continue + if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]: - if line_tag not in nest_dict: - nest_dict[line_tag] = list() + curr_dict = dict() - curr_dict = dict() - - if line_text is not None: - curr_dict[KEY_TEXT] = line_text + if line_text is not None: + curr_dict[self.KEY_TEXT] = line_text - if line_attributes is not None: - for attribute in line_attributes: - curr_dict[attribute] = line_attributes[attribute] + if line_attributes is not None: + for attribute in line_attributes: + curr_dict[attribute] = line_attributes[attribute] - nest_dict[line_tag].append(curr_dict) + nest_dict[line_tag].append(curr_dict) - curr_index += 1 - continue - - if line_type in [LINE_TYPE_END_NEST]: - return nest_dict, curr_index + curr_index += 1 + continue - return nest_dict, curr_index + if line_type in [self.LINE_TYPE_END_NEST]: + return nest_dict, curr_index + return nest_dict, curr_index -def check_for_class_genids(nest_dict): - genids = list() - nest_dict_classes = nest_dict.get(CLASS_TAG, list()) - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - nest_subclasses = nest_class.get(SUBCLASS_TAG, list()) - for nest_subclass_index in range(len(nest_subclasses)): - nest_subclass = nest_subclasses[nest_subclass_index] - potential_genid = nest_subclass.get(NODEID_TAG, str()) - if potential_genid.startswith(GENID_PREFIX): - genids.append(potential_genid) + def divide_into_lines(self, input_file_name): + curr_str = "" + curr_nest = list() + curr_nest_tags = list() # Treating it as a stack + start_brackets = 0 - return genids + with open(input_file_name) as input_file: + for line in input_file: + line_str = line.strip() + for letter_index in range(len(line_str)): + letter = line_str[letter_index] + if letter == '<': + start_brackets += 1 + if letter == '>': + start_brackets -= 1 -def check_for_restriction_genids(nest_dict): - for nest_restriction in nest_dict.get(RESTRICTION_TAG, dict()): - potential_genid = nest_restriction.get(NODEID_TAG, str()) - if potential_genid.startswith(GENID_PREFIX): - return potential_genid - return None - -def extract_class_id(nest_dict): - nest_dict_classes = nest_dict.get(CLASS_TAG, list()) - # Can't have competing class_ids - assert len(nest_dict_classes) <= 1 - - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - return nest_class.get(RDF_ABOUT_TAG, str()) + next_letter = "" + if letter_index + 1 < len(line_str): + next_letter = line_str[letter_index + 1] -def store_genid_nest_in_class_nest(genid, genid_nest, class_nest): - output_class_nest = class_nest - - nest_dict_classes = class_nest.get(CLASS_TAG, list()) - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - nest_subclasses = nest_class.get(SUBCLASS_TAG, list()) - for nest_subclass_index in range(len(nest_subclasses)): - nest_subclass = nest_subclasses[nest_subclass_index] - potential_genid = nest_subclass.get(NODEID_TAG, str()) - if potential_genid == genid: - output_class_nest[CLASS_TAG][nest_class_index][SUBCLASS_TAG][nest_subclass_index][RESTRICTION_TAG] = genid_nest[RESTRICTION_TAG] + curr_str += letter - return output_class_nest + if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: + # Only return if nesting + line_parsed = self.convert_line(curr_str) + tag = line_parsed.get(self.KEY_TAG, None) + assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely + line_type = line_parsed.get(self.KEY_TYPE, None) + attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys() -def triage_nest_dict(nest_dict): - genids = check_for_class_genids(nest_dict) - restriction_genid = check_for_restriction_genids(nest_dict) - class_id = extract_class_id(nest_dict) - - if len(genids) > 0: - for genid in genids: - GENID_TO_ID[genid] = class_id - ID_TO_GENIDS[class_id] = genids - GENID_REMAINING_NESTS[class_id] = nest_dict - elif restriction_genid is not None: - class_id = GENID_TO_ID.get(restriction_genid, str()) - if len(class_id) == 0: - print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") - OUTPUT_NESTS.append(nest_dict) - return - class_nest = GENID_REMAINING_NESTS[class_id] - ID_TO_GENIDS[class_id].remove(restriction_genid) - updated_class_nest = store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) - - if len(ID_TO_GENIDS[class_id]) > 0: - GENID_REMAINING_NESTS[class_id] = updated_class_nest + if line_type != self.LINE_TYPE_IGNORE: + curr_nest.append(line_parsed) + + output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) + + if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: + curr_nest_tags.append(tag) + elif line_type == self.LINE_TYPE_END_NEST: + popped_curr_nest_tag = curr_nest_tags.pop() + assert popped_curr_nest_tag == tag, curr_nest + if len(curr_nest_tags) == 0: + output_nest = True + if output_nest: + nest_dict, _ = self.convert_nest(curr_nest, 0) + + self.processing_func(nest_dict) + + curr_nest = list() + curr_nest_tag = str() + + curr_str = "" + + if curr_str != "": + # divide lines by a space + curr_str += ' ' + + +class OWLParser(): + def __init__(self, input_file_name): + self.XML_TAG = "?xml" + self.RDF_TAG = "rdf:RDF" + self.DOCTYPE_TAG = "!DOCTYPE" + self.CLASS_TAG = "owl:Class" + self.RESTRICTION_TAG = "owl:Restriction" + self.SUBCLASS_TAG = "rdfs:subClassOf" + self.NODEID_TAG = "rdf:nodeID" + self.RDF_ABOUT_TAG = "rdf:about" + self.GENID_PREFIX = "genid" + + self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] + + self.ignored_attributes = ["xml:lang"] + + self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict) + + self.OUTPUT_NESTS = [] + self.GENID_REMAINING_NESTS = dict() + self.GENID_TO_ID = dict() + self.ID_TO_GENIDS = dict() + + self.input_file = input_file_name + + def check_for_class_genids(self, nest_dict): + genids = list() + + nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] + potential_genid = nest_subclass.get(self.NODEID_TAG, str()) + if potential_genid.startswith(self.GENID_PREFIX): + genids.append(potential_genid) + + return genids + + + def check_for_restriction_genids(self, nest_dict): + for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()): + potential_genid = nest_restriction.get(self.NODEID_TAG, str()) + if potential_genid.startswith(self.GENID_PREFIX): + return potential_genid + return None + + def extract_class_id(self, nest_dict): + nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) + # Can't have competing class_ids + assert len(nest_dict_classes) <= 1 + + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + return nest_class.get(self.RDF_ABOUT_TAG, str()) + + def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): + output_class_nest = class_nest + + nest_dict_classes = class_nest.get(self.CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] + potential_genid = nest_subclass.get(self.NODEID_TAG, str()) + if potential_genid == genid: + output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG] + + return output_class_nest + + + def triage_nest_dict(self, nest_dict): + genids = self.check_for_class_genids(nest_dict) + restriction_genid = self.check_for_restriction_genids(nest_dict) + class_id = self.extract_class_id(nest_dict) + + if len(genids) > 0: + for genid in genids: + self.GENID_TO_ID[genid] = class_id + self.ID_TO_GENIDS[class_id] = genids + self.GENID_REMAINING_NESTS[class_id] = nest_dict + elif restriction_genid is not None: + class_id = self.GENID_TO_ID.get(restriction_genid, str()) + if len(class_id) == 0: + print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") + self.OUTPUT_NESTS.append(nest_dict) + return + class_nest = self.GENID_REMAINING_NESTS[class_id] + self.ID_TO_GENIDS[class_id].remove(restriction_genid) + updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) + + if len(self.ID_TO_GENIDS[class_id]) > 0: + self.GENID_REMAINING_NESTS[class_id] = updated_class_nest + else: + self.OUTPUT_NESTS.append(updated_class_nest) + self.GENID_REMAINING_NESTS[class_id] = None else: - OUTPUT_NESTS.append(updated_class_nest) - GENID_REMAINING_NESTS[class_id] = None - else: - OUTPUT_NESTS.append(nest_dict) - - -def divide_into_lines(input_file_name): - curr_str = "" - curr_nest = list() - curr_nest_tags = list() # Treating it as a stack - start_brackets = 0 - - with open(input_file_name) as input_file: - for line in input_file: - line_str = line.strip() - - for letter_index in range(len(line_str)): - letter = line_str[letter_index] - if letter == '<': - start_brackets += 1 - if letter == '>': - start_brackets -= 1 - - next_letter = "" - if letter_index + 1 < len(line_str): - next_letter = line_str[letter_index + 1] - - curr_str += letter - - if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: - # Only return if nesting - line_parsed = convert_line(curr_str) - - tag = line_parsed.get(KEY_TAG, None) - assert tag != KEY_TEXT # This could cause a massive conflict, but it is unlikely - line_type = line_parsed.get(KEY_TYPE, None) - attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys() - - if line_type != LINE_TYPE_IGNORE: - curr_nest.append(line_parsed) - - output_nest = (line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) - - if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]: - curr_nest_tags.append(tag) - elif line_type == LINE_TYPE_END_NEST: - popped_curr_nest_tag = curr_nest_tags.pop() - assert popped_curr_nest_tag == tag, curr_nest - if len(curr_nest_tags) == 0: - output_nest = True - if output_nest: - nest_dict, _ = convert_nest(curr_nest, 0) - # genids = check_for_class_genids(nest_dict) - triage_nest_dict(nest_dict) - # restriction_genid = check_for_restriction_genids(nest_dict) - - # if len(genids) > 0: - # nest_dict['genids'] = genids - # print(json.dumps(nest_dict, indent=4)) - curr_nest = list() - curr_nest_tag = str() - - curr_str = "" - - if curr_str != "": - # divide lines by a space - curr_str += ' ' - - print(json.dumps(OUTPUT_NESTS, indent=4)) - - print("=========") - - print("Remaining:") - for item in GENID_REMAINING_NESTS: - if GENID_REMAINING_NESTS[item] != None: - print(item) - print(json.dumps(GENID_REMAINING_NESTS[item], indent=4)) + self.OUTPUT_NESTS.append(nest_dict) + + + def parse_OWL_file(self): + self.xml_parser.divide_into_lines(self.input_file) + print(json.dumps(self.OUTPUT_NESTS, indent=4)) + + print("=========") + + print("Remaining:") + for item in self.GENID_REMAINING_NESTS: + if self.GENID_REMAINING_NESTS[item] != None: + print(item) + print(json.dumps(self.GENID_REMAINING_NESTS[item], indent=4)) + if __name__ == '__main__': args = get_args() @@ -457,5 +469,6 @@ def divide_into_lines(input_file_name): print("File:", input_file_name) print("Start Time:", date()) - divide_into_lines(input_file_name) + owl_parser = OWLParser(input_file_name) + owl_parser.parse_OWL_file() print("End Time:", date()) \ No newline at end of file From a55212c1f10a439a2e01401d094c25b8b1b5abc5 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 12 Aug 2024 19:27:47 -0700 Subject: [PATCH 050/125] #387 added in output filing --- misc-tools/owlparser.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py index 83371543..629a8226 100644 --- a/misc-tools/owlparser.py +++ b/misc-tools/owlparser.py @@ -1,12 +1,14 @@ import json import argparse import datetime +import kg2_util def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('outputFile', type=str) return arg_parser.parse_args() def date(): @@ -349,7 +351,7 @@ def divide_into_lines(self, input_file_name): class OWLParser(): - def __init__(self, input_file_name): + def __init__(self, input_file_name, output_file_name): self.XML_TAG = "?xml" self.RDF_TAG = "rdf:RDF" self.DOCTYPE_TAG = "!DOCTYPE" @@ -366,12 +368,16 @@ def __init__(self, input_file_name): self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict) - self.OUTPUT_NESTS = [] self.GENID_REMAINING_NESTS = dict() self.GENID_TO_ID = dict() self.ID_TO_GENIDS = dict() self.input_file = input_file_name + self.output_file_name = output_file_name + + self.output_info = create_single_jsonlines() + self.output = output_info[0] + def check_for_class_genids(self, nest_dict): genids = list() @@ -435,7 +441,9 @@ def triage_nest_dict(self, nest_dict): class_id = self.GENID_TO_ID.get(restriction_genid, str()) if len(class_id) == 0: print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") - self.OUTPUT_NESTS.append(nest_dict) + + # Save to output despite not matching with an existing class + self.output.write(nest_dict) return class_nest = self.GENID_REMAINING_NESTS[class_id] self.ID_TO_GENIDS[class_id].remove(restriction_genid) @@ -444,31 +452,32 @@ def triage_nest_dict(self, nest_dict): if len(self.ID_TO_GENIDS[class_id]) > 0: self.GENID_REMAINING_NESTS[class_id] = updated_class_nest else: - self.OUTPUT_NESTS.append(updated_class_nest) + # Since all of the genids used in this class have been matched, output + self.output.write(nest_dict) self.GENID_REMAINING_NESTS[class_id] = None else: - self.OUTPUT_NESTS.append(nest_dict) + # There are no genids that need to be worked with, so just output + self.output.write(nest_dict) def parse_OWL_file(self): self.xml_parser.divide_into_lines(self.input_file) - print(json.dumps(self.OUTPUT_NESTS, indent=4)) - print("=========") - - print("Remaining:") + # Genid wasn't filled, still want to include them though for item in self.GENID_REMAINING_NESTS: if self.GENID_REMAINING_NESTS[item] != None: - print(item) - print(json.dumps(self.GENID_REMAINING_NESTS[item], indent=4)) + self.output.write(self.GENID_REMAINING_NESTS[item]) + + close_single_jsonlines(self.output_info, self.output_file_name) if __name__ == '__main__': args = get_args() input_file_name = args.inputFile + output_file_name = args.outputFile print("File:", input_file_name) print("Start Time:", date()) - owl_parser = OWLParser(input_file_name) + owl_parser = OWLParser(input_file_name, output_file_name) owl_parser.parse_OWL_file() print("End Time:", date()) \ No newline at end of file From e8d9e8803bacffa0a6344c7cb778af74240b3eb5 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 12 Aug 2024 19:28:25 -0700 Subject: [PATCH 051/125] #387 moving bc of kg2_util --- misc-tools/owlparser.py => owlparser.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename misc-tools/owlparser.py => owlparser.py (100%) diff --git a/misc-tools/owlparser.py b/owlparser.py similarity index 100% rename from misc-tools/owlparser.py rename to owlparser.py From 8d6668fffcdf3556bf6b18e3553176e2d5138b9e Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 12 Aug 2024 20:50:31 -0700 Subject: [PATCH 052/125] #387 slightly more efficient --- owlparser.py | 237 ++++++++++++++++++++++++++------------------------- 1 file changed, 119 insertions(+), 118 deletions(-) diff --git a/owlparser.py b/owlparser.py index 629a8226..2bc87905 100644 --- a/owlparser.py +++ b/owlparser.py @@ -1,7 +1,7 @@ import json import argparse import datetime -import kg2_util +import kg2_util_thin as kg2_util def get_args(): arg_parser = argparse.ArgumentParser() @@ -41,204 +41,205 @@ def __init__(self, skip_tags, ignored_attributes, processing_func): self.KEY_TEXT = "ENTRY_TEXT" self.KEY_TYPE = "type" - - def categorize_line(self, tag, attributes, main_text, end_tag, only_tag): + # Variables for line reading + self.tag = "" + self.attributes = dict() + self.attribute_tag = "" + self.attribute_text = "" + self.main_text = "" + self.end_tag = "" + self.only_tag = False + self.start_brackets = 0 + self.line = "" + self.letter = "" + self.next_letter = "" + self.prev_letter = "" + self.type_to_read = 0 + + def categorize_line(self): # Categorize the type of line line_type = str() out = dict() # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it - if tag == self.COMMENT or tag in self.OUTMOST_TAGS_SKIP or end_tag in self.OUTMOST_TAGS_SKIP or only_tag: + if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag: line_type = self.LINE_TYPE_IGNORE else: - start_tag_exists = (tag != str()) - attributes_exist = (attributes != dict()) - text_exists = (main_text != str()) - end_tag_exists = (end_tag != str()) + start_tag_exists = (self.tag != str()) + attributes_exist = (self.attributes != dict()) + text_exists = (self.main_text != str()) + end_tag_exists = (self.end_tag != str()) if start_tag_exists: if attributes_exist: if text_exists: line_type = self.LINE_TYPE_ENTRY_WITH_ATTR - out[self.KEY_TAG] = tag - out[self.KEY_ATTRIBUTES] = attributes - out[self.KEY_TEXT] = main_text + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes + out[self.KEY_TEXT] = self.main_text elif end_tag_exists: line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR - out[self.KEY_TAG] = tag - out[self.KEY_ATTRIBUTES] = attributes + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes else: line_type = self.LINE_TYPE_START_NEST_WITH_ATTR - out[self.KEY_TAG] = tag - out[self.KEY_ATTRIBUTES] = attributes + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes elif text_exists: line_type = self.LINE_TYPE_ENTRY - out[self.KEY_TAG] = tag - out[self.KEY_TEXT] = main_text + out[self.KEY_TAG] = self.tag + out[self.KEY_TEXT] = self.main_text else: line_type = self.LINE_TYPE_START_NEST - out[self.KEY_TAG] = tag + out[self.KEY_TAG] = self.tag elif end_tag_exists: line_type = self.LINE_TYPE_END_NEST - out[self.KEY_TAG] = end_tag + out[self.KEY_TAG] = self.end_tag out[self.KEY_TYPE] = line_type return out - def get_letters(self, line, letter_index, start_brackets): - letter = line[letter_index] - next_letter = "" - prev_letter = "" - if letter_index + 1 < len(line): - next_letter = line[letter_index + 1] + def get_letters(self, letter_index): + self.letter = self.line[letter_index] + self.next_letter = "" + self.prev_letter = "" + if letter_index + 1 < len(self.line): + self.next_letter = self.line[letter_index + 1] if letter_index - 1 >= 0: - prev_letter = line[letter_index - 1] - - if letter == '<': - start_brackets += 1 - if letter == '>': - start_brackets -= 1 + self.prev_letter = self.line[letter_index - 1] - return letter, next_letter, prev_letter, start_brackets + if self.letter == '<': + self.start_brackets += 1 + if self.letter == '>': + self.start_brackets -= 1 - def identify_tag_type(self, letter_index, letter, next_letter, prev_letter, type_to_read): + def identify_tag_type(self, letter_index): changed = True - if letter == '<' and letter_index == 0: - if next_letter != '/': - type_to_read = LineElementRead.TAG - elif letter == '/' and prev_letter == '<': - type_to_read = LineElementRead.END_TAG + if self.letter == '<' and letter_index == 0: + if self.next_letter != '/': + self.type_to_read = LineElementRead.TAG + elif self.letter == '/' and self.prev_letter == '<': + self.type_to_read = LineElementRead.END_TAG else: changed = False - return changed, type_to_read + return changed - def read_tag(self, letter, prev_letter, type_to_read, start_brackets, tag, line): - only_tag = False + def read_tag(self): changed = True - if letter == ' ' and type_to_read == LineElementRead.TAG: - type_to_read = LineElementRead.ATTRIBUTE_TAG - elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0: - type_to_read = LineElementRead.MAIN + if self.letter == ' ' and self.type_to_read == LineElementRead.TAG: + self.type_to_read = LineElementRead.ATTRIBUTE_TAG + elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0: + self.type_to_read = LineElementRead.MAIN - if prev_letter == '/': - print("Warning - strange tag, ignoring", line) - only_tag = True - elif type_to_read == LineElementRead.TAG: - tag += letter + if self.prev_letter == '/': + print("Warning - strange tag, ignoring", self.line) + self.only_tag = True + elif self.type_to_read == LineElementRead.TAG: + self.tag += self.letter else: changed = False - return changed, type_to_read, (only_tag, tag) - + return changed - def store_attribute(self, attributes, attribute_tag, attribute_text): - if attribute_tag not in self.IGNORED_ATTRIBUTES: - attributes[attribute_tag] = attribute_text.strip('/').strip('"') - attribute_tag = "" - attribute_text = "" - return attributes, attribute_tag, attribute_text + def store_attribute(self): + if self.attribute_tag not in self.IGNORED_ATTRIBUTES: + self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"') + self.attribute_tag = "" + self.attribute_text = "" - def read_attributes(self, letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag): + def read_attributes(self): changed = True - start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT) + start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT) - if letter == '>' and start_reading_attributes and start_brackets == 0: - type_to_read = LineElementRead.MAIN - attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text) + if self.letter == '>' and start_reading_attributes and self.start_brackets == 0: + self.type_to_read = LineElementRead.MAIN + + self.store_attribute() - if prev_letter == '/': - end_tag = tag + if self.prev_letter == '/': + self.end_tag = self.tag elif start_reading_attributes: - if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG: - type_to_read = LineElementRead.ATTRIBUTE_TEXT - elif type_to_read == LineElementRead.ATTRIBUTE_TAG: - attribute_tag += letter - elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT: - type_to_read = LineElementRead.ATTRIBUTE_TAG - attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text) - elif type_to_read == LineElementRead.ATTRIBUTE_TEXT: - attribute_text += letter + if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG: + self.type_to_read = LineElementRead.ATTRIBUTE_TEXT + elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG: + self.attribute_tag += self.letter + elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: + self.type_to_read = LineElementRead.ATTRIBUTE_TAG + self.store_attribute() + elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: + self.attribute_text += self.letter else: changed = False - return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag) + return changed - def read_main(self, letter, type_to_read, main_text): + def read_main(self): changed = True - if letter == '<' and type_to_read == LineElementRead.MAIN: - type_to_read = LineElementRead.END_TAG - elif type_to_read == LineElementRead.MAIN: - main_text += letter + if self.letter == '<' and self.type_to_read == LineElementRead.MAIN: + self.type_to_read = LineElementRead.END_TAG + elif self.type_to_read == LineElementRead.MAIN: + self.main_text += self.letter else: changed = False - return changed, type_to_read, (main_text) + return changed - def read_end_tag(self, letter, type_to_read, start_brackets, end_tag): + def read_end_tag(self): changed = True - if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0: + if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0: pass - elif type_to_read == LineElementRead.END_TAG: - end_tag += letter + elif self.type_to_read == LineElementRead.END_TAG: + self.end_tag += self.letter else: changed = False - return changed, type_to_read, (end_tag) + return changed - def convert_line(self, line): - tag = "" - attributes = dict() - attribute_tag = "" - attribute_text = "" - main_text = "" - end_tag = "" + def convert_line(self): + self.tag = "" + self.attributes = dict() + self.attribute_tag = "" + self.attribute_text = "" + self.main_text = "" + self.end_tag = "" - type_to_read = 0 + self.type_to_read = 0 - only_tag = False + self.only_tag = False - start_brackets = 0 + self.start_brackets = 0 - for letter_index in range(len(line)): - letter, next_letter, prev_letter, start_brackets = self.get_letters(line, letter_index, start_brackets) + for letter_index in range(len(self.line)): + self.get_letters(letter_index) # First < - tag_identified, type_to_read = self.identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read) - if tag_identified: + if self.identify_tag_type(letter_index): continue - tag_read, type_to_read, tag_read_data = self.read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line) - if tag_read: - (only_tag, tag) = tag_read_data + if self.read_tag(): continue - attributes_read, type_to_read, attributes_read_data = self.read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag) - if attributes_read: - (attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data + if self.read_attributes(): continue - main_read, type_to_read, main_read_data = self.read_main(letter, type_to_read, main_text) - if main_read: - (main_text) = main_read_data + if self.read_main(): continue - end_tag_read, type_to_read, end_tag_read_data = self.read_end_tag(letter, type_to_read, start_brackets, end_tag) - if end_tag_read: - (end_tag) = end_tag_read_data + if self.read_end_tag(): continue - return self.categorize_line(tag, attributes, main_text, end_tag, only_tag) + return self.categorize_line() def convert_nest(self, nest, start_index): @@ -316,7 +317,8 @@ def divide_into_lines(self, input_file_name): if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: # Only return if nesting - line_parsed = self.convert_line(curr_str) + self.line = curr_str + line_parsed = self.convert_line() tag = line_parsed.get(self.KEY_TAG, None) assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely @@ -375,9 +377,8 @@ def __init__(self, input_file_name, output_file_name): self.input_file = input_file_name self.output_file_name = output_file_name - self.output_info = create_single_jsonlines() - self.output = output_info[0] - + self.output_info = kg2_util.create_single_jsonlines() + self.output = self.output_info[0] def check_for_class_genids(self, nest_dict): genids = list() @@ -468,7 +469,7 @@ def parse_OWL_file(self): if self.GENID_REMAINING_NESTS[item] != None: self.output.write(self.GENID_REMAINING_NESTS[item]) - close_single_jsonlines(self.output_info, self.output_file_name) + kg2_util.close_single_jsonlines(self.output_info, self.output_file_name) if __name__ == '__main__': From b377ae97127ed8b023c00e4ad04c959741a84f9a Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 15 Aug 2024 16:33:11 -0700 Subject: [PATCH 053/125] #387 loads multiple files now --- owlparser.py | 40 +++++++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/owlparser.py b/owlparser.py index 2bc87905..1973009a 100644 --- a/owlparser.py +++ b/owlparser.py @@ -15,6 +15,7 @@ def date(): return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") class LineElementRead(): + NONE = 0 TAG = 1 ATTRIBUTE_TAG = 2 ATTRIBUTE_TEXT = 3 @@ -54,7 +55,7 @@ def __init__(self, skip_tags, ignored_attributes, processing_func): self.letter = "" self.next_letter = "" self.prev_letter = "" - self.type_to_read = 0 + self.type_to_read = LineElementRead.NONE def categorize_line(self): # Categorize the type of line @@ -214,7 +215,7 @@ def convert_line(self): self.main_text = "" self.end_tag = "" - self.type_to_read = 0 + self.type_to_read = LineElementRead.NONE self.only_tag = False @@ -353,7 +354,7 @@ def divide_into_lines(self, input_file_name): class OWLParser(): - def __init__(self, input_file_name, output_file_name): + def __init__(self, input_files, output_file_name): self.XML_TAG = "?xml" self.RDF_TAG = "rdf:RDF" self.DOCTYPE_TAG = "!DOCTYPE" @@ -374,7 +375,7 @@ def __init__(self, input_file_name, output_file_name): self.GENID_TO_ID = dict() self.ID_TO_GENIDS = dict() - self.input_file = input_file_name + self.input_files = input_files self.output_file_name = output_file_name self.output_info = kg2_util.create_single_jsonlines() @@ -462,23 +463,40 @@ def triage_nest_dict(self, nest_dict): def parse_OWL_file(self): - self.xml_parser.divide_into_lines(self.input_file) + for input_file in self.input_files: + print("Reading:", input_file, "starting at", date()) + self.xml_parser.divide_into_lines(input_file) - # Genid wasn't filled, still want to include them though - for item in self.GENID_REMAINING_NESTS: - if self.GENID_REMAINING_NESTS[item] != None: - self.output.write(self.GENID_REMAINING_NESTS[item]) + # Genid wasn't filled, still want to include them though + for item in self.GENID_REMAINING_NESTS: + if self.GENID_REMAINING_NESTS[item] != None: + self.output.write(self.GENID_REMAINING_NESTS[item]) + + # Refresh everything for the next file + self.GENID_REMAINING_NESTS = dict() + self.GENID_TO_ID = dict() + self.ID_TO_GENIDS = dict() kg2_util.close_single_jsonlines(self.output_info, self.output_file_name) +def identify_input_files(ont_load_inventory): + input_files = list() + for item in ont_load_inventory: + input_files.append(item['file']) + + return input_files + if __name__ == '__main__': args = get_args() input_file_name = args.inputFile output_file_name = args.outputFile - print("File:", input_file_name) + ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name)) + input_files = identify_input_files(ont_load_inventory) + + print("Files:", input_files) print("Start Time:", date()) - owl_parser = OWLParser(input_file_name, output_file_name) + owl_parser = OWLParser(input_files, output_file_name) owl_parser.parse_OWL_file() print("End Time:", date()) \ No newline at end of file From e73585507859016e5f55067f0bc6d2b42d528cdb Mon Sep 17 00:00:00 2001 From: ecwood Date: Sat, 17 Aug 2024 01:29:30 -0700 Subject: [PATCH 054/125] #387 save the name of the output file as well --- owlparser.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/owlparser.py b/owlparser.py index 1973009a..3f271dd4 100644 --- a/owlparser.py +++ b/owlparser.py @@ -1,7 +1,7 @@ import json import argparse import datetime -import kg2_util_thin as kg2_util +import kg2_util def get_args(): arg_parser = argparse.ArgumentParser() @@ -365,6 +365,8 @@ def __init__(self, input_files, output_file_name): self.RDF_ABOUT_TAG = "rdf:about" self.GENID_PREFIX = "genid" + self.OWL_SOURCE_KEY = "owl_source" + self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] self.ignored_attributes = ["xml:lang"] @@ -429,6 +431,13 @@ def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): return output_class_nest + def write_to_output(self, output_dict, source_file): + output_dict[self.OWL_SOURCE_KEY] = source_file + self.output.write(output_dict) + + return + + def triage_nest_dict(self, nest_dict): genids = self.check_for_class_genids(nest_dict) restriction_genid = self.check_for_restriction_genids(nest_dict) @@ -445,7 +454,7 @@ def triage_nest_dict(self, nest_dict): print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") # Save to output despite not matching with an existing class - self.output.write(nest_dict) + self.write_to_output(nest_dict, self.input_file) return class_nest = self.GENID_REMAINING_NESTS[class_id] self.ID_TO_GENIDS[class_id].remove(restriction_genid) @@ -464,13 +473,14 @@ def triage_nest_dict(self, nest_dict): def parse_OWL_file(self): for input_file in self.input_files: + self.input_file = input_file print("Reading:", input_file, "starting at", date()) self.xml_parser.divide_into_lines(input_file) # Genid wasn't filled, still want to include them though for item in self.GENID_REMAINING_NESTS: if self.GENID_REMAINING_NESTS[item] != None: - self.output.write(self.GENID_REMAINING_NESTS[item]) + self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file) # Refresh everything for the next file self.GENID_REMAINING_NESTS = dict() From 462c0bfec45f99ff1cd0652a020d23386790d956 Mon Sep 17 00:00:00 2001 From: ecwood Date: Fri, 23 Aug 2024 12:04:23 -0700 Subject: [PATCH 055/125] #387 start of processing the ontologies JSON Lines file --- ontologies_jsonl_to_kg_jsonl.py | 147 ++++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 ontologies_jsonl_to_kg_jsonl.py diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py new file mode 100644 index 00000000..12784bc8 --- /dev/null +++ b/ontologies_jsonl_to_kg_jsonl.py @@ -0,0 +1,147 @@ +import argparse +import kg2_util +import json + +OWL_CLASS_TAG = "owl:Class" +SUBCLASS_TAG = "rdfs:subClassOf" +DESCRIPTION_TAG = "obo:IAO_0000115" +XREF_TAG = "oboInOwl:hasDbXref" +ID_TAG = "rdf:about" +NAME_TAG = "rdfs:label" +EXACT_MATCH_TAG = "skos:exactMatch" +COMMENT_TAG = "rdfs:comment" + +TEXT_KEY = "ENTRY_TEXT" +RESOURCE_KEY = "rdf:resource" + +OWL_SOURCE_KEY = "owl_source" + +KEYS_DICT = dict() + +COMMENT_PREFIX = "COMMENTS: " + +CLASSES_DICT = dict() + +def get_args(): + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('--test', dest='test', + action="store_true", default=False) + arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('outputFile', type=str) + return arg_parser.parse_args() + +def process_ontology_item(ontology_item): + source = ontology_item.get(OWL_SOURCE_KEY, str()) + for owl_class in ontology_item.get(OWL_CLASS_TAG, list()): + # Typically genid classes which don't neatly map onto the KG2 schema + if ID_TAG not in owl_class: + continue + # TODO: MAP THIS HERE, since not all sources use same IRIs for the same nodes + node_id = owl_class.get(ID_TAG, str()) + + # Configure the name + name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] + if len(name_list) == 0: + continue + + # Configure the description + description_list = list() + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)] + description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] + + # Configure the biological sequence + has_biological_sequence = dict() + has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence] + + # Extract edge triples + edges_list = list() + + for edge_type in ["obo:RO_0002175", "obo:RO_0002161", "obo:RO_0002604", "obo:RO_0002171", "obo:RO_0002174", "obo:RO_0002475", "obo:RO_0001900", "obo:RO_0004050"]: + for edge in owl_class.get(edge_type, list()): + if RESOURCE_KEY in edge: + edges_list.append((edge_type, edge.get(RESOURCE_KEY, None))) + + for edge_type in ["oboInOwl:hasDbXref"]: + for edge in owl_class.get(edge_type, list()): + if TEXT_KEY in edge: + edges_list.append((edge_type, edge.get(TEXT_KEY, None))) + + restriction_edges = list() + restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] + for equiv in owl_class.get("owl:equivalentClass", list()): + for mini_class in equiv.get("owl:Class", list()): + for edge in mini_class.get("owl:intersectionOf", list()): + restriction_edges.append((edge, "owl:equivalentClass")) + + for (edge, general_edge_type) in restriction_edges: + for restriction in edge.get("owl:Restriction", list()): + edge_type = restriction.get("owl:onProperty", list()) + edge_object = restriction.get("owl:someValuesFrom", list()) + if len(edge_type) != 1: + assert len(edge_type) <= 1, edge + continue + if len(edge_object) != 1: + assert len(edge_object) <= 1, edge + continue + edge_type = edge_type[0].get(RESOURCE_KEY, None) + edge_object = edge_object[0].get(RESOURCE_KEY, None) + + if edge_type != None and edge_object != None: + edges_list.append((edge_type, edge_object)) + + if RESOURCE_KEY in edge: + edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + + + # node_id = owl_class.get(ID_TAG, list()) + + # superclasses = [superclass.get(RESOURCE_KEY, str()) for superclass in owl_class.get(SUBCLASS_TAG, list())] + + # # Also query for comments? + # # Descriptions appear to be additive in current KG2 + # descriptions = owl_class.get(DESCRIPTION_TAG, list()) + # assert len(descriptions) <= 1 + # description = str() + # for element in descriptions: + # description += element[TEXT_KEY] + + # xrefs = [xref[TEXT_KEY] for xref in owl_class.get(XREF_TAG, list())] + # for element in owl_class.get(XREF_TAG, list()): + # xrefs.append(element[TEXT_KEY]) + + # exact_matches = [exact_match[RESOURCE_KEY] for exact_match in owl_class.get(EXACT_MATCH_TAG, list())] + + # names = owl_class.get(NAME_TAG, list()) + # assert len(names) <= 1, ontology_item + # name = str() + # for element in names: + # name += element[TEXT_KEY] + + # node = {"id": node_id, "superclasses": superclasses, "description": description, "xrefs": xrefs, "name": name, "exact_matches": exact_matches} + + node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": edges_list} + print(json.dumps(node, indent=4)) + + +if __name__ == '__main__': + args = get_args() + input_file_name = args.inputFile + output_file_name = args.outputFile + + input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) + input_data = input_read_jsonlines_info[0] + + owl_class_count = 0 + for ontology_item in input_data: + process_ontology_item(ontology_item) + + # print("OWL Classes:", owl_class_count) + # for key in KEYS_DICT: + # KEYS_DICT[key] = KEYS_DICT[key] / owl_class_count + # print(json.dumps(KEYS_DICT, indent=4, sort_keys=True)) \ No newline at end of file From b12e98409d1d7ef739315c747e258c91bf38a145 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 26 Aug 2024 23:04:24 -0700 Subject: [PATCH 056/125] #387 additional weird sources due to FOODON --- maps/curies-to-urls-map.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 0e4ba4cb..7a57b610 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -5,6 +5,8 @@ use_for_bidirectional_mapping: - AEO: http://purl.obolibrary.org/obo/AEO_ + - + AGRO: http://purl.obolibrary.org/obo/AGRO_ - AIR: https://identifiers.org/umls/AIR/ - @@ -521,6 +523,8 @@ use_for_bidirectional_mapping: ZFIN: "https://identifiers.org/zfin:" ##########################################3 use_for_contraction_only: + - + AGRO: "&obo;AGRO_" - AraPort: https://www.araport.org/locus/ - From e9d6d68b80d4a14cea7658930771b0165d288d68 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 26 Aug 2024 23:27:22 -0700 Subject: [PATCH 057/125] #387 more additional weird source links due to FOODON --- maps/curies-to-urls-map.yaml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 7a57b610..647ce2c8 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -47,6 +47,8 @@ use_for_bidirectional_mapping: CAID: 'http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=' - CARO: http://purl.obolibrary.org/obo/CARO_ + - + CDNO: http://purl.obolibrary.org/obo/CDNO_ - CEPH: http://purl.obolibrary.org/obo/CEPH_ - @@ -77,6 +79,8 @@ use_for_bidirectional_mapping: CLO: http://purl.obolibrary.org/obo/CLO_ - COAR_RESOURCE: 'http://purl.org/coar/resource_type/' + - + COB: http://purl.obolibrary.org/obo/COB_ - COG: 'https://www.ncbi.nlm.nih.gov/research/cog-project/' - @@ -525,38 +529,56 @@ use_for_bidirectional_mapping: use_for_contraction_only: - AGRO: "&obo;AGRO_" + - + apollo: "&obo;APOLLO_" - AraPort: https://www.araport.org/locus/ - AraPort: https://bar.utoronto.ca/thalemine/portal.do?externalids= + - + ARO: "&obo;ARO_" - BAO: http://www.bioassayontology.org/bao# - BFO: http://www.ifomis.org/bfo/1.1/snap# - BFO: http://www.ifomis.org/bfo/1.1/span# + - + BFO: "&obo;BFO_" - biolink: https://w3id.org/biolink/biolinkml/meta/ - biolink: https://w3id.org/biolink/biolink-model - BTO: http://purl.obolibrary.org/obo/bto# + - + CDNO: "&obo;CDNO_" - CHEBI: http://purl.obolibrary.org/obo/chebi/ - CHEBI: http://purl.obolibrary.org/obo/chebi# + - + CHEBI: "&obo;CHEBI_" - CHEMBL.COMPOUND: https://www.ebi.ac.uk/chembl/compound/inspect/ + - + CHMO: "&obo;CHMO_" - CL: http://purl.obolibrary.org/obo/cl# + - + COB: "&obo;COB_" - CPT: http://purl.bioontology.org/ontology/HCPT/ - DDANAT: http://purl.obolibrary.org/obo/ddanat# - DGIdb: https://www.dgidb.org/interaction_types/ + - + DOID: "&obo;DOID_" - DRUGBANK: http://purl.bioontology.org/ontology/DRUGBANK/ + - + ECOCORE: "&obo;ECOCORE_" - ecogene: http://www.ecogene.org/gene/ - @@ -567,6 +589,10 @@ use_for_contraction_only: EFO: http://www.ebi.ac.uk/efoIri - ENSEMBL: http://www.ensembl.org/id/ + - + ENVO: "&obo;ENVO_" + - + EPO: "&obo;EPO_" - FBgn: https://flybase.org/reports/FBgn - From 61c3f069cdce612650faaa009821a40a78674c5d Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 26 Aug 2024 23:39:46 -0700 Subject: [PATCH 058/125] #387 even more additional weird source links due to FOODON --- maps/curies-to-urls-map.yaml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 647ce2c8..adf646a7 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -593,16 +593,30 @@ use_for_contraction_only: ENVO: "&obo;ENVO_" - EPO: "&obo;EPO_" + - + ExO: "&obo;ExO_" + - + FAO: "&obo;FAO_" - FBgn: https://flybase.org/reports/FBgn - FMA: http://purl.obolibrary.org/obo/fma# - FMA: http://purl.bioontology.org/ontology/FMA/ + - + FOODON: "&obo;FOODON_" + - + GAZ: "&obo;GAZ_" + - + GENEPIO: "&obo;GENEPIO_" + - + GO: "&obo;GO_" - GO: "http://purl.bioontology.org/ontology/GO/GO%3A" - GO: http://purl.bioontology.org/ontology/GO/ + - + HANCESTRO: "&obo;HANCESTRO_" - HCPCS: https://hcpcs.codes/a-codes/ - @@ -617,6 +631,8 @@ use_for_contraction_only: HP: "http://purl.bioontology.org/ontology/HPO/HP%3A" - HP: http://purl.bioontology.org/ontology/HPO/ + - + IAO: "&obo;IAO_" - ICD10: http://purl.bioontology.org/ontology/ICD10/ - @@ -633,6 +649,8 @@ use_for_contraction_only: ICD9: http://purl.bioontology.org/ontology/ICD9CM/ - ICD9: http://purl.obolibrary.org/obo/ICD9_ + - + IDO: "&obo;IDO_" - KEGG: http://purl.obolibrary.org/obo/KEGG_ - @@ -653,6 +671,8 @@ use_for_contraction_only: MESH: http://identifiers.org/mesh/ - MGI: "http://www.informatics.jax.org/marker/MGI:" + - + MI: "&obo;MI_" - miRBase: "https://identifiers.org/mirbase:" - @@ -665,6 +685,10 @@ use_for_contraction_only: NCBITaxon: http://purl.bioontology.org/ontology/NCBITAXON/ - NCBITaxon: http://purl.obolibrary.org/obo/ncbitaxon# + - + NCBITaxon: "&obo;NCBITaxon" + - + NCBITaxon: "&obo;NCBITaxon#" - NCIT: http://purl.bioontology.org/ontology/NCI/ - @@ -681,6 +705,8 @@ use_for_contraction_only: NCIT: http://purl.bioontology.org/ontology/NCI_CTCAE_5/ - NCIT: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# + - + NCIT: "&obo;NCIT_" - OBO: http://purl.obolibrary.org/obo# - From da66493092bb3ba90768654ad340926374aa08c8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 26 Aug 2024 23:55:40 -0700 Subject: [PATCH 059/125] #387 final additional weird source links due to FOODON --- maps/curies-to-urls-map.yaml | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index adf646a7..9ce1b30d 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -179,6 +179,8 @@ use_for_bidirectional_mapping: FBrf: https://flybase.org/reports/FBrf - FIX: http://purl.obolibrary.org/obo/FIX_ + - + FLOPO: http://purl.obolibrary.org/obo/FLOPO_ - FLU: http://purl.obolibrary.org/obo/FLU_ - @@ -381,6 +383,8 @@ use_for_bidirectional_mapping: OMRSE: http://purl.obolibrary.org/obo/OMRSE_ - OncoTree: http://purl.obolibrary.org/obo/ONCOTREE_ + - + ONS: http://purl.obolibrary.org/obo/ONS_ - OPL: http://purl.obolibrary.org/obo/OPL_ - @@ -445,6 +449,8 @@ use_for_bidirectional_mapping: RXNORM: http://purl.bioontology.org/ontology/RXNORM/ - SEMMEDDB: https://skr3.nlm.nih.gov/SemMedDB + - + SEPIO: http://purl.obolibrary.org/obo/SEPIO_ - sgd: "https://identifiers.org/sgd:" - @@ -599,6 +605,8 @@ use_for_contraction_only: FAO: "&obo;FAO_" - FBgn: https://flybase.org/reports/FBgn + - + FLOPO: "&obo;FLOPO_" - FMA: http://purl.obolibrary.org/obo/fma# - @@ -707,6 +715,8 @@ use_for_contraction_only: NCIT: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# - NCIT: "&obo;NCIT_" + - + OBI: "&obo;OBI_" - OBO: http://purl.obolibrary.org/obo# - @@ -717,20 +727,38 @@ use_for_contraction_only: OMIM: http://purl.bioontology.org/ontology/OMIM/ - OMIM: http://identifiers.org/omim/ + - + OMIT: "&obo;OMIT_" - OMOP: http://purl.obolibrary.org/obo/COHD_ + - + OMP: "&obo;OMP_" + - + ONS: "&obo;ONS_" - orphanet: http://purl.bioontology.org/ontology/ORDO/ - orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb - PATO: http://purl.obolibrary.org/obo/pato# + - + PATO: "&obo;PATO_" + - + PCO: "&obo;PCO_" + - + PO: "&obo;PO_" - PomBase: http://www.pombase.org/spombe/result/ - PR: http://purl.obolibrary.org/obo/pr# + - + PR: "&obo;PR_" - RO: http://www.obofoundry.org/ro/ro.owl# + - + RO: "&obo;RO_" + - + SEPIO: "&obo;SEPIO_" - sgd: http://www.yeastgenome.org/cgi-bin/locus.fpl?dbid= - @@ -747,10 +775,18 @@ use_for_contraction_only: SNOMED: http://identifiers.org/snomedct/ - SO: http://purl.obolibrary.org/obo/so# + - + SO: "&obo;SO_" + - + STATO: "&obo;STATO_" + - + TRANS: "&obo;TRANS_" - UBERON: http://purl.obolibrary.org/obo/uberon/insect-anatomy# - UBERON: http://purl.obolibrary.org/obo/uberon# + - + UBERON: "&obo;UBERON_" - UMLS: http://purl.obolibrary.org/obo/UMLS_ - @@ -767,6 +803,8 @@ use_for_contraction_only: UMLS: http://purl.bioontology.org/ontology/MEDLINEPLUS/ - UniProtKB: "http://identifiers.org/uniprot/" + - + UO: "&obo;UO_" - wb: http://www.wormbase.org/species/c_elegans/gene/ - From 0d40be8a6cc6bfac28014ddcdd3c0681e8feb4da Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 27 Aug 2024 00:30:39 -0700 Subject: [PATCH 060/125] #387 patch to get around weird ids showing up when trying to prefix match --- maps/curies-to-urls-map.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 9ce1b30d..6cb9300d 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -579,10 +579,14 @@ use_for_contraction_only: DDANAT: http://purl.obolibrary.org/obo/ddanat# - DGIdb: https://www.dgidb.org/interaction_types/ + - + dictybase.gene: http://dictybase.org/gene/DDB_ - DOID: "&obo;DOID_" - DRUGBANK: http://purl.bioontology.org/ontology/DRUGBANK/ + - + DRUGBANK: "DrugBank:" - ECOCORE: "&obo;ECOCORE_" - @@ -635,6 +639,8 @@ use_for_contraction_only: HGNC: "http://identifiers.org/hgnc/" - HGNC: http://purl.bioontology.org/ontology/HGNC/ + - + HMDB: "HMDB:" - HP: "http://purl.bioontology.org/ontology/HPO/HP%3A" - @@ -661,6 +667,8 @@ use_for_contraction_only: IDO: "&obo;IDO_" - KEGG: http://purl.obolibrary.org/obo/KEGG_ + - + KEGG.ENZYME: "EC:" - LOINC: http://purl.bioontology.org/ontology/LNC/ - @@ -753,6 +761,8 @@ use_for_contraction_only: PR: http://purl.obolibrary.org/obo/pr# - PR: "&obo;PR_" + - + REACT: "Reactome:" - RO: http://www.obofoundry.org/ro/ro.owl# - @@ -803,6 +813,8 @@ use_for_contraction_only: UMLS: http://purl.bioontology.org/ontology/MEDLINEPLUS/ - UniProtKB: "http://identifiers.org/uniprot/" + - + UniProtKB: "UniProtKB:" - UO: "&obo;UO_" - From 0a229649ba450ef1cbd11da78d235408244783a5 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 27 Aug 2024 02:10:17 -0700 Subject: [PATCH 061/125] #387 more weird prefixes --- maps/curies-to-urls-map.yaml | 90 ++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 6cb9300d..84a730ab 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -543,6 +543,8 @@ use_for_contraction_only: AraPort: https://bar.utoronto.ca/thalemine/portal.do?externalids= - ARO: "&obo;ARO_" + - + ATC: "ATC_code:" - BAO: http://www.bioassayontology.org/bao# - @@ -557,6 +559,10 @@ use_for_contraction_only: biolink: https://w3id.org/biolink/biolink-model - BTO: http://purl.obolibrary.org/obo/bto# + - + BTO: "BTO:" + - + BTO: "&obo;BTO_" - CDNO: "&obo;CDNO_" - @@ -571,6 +577,8 @@ use_for_contraction_only: CHMO: "&obo;CHMO_" - CL: http://purl.obolibrary.org/obo/cl# + - + CL: "&obo;CL_" - COB: "&obo;COB_" - @@ -583,10 +591,14 @@ use_for_contraction_only: dictybase.gene: http://dictybase.org/gene/DDB_ - DOID: "&obo;DOID_" + - + DOID: "DOID:" - DRUGBANK: http://purl.bioontology.org/ontology/DRUGBANK/ - DRUGBANK: "DrugBank:" + - + DrugCentral: "Drug_Central:" - ECOCORE: "&obo;ECOCORE_" - @@ -597,10 +609,18 @@ use_for_contraction_only: EFO: http://www.ebi.ac.uk/efo/ - EFO: http://www.ebi.ac.uk/efoIri + - + EFO: "EFO:" + - + EFO: "&efo;EFO_" + - + EHDAA2: "EHDAA2:" - ENSEMBL: http://www.ensembl.org/id/ - ENVO: "&obo;ENVO_" + - + ENVO: "ENVO:" - EPO: "&obo;EPO_" - @@ -615,14 +635,24 @@ use_for_contraction_only: FMA: http://purl.obolibrary.org/obo/fma# - FMA: http://purl.bioontology.org/ontology/FMA/ + - + FMA: "FMA:" - FOODON: "&obo;FOODON_" + - + GARD: "GARD:" - GAZ: "&obo;GAZ_" + - + GAZ: "GAZ:" - GENEPIO: "&obo;GENEPIO_" + - + GENEPIO: "&obo;GENEPIO:" - GO: "&obo;GO_" + - + GO: "GO:" - GO: "http://purl.bioontology.org/ontology/GO/GO%3A" - @@ -637,6 +667,8 @@ use_for_contraction_only: HGNC: "http://purl.bioontology.org/ontology/HGNC/HGNC%3A" - HGNC: "http://identifiers.org/hgnc/" + - + HGNC: "HGNC:" - HGNC: http://purl.bioontology.org/ontology/HGNC/ - @@ -645,6 +677,10 @@ use_for_contraction_only: HP: "http://purl.bioontology.org/ontology/HPO/HP%3A" - HP: http://purl.bioontology.org/ontology/HPO/ + - + HP: "HP:" + - + HP: "&obo;HP_" - IAO: "&obo;IAO_" - @@ -655,6 +691,10 @@ use_for_contraction_only: ICD10: http://purl.obolibrary.org/obo/ICD10AE_ - ICD10: http://purl.obolibrary.org/obo/ICD10CM_ + - + ICD10: "ICD10CM:" + - + ICD10: "ICD10:" - ICD10: http://purl.bioontology.org/ontology/ICD10AE/ - @@ -663,6 +703,8 @@ use_for_contraction_only: ICD9: http://purl.bioontology.org/ontology/ICD9CM/ - ICD9: http://purl.obolibrary.org/obo/ICD9_ + - + ICD9: "ICD9:" - IDO: "&obo;IDO_" - @@ -675,16 +717,30 @@ use_for_contraction_only: MEDDRA: http://purl.obolibrary.org/obo/MedDRA_ - MEDDRA: http://identifiers.org/meddra/ + - + MEDDRA: "MedDRA:" + - + MEDDRA: "MEDDRA:" - medgen: http://purl.obolibrary.org/obo/MEDGEN_ - medgen: http://identifiers.org/medgen/ + - + medgen: "MEDGEN:" + - + medgen: "MedGen:" + - + medgen: "Medgen:" - MESH: http://purl.bioontology.org/ontology/MESH/ - MESH: http://purl.bioontology.org/ontology/MSH/ - MESH: http://identifiers.org/mesh/ + - + MESH: "MESH:" + - + MESH : "MeSH:" - MGI: "http://www.informatics.jax.org/marker/MGI:" - @@ -693,6 +749,8 @@ use_for_contraction_only: miRBase: "https://identifiers.org/mirbase:" - MONDO: http://purl.obolibrary.org/obo/mondo# + - + MONDO: "MONDO:" - NCBIGene: "https://identifiers.org/ncbigene:" - @@ -723,6 +781,16 @@ use_for_contraction_only: NCIT: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl# - NCIT: "&obo;NCIT_" + - + NCIT: "NCI:" + - + NCIT: "NCIT:" + - + NCIT: "NCIt:" + - + NCIT: "NCI Metathesaurus:" + - + NCIT: "NCI_" - OBI: "&obo;OBI_" - @@ -731,10 +799,16 @@ use_for_contraction_only: OBOREL: "http://purl.org/obo/owl/OBO_REL#" - OCRe: http://purl.org/net/OCRe/research.owl# + - + OIO: "oboInOwl:" - OMIM: http://purl.bioontology.org/ontology/OMIM/ - OMIM: http://identifiers.org/omim/ + - + OMIM: "MIM:" + - + OMIM: "OMIM:" - OMIT: "&obo;OMIT_" - @@ -745,6 +819,10 @@ use_for_contraction_only: ONS: "&obo;ONS_" - orphanet: http://purl.bioontology.org/ontology/ORDO/ + - + orphanet: "ORDO:" + - + orphanet: "Orphanet:" - orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb - @@ -753,6 +831,8 @@ use_for_contraction_only: PATO: "&obo;PATO_" - PCO: "&obo;PCO_" + - + PMID: "PMID:" - PO: "&obo;PO_" - @@ -761,12 +841,16 @@ use_for_contraction_only: PR: http://purl.obolibrary.org/obo/pr# - PR: "&obo;PR_" + - + rdfs: "rdfs:" - REACT: "Reactome:" - RO: http://www.obofoundry.org/ro/ro.owl# - RO: "&obo;RO_" + - + RO: "obo:RO_" - SEPIO: "&obo;SEPIO_" - @@ -811,6 +895,12 @@ use_for_contraction_only: UMLS: http://purl.bioontology.org/ontology/MED-RT/ - UMLS: http://purl.bioontology.org/ontology/MEDLINEPLUS/ + - + UMLS: "MedlinePlus:" + - + UMLS: "UMLS_CUI:" + - + UMLS: "UMLS:" - UniProtKB: "http://identifiers.org/uniprot/" - From 919d3b85d5933590144cd7906ed2145a2badd1fb Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 27 Aug 2024 02:27:05 -0700 Subject: [PATCH 062/125] #387 today's work on the ontologies ETL --- ontologies_jsonl_to_kg_jsonl.py | 58 +++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py index 12784bc8..233ed4d2 100644 --- a/ontologies_jsonl_to_kg_jsonl.py +++ b/ontologies_jsonl_to_kg_jsonl.py @@ -22,6 +22,11 @@ CLASSES_DICT = dict() +URI_MAP = dict() +URI_MAP_KEYS = list() + +MISSING_ID_PREFIXES = set() + def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', @@ -37,7 +42,9 @@ def process_ontology_item(ontology_item): if ID_TAG not in owl_class: continue # TODO: MAP THIS HERE, since not all sources use same IRIs for the same nodes - node_id = owl_class.get(ID_TAG, str()) + node_id = match_prefix(owl_class.get(ID_TAG, str())) + if node_id is None: + continue # Configure the name name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] @@ -98,6 +105,16 @@ def process_ontology_item(ontology_item): if RESOURCE_KEY in edge: edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + final_edges_list = list() + for (edge_relation, edge_object) in edges_list: + edge_object = match_prefix(edge_object) + if edge_object is None: + continue + edge_relation = match_prefix(edge_relation) + if edge_relation is None: + continue + final_edges_list.append((edge_relation, edge_object)) + # node_id = owl_class.get(ID_TAG, list()) @@ -125,9 +142,43 @@ def process_ontology_item(ontology_item): # node = {"id": node_id, "superclasses": superclasses, "description": description, "xrefs": xrefs, "name": name, "exact_matches": exact_matches} - node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": edges_list} + node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list} print(json.dumps(node, indent=4)) +def generate_uri_map(): + uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml")) + bidirectional_map = uri_input_map['use_for_bidirectional_mapping'] + contraction_map = uri_input_map['use_for_contraction_only'] + + for curie_prefix_dict in bidirectional_map: + for curie_prefix in curie_prefix_dict: + curie_url = curie_prefix_dict[curie_prefix] + URI_MAP[curie_url] = curie_prefix + + for curie_prefix_dict in contraction_map: + for curie_prefix in curie_prefix_dict: + curie_url = curie_prefix_dict[curie_prefix] + URI_MAP[curie_url] = curie_prefix + + # So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another) + # Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python) + global URI_MAP_KEYS + URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True) + +def match_prefix(node_id): + for curie_url in URI_MAP_KEYS: + if node_id.startswith(curie_url): + return node_id.replace(curie_url, URI_MAP[curie_url] + ":") + + if "http" in node_id: + MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/") + elif ':' in node_id: + MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":") + elif '_' in node_id: + MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_") + else: + MISSING_ID_PREFIXES.add(node_id) + if __name__ == '__main__': args = get_args() @@ -138,8 +189,11 @@ def process_ontology_item(ontology_item): input_data = input_read_jsonlines_info[0] owl_class_count = 0 + ontology_prefixes = set() + generate_uri_map() for ontology_item in input_data: process_ontology_item(ontology_item) + print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4)) # print("OWL Classes:", owl_class_count) # for key in KEYS_DICT: From 6613470ded6b49ab4597b5128f62b3b96d607e8e Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 27 Aug 2024 03:07:52 -0700 Subject: [PATCH 063/125] #387 finishing up the different edge types --- maps/curies-to-urls-map.yaml | 6 ++++++ ontologies_jsonl_to_kg_jsonl.py | 32 +++++++++++++++++++++++++------- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 84a730ab..3452641c 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -683,6 +683,8 @@ use_for_contraction_only: HP: "&obo;HP_" - IAO: "&obo;IAO_" + - + IAO: "obo:IAO_" - ICD10: http://purl.bioontology.org/ontology/ICD10/ - @@ -751,6 +753,8 @@ use_for_contraction_only: MONDO: http://purl.obolibrary.org/obo/mondo# - MONDO: "MONDO:" + - + MONDO: "mondo-base:" - NCBIGene: "https://identifiers.org/ncbigene:" - @@ -857,6 +861,8 @@ use_for_contraction_only: sgd: http://www.yeastgenome.org/cgi-bin/locus.fpl?dbid= - skos: http://www.w3.org/2008/05/skos# + - + skos: "skos:" - SNOMED: http://purl.bioontology.org/ontology/SNOMEDCT/ - diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py index 233ed4d2..93e119db 100644 --- a/ontologies_jsonl_to_kg_jsonl.py +++ b/ontologies_jsonl_to_kg_jsonl.py @@ -20,6 +20,28 @@ COMMENT_PREFIX = "COMMENTS: " +BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY, + "mondo-base:closeMatch": RESOURCE_KEY, + "mondo-base:relatedMatch": RESOURCE_KEY, + "mondo-base:broadMatch": RESOURCE_KEY, + "mondo-base:narrowMatch": RESOURCE_KEY, + "skos:exactMatch": RESOURCE_KEY, + "skos:closeMatch": RESOURCE_KEY, + "skos:broadMatch": RESOURCE_KEY, + "skos:relatedMatch": RESOURCE_KEY, + "skos:narrowMatch": RESOURCE_KEY, + "obo:IAO_0100001": RESOURCE_KEY, + "obo:RO_0002175": RESOURCE_KEY, + "obo:RO_0002161": RESOURCE_KEY, + "obo:RO_0002604": RESOURCE_KEY, + "obo:RO_0002171": RESOURCE_KEY, + "obo:RO_0002174": RESOURCE_KEY, + "obo:RO_0002475": RESOURCE_KEY, + "obo:RO_0001900": RESOURCE_KEY, + "oboInOwl:hasAlternativeId": TEXT_KEY, + "oboInOwl:hasDbXref": TEXT_KEY, + "oboInOwl:xref": TEXT_KEY} + CLASSES_DICT = dict() URI_MAP = dict() @@ -69,15 +91,11 @@ def process_ontology_item(ontology_item): # Extract edge triples edges_list = list() - for edge_type in ["obo:RO_0002175", "obo:RO_0002161", "obo:RO_0002604", "obo:RO_0002171", "obo:RO_0002174", "obo:RO_0002475", "obo:RO_0001900", "obo:RO_0004050"]: + for edge_type in BASE_EDGE_TYPES: for edge in owl_class.get(edge_type, list()): - if RESOURCE_KEY in edge: - edges_list.append((edge_type, edge.get(RESOURCE_KEY, None))) + if BASE_EDGE_TYPES[edge_type] in edge: + edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) - for edge_type in ["oboInOwl:hasDbXref"]: - for edge in owl_class.get(edge_type, list()): - if TEXT_KEY in edge: - edges_list.append((edge_type, edge.get(TEXT_KEY, None))) restriction_edges = list() restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] From 07500628e27d474ed55fcd62e1b9012cba279ae1 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 28 Aug 2024 21:04:45 -0700 Subject: [PATCH 064/125] #387 for testing purposes --- validate/validate_kg2_util_curies_urls_categories.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py index 4e670fae..e144bd21 100755 --- a/validate/validate_kg2_util_curies_urls_categories.py +++ b/validate/validate_kg2_util_curies_urls_categories.py @@ -47,6 +47,9 @@ def make_arg_parser(): kg2_util.CURIE_PREFIX_BIOLINK + ':' + kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO) +print("TESTTESTTEST") +print(biolink_edge_labels) + for variable_name in dir(kg2_util): variable_value = getattr(kg2_util, variable_name) if variable_name.startswith('CURIE_PREFIX_'): From 7ed39cbfd81f551a138e861a75b5e912ced7d0d4 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 28 Aug 2024 23:54:09 -0700 Subject: [PATCH 065/125] #387 don't need that print statement anymore --- validate/validate_kg2_util_curies_urls_categories.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py index e144bd21..4e670fae 100755 --- a/validate/validate_kg2_util_curies_urls_categories.py +++ b/validate/validate_kg2_util_curies_urls_categories.py @@ -47,9 +47,6 @@ def make_arg_parser(): kg2_util.CURIE_PREFIX_BIOLINK + ':' + kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO) -print("TESTTESTTEST") -print(biolink_edge_labels) - for variable_name in dir(kg2_util): variable_value = getattr(kg2_util, variable_name) if variable_name.startswith('CURIE_PREFIX_'): From 7d184003a12d5a8571de6f359988196bb7dde5b0 Mon Sep 17 00:00:00 2001 From: ecwood Date: Wed, 28 Aug 2024 23:54:35 -0700 Subject: [PATCH 066/125] #387 looks like we're not using this anymore --- validate/{ => archive}/validate_ont_load_inventory.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename validate/{ => archive}/validate_ont_load_inventory.py (100%) diff --git a/validate/validate_ont_load_inventory.py b/validate/archive/validate_ont_load_inventory.py similarity index 100% rename from validate/validate_ont_load_inventory.py rename to validate/archive/validate_ont_load_inventory.py From 2eb8b00848ac2149a5228d95f0dcc44d0c36b968 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 01:33:58 -0700 Subject: [PATCH 067/125] #387 try out the new validate kg2 util --- validate/run-validation-tests.sh | 4 +- ...alidate_kg2_util_curies_urls_categories.py | 56 +++++++++++++------ 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/validate/run-validation-tests.sh b/validate/run-validation-tests.sh index f2c997de..4bcec92f 100755 --- a/validate/run-validation-tests.sh +++ b/validate/run-validation-tests.sh @@ -68,8 +68,8 @@ ${python_command} -u ${VALIDATE_CODE_DIR}/validate_curies_to_urls_map_yaml.py \ ${python_command} -u ${VALIDATE_CODE_DIR}/validate_kg2_util_curies_urls_categories.py \ ${curies_to_urls_file} \ - ${biolink_model_owl_url} \ - ${biolink_model_owl_local_file} + ${biolink_model_yaml_url} \ + ${biolink_model_yaml_local_file} ${python_command} -u ${VALIDATE_CODE_DIR}/validate_predicate_remap_yaml.py \ ${curies_to_urls_file} \ diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py index 4e670fae..4e09b14b 100755 --- a/validate/validate_kg2_util_curies_urls_categories.py +++ b/validate/validate_kg2_util_curies_urls_categories.py @@ -6,7 +6,7 @@ __author__ = 'Stephen Ramsey' __copyright__ = 'Oregon State University' -__credits__ = ['Stephen Ramsey'] +__credits__ = ['Stephen Ramsey', 'Erica Wood'] __license__ = 'MIT' __version__ = '0.1.0' __maintainer__ = '' @@ -15,6 +15,11 @@ import argparse import kg2_util +import json + +DESCENDANT_KEY = "is_a" +BASE_PREDICATE = "related to" +BASE_CATEGORY = "named thing" def make_arg_parser(): arg_parser = argparse.ArgumentParser(description='validate_kg2_util_curies_urls_categories.py: ' + @@ -24,28 +29,51 @@ def make_arg_parser(): arg_parser.add_argument('biolinkModelLocalFile', type=str) return arg_parser +def construct_biolink_term_set(is_a_base, biolink_terms): + output_set = set() + for key in biolink_terms: + key_is_a = biolink_terms[key] + if key_is_a == is_a_base: + for item in construct_biolink_term_set(key, biolink_terms): + output_set.add(item) + output_set.add(is_a_base) + return output_set + +def identify_biolink_terms(biolink_model): + biolink_predicate_terms = dict() + biolink_category_terms = dict() + for predicate in biolink_model["slots"]: + if DESCENDANT_KEY in biolink_model["slots"][predicate]: + biolink_predicate_terms[predicate] = biolink_model["slots"][predicate][DESCENDANT_KEY] + + for category in biolink_model["classes"]: + if DESCENDANT_KEY in biolink_model["classes"][category]: + biolink_category_terms[category] = biolink_model["classes"][category][DESCENDANT_KEY] + + biolink_predicates = construct_biolink_term_set("related to", biolink_predicate_terms) + biolink_categories = construct_biolink_term_set("named thing", biolink_category_terms) + + return list(biolink_predicates), list(biolink_categories) + + args = make_arg_parser().parse_args() biolink_model_url = args.biolinkModelURL biolink_model_file_name = args.biolinkModelLocalFile curies_to_urls_map_file_name = args.curiesToURLsMapFile -iri_shortener = kg2_util.make_uri_to_curie_shortener(kg2_util.make_curies_to_uri_map(kg2_util.read_file_to_string(curies_to_urls_map_file_name), - kg2_util.IDMapperType.CONTRACT)) - curies_to_url_map_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_map_file_name)) curies_to_url_map_data_bidir = {key: listitem[key] for listitem in curies_to_url_map_data['use_for_bidirectional_mapping'] for key in listitem.keys()} curies_to_url_map_data_cont = {key: listitem[key] for listitem in curies_to_url_map_data['use_for_contraction_only'] for key in listitem.keys()} +valid_base_urls = list() +valid_base_urls += [value for value in curies_to_url_map_data_bidir.values()] +valid_base_urls += [value for value in curies_to_url_map_data_cont.values()] kg2_util.download_file_if_not_exist_locally(biolink_model_url, biolink_model_file_name) -biolink_ont = kg2_util.make_ontology_from_local_file(biolink_model_file_name) -biolink_categories_ontology_depths = kg2_util.get_biolink_categories_ontology_depths(biolink_ont) - -biolink_edge_labels = kg2_util.ont_children_recursive(biolink_ont, - kg2_util.CURIE_PREFIX_BIOLINK + ':' + - kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO) +biolink_model = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(biolink_model_file_name)) +biolink_edge_labels, biolink_categories = identify_biolink_terms(biolink_model) for variable_name in dir(kg2_util): variable_value = getattr(kg2_util, variable_name) @@ -53,21 +81,17 @@ def make_arg_parser(): assert variable_value in curies_to_url_map_data_bidir, variable_name elif variable_name.startswith('BASE_URL_'): url_str = variable_value - curie = iri_shortener(url_str) - assert curie is not None, url_str + assert url_str in valid_base_urls, url_str elif variable_name.startswith('BIOLINK_CATEGORY_'): category_label = variable_value category_camelcase = kg2_util.convert_space_case_to_camel_case(category_label) category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase - assert category_curie in biolink_categories_ontology_depths, category_curie + assert category_curie in biolink_categories, category_curie # assert category_label in categories_to_check, category_label elif variable_name.startswith('CURIE_ID_'): curie_id = variable_value assert ':' in curie_id, variable_name assert curie_id.split(':')[0] in curies_to_url_map_data_bidir, variable_name - elif variable_name.startswith('IRI_'): - url = variable_value - assert iri_shortener(url) is not None, url elif variable_name.startswith('EDGE_LABEL_BIOLINK_'): relation_label = variable_value assert kg2_util.CURIE_PREFIX_BIOLINK + ':' + relation_label in biolink_edge_labels, relation_label From 2375994caaf693f8127e509e407a3cb3b943932a Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 01:53:42 -0700 Subject: [PATCH 068/125] #387 drastic changes: REMOVAL OF ONTOBIO from kg2_util and validation --- validate/run-validation-tests.sh | 4 +- .../validate_curies_to_categories_yaml.py | 12 +++--- ...alidate_kg2_util_curies_urls_categories.py | 42 ++----------------- 3 files changed, 12 insertions(+), 46 deletions(-) diff --git a/validate/run-validation-tests.sh b/validate/run-validation-tests.sh index 4bcec92f..7cdb7974 100755 --- a/validate/run-validation-tests.sh +++ b/validate/run-validation-tests.sh @@ -58,8 +58,8 @@ ${curl_get} ${infores_catalog_yaml_url} -o ${infores_catalog_yaml} ${python_command} -u ${VALIDATE_CODE_DIR}/validate_curies_to_categories_yaml.py \ ${curies_to_categories_file} \ ${curies_to_urls_file} \ - ${biolink_model_owl_url} \ - ${biolink_model_owl_local_file} + ${biolink_model_yaml_url} \ + ${biolink_model_yaml_local_file} ${python_command} -u ${VALIDATE_CODE_DIR}/validate_curies_to_urls_map_yaml.py \ ${curies_to_urls_file} \ diff --git a/validate/validate_curies_to_categories_yaml.py b/validate/validate_curies_to_categories_yaml.py index 3af15002..80cdde53 100755 --- a/validate/validate_curies_to_categories_yaml.py +++ b/validate/validate_curies_to_categories_yaml.py @@ -7,7 +7,7 @@ __author__ = 'Stephen Ramsey' __copyright__ = 'Oregon State University' -__credits__ = ['Stephen Ramsey'] +__credits__ = ['Stephen Ramsey', 'Erica Wood'] __license__ = 'MIT' __version__ = '0.1.0' __maintainer__ = '' @@ -22,8 +22,8 @@ def make_arg_parser(): arg_parser = argparse.ArgumentParser(description='validate_curies_to_categories.py: checks the file `curies-to-categories.yaml` for correctness.') arg_parser.add_argument('curiesToCategoriesFile', type=str) arg_parser.add_argument('curiesToURLsMapFile', type=str) - arg_parser.add_argument('biolinkModelOWLURL', type=str) - arg_parser.add_argument('biolinkModelOWLLocalFile', type=str) + arg_parser.add_argument('biolinkModelYAMLURL', type=str) + arg_parser.add_argument('biolinkModelYAMLLocalFile', type=str) return arg_parser @@ -37,8 +37,8 @@ def make_arg_parser(): curies_to_url_map_data_bidir = {next(iter(listitem.keys())) for listitem in curies_to_url_map_data['use_for_bidirectional_mapping']} kg2_util.download_file_if_not_exist_locally(biolink_model_url, biolink_model_file_name) -biolink_ont = kg2_util.make_ontology_from_local_file(biolink_model_file_name) -biolink_categories_ontology_depths = kg2_util.get_biolink_categories_ontology_depths(biolink_ont) +biolink_model = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(biolink_model_file_name)) +_, biolink_categories = kg2_util.identify_biolink_terms(biolink_model) for prefix in curies_to_categories_data['prefix-mappings'].keys(): assert prefix in curies_to_url_map_data_bidir, prefix @@ -53,4 +53,4 @@ def make_arg_parser(): for category in categories_to_check: category_camelcase = kg2_util.convert_space_case_to_camel_case(category) category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase - assert category_curie in biolink_categories_ontology_depths, category_curie + assert category_curie in biolink_categories, category_curie diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py index 4e09b14b..4ba57d50 100755 --- a/validate/validate_kg2_util_curies_urls_categories.py +++ b/validate/validate_kg2_util_curies_urls_categories.py @@ -17,10 +17,6 @@ import kg2_util import json -DESCENDANT_KEY = "is_a" -BASE_PREDICATE = "related to" -BASE_CATEGORY = "named thing" - def make_arg_parser(): arg_parser = argparse.ArgumentParser(description='validate_kg2_util_curies_urls_categories.py: ' + 'checks the file `kg2_util.py` for correctness for its CURIE IDs, Base URLs, and biolink categories.') @@ -29,34 +25,6 @@ def make_arg_parser(): arg_parser.add_argument('biolinkModelLocalFile', type=str) return arg_parser -def construct_biolink_term_set(is_a_base, biolink_terms): - output_set = set() - for key in biolink_terms: - key_is_a = biolink_terms[key] - if key_is_a == is_a_base: - for item in construct_biolink_term_set(key, biolink_terms): - output_set.add(item) - output_set.add(is_a_base) - return output_set - -def identify_biolink_terms(biolink_model): - biolink_predicate_terms = dict() - biolink_category_terms = dict() - for predicate in biolink_model["slots"]: - if DESCENDANT_KEY in biolink_model["slots"][predicate]: - biolink_predicate_terms[predicate] = biolink_model["slots"][predicate][DESCENDANT_KEY] - - for category in biolink_model["classes"]: - if DESCENDANT_KEY in biolink_model["classes"][category]: - biolink_category_terms[category] = biolink_model["classes"][category][DESCENDANT_KEY] - - biolink_predicates = construct_biolink_term_set("related to", biolink_predicate_terms) - biolink_categories = construct_biolink_term_set("named thing", biolink_category_terms) - - return list(biolink_predicates), list(biolink_categories) - - - args = make_arg_parser().parse_args() biolink_model_url = args.biolinkModelURL biolink_model_file_name = args.biolinkModelLocalFile @@ -73,7 +41,7 @@ def identify_biolink_terms(biolink_model): kg2_util.download_file_if_not_exist_locally(biolink_model_url, biolink_model_file_name) biolink_model = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(biolink_model_file_name)) -biolink_edge_labels, biolink_categories = identify_biolink_terms(biolink_model) +biolink_edge_labels, biolink_categories = kg2_util.identify_biolink_terms(biolink_model) for variable_name in dir(kg2_util): variable_value = getattr(kg2_util, variable_name) @@ -84,14 +52,12 @@ def identify_biolink_terms(biolink_model): assert url_str in valid_base_urls, url_str elif variable_name.startswith('BIOLINK_CATEGORY_'): category_label = variable_value - category_camelcase = kg2_util.convert_space_case_to_camel_case(category_label) - category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase - assert category_curie in biolink_categories, category_curie + assert category_label in biolink_categories, category_curie # assert category_label in categories_to_check, category_label elif variable_name.startswith('CURIE_ID_'): curie_id = variable_value assert ':' in curie_id, variable_name assert curie_id.split(':')[0] in curies_to_url_map_data_bidir, variable_name elif variable_name.startswith('EDGE_LABEL_BIOLINK_'): - relation_label = variable_value - assert kg2_util.CURIE_PREFIX_BIOLINK + ':' + relation_label in biolink_edge_labels, relation_label + relation_label = variable_value.replace('_', ' ') + assert relation_label in biolink_edge_labels, relation_label From 5508b9b42407d43514d7f578a8bff9b58d811a0c Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 01:54:55 -0700 Subject: [PATCH 069/125] #387 the ACTUAL removal of ontobio --- kg2_util.py | 220 +++++++--------------------------------------------- 1 file changed, 30 insertions(+), 190 deletions(-) diff --git a/kg2_util.py b/kg2_util.py index a5aa0971..b694ea62 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -24,12 +24,10 @@ import json import jsonlines import math -import ontobio import os import pathlib import pickle import pprint -import prefixcommons import re import shutil import ssl @@ -137,8 +135,6 @@ BASE_URL_IDENTIFIERS_ORG_REGISTRY = \ 'https://registry.identifiers.org/registry/' BASE_URL_BIOLINK_CONCEPTS = 'https://w3id.org/biolink/vocab/' -BASE_URL_BIOLINK_ONTOLOGY = 'https://w3id.org/biolink/biolink-model.owl.ttl' -BASE_URL_BIOLINK_META = 'https://w3id.org/biolink/biolinkml/meta/' BASE_URL_CHEMBL_COMPOUND = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.compound:' BASE_URL_CHEMBL_TARGET = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.target:' BASE_URL_CHEMBL_MECHANISM = 'https://www.ebi.ac.uk/chembl/mechanism/inspect/' @@ -245,9 +241,6 @@ CURIE_ID_UNICHEM = CURIE_PREFIX_UNICHEM_SOURCE + ':' CURIE_ID_RDFS_SUBCLASS_OF = CURIE_PREFIX_RDFS + ':' + 'subClassOf' -IRI_OBO_FORMAT_XREF = BASE_URL_OBO_FORMAT + 'xref' -IRI_OWL_SAME_AS = BASE_URL_OWL + 'sameAs' - EDGE_LABEL_OWL_SAME_AS = 'same_as' EDGE_LABEL_BIOLINK_GENE_ASSOCIATED_WITH_CONDITION = 'gene_associated_with_condition' EDGE_LABEL_BIOLINK_GENE_PRODUCT_OF = 'gene_product_of' @@ -275,6 +268,9 @@ OBO_ONT_CURIE_RE = re.compile(r'OBO:([^\.]+)\.owl') LOWER_TO_UPPER_RE = re.compile(r'([a-z0-9])([A-Z][^A-Z])') +DESCENDANT_KEY = "is_a" +BASE_PREDICATE = EDGE_LABEL_BIOLINK_RELATED_TO.replace('_', ' ') +BASE_CATEGORY = BIOLINK_CATEGORY_NAMED_THING def convert_date(time): return datetime.datetime.fromtimestamp(time).strftime('%Y-%m-%d %H:%M:%S') @@ -410,126 +406,6 @@ def allcaps_to_only_first_letter_capitalized(allcaps: str): def safe_load_yaml_from_string(yaml_string: str): return yaml.safe_load(io.StringIO(yaml_string)) - -def shorten_iri_to_curie(iri: str, curie_to_iri_map: list) -> str: - if iri is None: - raise ValueError('cannot shorten an IRI with value None') - curie_list = prefixcommons.contract_uri(iri, - curie_to_iri_map) - if len(curie_list) == 0: - return None - - if len(curie_list) == 1: - curie_id = curie_list[0] - else: - assert False, \ - "somehow got a list after calling prefixcommons.contract: " + \ - iri + "; list is: " + str(curie_list) - curie_id = None - - # if curie_id is not None: - # # deal with IRIs like 'https://identifiers.org/umls/ATC/L01AX02' which get converted to CURIE 'UMLS:ATC/L01AX02' - # umls_match = REGEX_UMLS_CURIE.match(curie_id) - # if umls_match is not None: - # curie_id = umls_match[1] + ':' + umls_match[2] - - return curie_id - - -def make_uri_to_curie_shortener(curie_to_iri_map=None) -> callable: - if curie_to_iri_map is None: - curie_to_iri_map = [] - return lambda iri: shorten_iri_to_curie(iri, curie_to_iri_map) - - -def expand_curie_to_iri(curie_id: str, curie_to_iri_map: list) -> Optional[str]: - if curie_id.startswith('UMLS:CN'): - curie_id = curie_id.replace('UMLS:CN', 'medgen:CN') # see GitHub issue 810 - iri = prefixcommons.expand_uri(curie_id, curie_to_iri_map) - if iri == curie_id: - iri = None - return iri - - -def make_curie_to_uri_expander(curie_to_iri_map: list = None) -> callable: - if curie_to_iri_map is None: - curie_to_iri_map = [] - return lambda curie_id: expand_curie_to_iri(curie_id, curie_to_iri_map) - - -class IDMapperType(enum.Enum): - EXPAND = 1 - CONTRACT = 2 - - -def make_curies_to_uri_map(curies_to_uri_map_yaml_string: str, mapper_type: IDMapperType) -> dict: - yaml_data_structure_dict = safe_load_yaml_from_string(curies_to_uri_map_yaml_string) - if mapper_type == IDMapperType.CONTRACT: - return typing.cast(list, typing.cast(list, typing.cast(list, yaml_data_structure_dict['use_for_bidirectional_mapping']) + - yaml_data_structure_dict['use_for_contraction_only'])) - elif mapper_type == IDMapperType.EXPAND: - return typing.cast(list, typing.cast(list, yaml_data_structure_dict['use_for_bidirectional_mapping']) + - typing.cast(list, yaml_data_structure_dict['use_for_expansion_only'])) - else: - raise ValueError("Invalid mapper type: " + str(mapper_type)) - - -def get_biolink_category_tree(biolink_ontology: ontobio.ontol.Ontology): - queue = collections.deque([CURIE_PREFIX_BIOLINK + ':' + 'NamedThing']) - biolink_category_dict = dict() - biolink_category_tree = dict() - - while len(queue) > 0: - node_id = queue.popleft() - biolink_category_dict[node_id] = [] - for child_node_id in biolink_ontology.children(node_id, ['subClassOf']): - biolink_category_dict[node_id].append(child_node_id) - queue.append(child_node_id) - - for parent, children in biolink_category_dict.items(): - parent = biolink_ontology.node(parent)['lbl'] - for child in children: - if parent not in biolink_category_tree: - biolink_category_tree[parent] = [] - child = biolink_ontology.node(child)['lbl'] - biolink_category_tree[parent].append(child) - biolink_category_tree[parent] = sorted(biolink_category_tree[parent]) - - return biolink_category_tree - - -def get_depths_of_ontology_terms(ontology: ontobio.ontol.Ontology, - top_node_id: str): - queue = collections.deque([top_node_id]) - distances = dict() - distances[top_node_id] = 0 - while len(queue) > 0: - node_id = queue.popleft() - node_dist = distances.get(node_id, math.inf) - assert not math.isinf(node_dist) - for child_node_id in ontology.children(node_id, ['subClassOf']): - if math.isinf(distances.get(child_node_id, math.inf)): - distances[child_node_id] = node_dist + 1 - queue.append(child_node_id) - return distances - - -def get_biolink_categories_ontology_depths(biolink_ontology: ontobio.ontol.Ontology): - url_depths = get_depths_of_ontology_terms(biolink_ontology, CURIE_PREFIX_BIOLINK + ':NamedThing') - ret_depths = {key.replace(BASE_URL_BIOLINK_META, ''): value for key, value in url_depths.items()} - ret_depths['UnknownCategory'] = -1 - return ret_depths - - -def make_uri_curie_mappers(curies_to_uri_file_name: str) -> Dict[str, callable]: - yaml_string = read_file_to_string(curies_to_uri_file_name) - expand_map = make_curies_to_uri_map(yaml_string, IDMapperType.EXPAND) - contract_map = make_curies_to_uri_map(yaml_string, IDMapperType.CONTRACT) - expander = make_curie_to_uri_expander(expand_map) - contracter = make_uri_to_curie_shortener(contract_map) - return {'expand': expander, 'contract': contracter} - - def log_message(message: str, ontology_name: str = None, node_curie_id: str = None, @@ -807,15 +683,32 @@ def predicate_label_to_curie(predicate_label: str, predicate_label_to_use = predicate_label.replace(':', '_') return relation_curie_prefix + ':' + predicate_label_to_use - -def ont_children_recursive(ont_hier: ontobio.ontol.Ontology, - node_name: str): - res_set = {node_name} - for child_node_name in ont_hier.children(node_name): - res_set |= ont_children_recursive(ont_hier, child_node_name) - return res_set - - +def construct_biolink_term_set(is_a_base, biolink_terms): + output_set = set() + for key in biolink_terms: + key_is_a = biolink_terms[key] + if key_is_a == is_a_base: + for item in construct_biolink_term_set(key, biolink_terms): + output_set.add(item) + output_set.add(is_a_base) + return output_set + +def identify_biolink_terms(biolink_model): + biolink_predicate_terms = dict() + biolink_category_terms = dict() + for predicate in biolink_model["slots"]: + if DESCENDANT_KEY in biolink_model["slots"][predicate]: + biolink_predicate_terms[predicate] = biolink_model["slots"][predicate][DESCENDANT_KEY] + + for category in biolink_model["classes"]: + if DESCENDANT_KEY in biolink_model["classes"][category]: + biolink_category_terms[category] = biolink_model["classes"][category][DESCENDANT_KEY] + + biolink_predicates = construct_biolink_term_set(BASE_PREDICATE, biolink_predicate_terms) + biolink_categories = construct_biolink_term_set(BASE_CATEGORY, biolink_category_terms) + + return list(biolink_predicates), list(biolink_categories) + def make_edge_biolink(subject_curie_id: str, object_curie_id: str, predicate_label: str, @@ -839,57 +732,4 @@ def is_a_valid_http_url(id: str) -> bool: valid = id.startswith('http://') or id.startswith('https://') except validators.ValidationFailure: valid = False - return valid - - -def load_ontology_from_owl_or_json_file(ontology_file_name: str): - if ontology_file_name.startswith('./'): - ontology_file_name = ontology_file_name[2:(len(ontology_file_name)+1)] - ont_factory = ontobio.ontol_factory.OntologyFactory() - return ont_factory.create(ontology_file_name, ignore_cache=True) - - -# This function will load the ontology object from a pickle file (if it exists) -# or it will create the ontology object by parsing the OWL-XML ontology file -# NOTE: it seems that ontobio can't directly read a TTL file (at least, it is -# not working for me), so we convert all input files (whether OWL or TTL) to -# JSON and then load the JSON files using ontobio, for "simplicity". A second -# reason why we load using JSON is because when it loads an OWL file, ontobio -# does some internal caching that cannot be opted out of; it does not do this -# caching if you load an ontology in JSON format. -def make_ontology_from_local_file(file_name: str, save_pickle: bool = False): - file_name_without_ext = os.path.splitext(file_name)[0] - file_name_with_pickle_ext = file_name_without_ext + ".pickle" - if not os.path.isfile(file_name_with_pickle_ext) or save_pickle: - # the ontology hsa not been saved as a pickle file, so we need to load it from a text file - if not file_name.endswith('.json'): - temp_file_name = tempfile.mkstemp(prefix=TEMP_FILE_PREFIX + '-')[1] + '.json' - size = os.path.getsize(file_name) - log_message(message="Reading ontology file: " + file_name + "; size: " + "{0:.2f}".format(size/1024) + " KiB", - ontology_name=None) - cp = subprocess.run(['owltools', file_name, '-o', '-f', 'json', temp_file_name], - check=True) - # robot commented out because it is giving a NullPointerException on umls-semantictypes.owl - # Once robot no longer gives a NullPointerException, we can use it like this: - #cp = subprocess.run(['robot', 'convert', '--input', file_name, '--output', temp_file_name]) - if cp.stdout is not None: - log_message(message="OWL convert result: " + cp.stdout, ontology_name=None, output_stream=sys.stdout) - if cp.stderr is not None: - log_message(message="OWL convert result: " + cp.stderr, ontology_name=None, output_stream=sys.stderr) - assert cp.returncode == 0 - json_file = file_name_without_ext + ".json" - shutil.move(temp_file_name, json_file) - else: - json_file = file_name - size = os.path.getsize(json_file) - log_message(message="Reading ontology JSON file: " + json_file + "; size: " + "{0:.2f}".format(size/1024) + " KiB", - ontology_name=None) - assert os.path.exists(json_file) - ont_return = load_ontology_from_owl_or_json_file(json_file) - if save_pickle: - pickle.dump(ont_return, open(file_name_with_pickle_ext, 'wb')) - else: - size = os.path.getsize(file_name_with_pickle_ext) - log_message("Reading ontology file: " + file_name_with_pickle_ext + "; size: " + "{0:.2f}".format(size/1024) + " KiB", ontology_name=None) - ont_return = pickle.load(open(file_name_with_pickle_ext, "rb")) - return ont_return + return valid \ No newline at end of file From a8906951c0ff5462592e32ab16f41af4a6f57528 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 01:57:56 -0700 Subject: [PATCH 070/125] #387 addressing a name change --- validate/validate_curies_to_categories_yaml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validate/validate_curies_to_categories_yaml.py b/validate/validate_curies_to_categories_yaml.py index 80cdde53..8d8d6131 100755 --- a/validate/validate_curies_to_categories_yaml.py +++ b/validate/validate_curies_to_categories_yaml.py @@ -30,8 +30,8 @@ def make_arg_parser(): args = make_arg_parser().parse_args() curies_to_categories_file_name = args.curiesToCategoriesFile curies_to_urls_map_file_name = args.curiesToURLsMapFile -biolink_model_url = args.biolinkModelOWLURL -biolink_model_file_name = args.biolinkModelOWLLocalFile +biolink_model_url = args.biolinkModelYAMLURL +biolink_model_file_name = args.biolinkModelYAMLLocalFile curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) curies_to_url_map_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_map_file_name)) curies_to_url_map_data_bidir = {next(iter(listitem.keys())) for listitem in curies_to_url_map_data['use_for_bidirectional_mapping']} From 86492f525239b08b74b3dc91b7a47cea9dd7a32e Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 02:01:47 -0700 Subject: [PATCH 071/125] #387 format adjustment --- validate/validate_curies_to_categories_yaml.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/validate/validate_curies_to_categories_yaml.py b/validate/validate_curies_to_categories_yaml.py index 8d8d6131..63b96f88 100755 --- a/validate/validate_curies_to_categories_yaml.py +++ b/validate/validate_curies_to_categories_yaml.py @@ -51,6 +51,4 @@ def make_arg_parser(): list(curies_to_categories_data['term-mappings'].values()) for category in categories_to_check: - category_camelcase = kg2_util.convert_space_case_to_camel_case(category) - category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase - assert category_curie in biolink_categories, category_curie + assert category in biolink_categories, category_curie From 86d431ffdc0dbfbdb299916cd81e9967a373a3c1 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 02:05:30 -0700 Subject: [PATCH 072/125] #387 adjustments to the mapping file to go along with new comparison system --- maps/curies-to-urls-map.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 3452641c..c81cb4ed 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -324,7 +324,7 @@ use_for_bidirectional_mapping: - NBO-PROPERTY: 'http://purl.obolibrary.org/obo/nbo#' - - NCBIGene: 'http://identifiers.org/ncbigene/' + NCBIGene: 'https://identifiers.org/ncbigene:' # - # NCBITaxon: 'http://purl.obolibrary.org/obo/ncbitaxon/subsets/taxslim.owl' - From e63311b38998edab071708809cbb4e7df8aa1232 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 02:09:20 -0700 Subject: [PATCH 073/125] #387 apparently these have to be different to match biolink --- maps/curies-to-urls-map.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index c81cb4ed..3452641c 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -324,7 +324,7 @@ use_for_bidirectional_mapping: - NBO-PROPERTY: 'http://purl.obolibrary.org/obo/nbo#' - - NCBIGene: 'https://identifiers.org/ncbigene:' + NCBIGene: 'http://identifiers.org/ncbigene/' # - # NCBITaxon: 'http://purl.obolibrary.org/obo/ncbitaxon/subsets/taxslim.owl' - From 9d7213b31df87519265974a084b007f062f6ce13 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 02:13:21 -0700 Subject: [PATCH 074/125] #387 kg2_util didn't previously commit correctly --- kg2_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kg2_util.py b/kg2_util.py index b694ea62..a32815f2 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -156,7 +156,7 @@ BASE_URL_KEGG_GLYCAN = BASE_BASE_URL_IDENTIFIERS_ORG + 'kegg.glycan:' BASE_URL_KEGG_REACTION = BASE_BASE_URL_IDENTIFIERS_ORG + 'kegg.reaction:' BASE_URL_MIRBASE = BASE_BASE_URL_IDENTIFIERS_ORG + 'mirbase:' -BASE_URL_NCBIGENE = BASE_BASE_URL_IDENTIFIERS_ORG + 'ncbigene:' +BASE_URL_NCBIGENE = 'http://identifiers.org/ncbigene/' BASE_URL_OBO_FORMAT = 'http://purl.org/obo/owl/oboFormat#oboFormat_' BASE_URL_OWL = 'http://www.w3.org/2002/07/owl#' BASE_URL_PATHWHIZ = 'http://smpdb.ca/pathways/#' From e1919881b5e262272479dcd09b2039560d8d6172 Mon Sep 17 00:00:00 2001 From: ecwood Date: Thu, 29 Aug 2024 12:50:41 -0700 Subject: [PATCH 075/125] #387 the recursive category picker is workingpython3 ontologies_jsonl_to_kg_jsonl.py ontologies.json maps/curies-to-categories.yaml null > ontology_nodes.json python3 ontologies_jsonl_to_kg_jsonl.py ontologies.json maps/curies-to-categories.yaml null > ontology_nodes.json --- ontologies_jsonl_to_kg_jsonl.py | 122 +++++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 34 deletions(-) diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py index 93e119db..34494e56 100644 --- a/ontologies_jsonl_to_kg_jsonl.py +++ b/ontologies_jsonl_to_kg_jsonl.py @@ -42,6 +42,13 @@ "oboInOwl:hasDbXref": TEXT_KEY, "oboInOwl:xref": TEXT_KEY} +CLASS_TO_SUPERCLASSES = dict() +SAVED_NODE_INFO = dict() +SOURCE_INFO = dict() + +NODE_CATEGORY_MAPPINGS = dict() +PREFIX_MAPPINGS = dict() + CLASSES_DICT = dict() URI_MAP = dict() @@ -49,21 +56,57 @@ MISSING_ID_PREFIXES = set() +FILE_MAPPING = "file" +PREFIX_MAPPING = "prefix" +RECURSE_MAPPING = "recurse" + def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('curiesToCategoriesYAML', type=str) arg_parser.add_argument('outputFile', type=str) return arg_parser.parse_args() +def categorize_node(node_id, recursion_depth=0): + node_prefix = node_id.split(':')[0] + + if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING: + return NODE_CATEGORY_MAPPINGS[node_id][0] + + if node_prefix in PREFIX_MAPPINGS: + node_category = PREFIX_MAPPINGS[node_prefix] + NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING) + return PREFIX_MAPPINGS[node_prefix] + + # Get try to get the most common superclass categorization + superclass_categorizations = dict() + highest_value = 0 + highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING + if recursion_depth == 10: + return kg2_util.BIOLINK_CATEGORY_NAMED_THING + + for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()): + superclass_category = categorize_node(superclass, recursion_depth + 1) + if superclass_category not in superclass_categorizations: + superclass_categorizations[superclass_category] = 0 + superclass_categorizations[superclass_category] += 1 + if superclass_categorizations[superclass_category] > highest_value: + highest_value = superclass_categorizations[superclass_category] + highest_category = superclass_category + + NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) + return highest_category + + + def process_ontology_item(ontology_item): source = ontology_item.get(OWL_SOURCE_KEY, str()) for owl_class in ontology_item.get(OWL_CLASS_TAG, list()): # Typically genid classes which don't neatly map onto the KG2 schema if ID_TAG not in owl_class: continue - # TODO: MAP THIS HERE, since not all sources use same IRIs for the same nodes node_id = match_prefix(owl_class.get(ID_TAG, str())) if node_id is None: continue @@ -123,6 +166,7 @@ def process_ontology_item(ontology_item): if RESOURCE_KEY in edge: edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + superclasses = set() final_edges_list = list() for (edge_relation, edge_object) in edges_list: edge_object = match_prefix(edge_object) @@ -131,37 +175,38 @@ def process_ontology_item(ontology_item): edge_relation = match_prefix(edge_relation) if edge_relation is None: continue + if edge_relation in ["rdfs:subClassOf"]: + superclasses.add(edge_object) final_edges_list.append((edge_relation, edge_object)) + # Imperfect way to make it deterministic + superclasses = sorted(list(superclasses)) + if node_id not in CLASS_TO_SUPERCLASSES: + CLASS_TO_SUPERCLASSES[node_id] = list() + CLASS_TO_SUPERCLASSES[node_id] += superclasses + CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) + + if node_id not in SAVED_NODE_INFO: + SAVED_NODE_INFO[node_id] = list() + SAVED_NODE_INFO[node_id].append({"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list}) + + for ontology_node in ontology_item.get("owl:Ontology", list()): + ontology_version = None + ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get("owl:versionInfo", list()) if TEXT_KEY in version] + ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get("owl:versionIRI", list()) if RESOURCE_KEY in version] + ontology_date = [version.get(TEXT_KEY, str()) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] + if len(ontology_versions) == 1: + ontology_version = ontology_versions[0] + elif len(ontology_version_iri) == 1: + ontology_version = ontology_version_iri[0] + elif len(ontology_date) == 1: + ontology_version = ontology_date[0] + + if ontology_version is None: + print("Warning: source", source, "lacks any versioning information.") + if source not in SOURCE_INFO: + SOURCE_INFO[source] = {"source": source, "ontology_date": ontology_date, "ontology_version": ontology_version} - # node_id = owl_class.get(ID_TAG, list()) - - # superclasses = [superclass.get(RESOURCE_KEY, str()) for superclass in owl_class.get(SUBCLASS_TAG, list())] - - # # Also query for comments? - # # Descriptions appear to be additive in current KG2 - # descriptions = owl_class.get(DESCRIPTION_TAG, list()) - # assert len(descriptions) <= 1 - # description = str() - # for element in descriptions: - # description += element[TEXT_KEY] - - # xrefs = [xref[TEXT_KEY] for xref in owl_class.get(XREF_TAG, list())] - # for element in owl_class.get(XREF_TAG, list()): - # xrefs.append(element[TEXT_KEY]) - - # exact_matches = [exact_match[RESOURCE_KEY] for exact_match in owl_class.get(EXACT_MATCH_TAG, list())] - - # names = owl_class.get(NAME_TAG, list()) - # assert len(names) <= 1, ontology_item - # name = str() - # for element in names: - # name += element[TEXT_KEY] - - # node = {"id": node_id, "superclasses": superclasses, "description": description, "xrefs": xrefs, "name": name, "exact_matches": exact_matches} - - node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list} - print(json.dumps(node, indent=4)) def generate_uri_map(): uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml")) @@ -201,8 +246,15 @@ def match_prefix(node_id): if __name__ == '__main__': args = get_args() input_file_name = args.inputFile + curies_to_categories_file_name = args.curiesToCategoriesYAML output_file_name = args.outputFile + curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) + for mapping_node in curies_to_categories_data["term-mappings"]: + NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING) + for prefix in curies_to_categories_data["prefix-mappings"]: + PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix] + input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_data = input_read_jsonlines_info[0] @@ -211,9 +263,11 @@ def match_prefix(node_id): generate_uri_map() for ontology_item in input_data: process_ontology_item(ontology_item) - print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4)) - # print("OWL Classes:", owl_class_count) - # for key in KEYS_DICT: - # KEYS_DICT[key] = KEYS_DICT[key] / owl_class_count - # print(json.dumps(KEYS_DICT, indent=4, sort_keys=True)) \ No newline at end of file + for node_id in SAVED_NODE_INFO: + categorize_node(node_id) + + print(json.dumps(NODE_CATEGORY_MAPPINGS, indent=4)) + + # Can add this back in later + # print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4)) From 314b54f14b3731d50fcd6ee185347239271546de Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 1 Sep 2024 22:55:32 -0700 Subject: [PATCH 076/125] #387 some date restructuring, ontology node versioning, and changes to handle ORDO --- ontologies_jsonl_to_kg_jsonl.py | 219 ++++++++++++++++++++++++++++---- 1 file changed, 197 insertions(+), 22 deletions(-) diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py index 34494e56..c04b4b70 100644 --- a/ontologies_jsonl_to_kg_jsonl.py +++ b/ontologies_jsonl_to_kg_jsonl.py @@ -1,6 +1,7 @@ import argparse import kg2_util import json +import datetime OWL_CLASS_TAG = "owl:Class" SUBCLASS_TAG = "rdfs:subClassOf" @@ -15,10 +16,12 @@ RESOURCE_KEY = "rdf:resource" OWL_SOURCE_KEY = "owl_source" +OWL_SOURCE_NAME_KEY = "owl_source_name" KEYS_DICT = dict() COMMENT_PREFIX = "COMMENTS: " +DESCRIPTION_DELIM = " // " BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY, "mondo-base:closeMatch": RESOURCE_KEY, @@ -53,6 +56,7 @@ URI_MAP = dict() URI_MAP_KEYS = list() +PREFIX_TO_IRI_MAP = dict() MISSING_ID_PREFIXES = set() @@ -60,13 +64,28 @@ PREFIX_MAPPING = "prefix" RECURSE_MAPPING = "recurse" +ID_KEY = "id" +DEPRECATED_KEY = "deprecated" +UPDATE_DATE_KEY = "update_date" +CREATION_DATE_KEY = "creation_date" +SYNONYM_KEY = "synonym" +DESCRIPTION_KEY = "description_list" +NAME_KEY = "name" +SOURCE_KEY = "source" +BIOLOGICAL_SEQUENCE_KEY = "has_biological_sequence" +CATEGORY_KEY = "category" +EDGES_KEY = "edges" +IRI_KEY = "iri" +VERSION_KEY = "version" + def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str) arg_parser.add_argument('curiesToCategoriesYAML', type=str) - arg_parser.add_argument('outputFile', type=str) + arg_parser.add_argument('outputNodesFile', type=str) + arg_parser.add_argument('outputEdgesFile', type=str) return arg_parser.parse_args() def categorize_node(node_id, recursion_depth=0): @@ -99,10 +118,93 @@ def categorize_node(node_id, recursion_depth=0): NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) return highest_category +def reformat_obo_date(date_str): + if date_str is None: + return None + + if '-' in date_str: + delim = 'T' + if ' ' in date_str: + delim = ' ' + date_spl = date_str.strip('Z').split(delim) + date_fh = date_spl[0].split('-') + year = int(date_fh[0]) + month = int(date_fh[1]) + day = int(date_fh[2]) + + if month < 1 or month > 12 or day < 1 or day > 31: + return None + + if len(date_spl) > 1: + date_sh = date_spl[1].split(':') + hour = int(date_sh[0]) + minute = int(date_sh[1]) + second = int(date_sh[2][0:1]) + + return datetime.datetime(year, month, day, hour, minute, second) + else: + return datetime.datetime(year, month, day) + else: + date_spl = date_str.split(' ') + date_fh = date_spl[0].split(':') + year = int(date_fh[2]) + month = int(date_fh[1]) + day = int(date_fh[0]) + + if month < 1 or month > 12 or day < 1 or day > 31: + return None + + return datetime.datetime(year, month, day) + +def pick_most_recent_date(dates, alternate_date=None): + latest_date = None + for date in dates: + if date == None: + continue + if latest_date == None or date > latest_date: + latest_date = date + + if latest_date == None: + if alternate_date is not None: + latest_date = alternate_date + else: + return None + + return latest_date.isoformat(sep=' ') + +def process_ontology_term(ontology_node, source, ontology_name, owl_source=True): + owl_prefix = "" + if owl_source: + owl_prefix = "owl:" + ontology_version = None + ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version] + ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version] + ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] + ontology_iri = ontology_node.get("rdf:about", str()) + if len(ontology_versions) == 1: + ontology_version = ontology_versions[0] + elif len(ontology_version_iri) == 1: + ontology_version = ontology_version_iri[0] + version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/'] + for replacement in version_replacements: + ontology_version = ontology_version.replace(replacement, "") + ontology_version = ontology_version.split('/')[0] + elif len(ontology_dates) >= 1: + ontology_version = pick_most_recent_date(ontology_dates) + + if ontology_version is None: + print("Warning: source", source, "lacks any versioning information.") + + ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates)) + source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source + + if source not in SOURCE_INFO: + SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version} def process_ontology_item(ontology_item): source = ontology_item.get(OWL_SOURCE_KEY, str()) + ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str()) for owl_class in ontology_item.get(OWL_CLASS_TAG, list()): # Typically genid classes which don't neatly map onto the KG2 schema if ID_TAG not in owl_class: @@ -110,6 +212,8 @@ def process_ontology_item(ontology_item): node_id = match_prefix(owl_class.get(ID_TAG, str())) if node_id is None: continue + node_prefix = node_id.split(':')[0] + node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '') # Configure the name name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] @@ -124,6 +228,26 @@ def process_ontology_item(ontology_item): description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] + deprecated = "true" in owl_class.get("owl:deprecated", list()) + for name in name_list: + if name.startswith("obsolete") or name.startswith("(obsolete") or name.endswith("obsolete"): + deprecated = True + + # Configure the synonyms + synonym_list = list() + synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym", + "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111", + "obo:IAO_0000028", "skos:prefLabel"] + synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] + + update_date_list = list() + update_date_keys = ["dc:date", "dcterms:date", "terms:date"] + update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] + + creation_date_list = list() + creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] + creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] + # Configure the biological sequence has_biological_sequence = dict() has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] @@ -188,25 +312,24 @@ def process_ontology_item(ontology_item): if node_id not in SAVED_NODE_INFO: SAVED_NODE_INFO[node_id] = list() - SAVED_NODE_INFO[node_id].append({"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list}) + SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, + DEPRECATED_KEY: deprecated, + UPDATE_DATE_KEY: update_date_list, + CREATION_DATE_KEY: creation_date_list, + SYNONYM_KEY: synonym_list, + DESCRIPTION_KEY: description_list, + NAME_KEY: name_list, + SOURCE_KEY: source, + BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence, + IRI_KEY: node_iri, + EDGES_KEY: final_edges_list}) for ontology_node in ontology_item.get("owl:Ontology", list()): - ontology_version = None - ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get("owl:versionInfo", list()) if TEXT_KEY in version] - ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get("owl:versionIRI", list()) if RESOURCE_KEY in version] - ontology_date = [version.get(TEXT_KEY, str()) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] - if len(ontology_versions) == 1: - ontology_version = ontology_versions[0] - elif len(ontology_version_iri) == 1: - ontology_version = ontology_version_iri[0] - elif len(ontology_date) == 1: - ontology_version = ontology_date[0] - - if ontology_version is None: - print("Warning: source", source, "lacks any versioning information.") - if source not in SOURCE_INFO: - SOURCE_INFO[source] = {"source": source, "ontology_date": ontology_date, "ontology_version": ontology_version} + process_ontology_term(ontology_node, source, ontology_name) + # Because of ORDO + for ontology_node in ontology_item.get("Ontology", list()): + process_ontology_term(ontology_node, source, ontology_name, False) def generate_uri_map(): uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml")) @@ -217,6 +340,7 @@ def generate_uri_map(): for curie_prefix in curie_prefix_dict: curie_url = curie_prefix_dict[curie_prefix] URI_MAP[curie_url] = curie_prefix + PREFIX_TO_IRI_MAP[curie_prefix] = curie_url for curie_prefix_dict in contraction_map: for curie_prefix in curie_prefix_dict: @@ -242,12 +366,62 @@ def match_prefix(node_id): else: MISSING_ID_PREFIXES.add(node_id) +def construct_nodes_and_edges(nodes_output, edges_output): + for source in SOURCE_INFO: + source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]]) + source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY] + source_id = SOURCE_INFO[source][SOURCE_KEY] + source_iri = SOURCE_INFO[source][IRI_KEY] + node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id) + + nodes_output.write(node) + + + for node_id in SAVED_NODE_INFO: + for source_node_index in range(len(SAVED_NODE_INFO[node_id])): + if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]: + continue + name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name + node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY] + description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY]) + has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None) + synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY] + category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY] + + source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY] + provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source + source_date = SOURCE_INFO[source][UPDATE_DATE_KEY] + + update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date) + creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date) + + node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by) + node["description"] = description + node["has_biological_sequence"] = has_biological_sequence + node["creation_date"] = creation_date + node["synonym"] = synonyms + + nodes_output.write(node) + + for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]: + relation_label = edge_relation.split(':')[1] + edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date) + + edges_output.write(edge) + + if __name__ == '__main__': args = get_args() input_file_name = args.inputFile curies_to_categories_file_name = args.curiesToCategoriesYAML - output_file_name = args.outputFile + output_nodes_file_name = args.outputNodesFile + output_edges_file_name = args.outputEdgesFile + test_mode = args.test + + nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) + nodes_output = nodes_info[0] + edges_output = edges_info[0] curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) for mapping_node in curies_to_categories_data["term-mappings"]: @@ -258,7 +432,6 @@ def match_prefix(node_id): input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_data = input_read_jsonlines_info[0] - owl_class_count = 0 ontology_prefixes = set() generate_uri_map() for ontology_item in input_data: @@ -266,8 +439,10 @@ def match_prefix(node_id): for node_id in SAVED_NODE_INFO: categorize_node(node_id) + node_category = NODE_CATEGORY_MAPPINGS[node_id][0] + for index in range(len(SAVED_NODE_INFO[node_id])): + SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category - print(json.dumps(NODE_CATEGORY_MAPPINGS, indent=4)) + construct_nodes_and_edges(nodes_output, edges_output) - # Can add this back in later - # print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4)) + kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) \ No newline at end of file From c36f46c51ea089f4e937d9e9dc255ed2fb6fb408 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 1 Sep 2024 22:55:52 -0700 Subject: [PATCH 077/125] #387 need to have name for ontology node --- owlparser.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/owlparser.py b/owlparser.py index 3f271dd4..34e99fe3 100644 --- a/owlparser.py +++ b/owlparser.py @@ -8,6 +8,7 @@ def get_args(): arg_parser.add_argument('--test', dest='test', action="store_true", default=False) arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('owlFilePath', type=str) arg_parser.add_argument('outputFile', type=str) return arg_parser.parse_args() @@ -354,7 +355,7 @@ def divide_into_lines(self, input_file_name): class OWLParser(): - def __init__(self, input_files, output_file_name): + def __init__(self, input_files, input_file_names, owl_file_path, output_file_name): self.XML_TAG = "?xml" self.RDF_TAG = "rdf:RDF" self.DOCTYPE_TAG = "!DOCTYPE" @@ -366,6 +367,7 @@ def __init__(self, input_files, output_file_name): self.GENID_PREFIX = "genid" self.OWL_SOURCE_KEY = "owl_source" + self.OWL_SOURCE_NAME_KEY = "owl_source_name" self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] @@ -378,6 +380,8 @@ def __init__(self, input_files, output_file_name): self.ID_TO_GENIDS = dict() self.input_files = input_files + self.input_file_names = input_file_names + self.owl_file_path = owl_file_path self.output_file_name = output_file_name self.output_info = kg2_util.create_single_jsonlines() @@ -433,6 +437,7 @@ def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): def write_to_output(self, output_dict, source_file): output_dict[self.OWL_SOURCE_KEY] = source_file + output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file] self.output.write(output_dict) return @@ -464,18 +469,18 @@ def triage_nest_dict(self, nest_dict): self.GENID_REMAINING_NESTS[class_id] = updated_class_nest else: # Since all of the genids used in this class have been matched, output - self.output.write(nest_dict) + self.write_to_output(nest_dict, self.input_file) self.GENID_REMAINING_NESTS[class_id] = None else: # There are no genids that need to be worked with, so just output - self.output.write(nest_dict) + self.write_to_output(nest_dict, self.input_file) def parse_OWL_file(self): for input_file in self.input_files: self.input_file = input_file print("Reading:", input_file, "starting at", date()) - self.xml_parser.divide_into_lines(input_file) + self.xml_parser.divide_into_lines(self.owl_file_path + input_file) # Genid wasn't filled, still want to include them though for item in self.GENID_REMAINING_NESTS: @@ -490,23 +495,30 @@ def parse_OWL_file(self): kg2_util.close_single_jsonlines(self.output_info, self.output_file_name) -def identify_input_files(ont_load_inventory): +def identify_and_download_input_files(ont_load_inventory, path_to_owl_files): input_files = list() + input_file_names = dict() + owl_file_path = path_to_owl_files.rstrip('/') + "/" for item in ont_load_inventory: input_files.append(item['file']) + input_file_names[item['file']] = item['title'] + print("Downloading:", item['file'], "starting at", date()) + kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file']) + print("Download of:", item['file'], "finished at", date()) - return input_files + return input_files, input_file_names, owl_file_path if __name__ == '__main__': args = get_args() input_file_name = args.inputFile + owl_path = args.owlFilePath output_file_name = args.outputFile ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name)) - input_files = identify_input_files(ont_load_inventory) + input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path) print("Files:", input_files) print("Start Time:", date()) - owl_parser = OWLParser(input_files, output_file_name) + owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name) owl_parser.parse_OWL_file() print("End Time:", date()) \ No newline at end of file From d4ffa050bd9e9fe8aa9aab3f4f5a55d77518f301 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 1 Sep 2024 22:56:03 -0700 Subject: [PATCH 078/125] credits for me --- kg2_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kg2_util.py b/kg2_util.py index a32815f2..bfb7e2a8 100644 --- a/kg2_util.py +++ b/kg2_util.py @@ -7,7 +7,7 @@ __author__ = 'Stephen Ramsey' __copyright__ = 'Oregon State University' -__credits__ = ['Stephen Ramsey'] +__credits__ = ['Stephen Ramsey', 'Erica Wood'] __license__ = 'MIT' __version__ = '0.1.0' __maintainer__ = '' From 786050669dfbd04588f581a0e0b9446adbc28f31 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 1 Sep 2024 22:56:20 -0700 Subject: [PATCH 079/125] #387 removing unnecessary curies --- maps/curies-to-categories.yaml | 127 --------------------------------- 1 file changed, 127 deletions(-) diff --git a/maps/curies-to-categories.yaml b/maps/curies-to-categories.yaml index 9520387c..8b35d539 100644 --- a/maps/curies-to-categories.yaml +++ b/maps/curies-to-categories.yaml @@ -284,133 +284,6 @@ term-mappings: SNOMED:419891008: information content entity SNOMED:900000000000441003: information content entity SO:0000704: gene # formerly genomic entity - STY:T001: individual organism - STY:T002: organism taxon # formerly individual organism - STY:T004: organism taxon # formerly individual organism - STY:T005: organism taxon # formerly individual organism - STY:T007: organism taxon # formerly individual organism - STY:T008: organism taxon # formerly individual organism - STY:T010: organism taxon # formerly individual organism - STY:T011: organism taxon # formerly individual organism - STY:T012: organism taxon # formerly individual organism - STY:T013: organism taxon # formerly individual organism - STY:T014: organism taxon # formerly individual organism - STY:T015: organism taxon # formerly individual organism - STY:T016: organism taxon # formerly individual organism - STY:T017: anatomical entity - STY:T018: gross anatomical structure - STY:T019: disease - STY:T020: disease - STY:T021: gross anatomical structure - STY:T022: anatomical entity - STY:T023: gross anatomical structure - STY:T024: gross anatomical structure - STY:T025: cell - STY:T026: cellular component - STY:T028: biological entity - STY:T029: anatomical entity - STY:T030: anatomical entity - STY:T031: anatomical entity - STY:T032: named thing # formerly organism attribute - STY:T033: disease or phenotypic feature - STY:T034: phenomenon - STY:T037: pathological process - STY:T038: phenomenon - STY:T039: physiological process - STY:T040: physiological process - STY:T041: behavior - STY:T042: physiological process - STY:T043: physiological process - STY:T044: molecular activity - STY:T045: physiological process - STY:T046: pathological process - STY:T047: disease - STY:T048: disease - STY:T049: disease - STY:T050: biological entity - STY:T051: event # formerly activity - STY:T052: activity - STY:T053: behavior - STY:T054: behavior - STY:T055: behavior - STY:T056: activity - STY:T057: activity - STY:T058: activity - STY:T059: procedure - STY:T060: procedure - STY:T061: procedure - STY:T062: activity - STY:T063: procedure - STY:T064: activity - STY:T065: activity - STY:T066: activity - STY:T067: phenomenon - STY:T068: phenomenon - STY:T069: phenomenon - STY:T070: phenomenon - STY:T071: named thing - STY:T072: physical entity - STY:T073: physical entity - STY:T074: device - STY:T075: device - STY:T077: information content entity - STY:T078: information content entity - STY:T079: information content entity - STY:T080: information content entity - STY:T081: information content entity - STY:T082: information content entity - STY:T083: geographic location - STY:T085: biological entity - STY:T086: nucleic acid entity - STY:T087: polypeptide - STY:T088: biological entity - STY:T089: information content entity - STY:T090: individual organism - STY:T091: named thing - STY:T092: agent - STY:T093: agent - STY:T094: agent - STY:T095: agent - STY:T096: agent - STY:T097: cohort - STY:T098: population of individual organisms - STY:T099: cohort - STY:T100: cohort - STY:T101: cohort - STY:T102: information content entity - STY:T103: chemical entity # formerly chemical substance - STY:T104: chemical entity # formerly chemical substance - STY:T109: chemical entity - STY:T114: nucleic acid entity - STY:T116: polypeptide - STY:T120: chemical entity # formerly chemical substance - STY:T121: drug - STY:T122: device - STY:T123: chemical entity # formerly chemical substance - STY:T125: chemical entity # formerly chemical substance - STY:T126: protein - STY:T127: small molecule - STY:T129: biological entity # formerly chemical substance - STY:T130: chemical entity - STY:T131: chemical entity # formerly chemical substance - STY:T167: chemical entity # formerly chemical substance - STY:T168: food - STY:T169: information content entity - STY:T170: publication - STY:T171: information content entity - STY:T184: phenotypic feature - STY:T185: information content entity - STY:T190: disease - STY:T191: disease - STY:T192: protein - STY:T194: organism taxon # formerly individual organism - STY:T195: drug - STY:T196: small molecule - STY:T197: chemical entity - STY:T200: drug - STY:T201: named thing # formerly clinical attribute - STY:T203: device - STY:T204: organism taxon # formerly individual organism TRANS:0000000: named thing # formerly exposure event UBERON:0001062: anatomical entity UBERON:0000105: life stage From 0f56540ae369e55f52129da48356d9e9ece13d05 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 1 Sep 2024 22:56:39 -0700 Subject: [PATCH 080/125] #387 no longer want biolink as an ontology source due to the parsing hassle --- maps/ont-load-inventory.yaml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/maps/ont-load-inventory.yaml b/maps/ont-load-inventory.yaml index 7b808175..590781d1 100644 --- a/maps/ont-load-inventory.yaml +++ b/maps/ont-load-inventory.yaml @@ -1,8 +1,3 @@ -- # maps to CURIE prefix: biolink - url: https://raw.githubusercontent.com/biolink/biolink-model/v4.0.0/project/owl/biolink_model.owl.ttl - file: biolink_model.owl.ttl - download: true - title: Biolink meta-model - # maps to CURIE prefix: BFO url: http://purl.obolibrary.org/obo/bfo.owl file: bfo.owl From b3c8cea3757123bc2c2e31750b0b7fcd3ce08da9 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 1 Sep 2024 22:57:05 -0700 Subject: [PATCH 081/125] #387 we have this predicate again --- maps/predicate-remap.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml index 4da6dcac..15452136 100644 --- a/maps/predicate-remap.yaml +++ b/maps/predicate-remap.yaml @@ -3056,9 +3056,9 @@ OBO:mondo/mondo-base#disease_responds_to: # OBO:uo#is_unit_of: # operation: invert # core_predicate: biolink:related_to -# OIO:hasDbXref: -# operation: keep -# core_predicate: biolink:close_match +OIO:hasDbXref: + operation: keep + core_predicate: biolink:close_match OMIM:CHD: operation: keep core_predicate: biolink:subclass_of From c36d3c3a9843f0f916e6200505be087e7537a084 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 00:00:12 -0700 Subject: [PATCH 082/125] #387 remove sed-ing from validation tests now that biolink is gone --- validate/run-validation-tests.sh | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/validate/run-validation-tests.sh b/validate/run-validation-tests.sh index 7cdb7974..d0a3abac 100755 --- a/validate/run-validation-tests.sh +++ b/validate/run-validation-tests.sh @@ -22,13 +22,6 @@ export PATH=$PATH:${BUILD_DIR} biolink_base_url_no_version=https://raw.githubusercontent.com/biolink/biolink-model/ biolink_raw_base_url=${biolink_base_url_no_version}v${biolink_model_version}/ -biolink_download_url=${biolink_raw_base_url}/project/owl/biolink_model.owl.ttl -curies_urls_map_replace_string="\ biolink_download_source: ${biolink_download_url}" -ont_load_inventory_replace_string="\ url: ${biolink_download_url}" -biolink_url_context_jsonld=${biolink_raw_base_url}context.jsonld -biolink_model_owl=biolink_model.owl.ttl -biolink_model_owl_local_file=${BUILD_DIR}/${biolink_model_owl} -biolink_model_owl_url=${biolink_raw_base_url}project/owl/${biolink_model_owl} biolink_model_yaml=biolink_model.yaml biolink_model_yaml_url=${biolink_raw_base_url}src/biolink_model/schema/${biolink_model_yaml} biolink_model_yaml_local_file=${BUILD_DIR}/${biolink_model_yaml} @@ -42,13 +35,6 @@ cat ${config_dir}/master-config.shinc echo ${VALIDATE_CODE_DIR} echo ${curies_to_urls_file} -sed -i "\@${biolink_base_url_no_version}@c${curies_urls_map_replace_string}" \ - ${curies_to_urls_file} - -sed -i "\@${biolink_base_url_no_version}@c${ont_load_inventory_replace_string}" \ - ${ont_load_inventory_file} - -rm -f ${biolink_model_owl_local_file} rm -f ${biolink_model_yaml_local_file} cd ${BUILD_DIR} From 9a3c22f459dd877d4025a259e32d451e3c71c5d3 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 00:06:03 -0700 Subject: [PATCH 083/125] #387 #390 --- setup/requirements-kg2-build.txt | 2 -- setup/setup-kg2-build.sh | 12 ------------ 2 files changed, 14 deletions(-) diff --git a/setup/requirements-kg2-build.txt b/setup/requirements-kg2-build.txt index c40910aa..9823d186 100644 --- a/setup/requirements-kg2-build.txt +++ b/setup/requirements-kg2-build.txt @@ -5,8 +5,6 @@ HTMLParser==0.0.2 isodate==0.6.0 jsonlines==3.0.0 jsonpickle==1.0.0 -ontobio==2.8.0 -prefixcommons==0.1.9 pymongo==3.8.0 PyMySQL==0.9.3 python-dateutil==2.8.1 diff --git a/setup/setup-kg2-build.sh b/setup/setup-kg2-build.sh index 126eff65..f119fc52 100755 --- a/setup/setup-kg2-build.sh +++ b/setup/setup-kg2-build.sh @@ -97,18 +97,6 @@ fi # we want python3.7 (also need python3.7-dev or else pip cannot install the python package "mysqlclient") source ${SETUP_CODE_DIR}/setup-python37-with-pip3-in-ubuntu.shinc ${VENV_DIR}/bin/pip3 install -r ${SETUP_CODE_DIR}/requirements-kg2-build.txt - -## install ROBOT (software: ROBOT is an OBO Tool) by downloading the jar file -## distribution and cURLing the startup script (note github uses URL redirection -## so we need the "-L" command-line option, and cURL doesn't like JAR files by -## default so we need the "application/zip") -${curl_get} -H "Accept: application/zip" https://github.com/RTXteam/robot/releases/download/v1.3.0/robot.jar > ${BUILD_DIR}/robot.jar -curl -s https://raw.githubusercontent.com/RTXteam/robot/v1.3.0/bin/robot > ${BUILD_DIR}/robot -chmod +x ${BUILD_DIR}/robot - -## setup owltools -${curl_get} ${BUILD_DIR} https://github.com/RTXteam/owltools/releases/download/v0.3.0/owltools > ${BUILD_DIR}/owltools -chmod +x ${BUILD_DIR}/owltools } function setup_kg2_build_part2 () { From e94e431f063c3657c4c71b1bc7f7dd5cedd8dbbd Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 00:18:59 -0700 Subject: [PATCH 084/125] #387 ordo actually included in ETL --- ontologies_jsonl_to_kg_jsonl.py | 249 ++++++++++++++++---------------- 1 file changed, 127 insertions(+), 122 deletions(-) diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py index c04b4b70..3390c2ec 100644 --- a/ontologies_jsonl_to_kg_jsonl.py +++ b/ontologies_jsonl_to_kg_jsonl.py @@ -3,14 +3,8 @@ import json import datetime -OWL_CLASS_TAG = "owl:Class" -SUBCLASS_TAG = "rdfs:subClassOf" -DESCRIPTION_TAG = "obo:IAO_0000115" -XREF_TAG = "oboInOwl:hasDbXref" ID_TAG = "rdf:about" NAME_TAG = "rdfs:label" -EXACT_MATCH_TAG = "skos:exactMatch" -COMMENT_TAG = "rdfs:comment" TEXT_KEY = "ENTRY_TEXT" RESOURCE_KEY = "rdf:resource" @@ -202,127 +196,138 @@ def process_ontology_term(ontology_node, source, ontology_name, owl_source=True) SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version} +def process_ontology_class(owl_class, source, ontology_name, owl_source=True): + owl_prefix = "" + if owl_source: + owl_prefix = "owl:" + # Typically genid classes which don't neatly map onto the KG2 schema + if ID_TAG not in owl_class: + return + node_id = match_prefix(owl_class.get(ID_TAG, str())) + if node_id is None: + return + node_prefix = node_id.split(':')[0] + node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '') + + # Configure the name + name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] + if len(name_list) == 0: + return + + # Configure the description + description_list = list() + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)] + description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] + + deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list()) + for name in name_list: + search_name = name.lower() + if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"): + deprecated = True + + # Configure the synonyms + synonym_list = list() + synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym", + "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111", + "obo:IAO_0000028", "skos:prefLabel"] + synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] + + update_date_list = list() + update_date_keys = ["dc:date", "dcterms:date", "terms:date"] + update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] + + creation_date_list = list() + creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] + creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] + + # Configure the biological sequence + has_biological_sequence = dict() + has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence] + + # Extract edge triples + edges_list = list() + + for edge_type in BASE_EDGE_TYPES: + for edge in owl_class.get(edge_type, list()): + if BASE_EDGE_TYPES[edge_type] in edge: + edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) + + + restriction_edges = list() + restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] + for equiv in owl_class.get(owl_prefix + "equivalentClass", list()): + for mini_class in equiv.get(owl_prefix + "Class", list()): + for edge in mini_class.get(owl_prefix + "intersectionOf", list()): + restriction_edges.append((edge, owl_prefix + "equivalentClass")) + + for (edge, general_edge_type) in restriction_edges: + for restriction in edge.get(owl_prefix + "Restriction", list()): + edge_type = restriction.get(owl_prefix + "onProperty", list()) + edge_object = restriction.get(owl_prefix + "someValuesFrom", list()) + if len(edge_type) != 1: + assert len(edge_type) <= 1, edge + continue + if len(edge_object) != 1: + assert len(edge_object) <= 1, edge + continue + edge_type = edge_type[0].get(RESOURCE_KEY, None) + edge_object = edge_object[0].get(RESOURCE_KEY, None) + + if edge_type != None and edge_object != None: + edges_list.append((edge_type, edge_object)) + + if RESOURCE_KEY in edge: + edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + + superclasses = set() + final_edges_list = list() + for (edge_relation, edge_object) in edges_list: + edge_object = match_prefix(edge_object) + if edge_object is None: + continue + edge_relation = match_prefix(edge_relation) + if edge_relation is None: + continue + if edge_relation in ["rdfs:subClassOf"]: + superclasses.add(edge_object) + final_edges_list.append((edge_relation, edge_object)) + + # Imperfect way to make it deterministic + superclasses = sorted(list(superclasses)) + if node_id not in CLASS_TO_SUPERCLASSES: + CLASS_TO_SUPERCLASSES[node_id] = list() + CLASS_TO_SUPERCLASSES[node_id] += superclasses + CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) + + if node_id not in SAVED_NODE_INFO: + SAVED_NODE_INFO[node_id] = list() + SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, + DEPRECATED_KEY: deprecated, + UPDATE_DATE_KEY: update_date_list, + CREATION_DATE_KEY: creation_date_list, + SYNONYM_KEY: synonym_list, + DESCRIPTION_KEY: description_list, + NAME_KEY: name_list, + SOURCE_KEY: source, + BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence, + IRI_KEY: node_iri, + EDGES_KEY: final_edges_list}) + def process_ontology_item(ontology_item): source = ontology_item.get(OWL_SOURCE_KEY, str()) ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str()) - for owl_class in ontology_item.get(OWL_CLASS_TAG, list()): - # Typically genid classes which don't neatly map onto the KG2 schema - if ID_TAG not in owl_class: - continue - node_id = match_prefix(owl_class.get(ID_TAG, str())) - if node_id is None: - continue - node_prefix = node_id.split(':')[0] - node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '') - # Configure the name - name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] - if len(name_list) == 0: - continue + for owl_class in ontology_item.get("owl:Class", list()): + process_ontology_class(owl_class, source, ontology_name) - # Configure the description - description_list = list() - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)] - description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] - - deprecated = "true" in owl_class.get("owl:deprecated", list()) - for name in name_list: - if name.startswith("obsolete") or name.startswith("(obsolete") or name.endswith("obsolete"): - deprecated = True - - # Configure the synonyms - synonym_list = list() - synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym", - "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111", - "obo:IAO_0000028", "skos:prefLabel"] - synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] - - update_date_list = list() - update_date_keys = ["dc:date", "dcterms:date", "terms:date"] - update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] - - creation_date_list = list() - creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] - creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] - - # Configure the biological sequence - has_biological_sequence = dict() - has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence] - - # Extract edge triples - edges_list = list() - - for edge_type in BASE_EDGE_TYPES: - for edge in owl_class.get(edge_type, list()): - if BASE_EDGE_TYPES[edge_type] in edge: - edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) - - - restriction_edges = list() - restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] - for equiv in owl_class.get("owl:equivalentClass", list()): - for mini_class in equiv.get("owl:Class", list()): - for edge in mini_class.get("owl:intersectionOf", list()): - restriction_edges.append((edge, "owl:equivalentClass")) - - for (edge, general_edge_type) in restriction_edges: - for restriction in edge.get("owl:Restriction", list()): - edge_type = restriction.get("owl:onProperty", list()) - edge_object = restriction.get("owl:someValuesFrom", list()) - if len(edge_type) != 1: - assert len(edge_type) <= 1, edge - continue - if len(edge_object) != 1: - assert len(edge_object) <= 1, edge - continue - edge_type = edge_type[0].get(RESOURCE_KEY, None) - edge_object = edge_object[0].get(RESOURCE_KEY, None) - - if edge_type != None and edge_object != None: - edges_list.append((edge_type, edge_object)) - - if RESOURCE_KEY in edge: - edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) - - superclasses = set() - final_edges_list = list() - for (edge_relation, edge_object) in edges_list: - edge_object = match_prefix(edge_object) - if edge_object is None: - continue - edge_relation = match_prefix(edge_relation) - if edge_relation is None: - continue - if edge_relation in ["rdfs:subClassOf"]: - superclasses.add(edge_object) - final_edges_list.append((edge_relation, edge_object)) - - # Imperfect way to make it deterministic - superclasses = sorted(list(superclasses)) - if node_id not in CLASS_TO_SUPERCLASSES: - CLASS_TO_SUPERCLASSES[node_id] = list() - CLASS_TO_SUPERCLASSES[node_id] += superclasses - CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) - - if node_id not in SAVED_NODE_INFO: - SAVED_NODE_INFO[node_id] = list() - SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, - DEPRECATED_KEY: deprecated, - UPDATE_DATE_KEY: update_date_list, - CREATION_DATE_KEY: creation_date_list, - SYNONYM_KEY: synonym_list, - DESCRIPTION_KEY: description_list, - NAME_KEY: name_list, - SOURCE_KEY: source, - BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence, - IRI_KEY: node_iri, - EDGES_KEY: final_edges_list}) + for owl_class in ontology_item.get("Class", list()): + process_ontology_class(owl_class, source, ontology_name, False) for ontology_node in ontology_item.get("owl:Ontology", list()): process_ontology_term(ontology_node, source, ontology_name) From c8c63de5e28dbd3d4b51b967e8da77b70749b57c Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 00:24:38 -0700 Subject: [PATCH 085/125] #387 moving to its permanent home --- .../ontologies_jsonl_to_kg_jsonl.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename ontologies_jsonl_to_kg_jsonl.py => convert/ontologies_jsonl_to_kg_jsonl.py (100%) diff --git a/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py similarity index 100% rename from ontologies_jsonl_to_kg_jsonl.py rename to convert/ontologies_jsonl_to_kg_jsonl.py From 2ec98bdf658c0b50f8948931772e4d9a958cd5df Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 00:25:44 -0700 Subject: [PATCH 086/125] #387 moving owlparser to its permanent home --- owlparser.py => extract/owlparser.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename owlparser.py => extract/owlparser.py (100%) diff --git a/owlparser.py b/extract/owlparser.py similarity index 100% rename from owlparser.py rename to extract/owlparser.py From 82e6acb2810ff0c22c0e09fc1fbdf2aa76c45bda Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 00:46:08 -0700 Subject: [PATCH 087/125] #387 don't need this one anymore --- maps/ont-load-inventory-test.yaml | 289 ------------------------------ 1 file changed, 289 deletions(-) delete mode 100644 maps/ont-load-inventory-test.yaml diff --git a/maps/ont-load-inventory-test.yaml b/maps/ont-load-inventory-test.yaml deleted file mode 100644 index f79d6587..00000000 --- a/maps/ont-load-inventory-test.yaml +++ /dev/null @@ -1,289 +0,0 @@ -- # maps to CURIE prefix: biolink - url: https://raw.githubusercontent.com/biolink/biolink-model/master/biolink-model.owl.ttl - file: biolink-model.owl.ttl - download: true - title: Biolink meta-model -# - # maps to CURIE prefix: UMLSSC (the trailling slash here is important:) -# url: http://purl.bioontology.org/ontology/STY/ -# file: umls-semantictypes.ttl -# download: false -# title: UMLS Semantic Types -# - # maps to CURIE prefix: ATC -# download: false -# file: umls-atc.ttl -# title: Anatomical Therapeutic Chemical Classification System -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ATC -# - # maps to CURIE prefix CHV -# download: false -# file: umls-chv.ttl -# title: Consumer Health Vocabulary -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/CHV -# - # maps to CURIE prefix CPT -# download: false -# file: umls-cpt.ttl -# title: Current Procedural Terminology -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/CPT -# - # maps to CURIE prefix DRUGBANK -# download: false -# file: umls-drugbank.ttl -# title: DrugBank -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/DRUGBANK -# - # maps to CURIE prefix FMA -# download: false -# file: umls-fma.ttl -# title: Foundational Model of Anatomy -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/FMA -# - # maps to CURIE prefix GO -# download: false -# file: umls-go.ttl -# title: Gene Ontology -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/GO -# - # maps to CURIE prefix HCPCS -# download: false -# file: umls-hcpcs.ttl -# title: Healthcare Common Procedure Coding System -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HCPCS -# - # maps to CURIE prefix CPT -# download: false -# file: umls-hcpt.ttl -# title: CPT in HCPCS -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HCPT -# - # maps to CURIE prefix HGNC -# download: false -# file: umls-hgnc.ttl -# title: HUGO Gene Nomenclature Committee -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HGNC -# - # maps to CURIE prefix umls -# download: false -# file: umls-hl7.ttl -# title: HL7 Version 3.0 -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HL7 -# - # maps to CURIE prefix HP -# download: false -# file: umls-hpo.ttl -# title: Human Phenotype Ontology -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HPO -# - # maps to CURIE prefix ICD10 -# download: false -# file: umls-icd10.ttl -# title: International Classification of Diseases and Related Health Problems, -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10 -# - # maps to CURIE prefix ICD10 -# download: false -# file: umls-icd10ae.ttl -# title: ICD-10, American English Equivalents -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10AE -# - # maps to CURIE prefix ICD10 -# download: false -# file: umls-icd10cm.ttl -# title: International Classification of Diseases, Tenth Revision, Clinical Modification -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10CM -# - # maps to CURIE prefix ICD10PCS -# download: false -# file: umls-icd10pcs.ttl -# title: ICD-10 Procedure Coding System -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10PCS -# - # maps to CURIE prefix ICD9 -# download: false -# file: umls-icd9cm.ttl -# title: International Classification of Diseases, Ninth Revision, Clinical Modification -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD9CM -# - # maps to CURIE prefix LOINC -# download: false -# file: umls-lnc.ttl -# title: Logical Observation Identifiers Names and Codes -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/LNC -# - # maps to CURIE prefix MEDDRA -# download: false -# file: umls-mdr.ttl -# title: MedDRA -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MEDDRA -# - # maps to CURIE prefix umls -# download: false -# file: umls-med-rt.ttl -# title: Medication Reference Terminology -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MED-RT -# - # maps to CURIE prefix umls -# download: false -# file: umls-medlineplus.ttl -# title: MedlinePlus Health Topics -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MEDLINEPLUS -# - # maps to CURIE prefix MESH -# download: false -# file: umls-msh.ttl -# title: Medical Subject Headings -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MSH -# - # maps to CURIE prefix umls -# download: false - # file: umls-mth.ttl - # title: Metathesaurus Names - # url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MTH -# - # maps to CURIE prefix NCBITaxon -# download: false -# file: umls-ncbi.ttl -# title: NCBI -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NCBITAXON -# - # maps to CURIE prefix NCIT -# download: false -# file: umls-nci.ttl -# title: NCI Thesaurus -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NCI -# - # maps to CURIE prefix NDDF -# download: false -# file: umls-nddf.ttl -# title: National Drug Data File -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NDDF -#- # maps to CURIE prefix NDFRT -# download: false -# file: umls-ndfrt.ttl -# title: National Drug File - Reference Terminology -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NDFRT -# - # maps to CURIE prefix OMIM -# download: false -# file: umls-omim.ttl -# title: Online Mendelian Inheritance in Man -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/OMIM -# - # maps to CURIE prefix PDQ -# download: false -# file: umls-pdq.ttl -# title: Physician Data Query -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/PDQ -# - # maps to CURIE prefix RXNORM -# download: false -# file: umls-rxnorm.ttl -# title: RXNORM -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/RXNORM -# - # maps to CURIE prefix SNOMED -# download: false -# file: umls-snomedct_us.ttl -# title: SNOMED Clinical Terms US Edition -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT -# # ==> unable to find an online set of pages for SNOMEDCT_VET concepts but I want to find one so that -# # I can include SNOMEDCT_VET in the kg2 build, thus am keeping this section commented out [SAR]: -# # - -# # download: false -# # file: umls-snomedct_vet.ttl -# # title: Veterinary Extension to SNOMED CT -# # url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT_VET -# # ==> this section (UMLS Source Terminology Names) seems like it could be useful in the future, but -# # I can't find purls to its concepts anywhere: -# # - -# # download: false -# # file: umls-src.ttl -# # title: Source Terminology Names (UMLS) -# # url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SRC -# - # maps to CURIE prefix VANDF -# download: false -# file: umls-vandf.ttl -# title: National Drug File -# url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF -# - # maps to CURIE prefix: BFO -# url: http://purl.obolibrary.org/obo/bfo.owl -# file: bfo.owl -# download: true -# title: Basic Formal Ontology -#- maps to CURIE prefix: GO - # url: http://purl.obolibrary.org/obo/go/extensions/go-plus.owl - # file: go-plus.owl - # title: Gene Ontology - # download: true -# - # maps to CURIE prefix: RO -# url: http://purl.obolibrary.org/obo/ro.owl -# file: ro.owl -# download: true -# title: Relation Ontology -# - -# url: http://purl.obolibrary.org/obo/uberon/ext.owl -# file: uberon-ext.owl -# download: true -# title: Uber-anatomy Ontology -# - -# url: http://www.ebi.ac.uk/efo/efo.owl -# file: efo.owl -# download: true -# title: Experimental Factor Ontology -# - -# url: http://purl.obolibrary.org/obo/fma.owl -# file: fma.owl -# download: true -# title: Foundational Model of Anatomy -# - -# url: http://purl.obolibrary.org/obo/ddanat.owl -# file: ddanat.owl -# download: true -# title: Dictyostelium discoideum anatomy -- - url: http://purl.obolibrary.org/obo/cl.owl - file: cl.owl - download: true - title: Cell Ontology -# - -# url: http://purl.obolibrary.org/obo/chebi.owl -# file: chebi.owl -# download: true -# title: Chemical Entities of Biological Interest -# - - # url: http://purl.obolibrary.org/obo/foodon.owl - # file: foodon.owl - # download: false - # title: FOODON (Food Ontology) -# - -# url: http://data.bioontology.org/ontologies/ORDO/submissions/15/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb -# file: ordo.owl -# download: true -# title: ORPHANET Rare Disease Ontology -# - -# url: http://purl.obolibrary.org/obo/ehdaa2.owl -# file: ehdaa2.owl -# download: true -# title: Human developmental anatomy, abstract -# - -# url: http://purl.obolibrary.org/obo/bspo.owl -# file: bspo.owl -# download: true -# title: Biological Spatial Ontology -# - -# url: http://purl.obolibrary.org/obo/hp.owl -# file: hp.owl -# download: true -# title: Human Phenotype Ontology -# - -# url: http://purl.obolibrary.org/obo/nbo.owl -# file: nbo.owl -# download: true -# title: Neuro Behavior Ontology -# - -# url: http://purl.obolibrary.org/obo/ncbitaxon/subsets/taxslim.owl -# file: taxslim.owl -# download: true -# title: NCBITaxon -# - -# url: http://purl.obolibrary.org/obo/pato.owl -# file: pato.owl -# download: true -# title: Phenotypic Quality Ontology -# - # maps to CURIE prefix MONDO -# url: http://purl.obolibrary.org/obo/mondo.owl -# file: mondo.owl -# download: true -# title: MONDO Disease Ontology -# - -# url: http://purl.obolibrary.org/obo/doid.owl -# file: doid.owl -# download: true -# title: Disease Ontology -# - -# url: http://purl.obolibrary.org/obo/pr.owl -# file: pr.owl -# download: true -# title: Protein Ontology -# - -# url: http://purl.obolibrary.org/obo/ino.owl -# file: ino.owl -# download: true -# title: Interaction Network Ontology -# - # maps to CURIE prefix GENEPIO -# url: http://purl.obolibrary.org/obo/genepio.owl -# file: genepio.owl -# download: true -# title: Genomic Epidemiology Ontology From 192039c496df3ce7e18f9b6f583a7039dac46799 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 01:08:42 -0700 Subject: [PATCH 088/125] #387 #405 rethreading the pipeline for new ETL --- build/Snakefile-conversion | 13 ++++++++----- build/Snakefile-extraction | 13 +++++++++++++ build/Snakefile-post-etl | 8 ++++---- build/snakemake-config-var.yaml | 20 ++++++++++++++------ convert/ontologies_jsonl_to_kg_jsonl.py | 8 +++++--- master-config.shinc | 3 --- 6 files changed, 44 insertions(+), 21 deletions(-) diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion index 6754be04..45db4eeb 100644 --- a/build/Snakefile-conversion +++ b/build/Snakefile-conversion @@ -16,15 +16,18 @@ rule UMLS_Conversion: rule Ontologies_Conversion: input: - code = config['ONT_CONVERSION_SCRIPT'], + code = config['ONTOLOGIES_CONVERSION_SCRIPT'], + real = config['ONTOLOGIES_EXTRACT_FILE'], + curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP'] + curies_to_urls_map = config['CURIES_TO_URLS_FILE'], validation = config['VALIDATION_PLACEHOLDER'] output: - nodes = config['ONT_OUTPUT_NODES_FILE'], - edges = config['ONT_OUTPUT_EDGES_FILE'] + nodes = config['ONTOLOGIES_OUTPUT_NODES_FILE'], + edges = config['ONTOLOGIES_OUTPUT_EDGES_FILE'] log: - config['ONT_CONVERSION_LOG'] + config['ONTOLOGIES_CONVERSION_LOG'] shell: - "bash -x {input.code} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" + config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_categories_map} {input.curies_to_urls_map} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" rule SemMedDB_Conversion: input: diff --git a/build/Snakefile-extraction b/build/Snakefile-extraction index c23d0ef0..e5c12890 100644 --- a/build/Snakefile-extraction +++ b/build/Snakefile-extraction @@ -9,6 +9,19 @@ rule UMLS: shell: "bash -x {input.code} {output} > {log} 2>&1" +rule Ontologies: + input: + code = config['ONTOLOGIES_EXTRACTION_SCRIPT'], + parser = config['ONTOLOGIES_EXTRACTION_PARSER'], + ontologies_load_inventory = config['ONTOLOGIES_LOAD_INVENTORY_FILE'], + validation = config['VALIDATION_PLACEHOLDER'] + output: + config['ONTOLOGIES_EXTRACT_FILE'] + log: + config['ONTOLOGIES_EXTRACTION_LOG'] + shell: + "bash -x {input.code} {input.parser} {input.ontologies_load_inventory} {output} > {log} 2>&1" + rule SemMedDB: input: code = config['SEMMEDDB_EXTRACTION_SCRIPT'], diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl index eeb1a44d..dab60237 100644 --- a/build/Snakefile-post-etl +++ b/build/Snakefile-post-etl @@ -3,8 +3,8 @@ rule Merge: code = config['MERGE_SCRIPT'], umls_nodes = config['UMLS_OUTPUT_NODES_FILE'], umls_edges = config['UMLS_OUTPUT_EDGES_FILE'], - ont_nodes = config['ONT_OUTPUT_NODES_FILE'], - ont_edges = config['ONT_OUTPUT_EDGES_FILE'], + ontologies_nodes = config['ONTOLOGIES_OUTPUT_NODES_FILE'], + ontologies_edges = config['ONTOLOGIES_OUTPUT_EDGES_FILE'], uniprot_nodes = config['UNIPROTKB_OUTPUT_NODES_FILE'], uniprot_edges = config['UNIPROTKB_OUTPUT_EDGES_FILE'], semmeddb_nodes = config['SEMMEDDB_OUTPUT_NODES_FILE'], @@ -56,7 +56,7 @@ rule Merge: " --outputEdgesFile {output.edges} " + \ " --kgNodesFiles " + \ "{input.umls_nodes} " + \ - "{input.ont_nodes} " + \ + "{input.ontologies_nodes} " + \ "{input.semmeddb_nodes} " + \ "{input.uniprot_nodes} " + \ "{input.ensembl_nodes} " + \ @@ -78,7 +78,7 @@ rule Merge: "{input.clinicaltrialskg_nodes} " + \ " --kgEdgesFiles " + \ "{input.umls_edges} " + \ - "{input.ont_edges} " + \ + "{input.ontologies_edges} " + \ "{input.semmeddb_edges} " + \ "{input.uniprot_edges} " + \ "{input.ensembl_edges} " + \ diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index 209b3659..3f32aca4 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -13,6 +13,8 @@ umls_output_base: kg2-umls umls_extraction_script: ${EXTRACT_CODE_DIR}/${umls_extraction_base}.sh umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${version_suffix}${test_suffix}.log umls_extract_file: ${BUILD_DIR}/umls.jsonl +umls_dir: ${BUILD_DIR}/umls +umls_dest_dir: ${umls_dir}/META umls_conversion_script: ${CONVERT_CODE_DIR}/${umls_conversion_base}.py umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${version_suffix}${test_suffix}.log umls_name_heirarchy: ${MAPS_CODE_DIR}/umls-name-heirarchy.yaml @@ -20,12 +22,18 @@ umls_tui_map: ${MAPS_CODE_DIR}/tui_combo_mappings.json umls_output_nodes_file: ${BUILD_DIR}/${umls_output_base}${nodes_suffix}${test_suffix}.jsonl umls_output_edges_file: ${BUILD_DIR}/${umls_output_base}${edges_suffix}${test_suffix}.jsonl -ont_conversion_base: build-multi-ont-kg -ont_output_base: kg2-ont -ont_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.sh -ont_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log -ont_output_nodes_file: ${BUILD_DIR}/${ont_output_base}${nodes_suffix}${test_suffix}.jsonl -ont_output_edges_file: ${BUILD_DIR}/${ont_output_base}${edges_suffix}${test_suffix}.jsonl +ontologies_extraction_base: extract-ontologies +ontologies_conversion_base: ontologies_jsonl_to_kg_jsonl +ontologies_output_base: kg2-ontologies +ontologies_extraction_script: ${EXTRACT_CODE_DIR}/${ontologies_extraction_base}.sh +ontologies_extraction_parser: ${EXTRACT_CODE_DIR}/owlparser.py +ontologies_extraction_log: ${BUILD_DIR}/${ontologies_extraction_base}${version_suffix}${test_suffix}.log +ontologies_load_inventory_file: ${MAPS_CODE_DIR}/ont-load-inventory.yaml +ontologies_extract_file: ${BUILD_DIR}/ontologies.jsonl +ontologies_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.py +ontologies_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log +ontologies_output_nodes_file: ${BUILD_DIR}/${ontologies_output_base}${nodes_suffix}${test_suffix}.jsonl +ontologies_output_edges_file: ${BUILD_DIR}/${ontologies_output_base}${edges_suffix}${test_suffix}.jsonl semmeddb_extraction_base: extract-semmeddb semmeddb_conversion_base: semmeddb_tuplelist_json_to_kg_jsonl diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index 3390c2ec..4dfb9992 100644 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -78,6 +78,7 @@ def get_args(): action="store_true", default=False) arg_parser.add_argument('inputFile', type=str) arg_parser.add_argument('curiesToCategoriesYAML', type=str) + arg_parser.add_argument('curiesToURLsYAML', type=str) arg_parser.add_argument('outputNodesFile', type=str) arg_parser.add_argument('outputEdgesFile', type=str) return arg_parser.parse_args() @@ -336,8 +337,8 @@ def process_ontology_item(ontology_item): for ontology_node in ontology_item.get("Ontology", list()): process_ontology_term(ontology_node, source, ontology_name, False) -def generate_uri_map(): - uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml")) +def generate_uri_map(curies_to_urls_file_name): + uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name)) bidirectional_map = uri_input_map['use_for_bidirectional_mapping'] contraction_map = uri_input_map['use_for_contraction_only'] @@ -420,6 +421,7 @@ def construct_nodes_and_edges(nodes_output, edges_output): args = get_args() input_file_name = args.inputFile curies_to_categories_file_name = args.curiesToCategoriesYAML + curies_to_urls_file_name = args.curiesToURLsYAML output_nodes_file_name = args.outputNodesFile output_edges_file_name = args.outputEdgesFile test_mode = args.test @@ -438,7 +440,7 @@ def construct_nodes_and_edges(nodes_output, edges_output): input_data = input_read_jsonlines_info[0] ontology_prefixes = set() - generate_uri_map() + generate_uri_map(curies_to_urls_file_name) for ontology_item in input_data: process_ontology_item(ontology_item) diff --git a/master-config.shinc b/master-config.shinc index 015cf2f7..3e78c226 100644 --- a/master-config.shinc +++ b/master-config.shinc @@ -11,8 +11,6 @@ NEO4J_CODE_DIR=${CODE_DIR}/neo4j PROCESS_CODE_DIR=${CODE_DIR}/process SETUP_CODE_DIR=${CODE_DIR}/setup VALIDATE_CODE_DIR=${CODE_DIR}/validate -umls_dir=${BUILD_DIR}/umls -umls_dest_dir=${umls_dir}/META s3_region=us-west-2 s3_bucket=rtx-kg2 s3_bucket_public=rtx-kg2-public @@ -26,7 +24,6 @@ curies_to_urls_file=${MAPS_CODE_DIR}/curies-to-urls-map.yaml predicate_mapping_file=${MAPS_CODE_DIR}/predicate-remap.yaml infores_mapping_file=${MAPS_CODE_DIR}/kg2-provided-by-curie-to-infores-curie.yaml knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-type-map.yaml -ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml rtx_config_file=RTXConfiguration-config.json biolink_model_version=4.2.1 infores_registry_version=0.2.8 From d09ce05f3256d82e6c4823f77ae6e3297fb40489 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 01:09:20 -0700 Subject: [PATCH 089/125] #387 forgot to add the new extract --- extract/extract-ontologies.sh | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100755 extract/extract-ontologies.sh diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh new file mode 100755 index 00000000..3248cf4f --- /dev/null +++ b/extract/extract-ontologies.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# extract-ontologies.sh: Download OWL files and convert them into a JSONLines file +# Copyright 2024 Stephen A. Ramsey +# Author Erica Wood + +set -o nounset -o pipefail -o errexit + +if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then + echo Usage: "$0 " + exit 2 +fi + +# Usage: extract-ontologies.sh + +echo "================= starting extract-ontologies.sh ==================" +date + +config_dir=`dirname "$0"` +source ${config_dir}/master-config.shinc + +parsing_script=${1-"${EXTRACT_CODE_DIR}/owlparser.py"} +ontologies_load_inventory=${1-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"} +output_file=${2-"${BUILD_DIR}/ontologies.jsonl"} +ontologies_dir=${3-"${BUILD_DIR}/owl_files"} + +mkdir -p ${ontologies_dir} + +# Temporary adjustment for https://github.com/HUPO-PSI/psi-mi-CV/issues/456 +${s3_cp_cmd} s3://${s3_bucket}/mi.owl ${ontologies_dir}/mi.owl + +# Generate the ontologies.jsonl file +${python_command} ${parsing_script} ${ontologies_load_inventory} ${ontologies_dir} ${output_file} + +date +echo "================= finished extract-ontologies.sh ==================" From 76b996b9b2ffc9d73d592e44d131cc1e5e81af58 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 01:14:00 -0700 Subject: [PATCH 090/125] #387 adjusting some of the variables for new pipelining --- build/snakemake-config-var.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index 3f32aca4..2c5e1863 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -30,8 +30,8 @@ ontologies_extraction_parser: ${EXTRACT_CODE_DIR}/owlparser.py ontologies_extraction_log: ${BUILD_DIR}/${ontologies_extraction_base}${version_suffix}${test_suffix}.log ontologies_load_inventory_file: ${MAPS_CODE_DIR}/ont-load-inventory.yaml ontologies_extract_file: ${BUILD_DIR}/ontologies.jsonl -ontologies_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.py -ontologies_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log +ontologies_conversion_script: ${CONVERT_CODE_DIR}/${ontologies_conversion_base}.py +ontologies_conversion_log: ${BUILD_DIR}/${ontologies_conversion_base}${version_suffix}${test_suffix}.log ontologies_output_nodes_file: ${BUILD_DIR}/${ontologies_output_base}${nodes_suffix}${test_suffix}.jsonl ontologies_output_edges_file: ${BUILD_DIR}/${ontologies_output_base}${edges_suffix}${test_suffix}.jsonl From 168071ef49ba38512ca133b560c8a7a12da43f1a Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 01:18:04 -0700 Subject: [PATCH 091/125] #387 adjusting for new pipelining syntax error --- build/Snakefile-conversion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion index 45db4eeb..d2e61a00 100644 --- a/build/Snakefile-conversion +++ b/build/Snakefile-conversion @@ -18,7 +18,7 @@ rule Ontologies_Conversion: input: code = config['ONTOLOGIES_CONVERSION_SCRIPT'], real = config['ONTOLOGIES_EXTRACT_FILE'], - curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP'] + curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP'], curies_to_urls_map = config['CURIES_TO_URLS_FILE'], validation = config['VALIDATION_PLACEHOLDER'] output: From 8baa76352a1ba0b98a6b98a8053af6ad28c0cc6a Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 01:21:30 -0700 Subject: [PATCH 092/125] #387 adjusting for new pipelining naming error --- build/Snakefile-conversion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion index d2e61a00..d7f96c2e 100644 --- a/build/Snakefile-conversion +++ b/build/Snakefile-conversion @@ -18,7 +18,7 @@ rule Ontologies_Conversion: input: code = config['ONTOLOGIES_CONVERSION_SCRIPT'], real = config['ONTOLOGIES_EXTRACT_FILE'], - curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP'], + curies_to_categories_map = config['CURIES_TO_CATEGORIES_FILE'], curies_to_urls_map = config['CURIES_TO_URLS_FILE'], validation = config['VALIDATION_PLACEHOLDER'] output: From 58d1bddc4ce6bf4f1d88f0cdc0be8c6cf4a9d549 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 01:31:50 -0700 Subject: [PATCH 093/125] #387 cleaning up the formatting of the new files --- convert/ontologies_jsonl_to_kg_jsonl.py | 748 +++++++++---------- extract/owlparser.py | 924 ++++++++++++------------ 2 files changed, 854 insertions(+), 818 deletions(-) diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index 4dfb9992..a27561f4 100644 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -1,8 +1,25 @@ +#!/usr/bin/env python3 +''' ontologies_jsonl_to_kg_jsonl.py: Converts JSON Lines representation of ontologies into KG JSON Lines format + + Usage: ontologies_jsonl_to_kg_jsonl.py [--test] +''' + + import argparse import kg2_util import json import datetime +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + ID_TAG = "rdf:about" NAME_TAG = "rdfs:label" @@ -18,26 +35,26 @@ DESCRIPTION_DELIM = " // " BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY, - "mondo-base:closeMatch": RESOURCE_KEY, - "mondo-base:relatedMatch": RESOURCE_KEY, - "mondo-base:broadMatch": RESOURCE_KEY, - "mondo-base:narrowMatch": RESOURCE_KEY, - "skos:exactMatch": RESOURCE_KEY, - "skos:closeMatch": RESOURCE_KEY, - "skos:broadMatch": RESOURCE_KEY, - "skos:relatedMatch": RESOURCE_KEY, - "skos:narrowMatch": RESOURCE_KEY, - "obo:IAO_0100001": RESOURCE_KEY, - "obo:RO_0002175": RESOURCE_KEY, - "obo:RO_0002161": RESOURCE_KEY, - "obo:RO_0002604": RESOURCE_KEY, - "obo:RO_0002171": RESOURCE_KEY, - "obo:RO_0002174": RESOURCE_KEY, - "obo:RO_0002475": RESOURCE_KEY, - "obo:RO_0001900": RESOURCE_KEY, - "oboInOwl:hasAlternativeId": TEXT_KEY, - "oboInOwl:hasDbXref": TEXT_KEY, - "oboInOwl:xref": TEXT_KEY} + "mondo-base:closeMatch": RESOURCE_KEY, + "mondo-base:relatedMatch": RESOURCE_KEY, + "mondo-base:broadMatch": RESOURCE_KEY, + "mondo-base:narrowMatch": RESOURCE_KEY, + "skos:exactMatch": RESOURCE_KEY, + "skos:closeMatch": RESOURCE_KEY, + "skos:broadMatch": RESOURCE_KEY, + "skos:relatedMatch": RESOURCE_KEY, + "skos:narrowMatch": RESOURCE_KEY, + "obo:IAO_0100001": RESOURCE_KEY, + "obo:RO_0002175": RESOURCE_KEY, + "obo:RO_0002161": RESOURCE_KEY, + "obo:RO_0002604": RESOURCE_KEY, + "obo:RO_0002171": RESOURCE_KEY, + "obo:RO_0002174": RESOURCE_KEY, + "obo:RO_0002475": RESOURCE_KEY, + "obo:RO_0001900": RESOURCE_KEY, + "oboInOwl:hasAlternativeId": TEXT_KEY, + "oboInOwl:hasDbXref": TEXT_KEY, + "oboInOwl:xref": TEXT_KEY} CLASS_TO_SUPERCLASSES = dict() SAVED_NODE_INFO = dict() @@ -73,383 +90,386 @@ VERSION_KEY = "version" def get_args(): - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument('--test', dest='test', - action="store_true", default=False) - arg_parser.add_argument('inputFile', type=str) - arg_parser.add_argument('curiesToCategoriesYAML', type=str) - arg_parser.add_argument('curiesToURLsYAML', type=str) - arg_parser.add_argument('outputNodesFile', type=str) - arg_parser.add_argument('outputEdgesFile', type=str) - return arg_parser.parse_args() + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('--test', dest='test', + action="store_true", default=False) + arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('curiesToCategoriesYAML', type=str) + arg_parser.add_argument('curiesToURLsYAML', type=str) + arg_parser.add_argument('outputNodesFile', type=str) + arg_parser.add_argument('outputEdgesFile', type=str) + return arg_parser.parse_args() def categorize_node(node_id, recursion_depth=0): - node_prefix = node_id.split(':')[0] - - if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING: - return NODE_CATEGORY_MAPPINGS[node_id][0] - - if node_prefix in PREFIX_MAPPINGS: - node_category = PREFIX_MAPPINGS[node_prefix] - NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING) - return PREFIX_MAPPINGS[node_prefix] - - # Get try to get the most common superclass categorization - superclass_categorizations = dict() - highest_value = 0 - highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING - if recursion_depth == 10: - return kg2_util.BIOLINK_CATEGORY_NAMED_THING - - for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()): - superclass_category = categorize_node(superclass, recursion_depth + 1) - if superclass_category not in superclass_categorizations: - superclass_categorizations[superclass_category] = 0 - superclass_categorizations[superclass_category] += 1 - if superclass_categorizations[superclass_category] > highest_value: - highest_value = superclass_categorizations[superclass_category] - highest_category = superclass_category - - NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) - return highest_category + node_prefix = node_id.split(':')[0] + + if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING: + return NODE_CATEGORY_MAPPINGS[node_id][0] + + if node_prefix in PREFIX_MAPPINGS: + node_category = PREFIX_MAPPINGS[node_prefix] + NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING) + return PREFIX_MAPPINGS[node_prefix] + + # Get try to get the most common superclass categorization + superclass_categorizations = dict() + highest_value = 0 + highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING + if recursion_depth == 10: + return kg2_util.BIOLINK_CATEGORY_NAMED_THING + + for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()): + superclass_category = categorize_node(superclass, recursion_depth + 1) + if superclass_category not in superclass_categorizations: + superclass_categorizations[superclass_category] = 0 + superclass_categorizations[superclass_category] += 1 + if superclass_categorizations[superclass_category] > highest_value: + highest_value = superclass_categorizations[superclass_category] + highest_category = superclass_category + + NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) + return highest_category def reformat_obo_date(date_str): - if date_str is None: - return None - - if '-' in date_str: - delim = 'T' - if ' ' in date_str: - delim = ' ' - date_spl = date_str.strip('Z').split(delim) - date_fh = date_spl[0].split('-') - year = int(date_fh[0]) - month = int(date_fh[1]) - day = int(date_fh[2]) - - if month < 1 or month > 12 or day < 1 or day > 31: - return None - - if len(date_spl) > 1: - date_sh = date_spl[1].split(':') - hour = int(date_sh[0]) - minute = int(date_sh[1]) - second = int(date_sh[2][0:1]) - - return datetime.datetime(year, month, day, hour, minute, second) - else: - return datetime.datetime(year, month, day) - else: - date_spl = date_str.split(' ') - date_fh = date_spl[0].split(':') - year = int(date_fh[2]) - month = int(date_fh[1]) - day = int(date_fh[0]) - - if month < 1 or month > 12 or day < 1 or day > 31: - return None - - return datetime.datetime(year, month, day) + if date_str is None: + return None + + if '-' in date_str: + delim = 'T' + if ' ' in date_str: + delim = ' ' + date_spl = date_str.strip('Z').split(delim) + date_fh = date_spl[0].split('-') + year = int(date_fh[0]) + month = int(date_fh[1]) + day = int(date_fh[2]) + + if month < 1 or month > 12 or day < 1 or day > 31: + return None + + if len(date_spl) > 1: + date_sh = date_spl[1].split(':') + hour = int(date_sh[0]) + minute = int(date_sh[1]) + second = int(date_sh[2][0:1]) + + return datetime.datetime(year, month, day, hour, minute, second) + else: + return datetime.datetime(year, month, day) + else: + date_spl = date_str.split(' ') + date_fh = date_spl[0].split(':') + year = int(date_fh[2]) + month = int(date_fh[1]) + day = int(date_fh[0]) + + if month < 1 or month > 12 or day < 1 or day > 31: + return None + + return datetime.datetime(year, month, day) def pick_most_recent_date(dates, alternate_date=None): - latest_date = None - for date in dates: - if date == None: - continue - if latest_date == None or date > latest_date: - latest_date = date - - if latest_date == None: - if alternate_date is not None: - latest_date = alternate_date - else: - return None - - return latest_date.isoformat(sep=' ') + latest_date = None + for date in dates: + if date == None: + continue + if latest_date == None or date > latest_date: + latest_date = date + + if latest_date == None: + if alternate_date is not None: + latest_date = alternate_date + else: + return None + + return latest_date.isoformat(sep=' ') def process_ontology_term(ontology_node, source, ontology_name, owl_source=True): - owl_prefix = "" - if owl_source: - owl_prefix = "owl:" - ontology_version = None - ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version] - ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version] - ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] - ontology_iri = ontology_node.get("rdf:about", str()) - if len(ontology_versions) == 1: - ontology_version = ontology_versions[0] - elif len(ontology_version_iri) == 1: - ontology_version = ontology_version_iri[0] - version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/'] - for replacement in version_replacements: - ontology_version = ontology_version.replace(replacement, "") - ontology_version = ontology_version.split('/')[0] - elif len(ontology_dates) >= 1: - ontology_version = pick_most_recent_date(ontology_dates) - - if ontology_version is None: - print("Warning: source", source, "lacks any versioning information.") - - ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates)) - source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source - - if source not in SOURCE_INFO: - SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version} + owl_prefix = "" + if owl_source: + owl_prefix = "owl:" + ontology_version = None + ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version] + ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version] + ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version] + ontology_iri = ontology_node.get("rdf:about", str()) + if len(ontology_versions) == 1: + ontology_version = ontology_versions[0] + elif len(ontology_version_iri) == 1: + ontology_version = ontology_version_iri[0] + version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/'] + for replacement in version_replacements: + ontology_version = ontology_version.replace(replacement, "") + ontology_version = ontology_version.split('/')[0] + elif len(ontology_dates) >= 1: + ontology_version = pick_most_recent_date(ontology_dates) + + if ontology_version is None: + print("Warning: source", source, "lacks any versioning information.") + + ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates)) + source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source + + if source not in SOURCE_INFO: + SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version} def process_ontology_class(owl_class, source, ontology_name, owl_source=True): - owl_prefix = "" - if owl_source: - owl_prefix = "owl:" - # Typically genid classes which don't neatly map onto the KG2 schema - if ID_TAG not in owl_class: - return - node_id = match_prefix(owl_class.get(ID_TAG, str())) - if node_id is None: - return - node_prefix = node_id.split(':')[0] - node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '') - - # Configure the name - name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] - if len(name_list) == 0: - return - - # Configure the description - description_list = list() - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)] - description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] - description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] - - deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list()) - for name in name_list: - search_name = name.lower() - if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"): - deprecated = True - - # Configure the synonyms - synonym_list = list() - synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym", - "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111", - "obo:IAO_0000028", "skos:prefLabel"] - synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] - - update_date_list = list() - update_date_keys = ["dc:date", "dcterms:date", "terms:date"] - update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] - - creation_date_list = list() - creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] - creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] - - # Configure the biological sequence - has_biological_sequence = dict() - has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence] - has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence] - - # Extract edge triples - edges_list = list() - - for edge_type in BASE_EDGE_TYPES: - for edge in owl_class.get(edge_type, list()): - if BASE_EDGE_TYPES[edge_type] in edge: - edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) - - - restriction_edges = list() - restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] - for equiv in owl_class.get(owl_prefix + "equivalentClass", list()): - for mini_class in equiv.get(owl_prefix + "Class", list()): - for edge in mini_class.get(owl_prefix + "intersectionOf", list()): - restriction_edges.append((edge, owl_prefix + "equivalentClass")) - - for (edge, general_edge_type) in restriction_edges: - for restriction in edge.get(owl_prefix + "Restriction", list()): - edge_type = restriction.get(owl_prefix + "onProperty", list()) - edge_object = restriction.get(owl_prefix + "someValuesFrom", list()) - if len(edge_type) != 1: - assert len(edge_type) <= 1, edge - continue - if len(edge_object) != 1: - assert len(edge_object) <= 1, edge - continue - edge_type = edge_type[0].get(RESOURCE_KEY, None) - edge_object = edge_object[0].get(RESOURCE_KEY, None) - - if edge_type != None and edge_object != None: - edges_list.append((edge_type, edge_object)) - - if RESOURCE_KEY in edge: - edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) - - superclasses = set() - final_edges_list = list() - for (edge_relation, edge_object) in edges_list: - edge_object = match_prefix(edge_object) - if edge_object is None: - continue - edge_relation = match_prefix(edge_relation) - if edge_relation is None: - continue - if edge_relation in ["rdfs:subClassOf"]: - superclasses.add(edge_object) - final_edges_list.append((edge_relation, edge_object)) - - # Imperfect way to make it deterministic - superclasses = sorted(list(superclasses)) - if node_id not in CLASS_TO_SUPERCLASSES: - CLASS_TO_SUPERCLASSES[node_id] = list() - CLASS_TO_SUPERCLASSES[node_id] += superclasses - CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) - - if node_id not in SAVED_NODE_INFO: - SAVED_NODE_INFO[node_id] = list() - SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, - DEPRECATED_KEY: deprecated, - UPDATE_DATE_KEY: update_date_list, - CREATION_DATE_KEY: creation_date_list, - SYNONYM_KEY: synonym_list, - DESCRIPTION_KEY: description_list, - NAME_KEY: name_list, - SOURCE_KEY: source, - BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence, - IRI_KEY: node_iri, - EDGES_KEY: final_edges_list}) + owl_prefix = "" + if owl_source: + owl_prefix = "owl:" + # Typically genid classes which don't neatly map onto the KG2 schema + if ID_TAG not in owl_class: + return + node_id = match_prefix(owl_class.get(ID_TAG, str())) + if node_id is None: + return + node_prefix = node_id.split(':')[0] + node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '') + + # Configure the name + name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] + if len(name_list) == 0: + return + + # Configure the description + description_list = list() + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)] + description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] + description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] + + deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list()) + for name in name_list: + search_name = name.lower() + if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"): + deprecated = True + + # Configure the synonyms + synonym_list = list() + synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym", + "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111", + "obo:IAO_0000028", "skos:prefLabel"] + synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] + + update_date_list = list() + update_date_keys = ["dc:date", "dcterms:date", "terms:date"] + update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] + + creation_date_list = list() + creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] + creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] + + # Configure the biological sequence + has_biological_sequence = dict() + has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence] + has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence] + + # Extract edge triples + edges_list = list() + + for edge_type in BASE_EDGE_TYPES: + for edge in owl_class.get(edge_type, list()): + if BASE_EDGE_TYPES[edge_type] in edge: + edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) + + + restriction_edges = list() + restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] + for equiv in owl_class.get(owl_prefix + "equivalentClass", list()): + for mini_class in equiv.get(owl_prefix + "Class", list()): + for edge in mini_class.get(owl_prefix + "intersectionOf", list()): + restriction_edges.append((edge, owl_prefix + "equivalentClass")) + + for (edge, general_edge_type) in restriction_edges: + for restriction in edge.get(owl_prefix + "Restriction", list()): + edge_type = restriction.get(owl_prefix + "onProperty", list()) + edge_object = restriction.get(owl_prefix + "someValuesFrom", list()) + if len(edge_type) != 1: + assert len(edge_type) <= 1, edge + continue + if len(edge_object) != 1: + assert len(edge_object) <= 1, edge + continue + edge_type = edge_type[0].get(RESOURCE_KEY, None) + edge_object = edge_object[0].get(RESOURCE_KEY, None) + + if edge_type != None and edge_object != None: + edges_list.append((edge_type, edge_object)) + + if RESOURCE_KEY in edge: + edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + + superclasses = set() + final_edges_list = list() + for (edge_relation, edge_object) in edges_list: + edge_object = match_prefix(edge_object) + if edge_object is None: + continue + edge_relation = match_prefix(edge_relation) + if edge_relation is None: + continue + if edge_relation in ["rdfs:subClassOf"]: + superclasses.add(edge_object) + final_edges_list.append((edge_relation, edge_object)) + + # Imperfect way to make it deterministic + superclasses = sorted(list(superclasses)) + if node_id not in CLASS_TO_SUPERCLASSES: + CLASS_TO_SUPERCLASSES[node_id] = list() + CLASS_TO_SUPERCLASSES[node_id] += superclasses + CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) + + if node_id not in SAVED_NODE_INFO: + SAVED_NODE_INFO[node_id] = list() + SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, + DEPRECATED_KEY: deprecated, + UPDATE_DATE_KEY: update_date_list, + CREATION_DATE_KEY: creation_date_list, + SYNONYM_KEY: synonym_list, + DESCRIPTION_KEY: description_list, + NAME_KEY: name_list, + SOURCE_KEY: source, + BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence, + IRI_KEY: node_iri, + EDGES_KEY: final_edges_list}) def process_ontology_item(ontology_item): - source = ontology_item.get(OWL_SOURCE_KEY, str()) - ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str()) + source = ontology_item.get(OWL_SOURCE_KEY, str()) + ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str()) - for owl_class in ontology_item.get("owl:Class", list()): - process_ontology_class(owl_class, source, ontology_name) + for owl_class in ontology_item.get("owl:Class", list()): + process_ontology_class(owl_class, source, ontology_name) - for owl_class in ontology_item.get("Class", list()): - process_ontology_class(owl_class, source, ontology_name, False) + for owl_class in ontology_item.get("Class", list()): + process_ontology_class(owl_class, source, ontology_name, False) - for ontology_node in ontology_item.get("owl:Ontology", list()): - process_ontology_term(ontology_node, source, ontology_name) + for ontology_node in ontology_item.get("owl:Ontology", list()): + process_ontology_term(ontology_node, source, ontology_name) - # Because of ORDO - for ontology_node in ontology_item.get("Ontology", list()): - process_ontology_term(ontology_node, source, ontology_name, False) + # Because of ORDO + for ontology_node in ontology_item.get("Ontology", list()): + process_ontology_term(ontology_node, source, ontology_name, False) def generate_uri_map(curies_to_urls_file_name): - uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name)) - bidirectional_map = uri_input_map['use_for_bidirectional_mapping'] - contraction_map = uri_input_map['use_for_contraction_only'] - - for curie_prefix_dict in bidirectional_map: - for curie_prefix in curie_prefix_dict: - curie_url = curie_prefix_dict[curie_prefix] - URI_MAP[curie_url] = curie_prefix - PREFIX_TO_IRI_MAP[curie_prefix] = curie_url - - for curie_prefix_dict in contraction_map: - for curie_prefix in curie_prefix_dict: - curie_url = curie_prefix_dict[curie_prefix] - URI_MAP[curie_url] = curie_prefix - - # So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another) - # Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python) - global URI_MAP_KEYS - URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True) + uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name)) + bidirectional_map = uri_input_map['use_for_bidirectional_mapping'] + contraction_map = uri_input_map['use_for_contraction_only'] + + for curie_prefix_dict in bidirectional_map: + for curie_prefix in curie_prefix_dict: + curie_url = curie_prefix_dict[curie_prefix] + URI_MAP[curie_url] = curie_prefix + PREFIX_TO_IRI_MAP[curie_prefix] = curie_url + + for curie_prefix_dict in contraction_map: + for curie_prefix in curie_prefix_dict: + curie_url = curie_prefix_dict[curie_prefix] + URI_MAP[curie_url] = curie_prefix + + # So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another) + # Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python) + global URI_MAP_KEYS + URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True) def match_prefix(node_id): - for curie_url in URI_MAP_KEYS: - if node_id.startswith(curie_url): - return node_id.replace(curie_url, URI_MAP[curie_url] + ":") - - if "http" in node_id: - MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/") - elif ':' in node_id: - MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":") - elif '_' in node_id: - MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_") - else: - MISSING_ID_PREFIXES.add(node_id) + for curie_url in URI_MAP_KEYS: + if node_id.startswith(curie_url): + return node_id.replace(curie_url, URI_MAP[curie_url] + ":") + + if "http" in node_id: + MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/") + elif ':' in node_id: + MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":") + elif '_' in node_id: + MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_") + else: + MISSING_ID_PREFIXES.add(node_id) def construct_nodes_and_edges(nodes_output, edges_output): - for source in SOURCE_INFO: - source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]]) - source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY] - source_id = SOURCE_INFO[source][SOURCE_KEY] - source_iri = SOURCE_INFO[source][IRI_KEY] - node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id) + for source in SOURCE_INFO: + source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]]) + source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY] + source_id = SOURCE_INFO[source][SOURCE_KEY] + source_iri = SOURCE_INFO[source][IRI_KEY] + node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id) - nodes_output.write(node) + nodes_output.write(node) - for node_id in SAVED_NODE_INFO: - for source_node_index in range(len(SAVED_NODE_INFO[node_id])): - if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]: - continue - name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name - node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY] - description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY]) - has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None) - synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY] - category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY] + for node_id in SAVED_NODE_INFO: + for source_node_index in range(len(SAVED_NODE_INFO[node_id])): + if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]: + continue + name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name + node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY] + description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY]) + has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None) + synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY] + category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY] - source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY] - provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source - source_date = SOURCE_INFO[source][UPDATE_DATE_KEY] + source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY] + provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source + source_date = SOURCE_INFO[source][UPDATE_DATE_KEY] - update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date) - creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date) + update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date) + creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date) - node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by) - node["description"] = description - node["has_biological_sequence"] = has_biological_sequence - node["creation_date"] = creation_date - node["synonym"] = synonyms + node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by) + node["description"] = description + node["has_biological_sequence"] = has_biological_sequence + node["creation_date"] = creation_date + node["synonym"] = synonyms - nodes_output.write(node) + nodes_output.write(node) - for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]: - relation_label = edge_relation.split(':')[1] - edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date) + for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]: + relation_label = edge_relation.split(':')[1] + edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date) - edges_output.write(edge) + edges_output.write(edge) if __name__ == '__main__': - args = get_args() - input_file_name = args.inputFile - curies_to_categories_file_name = args.curiesToCategoriesYAML - curies_to_urls_file_name = args.curiesToURLsYAML - output_nodes_file_name = args.outputNodesFile - output_edges_file_name = args.outputEdgesFile - test_mode = args.test - - nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) - nodes_output = nodes_info[0] - edges_output = edges_info[0] - - curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) - for mapping_node in curies_to_categories_data["term-mappings"]: - NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING) - for prefix in curies_to_categories_data["prefix-mappings"]: - PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix] - - input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) - input_data = input_read_jsonlines_info[0] - - ontology_prefixes = set() - generate_uri_map(curies_to_urls_file_name) - for ontology_item in input_data: - process_ontology_item(ontology_item) - - for node_id in SAVED_NODE_INFO: - categorize_node(node_id) - node_category = NODE_CATEGORY_MAPPINGS[node_id][0] - for index in range(len(SAVED_NODE_INFO[node_id])): - SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category - - construct_nodes_and_edges(nodes_output, edges_output) - - kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) \ No newline at end of file + print("Start time: ", kg2_util.date()) + args = get_args() + input_file_name = args.inputFile + curies_to_categories_file_name = args.curiesToCategoriesYAML + curies_to_urls_file_name = args.curiesToURLsYAML + output_nodes_file_name = args.outputNodesFile + output_edges_file_name = args.outputEdgesFile + test_mode = args.test + + nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) + nodes_output = nodes_info[0] + edges_output = edges_info[0] + + curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) + for mapping_node in curies_to_categories_data["term-mappings"]: + NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING) + for prefix in curies_to_categories_data["prefix-mappings"]: + PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix] + + input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) + input_data = input_read_jsonlines_info[0] + + ontology_prefixes = set() + generate_uri_map(curies_to_urls_file_name) + for ontology_item in input_data: + process_ontology_item(ontology_item) + + for node_id in SAVED_NODE_INFO: + categorize_node(node_id) + node_category = NODE_CATEGORY_MAPPINGS[node_id][0] + for index in range(len(SAVED_NODE_INFO[node_id])): + SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category + + construct_nodes_and_edges(nodes_output, edges_output) + + kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) + + print("Finish time: ", kg2_util.date()) diff --git a/extract/owlparser.py b/extract/owlparser.py index 34e99fe3..fe540f3b 100644 --- a/extract/owlparser.py +++ b/extract/owlparser.py @@ -1,524 +1,540 @@ +#!/usr/bin/env python3 +''' owlparser.py: Converts OWL (XML) Files into JSON Lines Representations + + Usage: owlparser.py [--test] +''' + import json import argparse import datetime import kg2_util +__author__ = 'Erica Wood' +__copyright__ = 'Oregon State University' +__credits__ = ['Stephen Ramsey', 'Erica Wood'] +__license__ = 'MIT' +__version__ = '0.1.0' +__maintainer__ = '' +__email__ = '' +__status__ = 'Prototype' + + def get_args(): - arg_parser = argparse.ArgumentParser() - arg_parser.add_argument('--test', dest='test', - action="store_true", default=False) - arg_parser.add_argument('inputFile', type=str) - arg_parser.add_argument('owlFilePath', type=str) - arg_parser.add_argument('outputFile', type=str) - return arg_parser.parse_args() + arg_parser = argparse.ArgumentParser() + arg_parser.add_argument('--test', dest='test', + action="store_true", default=False) + arg_parser.add_argument('inputFile', type=str) + arg_parser.add_argument('owlFilePath', type=str) + arg_parser.add_argument('outputFile', type=str) + return arg_parser.parse_args() def date(): - return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") class LineElementRead(): - NONE = 0 - TAG = 1 - ATTRIBUTE_TAG = 2 - ATTRIBUTE_TEXT = 3 - MAIN = 4 - END_TAG = 5 + NONE = 0 + TAG = 1 + ATTRIBUTE_TAG = 2 + ATTRIBUTE_TEXT = 3 + MAIN = 4 + END_TAG = 5 class XMLParser(): - def __init__(self, skip_tags, ignored_attributes, processing_func): - self.COMMENT = "!--" - self.OUTMOST_TAGS_SKIP = skip_tags - self.IGNORED_ATTRIBUTES = ignored_attributes - self.processing_func = processing_func - - self.LINE_TYPE_IGNORE = "ignore" - self.LINE_TYPE_START_NEST = "start nest" - self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" - self.LINE_TYPE_ENTRY = "entry" - self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes" - self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes" - self.LINE_TYPE_END_NEST = "end nest" - - self.KEY_TAG = "tag" - self.KEY_ATTRIBUTES = "attributes" - self.KEY_TEXT = "ENTRY_TEXT" - self.KEY_TYPE = "type" - - # Variables for line reading - self.tag = "" - self.attributes = dict() - self.attribute_tag = "" - self.attribute_text = "" - self.main_text = "" - self.end_tag = "" - self.only_tag = False - self.start_brackets = 0 - self.line = "" - self.letter = "" - self.next_letter = "" - self.prev_letter = "" - self.type_to_read = LineElementRead.NONE - - def categorize_line(self): - # Categorize the type of line - line_type = str() - out = dict() - - # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it - if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag: - line_type = self.LINE_TYPE_IGNORE - else: - start_tag_exists = (self.tag != str()) - attributes_exist = (self.attributes != dict()) - text_exists = (self.main_text != str()) - end_tag_exists = (self.end_tag != str()) - - if start_tag_exists: - if attributes_exist: - if text_exists: - line_type = self.LINE_TYPE_ENTRY_WITH_ATTR - out[self.KEY_TAG] = self.tag - out[self.KEY_ATTRIBUTES] = self.attributes - out[self.KEY_TEXT] = self.main_text - elif end_tag_exists: - line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR - out[self.KEY_TAG] = self.tag - out[self.KEY_ATTRIBUTES] = self.attributes - else: - line_type = self.LINE_TYPE_START_NEST_WITH_ATTR - out[self.KEY_TAG] = self.tag - out[self.KEY_ATTRIBUTES] = self.attributes - elif text_exists: - line_type = self.LINE_TYPE_ENTRY - out[self.KEY_TAG] = self.tag - out[self.KEY_TEXT] = self.main_text - else: - line_type = self.LINE_TYPE_START_NEST - out[self.KEY_TAG] = self.tag - elif end_tag_exists: - line_type = self.LINE_TYPE_END_NEST - out[self.KEY_TAG] = self.end_tag - - out[self.KEY_TYPE] = line_type - - return out - - def get_letters(self, letter_index): - self.letter = self.line[letter_index] - self.next_letter = "" - self.prev_letter = "" - if letter_index + 1 < len(self.line): - self.next_letter = self.line[letter_index + 1] - if letter_index - 1 >= 0: - self.prev_letter = self.line[letter_index - 1] - - if self.letter == '<': - self.start_brackets += 1 - if self.letter == '>': - self.start_brackets -= 1 - - - def identify_tag_type(self, letter_index): - changed = True - - if self.letter == '<' and letter_index == 0: - if self.next_letter != '/': - self.type_to_read = LineElementRead.TAG - elif self.letter == '/' and self.prev_letter == '<': - self.type_to_read = LineElementRead.END_TAG - else: - changed = False - - return changed - - - def read_tag(self): - changed = True - - if self.letter == ' ' and self.type_to_read == LineElementRead.TAG: - self.type_to_read = LineElementRead.ATTRIBUTE_TAG - elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0: - self.type_to_read = LineElementRead.MAIN - - if self.prev_letter == '/': - print("Warning - strange tag, ignoring", self.line) - self.only_tag = True - elif self.type_to_read == LineElementRead.TAG: - self.tag += self.letter - else: - changed = False - - return changed - - - def store_attribute(self): - if self.attribute_tag not in self.IGNORED_ATTRIBUTES: - self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"') - self.attribute_tag = "" - self.attribute_text = "" - - - def read_attributes(self): - changed = True - start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT) - - if self.letter == '>' and start_reading_attributes and self.start_brackets == 0: - self.type_to_read = LineElementRead.MAIN - - self.store_attribute() - - if self.prev_letter == '/': - self.end_tag = self.tag - elif start_reading_attributes: - if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG: - self.type_to_read = LineElementRead.ATTRIBUTE_TEXT - elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG: - self.attribute_tag += self.letter - elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: - self.type_to_read = LineElementRead.ATTRIBUTE_TAG - self.store_attribute() - elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: - self.attribute_text += self.letter - else: - changed = False + def __init__(self, skip_tags, ignored_attributes, processing_func): + self.COMMENT = "!--" + self.OUTMOST_TAGS_SKIP = skip_tags + self.IGNORED_ATTRIBUTES = ignored_attributes + self.processing_func = processing_func + + self.LINE_TYPE_IGNORE = "ignore" + self.LINE_TYPE_START_NEST = "start nest" + self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" + self.LINE_TYPE_ENTRY = "entry" + self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes" + self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes" + self.LINE_TYPE_END_NEST = "end nest" + + self.KEY_TAG = "tag" + self.KEY_ATTRIBUTES = "attributes" + self.KEY_TEXT = "ENTRY_TEXT" + self.KEY_TYPE = "type" + + # Variables for line reading + self.tag = "" + self.attributes = dict() + self.attribute_tag = "" + self.attribute_text = "" + self.main_text = "" + self.end_tag = "" + self.only_tag = False + self.start_brackets = 0 + self.line = "" + self.letter = "" + self.next_letter = "" + self.prev_letter = "" + self.type_to_read = LineElementRead.NONE + + def categorize_line(self): + # Categorize the type of line + line_type = str() + out = dict() + + # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it + if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag: + line_type = self.LINE_TYPE_IGNORE + else: + start_tag_exists = (self.tag != str()) + attributes_exist = (self.attributes != dict()) + text_exists = (self.main_text != str()) + end_tag_exists = (self.end_tag != str()) + + if start_tag_exists: + if attributes_exist: + if text_exists: + line_type = self.LINE_TYPE_ENTRY_WITH_ATTR + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes + out[self.KEY_TEXT] = self.main_text + elif end_tag_exists: + line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes + else: + line_type = self.LINE_TYPE_START_NEST_WITH_ATTR + out[self.KEY_TAG] = self.tag + out[self.KEY_ATTRIBUTES] = self.attributes + elif text_exists: + line_type = self.LINE_TYPE_ENTRY + out[self.KEY_TAG] = self.tag + out[self.KEY_TEXT] = self.main_text + else: + line_type = self.LINE_TYPE_START_NEST + out[self.KEY_TAG] = self.tag + elif end_tag_exists: + line_type = self.LINE_TYPE_END_NEST + out[self.KEY_TAG] = self.end_tag + + out[self.KEY_TYPE] = line_type + + return out + + def get_letters(self, letter_index): + self.letter = self.line[letter_index] + self.next_letter = "" + self.prev_letter = "" + if letter_index + 1 < len(self.line): + self.next_letter = self.line[letter_index + 1] + if letter_index - 1 >= 0: + self.prev_letter = self.line[letter_index - 1] + + if self.letter == '<': + self.start_brackets += 1 + if self.letter == '>': + self.start_brackets -= 1 + + + def identify_tag_type(self, letter_index): + changed = True + + if self.letter == '<' and letter_index == 0: + if self.next_letter != '/': + self.type_to_read = LineElementRead.TAG + elif self.letter == '/' and self.prev_letter == '<': + self.type_to_read = LineElementRead.END_TAG + else: + changed = False + + return changed + + + def read_tag(self): + changed = True + + if self.letter == ' ' and self.type_to_read == LineElementRead.TAG: + self.type_to_read = LineElementRead.ATTRIBUTE_TAG + elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0: + self.type_to_read = LineElementRead.MAIN + + if self.prev_letter == '/': + print("Warning - strange tag, ignoring", self.line) + self.only_tag = True + elif self.type_to_read == LineElementRead.TAG: + self.tag += self.letter + else: + changed = False + + return changed + + + def store_attribute(self): + if self.attribute_tag not in self.IGNORED_ATTRIBUTES: + self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"') + self.attribute_tag = "" + self.attribute_text = "" + + + def read_attributes(self): + changed = True + start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT) + + if self.letter == '>' and start_reading_attributes and self.start_brackets == 0: + self.type_to_read = LineElementRead.MAIN + + self.store_attribute() + + if self.prev_letter == '/': + self.end_tag = self.tag + elif start_reading_attributes: + if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG: + self.type_to_read = LineElementRead.ATTRIBUTE_TEXT + elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG: + self.attribute_tag += self.letter + elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: + self.type_to_read = LineElementRead.ATTRIBUTE_TAG + self.store_attribute() + elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT: + self.attribute_text += self.letter + else: + changed = False - return changed + return changed - def read_main(self): - changed = True - if self.letter == '<' and self.type_to_read == LineElementRead.MAIN: - self.type_to_read = LineElementRead.END_TAG - elif self.type_to_read == LineElementRead.MAIN: - self.main_text += self.letter - else: - changed = False + def read_main(self): + changed = True + if self.letter == '<' and self.type_to_read == LineElementRead.MAIN: + self.type_to_read = LineElementRead.END_TAG + elif self.type_to_read == LineElementRead.MAIN: + self.main_text += self.letter + else: + changed = False - return changed + return changed - def read_end_tag(self): - changed = True - if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0: - pass - elif self.type_to_read == LineElementRead.END_TAG: - self.end_tag += self.letter - else: - changed = False + def read_end_tag(self): + changed = True + if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0: + pass + elif self.type_to_read == LineElementRead.END_TAG: + self.end_tag += self.letter + else: + changed = False - return changed + return changed - def convert_line(self): - self.tag = "" - self.attributes = dict() - self.attribute_tag = "" - self.attribute_text = "" - self.main_text = "" - self.end_tag = "" + def convert_line(self): + self.tag = "" + self.attributes = dict() + self.attribute_tag = "" + self.attribute_text = "" + self.main_text = "" + self.end_tag = "" - self.type_to_read = LineElementRead.NONE + self.type_to_read = LineElementRead.NONE - self.only_tag = False + self.only_tag = False - self.start_brackets = 0 + self.start_brackets = 0 - for letter_index in range(len(self.line)): - self.get_letters(letter_index) + for letter_index in range(len(self.line)): + self.get_letters(letter_index) - # First < - if self.identify_tag_type(letter_index): - continue + # First < + if self.identify_tag_type(letter_index): + continue - if self.read_tag(): - continue + if self.read_tag(): + continue - if self.read_attributes(): - continue + if self.read_attributes(): + continue - if self.read_main(): - continue + if self.read_main(): + continue - if self.read_end_tag(): - continue + if self.read_end_tag(): + continue - return self.categorize_line() + return self.categorize_line() - def convert_nest(self, nest, start_index): - nest_dict = dict() - curr_index = start_index + def convert_nest(self, nest, start_index): + nest_dict = dict() + curr_index = start_index - while curr_index < len(nest): - element = nest[curr_index] - line_type = element[self.KEY_TYPE] - line_tag = element[self.KEY_TAG] - line_text = element.get(self.KEY_TEXT, None) - line_attributes = element.get(self.KEY_ATTRIBUTES, None) + while curr_index < len(nest): + element = nest[curr_index] + line_type = element[self.KEY_TYPE] + line_tag = element[self.KEY_TAG] + line_text = element.get(self.KEY_TEXT, None) + line_attributes = element.get(self.KEY_ATTRIBUTES, None) - if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: - if line_tag not in nest_dict: - nest_dict[line_tag] = list() + if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - converted_nest, ret_index = self.convert_nest(nest, curr_index + 1) + converted_nest, ret_index = self.convert_nest(nest, curr_index + 1) - if line_attributes is not None: - for attribute in line_attributes: - converted_nest[attribute] = line_attributes[attribute] + if line_attributes is not None: + for attribute in line_attributes: + converted_nest[attribute] = line_attributes[attribute] - nest_dict[line_tag].append(converted_nest) + nest_dict[line_tag].append(converted_nest) - curr_index = ret_index + 1 - continue + curr_index = ret_index + 1 + continue - if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]: - if line_tag not in nest_dict: - nest_dict[line_tag] = list() + if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]: + if line_tag not in nest_dict: + nest_dict[line_tag] = list() - curr_dict = dict() + curr_dict = dict() - if line_text is not None: - curr_dict[self.KEY_TEXT] = line_text + if line_text is not None: + curr_dict[self.KEY_TEXT] = line_text - if line_attributes is not None: - for attribute in line_attributes: - curr_dict[attribute] = line_attributes[attribute] + if line_attributes is not None: + for attribute in line_attributes: + curr_dict[attribute] = line_attributes[attribute] - nest_dict[line_tag].append(curr_dict) + nest_dict[line_tag].append(curr_dict) - curr_index += 1 - continue + curr_index += 1 + continue - if line_type in [self.LINE_TYPE_END_NEST]: - return nest_dict, curr_index + if line_type in [self.LINE_TYPE_END_NEST]: + return nest_dict, curr_index - return nest_dict, curr_index + return nest_dict, curr_index - def divide_into_lines(self, input_file_name): - curr_str = "" - curr_nest = list() - curr_nest_tags = list() # Treating it as a stack - start_brackets = 0 + def divide_into_lines(self, input_file_name): + curr_str = "" + curr_nest = list() + curr_nest_tags = list() # Treating it as a stack + start_brackets = 0 - with open(input_file_name) as input_file: - for line in input_file: - line_str = line.strip() + with open(input_file_name) as input_file: + for line in input_file: + line_str = line.strip() - for letter_index in range(len(line_str)): - letter = line_str[letter_index] - if letter == '<': - start_brackets += 1 - if letter == '>': - start_brackets -= 1 + for letter_index in range(len(line_str)): + letter = line_str[letter_index] + if letter == '<': + start_brackets += 1 + if letter == '>': + start_brackets -= 1 - next_letter = "" - if letter_index + 1 < len(line_str): - next_letter = line_str[letter_index + 1] + next_letter = "" + if letter_index + 1 < len(line_str): + next_letter = line_str[letter_index + 1] - curr_str += letter + curr_str += letter - if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: - # Only return if nesting - self.line = curr_str - line_parsed = self.convert_line() + if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: + # Only return if nesting + self.line = curr_str + line_parsed = self.convert_line() - tag = line_parsed.get(self.KEY_TAG, None) - assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely - line_type = line_parsed.get(self.KEY_TYPE, None) - attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys() + tag = line_parsed.get(self.KEY_TAG, None) + assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely + line_type = line_parsed.get(self.KEY_TYPE, None) + attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys() - if line_type != self.LINE_TYPE_IGNORE: - curr_nest.append(line_parsed) + if line_type != self.LINE_TYPE_IGNORE: + curr_nest.append(line_parsed) - output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) + output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) - if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: - curr_nest_tags.append(tag) - elif line_type == self.LINE_TYPE_END_NEST: - popped_curr_nest_tag = curr_nest_tags.pop() - assert popped_curr_nest_tag == tag, curr_nest - if len(curr_nest_tags) == 0: - output_nest = True - if output_nest: - nest_dict, _ = self.convert_nest(curr_nest, 0) + if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: + curr_nest_tags.append(tag) + elif line_type == self.LINE_TYPE_END_NEST: + popped_curr_nest_tag = curr_nest_tags.pop() + assert popped_curr_nest_tag == tag, curr_nest + if len(curr_nest_tags) == 0: + output_nest = True + if output_nest: + nest_dict, _ = self.convert_nest(curr_nest, 0) - self.processing_func(nest_dict) + self.processing_func(nest_dict) - curr_nest = list() - curr_nest_tag = str() + curr_nest = list() + curr_nest_tag = str() - curr_str = "" + curr_str = "" - if curr_str != "": - # divide lines by a space - curr_str += ' ' + if curr_str != "": + # divide lines by a space + curr_str += ' ' class OWLParser(): - def __init__(self, input_files, input_file_names, owl_file_path, output_file_name): - self.XML_TAG = "?xml" - self.RDF_TAG = "rdf:RDF" - self.DOCTYPE_TAG = "!DOCTYPE" - self.CLASS_TAG = "owl:Class" - self.RESTRICTION_TAG = "owl:Restriction" - self.SUBCLASS_TAG = "rdfs:subClassOf" - self.NODEID_TAG = "rdf:nodeID" - self.RDF_ABOUT_TAG = "rdf:about" - self.GENID_PREFIX = "genid" - - self.OWL_SOURCE_KEY = "owl_source" - self.OWL_SOURCE_NAME_KEY = "owl_source_name" - - self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] - - self.ignored_attributes = ["xml:lang"] - - self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict) - - self.GENID_REMAINING_NESTS = dict() - self.GENID_TO_ID = dict() - self.ID_TO_GENIDS = dict() - - self.input_files = input_files - self.input_file_names = input_file_names - self.owl_file_path = owl_file_path - self.output_file_name = output_file_name - - self.output_info = kg2_util.create_single_jsonlines() - self.output = self.output_info[0] - - def check_for_class_genids(self, nest_dict): - genids = list() - - nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) - for nest_subclass_index in range(len(nest_subclasses)): - nest_subclass = nest_subclasses[nest_subclass_index] - potential_genid = nest_subclass.get(self.NODEID_TAG, str()) - if potential_genid.startswith(self.GENID_PREFIX): - genids.append(potential_genid) - - return genids - - - def check_for_restriction_genids(self, nest_dict): - for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()): - potential_genid = nest_restriction.get(self.NODEID_TAG, str()) - if potential_genid.startswith(self.GENID_PREFIX): - return potential_genid - return None - - def extract_class_id(self, nest_dict): - nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) - # Can't have competing class_ids - assert len(nest_dict_classes) <= 1 - - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - return nest_class.get(self.RDF_ABOUT_TAG, str()) - - def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): - output_class_nest = class_nest - - nest_dict_classes = class_nest.get(self.CLASS_TAG, list()) - for nest_class_index in range(len(nest_dict_classes)): - nest_class = nest_dict_classes[nest_class_index] - nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) - for nest_subclass_index in range(len(nest_subclasses)): - nest_subclass = nest_subclasses[nest_subclass_index] - potential_genid = nest_subclass.get(self.NODEID_TAG, str()) - if potential_genid == genid: - output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG] - - return output_class_nest - - - def write_to_output(self, output_dict, source_file): - output_dict[self.OWL_SOURCE_KEY] = source_file - output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file] - self.output.write(output_dict) - - return - - - def triage_nest_dict(self, nest_dict): - genids = self.check_for_class_genids(nest_dict) - restriction_genid = self.check_for_restriction_genids(nest_dict) - class_id = self.extract_class_id(nest_dict) - - if len(genids) > 0: - for genid in genids: - self.GENID_TO_ID[genid] = class_id - self.ID_TO_GENIDS[class_id] = genids - self.GENID_REMAINING_NESTS[class_id] = nest_dict - elif restriction_genid is not None: - class_id = self.GENID_TO_ID.get(restriction_genid, str()) - if len(class_id) == 0: - print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") - - # Save to output despite not matching with an existing class - self.write_to_output(nest_dict, self.input_file) - return - class_nest = self.GENID_REMAINING_NESTS[class_id] - self.ID_TO_GENIDS[class_id].remove(restriction_genid) - updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) - - if len(self.ID_TO_GENIDS[class_id]) > 0: - self.GENID_REMAINING_NESTS[class_id] = updated_class_nest - else: - # Since all of the genids used in this class have been matched, output - self.write_to_output(nest_dict, self.input_file) - self.GENID_REMAINING_NESTS[class_id] = None - else: - # There are no genids that need to be worked with, so just output - self.write_to_output(nest_dict, self.input_file) - - - def parse_OWL_file(self): - for input_file in self.input_files: - self.input_file = input_file - print("Reading:", input_file, "starting at", date()) - self.xml_parser.divide_into_lines(self.owl_file_path + input_file) - - # Genid wasn't filled, still want to include them though - for item in self.GENID_REMAINING_NESTS: - if self.GENID_REMAINING_NESTS[item] != None: - self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file) - - # Refresh everything for the next file - self.GENID_REMAINING_NESTS = dict() - self.GENID_TO_ID = dict() - self.ID_TO_GENIDS = dict() - - kg2_util.close_single_jsonlines(self.output_info, self.output_file_name) + def __init__(self, input_files, input_file_names, owl_file_path, output_file_name): + self.XML_TAG = "?xml" + self.RDF_TAG = "rdf:RDF" + self.DOCTYPE_TAG = "!DOCTYPE" + self.CLASS_TAG = "owl:Class" + self.RESTRICTION_TAG = "owl:Restriction" + self.SUBCLASS_TAG = "rdfs:subClassOf" + self.NODEID_TAG = "rdf:nodeID" + self.RDF_ABOUT_TAG = "rdf:about" + self.GENID_PREFIX = "genid" + + self.OWL_SOURCE_KEY = "owl_source" + self.OWL_SOURCE_NAME_KEY = "owl_source_name" + + self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] + + self.ignored_attributes = ["xml:lang"] + + self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict) + + self.GENID_REMAINING_NESTS = dict() + self.GENID_TO_ID = dict() + self.ID_TO_GENIDS = dict() + + self.input_files = input_files + self.input_file_names = input_file_names + self.owl_file_path = owl_file_path + self.output_file_name = output_file_name + + self.output_info = kg2_util.create_single_jsonlines() + self.output = self.output_info[0] + + def check_for_class_genids(self, nest_dict): + genids = list() + + nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] + potential_genid = nest_subclass.get(self.NODEID_TAG, str()) + if potential_genid.startswith(self.GENID_PREFIX): + genids.append(potential_genid) + + return genids + + + def check_for_restriction_genids(self, nest_dict): + for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()): + potential_genid = nest_restriction.get(self.NODEID_TAG, str()) + if potential_genid.startswith(self.GENID_PREFIX): + return potential_genid + return None + + def extract_class_id(self, nest_dict): + nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) + # Can't have competing class_ids + assert len(nest_dict_classes) <= 1 + + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + return nest_class.get(self.RDF_ABOUT_TAG, str()) + + def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): + output_class_nest = class_nest + + nest_dict_classes = class_nest.get(self.CLASS_TAG, list()) + for nest_class_index in range(len(nest_dict_classes)): + nest_class = nest_dict_classes[nest_class_index] + nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) + for nest_subclass_index in range(len(nest_subclasses)): + nest_subclass = nest_subclasses[nest_subclass_index] + potential_genid = nest_subclass.get(self.NODEID_TAG, str()) + if potential_genid == genid: + output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG] + + return output_class_nest + + + def write_to_output(self, output_dict, source_file): + output_dict[self.OWL_SOURCE_KEY] = source_file + output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file] + self.output.write(output_dict) + + return + + + def triage_nest_dict(self, nest_dict): + genids = self.check_for_class_genids(nest_dict) + restriction_genid = self.check_for_restriction_genids(nest_dict) + class_id = self.extract_class_id(nest_dict) + + if len(genids) > 0: + for genid in genids: + self.GENID_TO_ID[genid] = class_id + self.ID_TO_GENIDS[class_id] = genids + self.GENID_REMAINING_NESTS[class_id] = nest_dict + elif restriction_genid is not None: + class_id = self.GENID_TO_ID.get(restriction_genid, str()) + if len(class_id) == 0: + print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") + + # Save to output despite not matching with an existing class + self.write_to_output(nest_dict, self.input_file) + return + class_nest = self.GENID_REMAINING_NESTS[class_id] + self.ID_TO_GENIDS[class_id].remove(restriction_genid) + updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) + + if len(self.ID_TO_GENIDS[class_id]) > 0: + self.GENID_REMAINING_NESTS[class_id] = updated_class_nest + else: + # Since all of the genids used in this class have been matched, output + self.write_to_output(nest_dict, self.input_file) + self.GENID_REMAINING_NESTS[class_id] = None + else: + # There are no genids that need to be worked with, so just output + self.write_to_output(nest_dict, self.input_file) + + + def parse_OWL_file(self): + for input_file in self.input_files: + self.input_file = input_file + print("Reading:", input_file, "starting at", date()) + self.xml_parser.divide_into_lines(self.owl_file_path + input_file) + + # Genid wasn't filled, still want to include them though + for item in self.GENID_REMAINING_NESTS: + if self.GENID_REMAINING_NESTS[item] != None: + self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file) + + # Refresh everything for the next file + self.GENID_REMAINING_NESTS = dict() + self.GENID_TO_ID = dict() + self.ID_TO_GENIDS = dict() + + kg2_util.close_single_jsonlines(self.output_info, self.output_file_name) def identify_and_download_input_files(ont_load_inventory, path_to_owl_files): - input_files = list() - input_file_names = dict() - owl_file_path = path_to_owl_files.rstrip('/') + "/" - for item in ont_load_inventory: - input_files.append(item['file']) - input_file_names[item['file']] = item['title'] - print("Downloading:", item['file'], "starting at", date()) - kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file']) - print("Download of:", item['file'], "finished at", date()) - - return input_files, input_file_names, owl_file_path + input_files = list() + input_file_names = dict() + owl_file_path = path_to_owl_files.rstrip('/') + "/" + for item in ont_load_inventory: + input_files.append(item['file']) + input_file_names[item['file']] = item['title'] + print("Downloading:", item['file'], "starting at", date()) + kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file']) + print("Download of:", item['file'], "finished at", date()) + + return input_files, input_file_names, owl_file_path if __name__ == '__main__': - args = get_args() - input_file_name = args.inputFile - owl_path = args.owlFilePath - output_file_name = args.outputFile - - ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name)) - input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path) - - print("Files:", input_files) - print("Start Time:", date()) - owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name) - owl_parser.parse_OWL_file() - print("End Time:", date()) \ No newline at end of file + args = get_args() + input_file_name = args.inputFile + owl_path = args.owlFilePath + output_file_name = args.outputFile + + ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name)) + input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path) + + print("Files:", input_files) + print("Start Time:", date()) + owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name) + owl_parser.parse_OWL_file() + print("End Time:", date()) \ No newline at end of file From 59c6192b9e5391f21fb331bee3106aaed597f2f3 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 02:58:05 -0700 Subject: [PATCH 094/125] #387 comments about the inner workings of ontologies conversion --- convert/ontologies_jsonl_to_kg_jsonl.py | 135 ++++++++++++++++++++++-- 1 file changed, 125 insertions(+), 10 deletions(-) diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index a27561f4..b0adf675 100644 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -34,6 +34,7 @@ COMMENT_PREFIX = "COMMENTS: " DESCRIPTION_DELIM = " // " +# Encoding styles for different predicates BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY, "mondo-base:closeMatch": RESOURCE_KEY, "mondo-base:relatedMatch": RESOURCE_KEY, @@ -56,25 +57,35 @@ "oboInOwl:hasDbXref": TEXT_KEY, "oboInOwl:xref": TEXT_KEY} +# Mapping structure used to recursively determine node category CLASS_TO_SUPERCLASSES = dict() + +# Node information storage, for while categories are determined SAVED_NODE_INFO = dict() + +# Storage for source information SOURCE_INFO = dict() +# Used to store the category of nodes both from curies-to-categories.yaml and as they are recursively mapped NODE_CATEGORY_MAPPINGS = dict() -PREFIX_MAPPINGS = dict() -CLASSES_DICT = dict() +# Used to store the prefix mappings from curies-to-categories.yaml +PREFIX_MAPPINGS = dict() +# Used to store extracted information from curies-to-urls-map.yaml URI_MAP = dict() URI_MAP_KEYS = list() PREFIX_TO_IRI_MAP = dict() +# Prefixes for owl:Class elements that were unable to be mapped with curies-to-urls-map.yaml MISSING_ID_PREFIXES = set() +# Category mapping techniques FILE_MAPPING = "file" PREFIX_MAPPING = "prefix" RECURSE_MAPPING = "recurse" +# Keys for saving node and edges information between its initial processing and node/edge creation ID_KEY = "id" DEPRECATED_KEY = "deprecated" UPDATE_DATE_KEY = "update_date" @@ -89,6 +100,7 @@ IRI_KEY = "iri" VERSION_KEY = "version" + def get_args(): arg_parser = argparse.ArgumentParser() arg_parser.add_argument('--test', dest='test', @@ -100,26 +112,42 @@ def get_args(): arg_parser.add_argument('outputEdgesFile', type=str) return arg_parser.parse_args() + def categorize_node(node_id, recursion_depth=0): + """ + Recursively navigate the hierarchy of node superclasses to identify the optimal categorization for a node. + If a particular category for a node is desired, classify it as such within curies-to-categories.yaml. + """ + # First, retrieve the node prefix node_prefix = node_id.split(':')[0] + # If the node is directly mapped in curies-to-categories.yaml, utilize that mapping + # The [1] field of NODE_CATEGORY_MAPPINGS[node_id] refers to the way that node was mapped if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING: return NODE_CATEGORY_MAPPINGS[node_id][0] + # If the node isn't in curies-to-categories.yaml, but its prefix is, use that mapping if node_prefix in PREFIX_MAPPINGS: node_category = PREFIX_MAPPINGS[node_prefix] NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING) return PREFIX_MAPPINGS[node_prefix] - # Get try to get the most common superclass categorization + # Try to get the most common superclass categorization (naive method for picking category of nodes with multiple superclasses) + # Initialize the category as named thing as a default superclass_categorizations = dict() highest_value = 0 highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING + + # To avoid unnecessary recursion, stop at 10 layers (prevents errors in subclass cycles) if recursion_depth == 10: return kg2_util.BIOLINK_CATEGORY_NAMED_THING + # Perform the recursive mapping search for all of the node's superclasses for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()): + # First, recurse superclass_category = categorize_node(superclass, recursion_depth + 1) + + # Then, determine the optimal categorization for the node based on naive category determination method if superclass_category not in superclass_categorizations: superclass_categorizations[superclass_category] = 0 superclass_categorizations[superclass_category] += 1 @@ -127,10 +155,15 @@ def categorize_node(node_id, recursion_depth=0): highest_value = superclass_categorizations[superclass_category] highest_category = superclass_category + # Save the categorization NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING) return highest_category + def reformat_obo_date(date_str): + """ + Reformat a date from an OWL field and save it as a datetime object for comparison purposes, handling a variety of input date styles + """ if date_str is None: return None @@ -168,7 +201,11 @@ def reformat_obo_date(date_str): return datetime.datetime(year, month, day) + def pick_most_recent_date(dates, alternate_date=None): + """ + Given a list of datetime objects, determine the most recent one + """ latest_date = None for date in dates: if date == None: @@ -184,10 +221,17 @@ def pick_most_recent_date(dates, alternate_date=None): return latest_date.isoformat(sep=' ') + def process_ontology_term(ontology_node, source, ontology_name, owl_source=True): + """ + Given an owl:Ontology (or analogous) element, determine all of the relevant attributes to construct a source node + """ + # Only use the owl prefix on terms if it is an owl_source (i.e., not ORDO) owl_prefix = "" if owl_source: owl_prefix = "owl:" + + # Determine the version of the ontology through one of the three encoding methods (version number, version IRI, or date) ontology_version = None ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version] ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version] @@ -196,6 +240,7 @@ def process_ontology_term(ontology_node, source, ontology_name, owl_source=True) if len(ontology_versions) == 1: ontology_version = ontology_versions[0] elif len(ontology_version_iri) == 1: + # Strip the version number out of the IRI ontology_version = ontology_version_iri[0] version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/'] for replacement in version_replacements: @@ -204,22 +249,33 @@ def process_ontology_term(ontology_node, source, ontology_name, owl_source=True) elif len(ontology_dates) >= 1: ontology_version = pick_most_recent_date(ontology_dates) + # Issue a warning if there is no versioning information if ontology_version is None: print("Warning: source", source, "lacks any versioning information.") ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates)) + + # Convert the source file name to a CURIE ID source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source + # Add the source information to the SOURCE_INFO dictionary to later be made into a source node if source not in SOURCE_INFO: SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version} def process_ontology_class(owl_class, source, ontology_name, owl_source=True): + """ + Given an owl:Class (or analogous) element, strip out all of the relevant date to construct the node and edges + """ + # Only use the owl prefix on terms if it is an owl_source (i.e., not ORDO) owl_prefix = "" if owl_source: owl_prefix = "owl:" - # Typically genid classes which don't neatly map onto the KG2 schema + + # Configure the node_id and node_iri + # We only want to construct nodes for standard nodes that fit into KG2 mappings if ID_TAG not in owl_class: + # These are typically genid classes which don't neatly map onto the KG2 schema return node_id = match_prefix(owl_class.get(ID_TAG, str())) if node_id is None: @@ -229,6 +285,7 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True): # Configure the name name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name] + # Return if the node has no names if len(name_list) == 0: return @@ -240,6 +297,7 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True): description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)] description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)] + # Determine whether the node is deprecated deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list()) for name in name_list: search_name = name.lower() @@ -253,15 +311,18 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True): "obo:IAO_0000028", "skos:prefLabel"] synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)] + # Configure the update date update_date_list = list() update_date_keys = ["dc:date", "dcterms:date", "terms:date"] update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)] + # Configure the creation date creation_date_list = list() creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"] creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)] # Configure the biological sequence + # We are only currently using inchi, but we might as well extract all of them in case this changes in the future has_biological_sequence = dict() has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence] has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence] @@ -271,12 +332,13 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True): # Extract edge triples edges_list = list() + # First, extract the edges with predicates that easily map to extraction patters for edge_type in BASE_EDGE_TYPES: for edge in owl_class.get(edge_type, list()): if BASE_EDGE_TYPES[edge_type] in edge: edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None))) - + # Next, identify the edges which are tightly nested under a layer of other information restriction_edges = list() restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())] for equiv in owl_class.get(owl_prefix + "equivalentClass", list()): @@ -284,10 +346,14 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True): for edge in mini_class.get(owl_prefix + "intersectionOf", list()): restriction_edges.append((edge, owl_prefix + "equivalentClass")) + # Then, extract the actual information from those edges for (edge, general_edge_type) in restriction_edges: + # First, handle those with the restriction elements (the owl:EquivalentClass and rdfs:subClassOf sub-predicate cases) for restriction in edge.get(owl_prefix + "Restriction", list()): edge_type = restriction.get(owl_prefix + "onProperty", list()) edge_object = restriction.get(owl_prefix + "someValuesFrom", list()) + + # Ensure each of those lists only have one item, so that we can pull item [0] in the next step to correctly identify the respective information if len(edge_type) != 1: assert len(edge_type) <= 1, edge continue @@ -300,31 +366,41 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True): if edge_type != None and edge_object != None: edges_list.append((edge_type, edge_object)) + # Then handle the generic rdfs:subClassOf case if RESOURCE_KEY in edge: edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None))) + # Convert the edges into CURIEs so they can later be converted into edges superclasses = set() final_edges_list = list() for (edge_relation, edge_object) in edges_list: + # Ensure the edge corresponds to a KG2 mapping edge_object = match_prefix(edge_object) if edge_object is None: continue edge_relation = match_prefix(edge_relation) if edge_relation is None: continue + + # Identify superclass relationships if edge_relation in ["rdfs:subClassOf"]: superclasses.add(edge_object) + + # Add the processed edge to the list of this node's edges final_edges_list.append((edge_relation, edge_object)) - # Imperfect way to make it deterministic + # Formally save the superclass relations to the superclass hierarchy + # This is an imperfect way to make it deterministic; We don't want duplicate superclasses, but we also want the order to remain the same across runs superclasses = sorted(list(superclasses)) if node_id not in CLASS_TO_SUPERCLASSES: CLASS_TO_SUPERCLASSES[node_id] = list() CLASS_TO_SUPERCLASSES[node_id] += superclasses CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id]))) + # Make sure that we have an entry for the node (since multiple sources can have information on a particular node) if node_id not in SAVED_NODE_INFO: SAVED_NODE_INFO[node_id] = list() + # Save this source's version of the node information SAVED_NODE_INFO[node_id].append({ID_KEY: node_id, DEPRECATED_KEY: deprecated, UPDATE_DATE_KEY: update_date_list, @@ -337,34 +413,46 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True): IRI_KEY: node_iri, EDGES_KEY: final_edges_list}) + def process_ontology_item(ontology_item): + """ + Handler for processing ontology subsets + """ + # Extract these custom input attributes (parts of ont-load-inventory.yaml put into owlparser.py output) source = ontology_item.get(OWL_SOURCE_KEY, str()) ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str()) for owl_class in ontology_item.get("owl:Class", list()): process_ontology_class(owl_class, source, ontology_name) + # Special case for non-owl prefix sources (e.g. ORDO) for owl_class in ontology_item.get("Class", list()): process_ontology_class(owl_class, source, ontology_name, False) for ontology_node in ontology_item.get("owl:Ontology", list()): process_ontology_term(ontology_node, source, ontology_name) - # Because of ORDO + # Special case for non-owl prefix sources (e.g. ORDO) for ontology_node in ontology_item.get("Ontology", list()): process_ontology_term(ontology_node, source, ontology_name, False) + def generate_uri_map(curies_to_urls_file_name): + """ + Import the curies-to-urls-map.yaml for use in CURIE ID and IRI resolution + """ uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name)) bidirectional_map = uri_input_map['use_for_bidirectional_mapping'] contraction_map = uri_input_map['use_for_contraction_only'] + # Import the bidirectional map for both ID mapping (URI_MAP) and IRI expansion, given the standard prefix (PREFIX_TO_IRI_MAP) for curie_prefix_dict in bidirectional_map: for curie_prefix in curie_prefix_dict: curie_url = curie_prefix_dict[curie_prefix] URI_MAP[curie_url] = curie_prefix PREFIX_TO_IRI_MAP[curie_prefix] = curie_url + # Import the contraction map for ID mapping (URI_MAP) for curie_prefix_dict in contraction_map: for curie_prefix in curie_prefix_dict: curie_url = curie_prefix_dict[curie_prefix] @@ -375,11 +463,17 @@ def generate_uri_map(curies_to_urls_file_name): global URI_MAP_KEYS URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True) + def match_prefix(node_id): + """ + Given a node_id from an ontology (possibly actually an IRI), return the KG2-standard CURIE ID for the node + """ + # Iterate through the map keys, comparing with the longest urls first (for the most accurate match) for curie_url in URI_MAP_KEYS: if node_id.startswith(curie_url): return node_id.replace(curie_url, URI_MAP[curie_url] + ":") + # If there is no match, attempt to distill down the ID into just the prefix (not always possible) and add it to the list of prefixes not in KG2 if "http" in node_id: MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/") elif ':' in node_id: @@ -389,7 +483,12 @@ def match_prefix(node_id): else: MISSING_ID_PREFIXES.add(node_id) + def construct_nodes_and_edges(nodes_output, edges_output): + """ + Output the nodes and edges from the ontologies once the node information has been extracted and categories have been assigned + """ + # Construct all of the source nodes for source in SOURCE_INFO: source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]]) source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY] @@ -399,11 +498,14 @@ def construct_nodes_and_edges(nodes_output, edges_output): nodes_output.write(node) - + # Constrct the regular nodes and edges for node_id in SAVED_NODE_INFO: + # Iterate across all of the sources which have defined this node for source_node_index in range(len(SAVED_NODE_INFO[node_id])): + # Ignore deprecated nodes if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]: continue + # Extract all of the information from the SAVED_NODE_INFO dictionary name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY] description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY]) @@ -411,21 +513,26 @@ def construct_nodes_and_edges(nodes_output, edges_output): synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY] category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY] + # Obtain source information source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY] provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source source_date = SOURCE_INFO[source][UPDATE_DATE_KEY] + # Determine the node's dates update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date) creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date) + # Construct the node and add in the other attributes node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by) node["description"] = description node["has_biological_sequence"] = has_biological_sequence node["creation_date"] = creation_date node["synonym"] = synonyms + # Output the node nodes_output.write(node) + # Construct the edges from the triples saved for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]: relation_label = edge_relation.split(':')[1] edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date) @@ -433,9 +540,10 @@ def construct_nodes_and_edges(nodes_output, edges_output): edges_output.write(edge) - if __name__ == '__main__': print("Start time: ", kg2_util.date()) + + # Obtain all of the input arguments args = get_args() input_file_name = args.inputFile curies_to_categories_file_name = args.curiesToCategoriesYAML @@ -444,30 +552,37 @@ def construct_nodes_and_edges(nodes_output, edges_output): output_edges_file_name = args.outputEdgesFile test_mode = args.test + # Create the output files nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode) nodes_output = nodes_info[0] edges_output = edges_info[0] + # Prepare the node category dictionaries with the information from curies-to-categories.yaml curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name)) for mapping_node in curies_to_categories_data["term-mappings"]: NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING) for prefix in curies_to_categories_data["prefix-mappings"]: PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix] + # Begin reading the JSON Lines input file containing all of the ontologies input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name) input_data = input_read_jsonlines_info[0] - ontology_prefixes = set() + # Prepare the URI maps for mapping ontology information to KG2 CURIE IDs and IRIs generate_uri_map(curies_to_urls_file_name) + + # Extract all of the necessary information from the ontologies for ontology_item in input_data: process_ontology_item(ontology_item) + # Categorize every node and save the information in the information dictionary for the node for node_id in SAVED_NODE_INFO: categorize_node(node_id) node_category = NODE_CATEGORY_MAPPINGS[node_id][0] for index in range(len(SAVED_NODE_INFO[node_id])): SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category + # Save all of the node and edge information in KG2 format construct_nodes_and_edges(nodes_output, edges_output) kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name) From b0850552d01a9ac2d93ef7b6a8ead23aba16dc7c Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 03:00:26 -0700 Subject: [PATCH 095/125] #387 archiving multi ont --- convert/{ => archive}/multi_ont_to_kg_jsonl.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename convert/{ => archive}/multi_ont_to_kg_jsonl.py (100%) diff --git a/convert/multi_ont_to_kg_jsonl.py b/convert/archive/multi_ont_to_kg_jsonl.py similarity index 100% rename from convert/multi_ont_to_kg_jsonl.py rename to convert/archive/multi_ont_to_kg_jsonl.py From 321981cc505cb3d71abc740394693ed553230d7d Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 03:00:59 -0700 Subject: [PATCH 096/125] #387 archiving build multi ont --- convert/{ => archive}/build-multi-ont-kg.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename convert/{ => archive}/build-multi-ont-kg.sh (100%) diff --git a/convert/build-multi-ont-kg.sh b/convert/archive/build-multi-ont-kg.sh similarity index 100% rename from convert/build-multi-ont-kg.sh rename to convert/archive/build-multi-ont-kg.sh From cea05b7b9645fe02636db2e1c8fa562cfabf92f9 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 03:02:40 -0700 Subject: [PATCH 097/125] updating executability for newer files --- convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 0 convert/ontologies_jsonl_to_kg_jsonl.py | 0 convert/umls_list_jsonl_to_kg_jsonl.py | 0 extract/extract-clinicaltrialskg.sh | 0 4 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 convert/clinicaltrialskg_tsv_to_kg_jsonl.py mode change 100644 => 100755 convert/ontologies_jsonl_to_kg_jsonl.py mode change 100644 => 100755 convert/umls_list_jsonl_to_kg_jsonl.py mode change 100644 => 100755 extract/extract-clinicaltrialskg.sh diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py old mode 100644 new mode 100755 diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py old mode 100644 new mode 100755 diff --git a/convert/umls_list_jsonl_to_kg_jsonl.py b/convert/umls_list_jsonl_to_kg_jsonl.py old mode 100644 new mode 100755 diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh old mode 100644 new mode 100755 From fd482ba7285c01da769d51a5b1db9b321bd08870 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 04:24:18 -0700 Subject: [PATCH 098/125] #387 comments through owlparser --- extract/owlparser.py | 208 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 194 insertions(+), 14 deletions(-) diff --git a/extract/owlparser.py b/extract/owlparser.py index fe540f3b..418bf0d9 100644 --- a/extract/owlparser.py +++ b/extract/owlparser.py @@ -28,10 +28,15 @@ def get_args(): arg_parser.add_argument('outputFile', type=str) return arg_parser.parse_args() + def date(): return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + class LineElementRead(): + """ + Custom enum for identifying which element is currently being read in an XML Line + """ NONE = 0 TAG = 1 ATTRIBUTE_TAG = 2 @@ -39,13 +44,21 @@ class LineElementRead(): MAIN = 4 END_TAG = 5 + class XMLParser(): + """ + General XML to JSON Lines parser optimized for XML consisting of many short nests + """ def __init__(self, skip_tags, ignored_attributes, processing_func): + # Defining the types of lines which will be skipped by the processor self.COMMENT = "!--" - self.OUTMOST_TAGS_SKIP = skip_tags + self.OUTMOST_TAGS_SKIP = skip_tags # To avoid one large JSON Line, the outmost tags should be skipped self.IGNORED_ATTRIBUTES = ignored_attributes + + # Function for processing each nest self.processing_func = processing_func + # Line categorization labels self.LINE_TYPE_IGNORE = "ignore" self.LINE_TYPE_START_NEST = "start nest" self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes" @@ -54,6 +67,7 @@ def __init__(self, skip_tags, ignored_attributes, processing_func): self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes" self.LINE_TYPE_END_NEST = "end nest" + # Processing labels for components of each line self.KEY_TAG = "tag" self.KEY_ATTRIBUTES = "attributes" self.KEY_TEXT = "ENTRY_TEXT" @@ -74,11 +88,16 @@ def __init__(self, skip_tags, ignored_attributes, processing_func): self.prev_letter = "" self.type_to_read = LineElementRead.NONE + def categorize_line(self): + """ + Logic for determining which type of line is being processed based on the content of its attributes + """ # Categorize the type of line line_type = str() out = dict() + # If it is one of these first line types, skip it # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag: line_type = self.LINE_TYPE_IGNORE @@ -91,34 +110,45 @@ def categorize_line(self): if start_tag_exists: if attributes_exist: if text_exists: + # This type of line has everything line_type = self.LINE_TYPE_ENTRY_WITH_ATTR out[self.KEY_TAG] = self.tag out[self.KEY_ATTRIBUTES] = self.attributes out[self.KEY_TEXT] = self.main_text elif end_tag_exists: + # This type of line acts an an entry, but doesn't have text. There is not another end_tag coming for it. line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR out[self.KEY_TAG] = self.tag out[self.KEY_ATTRIBUTES] = self.attributes else: + # This type of line does not have an entry and acts as the start of an inner nest line_type = self.LINE_TYPE_START_NEST_WITH_ATTR out[self.KEY_TAG] = self.tag out[self.KEY_ATTRIBUTES] = self.attributes elif text_exists: + # This type of line does not have attributes and only contains an entry line_type = self.LINE_TYPE_ENTRY out[self.KEY_TAG] = self.tag out[self.KEY_TEXT] = self.main_text else: + # This type of line is only starting a nest and does not contain any of its own information line_type = self.LINE_TYPE_START_NEST out[self.KEY_TAG] = self.tag elif end_tag_exists: + # This type of line ends a started nest line_type = self.LINE_TYPE_END_NEST out[self.KEY_TAG] = self.end_tag + # Assign the key type based on the determined line type out[self.KEY_TYPE] = line_type return out + def get_letters(self, letter_index): + """ + Get the current letter, previous letter, and next letter in the line and count the brackets status (in case there are brackets inside of brackets) + """ self.letter = self.line[letter_index] self.next_letter = "" self.prev_letter = "" @@ -134,6 +164,9 @@ def get_letters(self, letter_index): def identify_tag_type(self, letter_index): + """ + Depending on the presence of a "/" character, determine whether this is an end tag + """ changed = True if self.letter == '<' and letter_index == 0: @@ -148,8 +181,13 @@ def identify_tag_type(self, letter_index): def read_tag(self): + """ + Determine the tag of an XML line + """ changed = True + # Once you hit a space or bracket, switch to the next type of line element + # If not, keep adding to the tag if self.letter == ' ' and self.type_to_read == LineElementRead.TAG: self.type_to_read = LineElementRead.ATTRIBUTE_TAG elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0: @@ -167,16 +205,28 @@ def read_tag(self): def store_attribute(self): + """ + Clean and save an attribute for later processing + """ + # Only save desired attributes if self.attribute_tag not in self.IGNORED_ATTRIBUTES: self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"') + + # Reset our attribute trackers self.attribute_tag = "" self.attribute_text = "" def read_attributes(self): + """ + Determine the attributes of an XML line + """ changed = True + + # Identify whether it is time to process the attributes of the line start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT) + # At the end of the attributes section, save the attributes and switch to the text portion of the line if self.letter == '>' and start_reading_attributes and self.start_brackets == 0: self.type_to_read = LineElementRead.MAIN @@ -184,6 +234,7 @@ def read_attributes(self): if self.prev_letter == '/': self.end_tag = self.tag + # Otherwise, read the correct part of the line and switch parts based on the delimiter ('=' and ' ') elif start_reading_attributes: if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG: self.type_to_read = LineElementRead.ATTRIBUTE_TEXT @@ -201,7 +252,12 @@ def read_attributes(self): def read_main(self): + """ + Determine the main textual entry of an XML line + """ changed = True + + # Stop reading and switch to reading the end tag once you hit a start bracket if self.letter == '<' and self.type_to_read == LineElementRead.MAIN: self.type_to_read = LineElementRead.END_TAG elif self.type_to_read == LineElementRead.MAIN: @@ -213,9 +269,15 @@ def read_main(self): def read_end_tag(self): + """ + Determine the end tag of an XML line + """ changed = True + + # Stop once you've reached the end of the line if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0: pass + # Otherwise, add to the end tag elif self.type_to_read == LineElementRead.END_TAG: self.end_tag += self.letter else: @@ -225,153 +287,214 @@ def read_end_tag(self): def convert_line(self): + """ + Using a streaming reading technique, convert a line into its tag, attributes, text, and type + """ + # Initialize all of the line elements for the new line self.tag = "" self.attributes = dict() self.attribute_tag = "" self.attribute_text = "" self.main_text = "" self.end_tag = "" - self.type_to_read = LineElementRead.NONE - self.only_tag = False - self.start_brackets = 0 + # Read the line letter by letter for letter_index in range(len(self.line)): + # Get the letters required for analysis regardless of the element type self.get_letters(letter_index) - # First < + # Start by determining if it is a start or end tag if self.identify_tag_type(letter_index): + # If this was the work done on this letter, move to the next continue + # Determine the tag of the line if self.read_tag(): + # If this was the work done on this letter, move to the next continue + # Determine the attributes of the line (if applicable) if self.read_attributes(): + # If this was the work done on this letter, move to the next continue + # Determine the main text given in the line (if applicable) if self.read_main(): + # If this was the work done on this letter, move to the next continue + # Determine the end tag of the line (if applicable) if self.read_end_tag(): + # If this was the work done on this letter, move to the next continue + # Categorize the line based on the saved characteristics return self.categorize_line() def convert_nest(self, nest, start_index): + """ + Recursively the set of lines (from the first start tag to its pairing end tag) into a dictionary (nested as necessary) + """ + # Initialize the current dictionary in the nest nest_dict = dict() + + # Start at the given index curr_index = start_index + # Iterate linearly (without repeat) through every element in the nest while curr_index < len(nest): + # Get the basic characteristics of the nest element element = nest[curr_index] line_type = element[self.KEY_TYPE] line_tag = element[self.KEY_TAG] line_text = element.get(self.KEY_TEXT, None) line_attributes = element.get(self.KEY_ATTRIBUTES, None) + # If we are starting a new nest, we need to recurse if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: + # Initialize every element to a list to simplify later processing (don't have to deal with some entries being strings and some being lists then) if line_tag not in nest_dict: nest_dict[line_tag] = list() + # Recurse to build the inner dictionary converted_nest, ret_index = self.convert_nest(nest, curr_index + 1) + # If we have line attributes, we need to save them in the dictionary if line_attributes is not None: for attribute in line_attributes: converted_nest[attribute] = line_attributes[attribute] + # Add this converted nest to the overall list nest_dict[line_tag].append(converted_nest) + # Set the new index to prevent duplication curr_index = ret_index + 1 continue + # If we're not starting a new nest, process additively if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]: + # Initialize every element to a list to simplify later processing (don't have to deal with some entries being strings and some being lists then) if line_tag not in nest_dict: nest_dict[line_tag] = list() curr_dict = dict() + # If we have line text, we need to save it in the dictionary if line_text is not None: curr_dict[self.KEY_TEXT] = line_text + # If we have line attributes, we need to save them in the dictionary if line_attributes is not None: for attribute in line_attributes: curr_dict[attribute] = line_attributes[attribute] + # Add this converted nest to the overall list nest_dict[line_tag].append(curr_dict) + # Move to the next element curr_index += 1 continue + # Recursive base case, to exit the nest building when we hit the end of a nest if line_type in [self.LINE_TYPE_END_NEST]: return nest_dict, curr_index + # Once we reach the end, we need to return the nest return nest_dict, curr_index def divide_into_lines(self, input_file_name): + """ + Split a given XML file into sets of lines representing a nest (at a given level within the overall XML nest, based on the ignored lines) and process these nests + """ + # Initialize the current nest curr_str = "" curr_nest = list() - curr_nest_tags = list() # Treating it as a stack + curr_nest_tags = list() # Treating it as a stack, since some tags will be identical within a nest and we want to make sure start and end tags match start_brackets = 0 with open(input_file_name) as input_file: + # Iterate linearly through the file for line in input_file: line_str = line.strip() + # Process each letter in the line linearly for letter_index in range(len(line_str)): letter = line_str[letter_index] + + # In case of nested brackets ("<<>>"), need to maintain matching brackets if letter == '<': start_brackets += 1 if letter == '>': start_brackets -= 1 + # Identify the next letter, to aid in identifying the end of the line next_letter = "" if letter_index + 1 < len(line_str): next_letter = line_str[letter_index + 1] + # Build up the current line curr_str += letter + # Determine when we have reached the end of the line and process accordingly if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0: - # Only return if nesting + # Assign the class variable the current string to facilitate processing self.line = curr_str + # Process the line line_parsed = self.convert_line() + # Determine important traits of the line to build the nest tag = line_parsed.get(self.KEY_TAG, None) assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely line_type = line_parsed.get(self.KEY_TYPE, None) - attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys() + # Add non-ignore lines to the nest if line_type != self.LINE_TYPE_IGNORE: curr_nest.append(line_parsed) + # Initialize the output_file criteria output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0) + # If we are starting a new internal nest, push the current tag to the stack to ensure it has a matching end tag if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]: curr_nest_tags.append(tag) + # Ensure that the reached end tag matches the last start tag elif line_type == self.LINE_TYPE_END_NEST: popped_curr_nest_tag = curr_nest_tags.pop() assert popped_curr_nest_tag == tag, curr_nest + + # The nest is ready to process once we have matched the original start tag if len(curr_nest_tags) == 0: output_nest = True + + # Once the nest has been finished, convert it into a dictionary and process it if output_nest: nest_dict, _ = self.convert_nest(curr_nest, 0) + # Process the given nest dictionary based on a given processing function self.processing_func(nest_dict) + # Reinitialize variables for the next loop curr_nest = list() curr_nest_tag = str() curr_str = "" + # If we have to go to the next line to finish processing one XML line, add a delimiting space if curr_str != "": - # divide lines by a space curr_str += ' ' class OWLParser(): + """ + Custom parser (into JSON Lines) for XML-style OWL files + """ def __init__(self, input_files, input_file_names, owl_file_path, output_file_name): + # Important tags within OWL files for processing self.XML_TAG = "?xml" self.RDF_TAG = "rdf:RDF" self.DOCTYPE_TAG = "!DOCTYPE" @@ -380,35 +503,50 @@ def __init__(self, input_files, input_file_names, owl_file_path, output_file_nam self.SUBCLASS_TAG = "rdfs:subClassOf" self.NODEID_TAG = "rdf:nodeID" self.RDF_ABOUT_TAG = "rdf:about" + + # Generic OWL ID prefix self.GENID_PREFIX = "genid" + # Custom additions to JSON Lines output to propagate ont-load-inventory.yaml information self.OWL_SOURCE_KEY = "owl_source" self.OWL_SOURCE_NAME_KEY = "owl_source_name" + # Tags to exclude from JSON Lines representation, to be passed into XML Parser self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG] + # Attributes to ignore for JSON Lines representation (due to overcrowding) self.ignored_attributes = ["xml:lang"] + # XML Parser for OWL Parser, using triage_nest_dict as the processing_func self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict) + # Initialize the genid processing dictionaries required self.GENID_REMAINING_NESTS = dict() self.GENID_TO_ID = dict() self.ID_TO_GENIDS = dict() + # File names for input/output self.input_files = input_files self.input_file_names = input_file_names self.owl_file_path = owl_file_path self.output_file_name = output_file_name + # Output writer self.output_info = kg2_util.create_single_jsonlines() self.output = self.output_info[0] + def check_for_class_genids(self, nest_dict): + """ + Scanner for genids within an "owl:Class", to prepare them for later matching + """ genids = list() nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) for nest_class_index in range(len(nest_dict_classes)): nest_class = nest_dict_classes[nest_class_index] + + # genids are contained within "rdfs:subClassOf" elements nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list()) for nest_subclass_index in range(len(nest_subclasses)): nest_subclass = nest_subclasses[nest_subclass_index] @@ -420,13 +558,20 @@ def check_for_class_genids(self, nest_dict): def check_for_restriction_genids(self, nest_dict): + """ + Check a nest for possibly containing a "genid" term within an "owl:Restriction" element + """ for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()): potential_genid = nest_restriction.get(self.NODEID_TAG, str()) if potential_genid.startswith(self.GENID_PREFIX): return potential_genid return None + def extract_class_id(self, nest_dict): + """ + Determine the id of an "owl:Class", for use as a key + """ nest_dict_classes = nest_dict.get(self.CLASS_TAG, list()) # Can't have competing class_ids assert len(nest_dict_classes) <= 1 @@ -435,7 +580,11 @@ def extract_class_id(self, nest_dict): nest_class = nest_dict_classes[nest_class_index] return nest_class.get(self.RDF_ABOUT_TAG, str()) + def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): + """ + Replace a genid entry in an "rdfs:subClassOf" element with its corresponding "owl:Restriction" definition (which contains an actual identifier) + """ output_class_nest = class_nest nest_dict_classes = class_nest.get(self.CLASS_TAG, list()) @@ -452,50 +601,69 @@ def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest): def write_to_output(self, output_dict, source_file): + """ + Save other information to an output dictionary before writing it to the output JSON Lines file + """ output_dict[self.OWL_SOURCE_KEY] = source_file output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file] self.output.write(output_dict) - return - def triage_nest_dict(self, nest_dict): + """ + Process a nest dictionary by outputting it if it's ready (no outstanding "genid" terms) + """ + # Check for elements which complicate save pattern genids = self.check_for_class_genids(nest_dict) restriction_genid = self.check_for_restriction_genids(nest_dict) class_id = self.extract_class_id(nest_dict) + # If there are class genids, save these for future identification and store the nest to be outputted later if len(genids) > 0: for genid in genids: self.GENID_TO_ID[genid] = class_id self.ID_TO_GENIDS[class_id] = genids self.GENID_REMAINING_NESTS[class_id] = nest_dict + # If this nest contains a genid definition to be placed in its "owl:Class", place it, then output the nest elif restriction_genid is not None: class_id = self.GENID_TO_ID.get(restriction_genid, str()) + + # Issue a warning if genid doesn't correspond to an "owl:Class" if len(class_id) == 0: print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND") # Save to output despite not matching with an existing class self.write_to_output(nest_dict, self.input_file) return + + # Store the genid and remove it from the list of outstanding genids class_nest = self.GENID_REMAINING_NESTS[class_id] self.ID_TO_GENIDS[class_id].remove(restriction_genid) updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest) + # We must wait until all of the genids in the "owl:Class" have been matched to finally output if len(self.ID_TO_GENIDS[class_id]) > 0: self.GENID_REMAINING_NESTS[class_id] = updated_class_nest else: - # Since all of the genids used in this class have been matched, output self.write_to_output(nest_dict, self.input_file) self.GENID_REMAINING_NESTS[class_id] = None + # Otherwise, it is a normal situation else: # There are no genids that need to be worked with, so just output self.write_to_output(nest_dict, self.input_file) def parse_OWL_file(self): + """ + Handler for parsing the owl files + """ + # Iterate through the input files, processing them for input_file in self.input_files: + # Set the current OWLParser input file to this input file self.input_file = input_file print("Reading:", input_file, "starting at", date()) + + # Process the file self.xml_parser.divide_into_lines(self.owl_file_path + input_file) # Genid wasn't filled, still want to include them though @@ -512,9 +680,14 @@ def parse_OWL_file(self): def identify_and_download_input_files(ont_load_inventory, path_to_owl_files): + """ + Download all of the input files in ont-load-inventory.yaml + """ input_files = list() input_file_names = dict() owl_file_path = path_to_owl_files.rstrip('/') + "/" + + # Download every file in the file and store the file name and title for later use as provenance for item in ont_load_inventory: input_files.append(item['file']) input_file_names[item['file']] = item['title'] @@ -522,19 +695,26 @@ def identify_and_download_input_files(ont_load_inventory, path_to_owl_files): kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file']) print("Download of:", item['file'], "finished at", date()) + # Return, providing the file path so the files can be opened by the XMLParser later return input_files, input_file_names, owl_file_path if __name__ == '__main__': + print("Start Time:", date()) args = get_args() + + # Obtain all arguments input_file_name = args.inputFile owl_path = args.owlFilePath output_file_name = args.outputFile + # Read ont-load-inventory.yaml to prepare for OWL processing ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name)) input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path) print("Files:", input_files) - print("Start Time:", date()) + # Initialize the OWLParser owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name) + + # Run parsing on all of the OWL files owl_parser.parse_OWL_file() - print("End Time:", date()) \ No newline at end of file + print("End Time:", date()) From 7bd5e8f8b2aff6e445ad4ebb5e1f563f36c818ed Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 04:28:22 -0700 Subject: [PATCH 099/125] #387 adjusting for CHEBI issues --- extract/extract-ontologies.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh index 3248cf4f..5c90b782 100755 --- a/extract/extract-ontologies.sh +++ b/extract/extract-ontologies.sh @@ -28,6 +28,9 @@ mkdir -p ${ontologies_dir} # Temporary adjustment for https://github.com/HUPO-PSI/psi-mi-CV/issues/456 ${s3_cp_cmd} s3://${s3_bucket}/mi.owl ${ontologies_dir}/mi.owl +# Temporary adjustment due to lack of resolution of chebi PURL +${s3_cp_cmd} s3://${s3_bucket}/chebi.owl ${ontologies_dir}/mi.owl + # Generate the ontologies.jsonl file ${python_command} ${parsing_script} ${ontologies_load_inventory} ${ontologies_dir} ${output_file} From 0804556050e20ffde0b51aee048cb3b1065ccec5 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 04:29:13 -0700 Subject: [PATCH 100/125] #387 want to remove old ontologies --- extract/extract-ontologies.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh index 5c90b782..359114b2 100755 --- a/extract/extract-ontologies.sh +++ b/extract/extract-ontologies.sh @@ -23,6 +23,7 @@ ontologies_load_inventory=${1-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"} output_file=${2-"${BUILD_DIR}/ontologies.jsonl"} ontologies_dir=${3-"${BUILD_DIR}/owl_files"} +rm -rf ${ontologies_dir} mkdir -p ${ontologies_dir} # Temporary adjustment for https://github.com/HUPO-PSI/psi-mi-CV/issues/456 From a1a7c6ee7f3efd32ed45f045a56e5debd62e09b6 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 04:30:28 -0700 Subject: [PATCH 101/125] #387 have to fully handle CHEBI --- extract/extract-ontologies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh index 359114b2..e82d8a5b 100755 --- a/extract/extract-ontologies.sh +++ b/extract/extract-ontologies.sh @@ -30,7 +30,7 @@ mkdir -p ${ontologies_dir} ${s3_cp_cmd} s3://${s3_bucket}/mi.owl ${ontologies_dir}/mi.owl # Temporary adjustment due to lack of resolution of chebi PURL -${s3_cp_cmd} s3://${s3_bucket}/chebi.owl ${ontologies_dir}/mi.owl +${s3_cp_cmd} s3://${s3_bucket}/chebi.owl ${ontologies_dir}/chebi.owl # Generate the ontologies.jsonl file ${python_command} ${parsing_script} ${ontologies_load_inventory} ${ontologies_dir} ${output_file} From b0aee1c1d284a29b351da76241939091fbcb68b3 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 05:25:03 -0700 Subject: [PATCH 102/125] #387 revising predicate remap for new ontology etl --- maps/predicate-remap.yaml | 152 +++++++++++++++++++++++++++----------- 1 file changed, 109 insertions(+), 43 deletions(-) diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml index 15452136..3d4bc482 100644 --- a/maps/predicate-remap.yaml +++ b/maps/predicate-remap.yaml @@ -415,6 +415,12 @@ CL:lacks_part: CL:lacks_plasma_membrane_part: operation: keep core_predicate: biolink:lacks_part +COB:0000078: + operation: delete +COB:0000081: + operation: delete +COB:0000087: + operation: delete CTD:increases_expression_of: operation: keep core_predicate: biolink:affects @@ -425,6 +431,9 @@ CTD:increases_expression_of: DDANAT:develops_from: operation: keep core_predicate: biolink:develops_from +DDANAT:part_of: + operation: invert + core_predicate: biolink:has_part DGIdb:activator: operation: keep core_predicate: biolink:affects @@ -1469,6 +1478,9 @@ FMA:surrounded_by: FMA:surrounds: operation: keep core_predicate: biolink:coexists_with +FMA:systemic_part_of: + operation: invert + core_predicate: biolink:has_part FMA:transforms_from: operation: invert core_predicate: biolink:precedes @@ -1759,6 +1771,8 @@ IAO:0000142: IAO:0000219: operation: keep core_predicate: biolink:related_to +IAO:0100001: + operation: delete ICD10PCS:CHD: operation: keep core_predicate: biolink:subclass_of @@ -2280,6 +2294,16 @@ MONDO:part_of_progression_of_disease: MONDO:predisposes_towards: operation: keep core_predicate: biolink:contributes_to +NBO-PROPERTY:by_means: + operation: delete +NBO-PROPERTY:has_participant: + operation: keep + core_predicate: biolink:has_participant +NBO-PROPERTY:in_response_to: + operation: invert + core_predicate: biolink:causes +NBO-PROPERTY:is_about: + operation: delete NCBITaxon:CHD: operation: keep core_predicate: biolink:subclass_of @@ -2969,6 +2993,9 @@ OBI:0000295: OBI:0000299: operation: keep core_predicate: biolink:has_output +OBI:0000312: + operation: invert + core_predicate: biolink:has_output OBI:0000417: operation: keep core_predicate: biolink:has_output @@ -3020,6 +3047,10 @@ OBO:HANCESTRO_0308: OBO:HANCESTRO_0330: operation: keep core_predicate: biolink:related_to +OBO:INO_0000154: + operation: delete +OBO:MF#manifestationOf: + operation: delete OBO:nbo#by_means: operation: invert core_predicate: biolink:actively_involved_in @@ -3032,6 +3063,18 @@ OBO:nbo#in_response_to: OBO:nbo#is_about: operation: keep core_predicate: biolink:related_to +OBO:NCIT_R163: + operation: keep + core_predicate: biolink:related_to +OBO:NCIT_R81: + operation: keep + core_predicate: biolink:related_to +OBO:NCIT_R82: + operation: keep + core_predicate: biolink:related_to +OBO:has_role: + operation: keep + core_predicate: biolink:related_to OBO:mondo/mondo-base#predisposes_towards: operation: keep core_predicate: biolink:contributes_to @@ -3056,9 +3099,15 @@ OBO:mondo/mondo-base#disease_responds_to: # OBO:uo#is_unit_of: # operation: invert # core_predicate: biolink:related_to +OIO:hasAlternativeId: + operation: keep + core_predicate: biolink:close_match OIO:hasDbXref: operation: keep core_predicate: biolink:close_match +OIO:xref: + operation: keep + core_predicate: biolink:close_match OMIM:CHD: operation: keep core_predicate: biolink:subclass_of @@ -3114,7 +3163,7 @@ OMIM:phenotype_of: # ORPHA:317346: # operation: keep # core_predicate: biolink:causes -# ORPHANET:327767: +# orphanet:327767: # operation: keep # core_predicate: biolink:actively_involved_in # ORPHA:410295: @@ -3126,49 +3175,46 @@ OMIM:phenotype_of: # ORPHA:465410: # operation: keep # core_predicate: biolink:biomarker_for -ORPHANET:317343: +orphanet:317343: operation: keep core_predicate: biolink:causes -ORPHANET:317344: +orphanet:317344: operation: keep core_predicate: biolink:causes -ORPHANET:317345: +orphanet:317345: operation: keep core_predicate: biolink:correlated_with orphanet:317346: operation: keep core_predicate: biolink:causes -ORPHANET:317346: - operation: keep - core_predicate: biolink:causes -ORPHANET:317348: +orphanet:317348: operation: keep core_predicate: biolink:actively_involved_in -ORPHANET:317349: +orphanet:317349: operation: keep core_predicate: biolink:actively_involved_in -ORPHANET:327767: +orphanet:327767: operation: keep core_predicate: biolink:actively_involved_in -ORPHANET:410295: +orphanet:410295: operation: keep core_predicate: biolink:causes -ORPHANET:410296: +orphanet:410296: operation: keep core_predicate: biolink:causes -ORPHANET:465410: +orphanet:465410: operation: keep core_predicate: biolink:biomarker_for -ORPHANET:C016: +orphanet:C016: operation: keep core_predicate: biolink:related_to -ORPHANET:C017: +orphanet:C017: operation: keep core_predicate: biolink:related_to -ORPHANET:C056: +orphanet:C056: operation: keep core_predicate: biolink:close_match -ORPHANET:C057: +orphanet:C057: operation: keep core_predicate: biolink:close_match # PATO:0000085: @@ -3418,6 +3464,9 @@ RO:0001022: RO:0001025: operation: keep core_predicate: biolink:located_in +RO:0001900: + operation: keep + core_predicate: biolink:related_to RO:0002001: operation: keep core_predicate: biolink:related_to @@ -3502,6 +3551,15 @@ RO:0002162: RO:0002170: operation: keep core_predicate: biolink:related_to +RO:0002171: + operation: keep + core_predicate: biolink:related_to +RO:0002174: + operation: keep + core_predicate: biolink:related_to +RO:0002175: + operation: keep + core_predicate: biolink:related_to RO:0002176: operation: keep core_predicate: biolink:related_to @@ -3768,6 +3826,8 @@ RO:0002387: core_predicate: biolink:related_to RO:0002388: operation: delete +RO:0002404: + operation: delete # RO:0002410: # operation: keep # core_predicate: biolink:causes @@ -3822,6 +3882,9 @@ RO:0002470: RO:0002473: operation: keep core_predicate: biolink:composed_primarily_of +RO:0002475: + operation: keep + core_predicate: biolink:related_to RO:0002488: operation: keep core_predicate: biolink:temporally_related_to @@ -3921,9 +3984,9 @@ RO:0002596: # capable of regulating # RO:0002599: # operation: keep # core_predicate: biolink:prevents -# RO:0002604: -# operation: keep -# core_predicate: biolink:opposite_of +RO:0002604: + operation: keep + core_predicate: biolink:opposite_of # RO:0002606: # operation: keep # core_predicate: biolink:treats @@ -4501,12 +4564,15 @@ SO:has_part: core_predicate: biolink:has_part SO:has_quality: operation: delete +SO:member_of: + operation: invert + core_predicate: biolink:has_member SO:overlaps: operation: keep core_predicate: biolink:overlaps -SO:member_of: +SO:part_of: operation: invert - core_predicate: biolink:has_member + core_predicate: biolink:has_part # SO:similar_to: # operation: keep # core_predicate: biolink:similar_to @@ -4612,9 +4678,9 @@ UBERON_CORE:in_outermost_side_of: UBERON_CORE:indirectly_supplies: operation: keep core_predicate: biolink:coexists_with -# UBERON_CORE:layer_part_of: -# operation: invert -# core_predicate: biolink:has_part +UBERON_CORE:layer_part_of: + operation: invert + core_predicate: biolink:has_part UBERON_CORE:posteriorly_connected_to: operation: keep core_predicate: biolink:coexists_with @@ -4648,9 +4714,9 @@ UBERON_CORE:synapsed_by: # UBERON_CORE:transitively_proximally_connected_to: # operation: keep # core_predicate: biolink:coexists_with -# UBERON_CORE:trunk_part_of: -# operation: invert -# core_predicate: biolink:has_part +UBERON_CORE:trunk_part_of: + operation: invert + core_predicate: biolink:has_part # UBERON_NONAMESPACE:connected_to: # operation: keep # core_predicate: biolink:related_to @@ -5021,21 +5087,21 @@ rdfs:subClassOf: rdfs:subPropertyOf: operation: keep core_predicate: biolink:subclass_of -# skos:broadMatch: -# operation: keep -# core_predicate: biolink:broad_match -# skos:closeMatch: -# operation: keep -# core_predicate: biolink:close_match -# skos:exactMatch: -# operation: keep -# core_predicate: biolink:exact_match +skos:broadMatch: + operation: keep + core_predicate: biolink:broad_match +skos:closeMatch: + operation: keep + core_predicate: biolink:close_match +skos:exactMatch: + operation: keep + core_predicate: biolink:exact_match skos:member: operation: keep core_predicate: biolink:has_member -# skos:narrowMatch: -# operation: invert -# core_predicate: biolink:broad_match -# skos:relatedMatch: -# operation: keep -# core_predicate: biolink:related_to +skos:narrowMatch: + operation: invert + core_predicate: biolink:broad_match +skos:relatedMatch: + operation: keep + core_predicate: biolink:related_to From 139b1d56b56bd62778383043575f84244e036c1d Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 05:28:51 -0700 Subject: [PATCH 103/125] #387 updating provided by to infores for new ontologies etl --- maps/kg2-provided-by-curie-to-infores-curie.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml index 718cbd11..37854c95 100644 --- a/maps/kg2-provided-by-curie-to-infores-curie.yaml +++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml @@ -78,7 +78,7 @@ OBO:genepio.owl: source_name: Genomic Epidemiology Ontology infores_curie: infores:genepio knowledge_type: knowledge_source -OBO:go/extensions/go-plus.owl: +OBO:go-plus.owl: source_name: Gene Ontology Plus infores_curie: infores:go-plus knowledge_type: knowledge_source @@ -102,7 +102,7 @@ OBO:nbo.owl: source_name: Neuro Behavior Ontology infores_curie: infores:nbo knowledge_type: knowledge_source -OBO:ncbitaxon/subsets/taxslim.owl: +OBO:taxslim.owl: source_name: NCBI Taxonomy Ontology infores_curie: infores:ncbi-taxon knowledge_type: knowledge_source @@ -118,7 +118,7 @@ OBO:ro.owl: source_name: Relations Ontology infores_curie: infores:ro knowledge_type: knowledge_source -OBO:uberon.owl: +OBO:uberon-ext.owl: source_name: Uber Anatomy Ontology infores_curie: infores:uberon knowledge_type: knowledge_source @@ -126,7 +126,7 @@ OBO:uberon.owl: source_name: Online Mendelian Inheritance in Man (OMIM) infores_curie: infores:omim knowledge_type: knowledge_source -'ORPHANET:': +OBO:ordo.owl: source_name: Orphanet Rare Disease Ontology infores_curie: infores:ordo knowledge_type: knowledge_source From ee09f491018609ec481ff7d1565afba6241356be Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:02:23 -0700 Subject: [PATCH 104/125] #387 adding biolink version node in and correcting source node category --- convert/ontologies_jsonl_to_kg_jsonl.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index b0adf675..797953ba 100755 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 ''' ontologies_jsonl_to_kg_jsonl.py: Converts JSON Lines representation of ontologies into KG JSON Lines format - Usage: ontologies_jsonl_to_kg_jsonl.py [--test] + Usage: ontologies_jsonl_to_kg_jsonl.py [--test] ''' @@ -108,6 +108,7 @@ def get_args(): arg_parser.add_argument('inputFile', type=str) arg_parser.add_argument('curiesToCategoriesYAML', type=str) arg_parser.add_argument('curiesToURLsYAML', type=str) + arg_parser.add_argument('biolinkVersionNumber', type=str) arg_parser.add_argument('outputNodesFile', type=str) arg_parser.add_argument('outputEdgesFile', type=str) return arg_parser.parse_args() @@ -222,6 +223,17 @@ def pick_most_recent_date(dates, alternate_date=None): return latest_date.isoformat(sep=' ') +def save_biolink_information(biolink_version_number): + """ + Save the Biolink version with the ontologies versions so we can construct a Biolink version node (hacky workaround) + """ + source = kg2_util.CURIE_PREFIX_BIOLINK_SOURCE + source_id = source + ":" + ontology_iri = URI_MAP[source] + name = "Biolink" + SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number} + + def process_ontology_term(ontology_node, source, ontology_name, owl_source=True): """ Given an owl:Ontology (or analogous) element, determine all of the relevant attributes to construct a source node @@ -494,7 +506,7 @@ def construct_nodes_and_edges(nodes_output, edges_output): source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY] source_id = SOURCE_INFO[source][SOURCE_KEY] source_iri = SOURCE_INFO[source][IRI_KEY] - node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id) + node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.SOURCE_NODE_CATEGORY, source_date, source_id) nodes_output.write(node) @@ -548,6 +560,7 @@ def construct_nodes_and_edges(nodes_output, edges_output): input_file_name = args.inputFile curies_to_categories_file_name = args.curiesToCategoriesYAML curies_to_urls_file_name = args.curiesToURLsYAML + biolink_version_number = args.biolinkVersionNumber output_nodes_file_name = args.outputNodesFile output_edges_file_name = args.outputEdgesFile test_mode = args.test From 4f17e56aa227da188982d0fbca2803a3a7f4fd49 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:03:08 -0700 Subject: [PATCH 105/125] #392 edge blocklist logic implemented --- maps/edge-blocklist.yaml | 16 +++++------ process/filter_kg_and_remap_predicates.py | 35 +++++++++++++++++------ 2 files changed, 35 insertions(+), 16 deletions(-) diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml index bb64c0be..617741dc 100644 --- a/maps/edge-blocklist.yaml +++ b/maps/edge-blocklist.yaml @@ -1,4 +1,4 @@ -- +- # "Vaccines---causes---Autism/Autism Spectrum Disorders" subject_name: Vaccines subject_ids: - ATC:J07 @@ -6,7 +6,7 @@ - UMLS:C0042210 - VANDF:4021642 predicate: biolink:causes - object_name: Autism + object_name: Autism/Autism Spectrum Disorders object_ids: - CHV:0000001598 - CHV:0000050438 @@ -34,7 +34,7 @@ - UMLS:C0856975 - UMLS:C1510586 - UMLS:C1968924 -- +- # "Measles-Mumps-Rubella Vaccine---causes---Autism/Autism Spectrum Disorders" subject_name: Measles-Mumps-Rubella Vaccine subject_ids: - MESH:D022542 @@ -42,7 +42,7 @@ - PDQ:CDR0000702931 - UMLS:C0065828 predicate: biolink:causes - object_name: Autism + object_name: Autism/Autism Spectrum Disorders object_ids: - CHV:0000001598 - CHV:0000050438 @@ -70,7 +70,7 @@ - UMLS:C0856975 - UMLS:C1510586 - UMLS:C1968924 -- +- # "Mercury---causes---Autism/Autism Spectrum Disorders" subject_name: Mercury subject_ids: - CHEBI:16170 @@ -83,7 +83,7 @@ - UMLS:C0025424 - VANDF:4025953 predicate: biolink:causes - object_name: Autism + object_name: Autism/Autism Spectrum Disorders object_ids: - CHV:0000001598 - CHV:0000050438 @@ -111,7 +111,7 @@ - UMLS:C0856975 - UMLS:C1510586 - UMLS:C1968924 -- +- # "Thimerosal---causes---Autism/Autism Spectrum Disorders" subject_name: Thimerosal subject_ids: - ATC:D08AK06 @@ -128,7 +128,7 @@ - UMLS:C0039867 - VANDF:4017480 predicate: biolink:causes - object_name: Autism + object_name: Autism/Autism Spectrum Disorders object_ids: - CHV:0000001598 - CHV:0000050438 diff --git a/process/filter_kg_and_remap_predicates.py b/process/filter_kg_and_remap_predicates.py index 5daace1b..1d103df3 100644 --- a/process/filter_kg_and_remap_predicates.py +++ b/process/filter_kg_and_remap_predicates.py @@ -45,6 +45,7 @@ def make_arg_parser(): arg_parser.add_argument('inforesRemapYaml', type=str, help="The YAML file describing how knowledge_source fields should be remapped to Translator infores curies") arg_parser.add_argument('curiesToURIFile', type=str, help="The file mapping CURIE prefixes to URI fragments") arg_parser.add_argument('knowledgeLevelAgentTypeFile', type=str, help="The file mapping infores curies to knowledge_level and agent_type source information") + arg_parser.add_argument('edgeBlocklistFile', type=str, help="File containing blocked edges from KG2") arg_parser.add_argument('inputNodesFile', type=str, help="The input KG2 graph, in JSON format") arg_parser.add_argument('inputEdgesFile', type=str, help="The input KG2 graph, in JSON format") arg_parser.add_argument('outputNodesFile', type=str, help="The output KG2 graph, in JSON format") @@ -171,11 +172,11 @@ def process_nodes(input_nodes_file_name, infores_remap_config, nodes_output): return nodes_set -def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes): +def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes, edge_blocklist): predicate_remap_config = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(predicate_remap_file_name)) - map_dict = kg2_util.make_uri_curie_mappers(curies_to_uri_file_name) + # map_dict = kg2_util.make_uri_curie_mappers(curies_to_uri_file_name) - curie_to_uri_expander = map_dict['expand'] + # curie_to_uri_expander = map_dict['expand'] source_predicate_curies_not_in_config = set() source_predicate_curies_not_in_nodes = set() knowledge_source_curies_not_in_config_edges = set() @@ -275,10 +276,11 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a if predicate_curie not in nodes: predicate_curie_prefix = predicate_curie.split(':')[0] - predicate_uri_prefix = curie_to_uri_expander(predicate_curie_prefix + ':') - # Create list of curies to complain about if not in biolink - if predicate_uri_prefix == predicate_curie_prefix: - source_predicate_curies_not_in_nodes.add(predicate_curie) + # predicate_uri_prefix = curie_to_uri_expander(predicate_curie_prefix + ':') + # # Create list of curies to complain about if not in biolink + # if predicate_uri_prefix == predicate_curie_prefix: + # source_predicate_curies_not_in_nodes.add(predicate_curie) + source_predicate_curies_not_in_nodes.add(predicate_curie) if edge_dict.get("primary_knowledge_source") is None: #print(f"{edge_dict}") edge_dict["primary_knowledge_source"] = edge_dict.pop("knowledge_source") @@ -305,6 +307,11 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a edge_subject = edge_dict['subject'] edge_object = edge_dict['object'] + edge_triple = (edge_subject, edge_dict['predicate'], edge_object) + if edge_triple in edge_blocklist: + print("Edge:", edge_triple, "in the edge blocklist. Not adding it to edges_output.") + continue + edge_key = f"{edge_subject} /// {predicate_curie} /// {qualified_predicate} /// {qualified_object_aspect} /// {qualified_object_direction} /// {edge_object} /// {primary_knowledge_source}" edges_output.write(edge_dict) @@ -320,12 +327,23 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a warning_knowledge_level_agent_source_not_in_config_edges(knowledge_source_not_in_klat_map) +def load_edge_blocklist(edge_blocklist_dict): + edge_blocklist = list() + for edge in edge_blocklist_dict: + for edge_subject in edge['subject_ids']: + for edge_object in edge['object_ids']: + edge_blocklist.append((edge_subject, edge['predicate'], edge_object)) + + return edge_blocklist + + if __name__ == '__main__': args = make_arg_parser().parse_args() predicate_remap_file_name = args.predicateRemapYaml infores_remap_file_name = args.inforesRemapYaml curies_to_uri_file_name = args.curiesToURIFile knowledge_level_agent_type_file_name = args.knowledgeLevelAgentTypeFile + edge_blocklist_file_name = args.edgeBlocklistFile input_nodes_file_name = args.inputNodesFile input_edges_file_name = args.inputEdgesFile output_nodes_file_name = args.outputNodesFile @@ -340,6 +358,7 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a infores_remap_config = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(infores_remap_file_name)) knowledge_level_agent_type_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(knowledge_level_agent_type_file_name)) + edge_blocklist_map = load_edge_blocklist(kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(edge_blocklist_file_name))) source_predicate_curies_not_in_config = set() knowledge_source_curies_not_in_config_nodes = set() @@ -352,7 +371,7 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a nodes = process_nodes(input_nodes_file_name, infores_remap_config, nodes_output) - process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes) + process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes, edge_blocklist_map) update_date = datetime.now().strftime("%Y-%m-%d %H:%M") version_file = open(args.versionFile, 'r') From e3f0f8e05abaca6fa636598cf477888b2d55863e Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:05:30 -0700 Subject: [PATCH 106/125] #387 correcting the pipeline for new ontologies input --- build/Snakefile-conversion | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion index d7f96c2e..227d5ade 100644 --- a/build/Snakefile-conversion +++ b/build/Snakefile-conversion @@ -27,7 +27,7 @@ rule Ontologies_Conversion: log: config['ONTOLOGIES_CONVERSION_LOG'] shell: - config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_categories_map} {input.curies_to_urls_map} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" + config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_categories_map} {input.curies_to_urls_map} " + config['BIOLINK_MODEL_VERSION'] + " {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" rule SemMedDB_Conversion: input: From 0e9611064113ea1ff727f576b8c93735a17f814f Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:09:03 -0700 Subject: [PATCH 107/125] #392 restringing pipeline for edge blocklist --- master-config.shinc | 1 + process/run-simplify.sh | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/master-config.shinc b/master-config.shinc index 3e78c226..a37d6ff8 100644 --- a/master-config.shinc +++ b/master-config.shinc @@ -24,6 +24,7 @@ curies_to_urls_file=${MAPS_CODE_DIR}/curies-to-urls-map.yaml predicate_mapping_file=${MAPS_CODE_DIR}/predicate-remap.yaml infores_mapping_file=${MAPS_CODE_DIR}/kg2-provided-by-curie-to-infores-curie.yaml knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-type-map.yaml +edge_blocklist_file=${MAPS_CODE_DIR}/edge-blocklist.yaml rtx_config_file=RTXConfiguration-config.json biolink_model_version=4.2.1 infores_registry_version=0.2.8 diff --git a/process/run-simplify.sh b/process/run-simplify.sh index 4a033273..d88660dc 100755 --- a/process/run-simplify.sh +++ b/process/run-simplify.sh @@ -22,13 +22,13 @@ input_nodes_json=${1:-} input_edges_json=${2:-} output_nodes_json=${3:-} output_edges_json=${4:-} -build_flag=${5:-""} +test_flag=${5:-""} # TODO: Inhibits and increase are not in biolink model anymore - Find out what that should be now ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py ${test_flag} --dropNegated \ --dropSelfEdgesExcept interacts_with,regulates,inhibits,increase \ ${predicate_mapping_file} ${infores_mapping_file} ${curies_to_urls_file} \ - ${knowledge_level_agent_type_mapping_file} ${input_nodes_json} ${input_edges_json} \ + ${knowledge_level_agent_type_mapping_file} ${edge_blocklist_file} ${input_nodes_json} ${input_edges_json} \ ${output_nodes_json} ${output_edges_json} ${kg2_version_file_local} date From c499f551efec02e74e3e5a506f21867d8775523e Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:25:48 -0700 Subject: [PATCH 108/125] #387 correcting biolink version number code --- convert/ontologies_jsonl_to_kg_jsonl.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index 797953ba..96bc46e0 100755 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -588,6 +588,9 @@ def construct_nodes_and_edges(nodes_output, edges_output): for ontology_item in input_data: process_ontology_item(ontology_item) + # Save the Biolink node information before processing + save_biolink_information(biolink_version_number) + # Categorize every node and save the information in the information dictionary for the node for node_id in SAVED_NODE_INFO: categorize_node(node_id) From 36c63c684a582d9d3bcce294b604541d9851b489 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:35:46 -0700 Subject: [PATCH 109/125] #387 use correct dictionary to map IRI --- convert/ontologies_jsonl_to_kg_jsonl.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index 96bc46e0..41f86943 100755 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -229,7 +229,7 @@ def save_biolink_information(biolink_version_number): """ source = kg2_util.CURIE_PREFIX_BIOLINK_SOURCE source_id = source + ":" - ontology_iri = URI_MAP[source] + ontology_iri = PREFIX_TO_IRI_MAP[source] name = "Biolink" SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number} @@ -584,13 +584,13 @@ def construct_nodes_and_edges(nodes_output, edges_output): # Prepare the URI maps for mapping ontology information to KG2 CURIE IDs and IRIs generate_uri_map(curies_to_urls_file_name) + # Save the Biolink node information before processing + save_biolink_information(biolink_version_number) + # Extract all of the necessary information from the ontologies for ontology_item in input_data: process_ontology_item(ontology_item) - # Save the Biolink node information before processing - save_biolink_information(biolink_version_number) - # Categorize every node and save the information in the information dictionary for the node for node_id in SAVED_NODE_INFO: categorize_node(node_id) From d54a3d6161e0608267345b48e99017a145c5e016 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:36:45 -0700 Subject: [PATCH 110/125] #387 correct variable names --- convert/ontologies_jsonl_to_kg_jsonl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py index 41f86943..d21cd6b3 100755 --- a/convert/ontologies_jsonl_to_kg_jsonl.py +++ b/convert/ontologies_jsonl_to_kg_jsonl.py @@ -229,9 +229,9 @@ def save_biolink_information(biolink_version_number): """ source = kg2_util.CURIE_PREFIX_BIOLINK_SOURCE source_id = source + ":" - ontology_iri = PREFIX_TO_IRI_MAP[source] + iri = PREFIX_TO_IRI_MAP[source] name = "Biolink" - SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number} + SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: iri, NAME_KEY: name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number} def process_ontology_term(ontology_node, source, ontology_name, owl_source=True): From 59347ab2bc363213bd0124a5839a60e424c201ba Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:49:01 -0700 Subject: [PATCH 111/125] #387 can use shortened link now that we don't actually have to download biolink --- maps/curies-to-urls-map.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index 3452641c..fbbb987e 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -34,7 +34,7 @@ use_for_bidirectional_mapping: # - # biolink: https://w3id.org/linkml/ - - biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/vVERSION_HERE/project/owl/biolink_model.owl.ttl + biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/ - bioschemas: 'https://bioschemas.org/' - From f71881542cb847e2f90920873b69f4324090b8a8 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:49:59 -0700 Subject: [PATCH 112/125] #387 actually just change the biolink link to the repo --- maps/curies-to-urls-map.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml index fbbb987e..19faa9c9 100644 --- a/maps/curies-to-urls-map.yaml +++ b/maps/curies-to-urls-map.yaml @@ -34,7 +34,7 @@ use_for_bidirectional_mapping: # - # biolink: https://w3id.org/linkml/ - - biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/ + biolink_download_source: https://github.com/biolink/biolink-model.git - bioschemas: 'https://bioschemas.org/' - From 0699d849b8d2527cdcc19d438df8ea99f3eff6cb Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 13:56:48 -0700 Subject: [PATCH 113/125] #140 correct the filename --- master-config.shinc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/master-config.shinc b/master-config.shinc index a37d6ff8..f28bdd4d 100644 --- a/master-config.shinc +++ b/master-config.shinc @@ -28,6 +28,6 @@ edge_blocklist_file=${MAPS_CODE_DIR}/edge-blocklist.yaml rtx_config_file=RTXConfiguration-config.json biolink_model_version=4.2.1 infores_registry_version=0.2.8 -kg2_version_file=version.txt +kg2_version_file=kg2-version.txt kg2_version_file_local=${BUILD_DIR}/${kg2_version_file} kg2_version= \ No newline at end of file From e104912e7349057928a9e9d7a0a3947bf896156f Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 14:03:29 -0700 Subject: [PATCH 114/125] #387 pipelining issue thwarted --- extract/extract-ontologies.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh index e82d8a5b..94351b87 100755 --- a/extract/extract-ontologies.sh +++ b/extract/extract-ontologies.sh @@ -19,9 +19,9 @@ config_dir=`dirname "$0"` source ${config_dir}/master-config.shinc parsing_script=${1-"${EXTRACT_CODE_DIR}/owlparser.py"} -ontologies_load_inventory=${1-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"} -output_file=${2-"${BUILD_DIR}/ontologies.jsonl"} -ontologies_dir=${3-"${BUILD_DIR}/owl_files"} +ontologies_load_inventory=${2-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"} +output_file=${3-"${BUILD_DIR}/ontologies.jsonl"} +ontologies_dir=${4-"${BUILD_DIR}/owl_files"} rm -rf ${ontologies_dir} mkdir -p ${ontologies_dir} From bd93687f8d20224edbeb95c31520b293d7a52b96 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 14:07:28 -0700 Subject: [PATCH 115/125] #405 umls cleanup issue --- build/snakemake-config-var.yaml | 2 -- extract/extract-umls.sh | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml index 2c5e1863..2943089f 100644 --- a/build/snakemake-config-var.yaml +++ b/build/snakemake-config-var.yaml @@ -13,8 +13,6 @@ umls_output_base: kg2-umls umls_extraction_script: ${EXTRACT_CODE_DIR}/${umls_extraction_base}.sh umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${version_suffix}${test_suffix}.log umls_extract_file: ${BUILD_DIR}/umls.jsonl -umls_dir: ${BUILD_DIR}/umls -umls_dest_dir: ${umls_dir}/META umls_conversion_script: ${CONVERT_CODE_DIR}/${umls_conversion_base}.py umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${version_suffix}${test_suffix}.log umls_name_heirarchy: ${MAPS_CODE_DIR}/umls-name-heirarchy.yaml diff --git a/extract/extract-umls.sh b/extract/extract-umls.sh index dcaae277..d25e2028 100755 --- a/extract/extract-umls.sh +++ b/extract/extract-umls.sh @@ -19,6 +19,8 @@ source ${config_dir}/master-config.shinc output_file=${2:-${BUILD_DIR}/umls.jsonl} +umls_dir=${BUILD_DIR}/umls +umls_dest_dir=${umls_dir}/META umls_ver=2023AA umls_file_base=umls-${umls_ver}-metathesaurus-full config_file=${umls_dir}/config.prop From 3488401263dc4868ebca7d5bdf91ee851a7713de Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 14:11:01 -0700 Subject: [PATCH 116/125] #408 #398 curl problems --- extract/extract-clinicaltrialskg.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh index 95278512..86b6a799 100755 --- a/extract/extract-clinicaltrialskg.sh +++ b/extract/extract-clinicaltrialskg.sh @@ -24,7 +24,10 @@ version="2.2.6" clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv" echo "# ${version}" > ${clinicaltrialskg_output_file} -${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file} +# ${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file} + +# Short term fix because download link is not resolving and I cannot identify the correct download link +${aws_s3_cp} s3://${s3_bucket}/clinicaltrialskg-edges.tsv ${clinicaltrialskg_output_file} date echo "================= finishing extract-clinicaltrialskg.sh ==================" \ No newline at end of file From 848a24fef9e69e46be1001e2ee602af594ecbb25 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 14:14:38 -0700 Subject: [PATCH 117/125] #408 issue with DisGeNET download --- extract/extract-disgenet.sh | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/extract/extract-disgenet.sh b/extract/extract-disgenet.sh index c4c6c74e..5e6ddd73 100755 --- a/extract/extract-disgenet.sh +++ b/extract/extract-disgenet.sh @@ -22,9 +22,12 @@ disgenet_output_file=${1:-"${BUILD_DIR}/all_gene_disease_pmid_associations.tsv"} disgenet_download_link="https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_pmid_associations.tsv.gz" -${curl_get} ${disgenet_download_link} > ${disgenet_output_file}.gz +# ${curl_get} ${disgenet_download_link} > ${disgenet_output_file}.gz -gzip -d ${disgenet_output_file}.gz +# gzip -d ${disgenet_output_file}.gz + +# Temporary patch due to link failing to resolve +${s3_cp_cmd} s3://${s3_bucket}/all_gene_disease_pmid_associations.tsv ${disgenet_output_file} date From e7d76d6894f53551bcec652d49927225f4e3ae21 Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 14:14:57 -0700 Subject: [PATCH 118/125] #408 typo for download --- extract/extract-clinicaltrialskg.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh index 86b6a799..18630d16 100755 --- a/extract/extract-clinicaltrialskg.sh +++ b/extract/extract-clinicaltrialskg.sh @@ -27,7 +27,7 @@ echo "# ${version}" > ${clinicaltrialskg_output_file} # ${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file} # Short term fix because download link is not resolving and I cannot identify the correct download link -${aws_s3_cp} s3://${s3_bucket}/clinicaltrialskg-edges.tsv ${clinicaltrialskg_output_file} +${s3_cp_cmd} s3://${s3_bucket}/clinicaltrialskg-edges.tsv ${clinicaltrialskg_output_file} date echo "================= finishing extract-clinicaltrialskg.sh ==================" \ No newline at end of file From 7edd9886bf6da84f1ff34570b779c6c29ede6c9c Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 14:28:58 -0700 Subject: [PATCH 119/125] #408 download SMPDB while link is failing --- extract/extract-smpdb.sh | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/extract/extract-smpdb.sh b/extract/extract-smpdb.sh index 8062a446..a9d86592 100755 --- a/extract/extract-smpdb.sh +++ b/extract/extract-smpdb.sh @@ -27,8 +27,12 @@ smpdb_link="https://pathbank.org/downloads/pathbank_all_pathways.csv.zip" pwml_link="https://pathbank.org/downloads/pathbank_all_pwml.zip" smpdb_pmids_file="SMPDB_pubmed_IDs.csv" -${curl_get} ${output_dir}/ ${smpdb_link} > ${output_dir}/${smpdb_output_file}.zip -${curl_get} ${output_dir}/ ${pwml_link} > ${output_dir}/${pw_output_file} +# ${curl_get} ${output_dir}/ ${smpdb_link} > ${output_dir}/${smpdb_output_file}.zip +# ${curl_get} ${output_dir}/ ${pwml_link} > ${output_dir}/${pw_output_file} + +# Temporary patch due to cURL failure +${s3_cp_cmd} s3://${s3_bucket}/${smpdb_output_file}.zip ${output_dir}/${smpdb_output_file}.zip +${s3_cp_cmd} s3://${s3_bucket}/${pw_output_file} ${output_dir}/${pw_output_file} unzip -o ${output_dir}/${smpdb_output_file}.zip -d ${output_dir}/ unzip -o -q ${output_dir}/${pw_output_file} -d ${output_dir}/ From 5cef56ae7397a242a162827a5479dffc763a51ee Mon Sep 17 00:00:00 2001 From: ecwood Date: Mon, 2 Sep 2024 20:59:49 -0700 Subject: [PATCH 120/125] #408 cURL issue with HMDB --- extract/extract-hmdb.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/extract/extract-hmdb.sh b/extract/extract-hmdb.sh index 93f1e0b1..cf362477 100755 --- a/extract/extract-hmdb.sh +++ b/extract/extract-hmdb.sh @@ -22,7 +22,10 @@ output_file=hmdb_metabolites hmdb_link="https://hmdb.ca/system/downloads/current/hmdb_metabolites.zip" -${curl_get} ${hmdb_link} > ${BUILD_DIR}/${output_file}.zip +# ${curl_get} ${hmdb_link} > ${BUILD_DIR}/${output_file}.zip + +# Temporary patch due to cURL issues +${s3_cp_cmd} s3://${s3_bucket}/hmdb_metabolites.zip ${BUILD_DIR}/${output_file}.zip unzip -o ${BUILD_DIR}/${output_file}.zip -d ${BUILD_DIR} From 893fb716362dd9eeb9c96e1913f68e26ec178d0d Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 3 Sep 2024 22:02:56 -0700 Subject: [PATCH 121/125] #408 build issue with knowledge_source node curies --- maps/kg2-provided-by-curie-to-infores-curie.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml index 37854c95..25176d2c 100644 --- a/maps/kg2-provided-by-curie-to-infores-curie.yaml +++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml @@ -1,4 +1,4 @@ -'ClinicalTrialKG:': +'ClinicalTrialsKG:': source_name: Multiomics ClinicalTrials KP API infores_curie: infores:biothings-multiomics-clinicaltrials knowledge_type: knowledge_source @@ -22,7 +22,7 @@ DOID:doid.owl: source_name: DrugCentral infores_curie: infores:drugcentral knowledge_type: knowledge_source -EFO:efo.owl: +OBO:efo.owl: source_name: Experimental Factor Ontology infores_curie: infores:efo knowledge_type: knowledge_source From b37ee73198fad16c5e7d448486b661d8b9f39714 Mon Sep 17 00:00:00 2001 From: ecwood Date: Tue, 3 Sep 2024 23:33:45 -0700 Subject: [PATCH 122/125] #408 missing predicate --- maps/predicate-remap.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml index 3d4bc482..3927f0e8 100644 --- a/maps/predicate-remap.yaml +++ b/maps/predicate-remap.yaml @@ -3545,6 +3545,8 @@ RO:0002159: RO:0002160: operation: keep core_predicate: biolink:in_taxon +RO:0002161: + operation: delete RO:0002162: operation: keep core_predicate: biolink:in_taxon From 257997298ac0512f3979d0fb01b74ee460d72ca9 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 8 Sep 2024 13:27:05 -0700 Subject: [PATCH 123/125] #408 bucket problem --- neo4j/tsv-to-neo4j.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neo4j/tsv-to-neo4j.sh b/neo4j/tsv-to-neo4j.sh index 07c2f692..bd1a3682 100755 --- a/neo4j/tsv-to-neo4j.sh +++ b/neo4j/tsv-to-neo4j.sh @@ -54,7 +54,7 @@ rm -r -f ${tsv_dir} mkdir -p ${tsv_dir} # get the latest KG2 version -${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${kg2_version_file_local} +${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${kg2_version_file_local} kg2_version=`cat ${kg2_version_file_local}` # download the latest TSV files from the S3 Bucket From 08bd4c5d22b8f07c2043bf50c6a2ff7461f12012 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 8 Sep 2024 14:30:53 -0700 Subject: [PATCH 124/125] #408 kg2-versions entry for KG2.10.1 --- docs/kg2-versions.md | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/docs/kg2-versions.md b/docs/kg2-versions.md index 4f2744e1..34121b52 100644 --- a/docs/kg2-versions.md +++ b/docs/kg2-versions.md @@ -1,3 +1,38 @@ +# 2.10.1 +**Date: 2024.9.02** + +Counts: +- Nodes: 8,507,201 +- Edges: 57,418,405 + +Issues: +- Issue [#388](https://github.com/RTXteam/RTX-KG2/issues/388) +- Issue [#392](https://github.com/RTXteam/RTX-KG2/issues/392) +- Issue [#398](https://github.com/RTXteam/RTX-KG2/issues/398) +- Issue [#404](https://github.com/RTXteam/RTX-KG2/issues/404) +- Additional issues that arose during the build: [#395 (Comment)](https://github.com/RTXteam/RTX-KG2/issues/395#issuecomment-2223612095) + +Build info: +- Biolink Model version: 4.2.1 +- InfoRes Registry version: 0.2.8 +- Build host: `kg2101build.rtx.ai` +- Build directory: `/home/ubuntu/kg2-build` +- Build code branch: `midjuly24work` +- Neo4j endpoint CNAME: `kg2endpoint-kg2-10-1.rtx.ai` +- Neo4j endpoint hostname: `kg2endpoint4.rtx.ai` +- Tracking issue for the build: [#408](https://github.com/RTXteam/RTX-KG2/issues/408) +- Major knowledge source versions: + - SemMedDB: `43 (2023)` + - UMLS: `2023AA` + - ChEMBL: `33` + - DrugBank: `5.1.10` + - Ensembl: `106` + - Reactome: `80` + - UniProtKB: `2024_04` + - DrugCentral: `52` + - KEGG: `111.0` + + # 2.10.0 **Date: 2024.07.11** From b158cc578931225f06edbe4e00aaabb6635aa823 Mon Sep 17 00:00:00 2001 From: ecwood Date: Sun, 8 Sep 2024 14:31:10 -0700 Subject: [PATCH 125/125] #408 rest of kg2-versions entry for KG2.10.1 --- docs/kg2-versions.md | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/kg2-versions.md b/docs/kg2-versions.md index 34121b52..33846422 100644 --- a/docs/kg2-versions.md +++ b/docs/kg2-versions.md @@ -6,11 +6,18 @@ Counts: - Edges: 57,418,405 Issues: +- Issue [#140](https://github.com/RTXteam/RTX-KG2/issues/140) +- Issue [#387](https://github.com/RTXteam/RTX-KG2/issues/387) - Issue [#388](https://github.com/RTXteam/RTX-KG2/issues/388) +- Issue [#390](https://github.com/RTXteam/RTX-KG2/issues/390) - Issue [#392](https://github.com/RTXteam/RTX-KG2/issues/392) +- Issue [#393](https://github.com/RTXteam/RTX-KG2/issues/393) - Issue [#398](https://github.com/RTXteam/RTX-KG2/issues/398) +- Issue [#399](https://github.com/RTXteam/RTX-KG2/issues/399) +- Issue [#400](https://github.com/RTXteam/RTX-KG2/issues/400) - Issue [#404](https://github.com/RTXteam/RTX-KG2/issues/404) -- Additional issues that arose during the build: [#395 (Comment)](https://github.com/RTXteam/RTX-KG2/issues/395#issuecomment-2223612095) +- Issue [#405](https://github.com/RTXteam/RTX-KG2/issues/405) +- Additional issues that arose during the build: [#408 (Comment)](https://github.com/RTXteam/RTX-KG2/issues/408#issuecomment-2336826509) Build info: - Biolink Model version: 4.2.1