From 42d1a4302981ec9cba6ee25ee28d12ce1b3d6288 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 11 Jul 2024 15:21:20 -0700
Subject: [PATCH 001/125] #390 first attempt at reducing dependencies

---
 setup/requirements-kg2-build.txt | 35 +-------------------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

diff --git a/setup/requirements-kg2-build.txt b/setup/requirements-kg2-build.txt
index e4ae6bec..b3d36c41 100644
--- a/setup/requirements-kg2-build.txt
+++ b/setup/requirements-kg2-build.txt
@@ -1,53 +1,20 @@
 argh==0.26.2
-attrs==19.2.0
-bidict==0.21.0
 bmt==0.7.6
-CacheControl==0.12.6
-cachetools==5.1.0
-cachier==1.2.5
-certifi==2023.7.22
-chardet==3.0.4
-Click==8.0.0
 conf==0.4.
-Cython==0.29.26
-dataclasses==0.6
-decorator==4.4.0
-diskcache==4.0.0
-docker==4.4.2
 graphviz==0.20.1
 HTMLParser==0.0.2
-idna==2.8
 isodate==0.6.0
-jsobject==0.10.2
 jsonlines==3.0.0
 jsonpickle==1.0.0
 kgx==1.5.6
-lockfile==0.12.2
-marshmallow==3.0.0b11
-mysqlclient==1.4.4
-neo4j==4.3
-networkx==2.5
-numpy==1.21.6
 ontobio==2.8.0
-pandas==1.0.3
-pathtools==0.1.2
-portalocker==1.4.0
 prefixcommons==0.1.9
 pymongo==3.8.0
 PyMySQL==0.9.3
-pyparsing==2.4.7
-pysolr==3.8.1
 python-dateutil==2.8.1
-pytz==2022.2
 PyYAML==5.4
-rdflib==6.0.0
 requests==2.31.0
-scipy==1.7.3
-six==1.12.0
 snakemake==5.5.4
-SPARQLWrapper==1.8.5
 urllib3>=1.25.9
-watchdog==0.9.0
 xmltodict==0.12.0
-yamldown==0.1.8
-validators==0.15.0
+validators==0.15.0
\ No newline at end of file

From 20eeff6de3a255bbeb90e7593a01b00c4861fe0c Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 11 Jul 2024 15:41:11 -0700
Subject: [PATCH 002/125] #390 hopefully these can go as well

---
 setup/requirements-kg2-build.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/setup/requirements-kg2-build.txt b/setup/requirements-kg2-build.txt
index b3d36c41..c40910aa 100644
--- a/setup/requirements-kg2-build.txt
+++ b/setup/requirements-kg2-build.txt
@@ -1,12 +1,10 @@
 argh==0.26.2
 bmt==0.7.6
-conf==0.4.
 graphviz==0.20.1
 HTMLParser==0.0.2
 isodate==0.6.0
 jsonlines==3.0.0
 jsonpickle==1.0.0
-kgx==1.5.6
 ontobio==2.8.0
 prefixcommons==0.1.9
 pymongo==3.8.0

From 281bf43aa474dc96eace072120636f130604dc83 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 03:32:53 -0700
Subject: [PATCH 003/125] #398 first pass at clinical trials kg

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py   | 116 ++++++++++++++++++
 extract/extract-clinicaltrialskg.sh           |  29 +++++
 kg2_util.py                                   |   3 +-
 ...g2-provided-by-curie-to-infores-curie.yaml |   8 ++
 4 files changed, 155 insertions(+), 1 deletion(-)
 create mode 100644 convert/clinicaltrialskg_tsv_to_kg_jsonl.py
 create mode 100644 extract/extract-clinicaltrialskg.sh

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
new file mode 100644
index 00000000..b1d425d1
--- /dev/null
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+'''clinicaltrialskg_tsv_to_kg_jsonl.py: Extracts a KG2 JSON file from the ClinicalTrials Knowledge Graphy in TSV format
+
+   Usage: clinicaltrialskg_tsv_to_kg_jsonl.py [--test] <inputFile.tsv> <outputNodesFile.json> <outputEdgesFile.json>
+'''
+
+import argparse
+import kg2_util
+import csv
+import datetime
+
+__author__ = 'Erica Wood'
+__copyright__ = 'Oregon State University'
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
+__license__ = 'MIT'
+__version__ = '0.1.0'
+__maintainer__ = ''
+__email__ = ''
+__status__ = 'Prototype'
+
+
+CLINICALTRIALSKG_BASE_IRI = kg2_util.BASE_URL_CLINICALTRIALSKG
+CLINICALTRIALSKG_CURIE = kg2_util.CURIE_ID_CLINICALTRIALSKG
+
+TEST_MODE_LIMIT = 10000
+
+
+def get_args():
+    description = 'clinicaltrialskg_tsv_to_kg_jsonl.py: builds a KG2 JSON file from the \
+                   ClinicalTrials Knowledge Graph TSV file'
+    arg_parser = argparse.ArgumentParser(description=description)
+    arg_parser.add_argument('--test',
+                            dest='test',
+                            action="store_true",
+                            default=False)
+    arg_parser.add_argument('inputFile', type=str)
+    arg_parser.add_argument('outputNodesFile', type=str)
+    arg_parser.add_argument('outputEdgesFile', type=str)
+    return arg_parser.parse_args()
+
+
+def date():
+    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+def format_id(id: str, prefix: str):
+    return prefix + ':' + id.strip()
+
+
+def make_edges(input_file: str, edges_output, test_mode: bool):
+    count = 0
+    with open(input_file, 'r') as input_tsv:
+        tsvreader = csv.reader(input_tsv, delimiter='\t')
+        for line in tsvreader:
+            count += 1
+            if count == 1:
+                continue
+            if test_mode and count >= TEST_MODE_LIMIT:
+                break
+            [clinicaltrialskg_edge_id,
+             subject_id,
+             predicate,
+             object_id,
+             subject_name,
+             object_name,
+             category,
+             knowledge_level,
+             agent_type,
+             nctid,
+             phase,
+             primary_purpose,
+             intervention_model,
+             time_perspective,
+             overall_status,
+             start_date,
+             enrollment,
+             enrollment_type,
+             age_range,
+             child,
+             adult,
+             older_adult
+             unii] = line
+
+            edge = kg2_util.make_edge_biolink(subject_id,
+                                              object_id,
+                                              predicate,
+                                              CLINICALTRIALSKG_CURIE,
+                                              start_date)
+            edges_output.write(edge)
+
+
+if __name__ == '__main__':
+    print("Start time: ", date())
+    args = get_args()
+    input_file_name = args.inputFile
+    output_nodes_file_name = args.outputNodesFile
+    output_edges_file_name = args.outputEdgesFile
+    test_mode = args.test
+
+    nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
+    nodes_output = nodes_info[0]
+    edges_output = edges_info[0]
+
+    make_edges(input_file_name, edges_output, test_mode)
+
+    kp_node = kg2_util.make_node(CLINICALTRIALSKG_CURIE,
+                                 CLINICALTRIALSKG_BASE_IRI,
+                                 "Clinical Trials Knowledge Graph",
+                                 kg2_util.SOURCE_NODE_CATEGORY,
+                                 None,
+                                 CLINICALTRIALSKG_CURIE)
+    nodes_output.write(kp_node)
+
+    kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
+
+    print("Finish time: ", date())
diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh
new file mode 100644
index 00000000..20f97d26
--- /dev/null
+++ b/extract/extract-clinicaltrialskg.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# extract-clinicaltrialskg.sh: Download the ClinicalTrials Knowledge Graph
+# Copyright 2024 Stephen A. Ramsey
+# Author Erica Wood
+
+set -o nounset -o pipefail -o errexit
+
+if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
+    echo Usage: "$0 <clinicaltrialskg-edges.tsv>"
+    exit 2
+fi
+
+# Usage: extract-clinicaltrialskg.sh <clinicaltrialskg-edges.tsv>
+
+echo "================= starting extract-clinicaltrialskg.sh =================="
+date
+
+config_dir=`dirname "$0"`
+source ${config_dir}/master-config.shinc
+
+clinicaltrialskgoutput_file=${1:-"${BUILD_DIR}/clinicaltrialskg-edges.tsv"}
+version="2.2.6"
+
+clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv"
+
+${curl_get} ${clinicaltrialskg_download_link} > ${clinicaltrialskg_output_file}
+
+date
+echo "================= finishing extract-clinicaltrialskg.sh =================="
\ No newline at end of file
diff --git a/kg2_util.py b/kg2_util.py
index 6e19e7e2..a686e037 100644
--- a/kg2_util.py
+++ b/kg2_util.py
@@ -143,7 +143,7 @@
 BASE_URL_CHEMBL_COMPOUND = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.compound:'
 BASE_URL_CHEMBL_TARGET = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.target:'
 BASE_URL_CHEMBL_MECHANISM = 'https://www.ebi.ac.uk/chembl/mechanism/inspect/'
-BASE_URL_CLINICALTRIALS = BASE_BASE_URL_IDENTIFIERS_ORG + 'clinicaltrials:'
+BASE_URL_CLINICALTRIALS_KG = 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/'
 BASE_URL_DGIDB = 'https://www.dgidb.org/interaction_types'
 BASE_URL_DISGENET = 'http://www.disgenet.org'
 BASE_URL_DRUGBANK = BASE_BASE_URL_IDENTIFIERS_ORG + 'drugbank:'
@@ -216,6 +216,7 @@
 # Since this has changed 2(?) times now, this will make it easier going forward if things change again
 SOURCE_NODE_CATEGORY = BIOLINK_CATEGORY_RETRIEVAL_SOURCE
 
+CURIE_ID_CLINICALTRIALSKG = 'ClinicalTrialsKG:'
 CURIE_ID_DCTERMS_ISSUED = CURIE_PREFIX_DCTERMS + ':' + 'issued'
 CURIE_ID_DISGENET = 'DisGeNET:'
 CURIE_ID_DRUGCENTRAL_SOURCE = CURIE_PREFIX_DRUGCENTRAL + ':'
diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml
index ec5fcddb..efb2d639 100644
--- a/maps/kg2-provided-by-curie-to-infores-curie.yaml
+++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml
@@ -1,3 +1,11 @@
+'ClinicalTrialKG:':
+  source_name: Multiomics ClinicalTrials KP API
+  infores_curie: infores:biothings-multiomics-clinicaltrials
+  knowledge_type: knowledge_source
+'DGIdb:':
+  source_name: Drug Gene Interaction Database
+  infores_curie: infores:dgidb
+  knowledge_type: aggregator_knowledge_source
 'DGIdb:':
   source_name: Drug Gene Interaction Database
   infores_curie: infores:dgidb

From 4f338f59286c134fb54f72456ccf73bccb982161 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 03:41:11 -0700
Subject: [PATCH 004/125] #398 forgot the url map entry

---
 maps/curies-to-urls-map.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 6de72aa0..ec99da3f 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -65,6 +65,8 @@ use_for_bidirectional_mapping:
     CID: 'http://pubchem.ncbi.nlm.nih.gov/compound/'
   -
     CL: http://purl.obolibrary.org/obo/CL_
+  -
+    ClinicalTrialsKG: 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/'
   -
     clinicaltrials: "https://identifiers.org/clinicaltrials:"
   -

From cfd305e02762b8f24045134da170a04afe959ed1 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 03:53:32 -0700
Subject: [PATCH 005/125] #399 to avoid confusion

---
 maps/curies-to-urls-map.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index ec99da3f..fe4192b8 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -32,7 +32,7 @@ use_for_bidirectional_mapping:
 #  -
 #    biolink: https://w3id.org/linkml/
   -
-    biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/v4.0.0/project/owl/biolink_model.owl.ttl
+    biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/vVERSION_HERE/project/owl/biolink_model.owl.ttl
   -
     bioschemas: 'https://bioschemas.org/'
   -

From 42d987224d318a228c13a4a19cfa6e3ef1f7e6a6 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:11:56 -0700
Subject: [PATCH 006/125] #398 updating the Snakemake pipeline

---
 build/Snakefile-conversion      | 13 +++++++++++++
 build/Snakefile-extraction      | 13 ++++++++++++-
 build/Snakefile-post-etl        | 10 +++++++---
 build/snakemake-config-var.yaml | 11 +++++++++++
 4 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion
index 0d33b54f..ae80d765 100644
--- a/build/Snakefile-conversion
+++ b/build/Snakefile-conversion
@@ -275,3 +275,16 @@ rule KEGG_Conversion:
         config['KEGG_CONVERSION_LOG']
     shell:
         config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"
+
+rule ClinicalTrialsKG_Conversion:
+    input:
+        code = config['CLINICALTRIALSKG_CONVERSION_SCRIPT'],
+        real = config['CLINICALTRIALSKG_INPUT_FILE'],
+        validation = config['VALIDATION_PLACEHOLDER']
+    output:
+        nodes = config['CLINICALTRIALSKG_OUTPUT_NODES_FILE'],
+        edges = config['CLINICALTRIALSKG_OUTPUT_EDGES_FILE']
+    log:
+        config['CLINICALTRIALSKG_CONVERSION_LOG']
+    shell:
+        config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"
diff --git a/build/Snakefile-extraction b/build/Snakefile-extraction
index 14cf0eb8..ac5e19b1 100644
--- a/build/Snakefile-extraction
+++ b/build/Snakefile-extraction
@@ -218,4 +218,15 @@ rule KEGG:
     log:
         config['KEGG_EXTRACTION_LOG']
     shell:
-        "bash -x {input.code} {output} > {log} 2>&1"
\ No newline at end of file
+        "bash -x {input.code} {output} > {log} 2>&1"
+
+rule ClinicalTrialsKG:
+    input:
+        code = config['CLINICALTRIALSKG_EXTRACTION_SCRIPT'],
+        validation = config['VALIDATION_PLACEHOLDER']
+    output:
+        config['CLINICALTRIALSKG_INPUT_FILE']
+    log:
+        config['CLINICALTRIALSKG_EXTRACTION_LOG']
+    shell:
+        "bash -x {input.code} {output} > {log} 2>&1"
diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl
index 05fd073b..47b109e8 100644
--- a/build/Snakefile-post-etl
+++ b/build/Snakefile-post-etl
@@ -42,7 +42,9 @@ rule Merge:
         disgenet_nodes = config['DISGENET_OUTPUT_NODES_FILE'],
         disgenet_edges = config['DISGENET_OUTPUT_EDGES_FILE'],
         kegg_nodes = config['KEGG_OUTPUT_NODES_FILE'],
-        kegg_edges = config['KEGG_OUTPUT_EDGES_FILE']
+        kegg_edges = config['KEGG_OUTPUT_EDGES_FILE'],
+        clinicaltrialskg_nodes = config['CLINICALTRIALSKG_OUTPUT_NODES_FILE'],
+        clinicaltrialskg_edges = config['CLINICALTRIALSKG_OUTPUT_EDGES_FILE']
     output:
         nodes = config['MERGED_OUTPUT_NODES_FILE'],
         edges = config['MERGED_OUTPUT_EDGES_FILE'],
@@ -54,7 +56,7 @@ rule Merge:
             " --kgFileOrphanEdges {output.orph}" + \
             " --outputNodesFile {output.nodes} " + \
             " --outputEdgesFile {output.edges} " + \
-            " --kgNodesFiles " + \
+            " --kgNodesFiles 
             "{input.umls_nodes} " + \
             "{input.ont_nodes} " + \
             "{input.semmeddb_nodes} " + \
@@ -76,6 +78,7 @@ rule Merge:
             "{input.intact_nodes} " + \
             "{input.disgenet_nodes} " + \
             "{input.kegg_nodes} " + \
+            "{input.clinicaltrialskg_nodes} " + \
             " --kgEdgesFiles " + \
             "{input.umls_edges} " + \
             "{input.ont_edges} " + \
@@ -97,7 +100,8 @@ rule Merge:
             "{input.drugcentral_edges} " + \
             "{input.intact_edges} " + \
             "{input.disgenet_edges} " + \
-            "{input.kegg_edges} > {log} 2>&1"
+            "{input.kegg_edges} " + \
+            "{input.clinicaltrialskg_edges > {log} 2>&1"
 
 rule Stats:
     input:
diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index eb32aa6a..66e5fea8 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -240,6 +240,17 @@ kegg_conversion_log: ${BUILD_DIR}/${kegg_conversion_base}${test_suffix}.log
 kegg_output_nodes_file: ${BUILD_DIR}/${kegg_output_base}${nodes_suffix}${test_suffix}.jsonl
 kegg_output_edges_file: ${BUILD_DIR}/${kegg_output_base}${edges_suffix}${test_suffix}.jsonl
 
+clinicaltrialskg_extraction_base: extract-clinicaltrialskg
+clinicaltrialskg_conversion_base: clinicaltrialskg_tsv_to_kg_jsonl
+clinicaltrialskg_output_base: kg2-clinicaltrialskg
+clinicaltrialskg_extraction_script: ${EXTRACT_CODE_DIR}/${clinicaltrialskg_extraction_base}.sh
+clinicaltrialskg_extraction_log: ${BUILD_DIR}/${clinicaltrialskg_extraction_base}${test_suffix}.log
+clinicaltrialskg_input_file: ${BUILD_DIR}/clinicaltrialskg-edges.tsv
+clinicaltrialskg_conversion_script: ${CONVERT_CODE_DIR}/${clinicaltrialskg_conversion_base}.py
+clinicaltrialskg_conversion_log: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${test_suffix}.log
+clinicaltrialskg_output_nodes_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${nodes_suffix}${test_suffix}.jsonl
+clinicaltrialskg_output_edges_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${edges_suffix}${test_suffix}.jsonl
+
 merge_base: merge_graphs
 merge_script: ${PROCESS_CODE_DIR}/${merge_base}.py
 merged_output_base: kg2-merged

From 134f5ec545e66cdb4181d09113feb46d81b952f1 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:17:31 -0700
Subject: [PATCH 007/125] #398 correcting a typo

---
 build/Snakefile-post-etl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl
index 47b109e8..c0f73cee 100644
--- a/build/Snakefile-post-etl
+++ b/build/Snakefile-post-etl
@@ -56,7 +56,7 @@ rule Merge:
             " --kgFileOrphanEdges {output.orph}" + \
             " --outputNodesFile {output.nodes} " + \
             " --outputEdgesFile {output.edges} " + \
-            " --kgNodesFiles 
+            " --kgNodesFiles " + \
             "{input.umls_nodes} " + \
             "{input.ont_nodes} " + \
             "{input.semmeddb_nodes} " + \

From a61d098785648977da07f41c55f51c9e905b6d66 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:21:25 -0700
Subject: [PATCH 008/125] #398 correcting another typo

---
 build/Snakefile-post-etl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl
index c0f73cee..e6de67ee 100644
--- a/build/Snakefile-post-etl
+++ b/build/Snakefile-post-etl
@@ -101,7 +101,7 @@ rule Merge:
             "{input.intact_edges} " + \
             "{input.disgenet_edges} " + \
             "{input.kegg_edges} " + \
-            "{input.clinicaltrialskg_edges > {log} 2>&1"
+            "{input.clinicaltrialskg_edges} > {log} 2>&1"
 
 rule Stats:
     input:

From 93168e76b692224469128798e8a5ca9c95e096b6 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:28:44 -0700
Subject: [PATCH 009/125] #398 correcting typo in extract

---
 extract/extract-clinicaltrialskg.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh
index 20f97d26..639e1eb2 100644
--- a/extract/extract-clinicaltrialskg.sh
+++ b/extract/extract-clinicaltrialskg.sh
@@ -18,7 +18,7 @@ date
 config_dir=`dirname "$0"`
 source ${config_dir}/master-config.shinc
 
-clinicaltrialskgoutput_file=${1:-"${BUILD_DIR}/clinicaltrialskg-edges.tsv"}
+clinicaltrialskg_output_file=${1:-"${BUILD_DIR}/clinicaltrialskg-edges.tsv"}
 version="2.2.6"
 
 clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv"

From dd73e65b81b4dadbf748281f59d18f5cf7f6d151 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:30:16 -0700
Subject: [PATCH 010/125] #398 correcting typo in conversion

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index b1d425d1..0848205f 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -78,7 +78,7 @@ def make_edges(input_file: str, edges_output, test_mode: bool):
              age_range,
              child,
              adult,
-             older_adult
+             older_adult,
              unii] = line
 
             edge = kg2_util.make_edge_biolink(subject_id,

From 0c16ea319cd21ef2b72a0628634d202893a1525b Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:31:20 -0700
Subject: [PATCH 011/125] #398 correcting to standardize in kg2_util

---
 kg2_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kg2_util.py b/kg2_util.py
index a686e037..e61c6cba 100644
--- a/kg2_util.py
+++ b/kg2_util.py
@@ -143,7 +143,7 @@
 BASE_URL_CHEMBL_COMPOUND = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.compound:'
 BASE_URL_CHEMBL_TARGET = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.target:'
 BASE_URL_CHEMBL_MECHANISM = 'https://www.ebi.ac.uk/chembl/mechanism/inspect/'
-BASE_URL_CLINICALTRIALS_KG = 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/'
+BASE_URL_CLINICALTRIALSKG = 'https://github.com/NCATSTranslator/Translator-All/wiki/Clinical-Trials-KP/'
 BASE_URL_DGIDB = 'https://www.dgidb.org/interaction_types'
 BASE_URL_DISGENET = 'http://www.disgenet.org'
 BASE_URL_DRUGBANK = BASE_BASE_URL_IDENTIFIERS_ORG + 'drugbank:'

From 4a13131b7c77b25f9ddbb326e7253a2f82c19e44 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:38:48 -0700
Subject: [PATCH 012/125] #398 versioning attempt

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 10 ++++++++--
 extract/extract-clinicaltrialskg.sh         |  3 ++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index 0848205f..a421d505 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -49,11 +49,15 @@ def format_id(id: str, prefix: str):
 
 def make_edges(input_file: str, edges_output, test_mode: bool):
     count = 0
+    version = "v"
     with open(input_file, 'r') as input_tsv:
         tsvreader = csv.reader(input_tsv, delimiter='\t')
         for line in tsvreader:
             count += 1
             if count == 1:
+                version += str(line)
+                continue
+            if count == 2:
                 continue
             if test_mode and count >= TEST_MODE_LIMIT:
                 break
@@ -88,6 +92,8 @@ def make_edges(input_file: str, edges_output, test_mode: bool):
                                               start_date)
             edges_output.write(edge)
 
+    return version
+
 
 if __name__ == '__main__':
     print("Start time: ", date())
@@ -101,11 +107,11 @@ def make_edges(input_file: str, edges_output, test_mode: bool):
     nodes_output = nodes_info[0]
     edges_output = edges_info[0]
 
-    make_edges(input_file_name, edges_output, test_mode)
+    version = make_edges(input_file_name, edges_output, test_mode)
 
     kp_node = kg2_util.make_node(CLINICALTRIALSKG_CURIE,
                                  CLINICALTRIALSKG_BASE_IRI,
-                                 "Clinical Trials Knowledge Graph",
+                                 "Clinical Trials Knowledge Graph " + version,
                                  kg2_util.SOURCE_NODE_CATEGORY,
                                  None,
                                  CLINICALTRIALSKG_CURIE)
diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh
index 639e1eb2..95278512 100644
--- a/extract/extract-clinicaltrialskg.sh
+++ b/extract/extract-clinicaltrialskg.sh
@@ -23,7 +23,8 @@ version="2.2.6"
 
 clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv"
 
-${curl_get} ${clinicaltrialskg_download_link} > ${clinicaltrialskg_output_file}
+echo "# ${version}" > ${clinicaltrialskg_output_file}
+${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file}
 
 date
 echo "================= finishing extract-clinicaltrialskg.sh =================="
\ No newline at end of file

From 974ed5c9ed6b58a606ac954148fb9b6adf2b786b Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:40:18 -0700
Subject: [PATCH 013/125] #398 clean up the versioning

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index a421d505..1a4abff0 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -55,7 +55,7 @@ def make_edges(input_file: str, edges_output, test_mode: bool):
         for line in tsvreader:
             count += 1
             if count == 1:
-                version += str(line)
+                version += line[0].strip('#').strip(' ')
                 continue
             if count == 2:
                 continue

From 9d65fd9af3cb91407f46a30e42e20950bfd3382d Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:42:42 -0700
Subject: [PATCH 014/125] #398 add an entry into KL/AT map

---
 maps/knowledge-level-agent-type-map.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/maps/knowledge-level-agent-type-map.yaml b/maps/knowledge-level-agent-type-map.yaml
index 5d860e96..db99215a 100644
--- a/maps/knowledge-level-agent-type-map.yaml
+++ b/maps/knowledge-level-agent-type-map.yaml
@@ -10,6 +10,10 @@ infores:biolink-ontology:
   agent_type: manual_agent
   knowledge_level: knowledge_assertion
   reference: https://github.com/biolink/biolink-model/blob/master/biolink-model.yaml
+infores:biothings-multiomics-clinicaltrials:
+  agent_type: manual_agent
+  knowledge_level: knowledge_assertion
+  reference: https://github.com/biolink/information-resource-registry/blob/d84a524bfaf749d92a42c867b1b6798a88e905c8/infores_catalog.yaml#L650-L651
 infores:bspo:
   agent_type: manual_agent
   knowledge_level: knowledge_assertion

From bdf1ad18e066c0c4af86fecbcb541ff7b8d07413 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:57:05 -0700
Subject: [PATCH 015/125] #398 reworking update date

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 32 ++++++++++++++++++++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index 1a4abff0..60a4e9fb 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -47,6 +47,36 @@ def format_id(id: str, prefix: str):
     return prefix + ':' + id.strip()
 
 
+def format_date(date_field):
+    dates = date_field.split(',')
+
+    # Arbitrarily far back date to improve on
+    latest_date = datetime.date(1700, 1, 1)
+
+    if len(dates) > 1:
+        split_date = date.split('-')
+        year = split_date[0]
+        month = split_date[1]
+        day = 1 # most of the time, there's no day
+        if len(split_date) > 2:
+            day = split_date[2]
+        curr_date = datetime.date(year, month, day)
+
+        if curr_date > latest_date:
+            latest_date = curr_date
+    else:
+        split_date = date.split('-')
+        year = split_date[0]
+        month = split_date[1]
+        day = 1 # most of the time, there's no day
+        if len(split_date) > 2:
+            day = split_date[2]
+        latest_date = datetime.date(year, month, day)
+
+    return latest_date
+
+
+
 def make_edges(input_file: str, edges_output, test_mode: bool):
     count = 0
     version = "v"
@@ -89,7 +119,7 @@ def make_edges(input_file: str, edges_output, test_mode: bool):
                                               object_id,
                                               predicate,
                                               CLINICALTRIALSKG_CURIE,
-                                              start_date)
+                                              format_date(start_date))
             edges_output.write(edge)
 
     return version

From f0f73fd876d419292e7c6eb185b109108173ed21 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:58:12 -0700
Subject: [PATCH 016/125] #398 revising some access patterns

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 23 +++++++++++----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index 60a4e9fb..91f70c0e 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -54,18 +54,19 @@ def format_date(date_field):
     latest_date = datetime.date(1700, 1, 1)
 
     if len(dates) > 1:
-        split_date = date.split('-')
-        year = split_date[0]
-        month = split_date[1]
-        day = 1 # most of the time, there's no day
-        if len(split_date) > 2:
-            day = split_date[2]
-        curr_date = datetime.date(year, month, day)
-
-        if curr_date > latest_date:
-            latest_date = curr_date
+        for date in dates:
+            split_date = date.split('-')
+            year = split_date[0]
+            month = split_date[1]
+            day = 1 # most of the time, there's no day
+            if len(split_date) > 2:
+                day = split_date[2]
+            curr_date = datetime.date(year, month, day)
+
+            if curr_date > latest_date:
+                latest_date = curr_date
     else:
-        split_date = date.split('-')
+        split_date = dates[0].split('-')
         year = split_date[0]
         month = split_date[1]
         day = 1 # most of the time, there's no day

From 73c085fbeb72aef9358e9c3e9f44e559f6165aae Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:59:16 -0700
Subject: [PATCH 017/125] #398 changing data type for datetime

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index 91f70c0e..2b2acd85 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -56,22 +56,22 @@ def format_date(date_field):
     if len(dates) > 1:
         for date in dates:
             split_date = date.split('-')
-            year = split_date[0]
-            month = split_date[1]
+            year = int(split_date[0])
+            month = int(split_date[1])
             day = 1 # most of the time, there's no day
             if len(split_date) > 2:
-                day = split_date[2]
+                day = int(split_date[2])
             curr_date = datetime.date(year, month, day)
 
             if curr_date > latest_date:
                 latest_date = curr_date
     else:
         split_date = dates[0].split('-')
-        year = split_date[0]
-        month = split_date[1]
+        year = int(split_date[0])
+        month = int(split_date[1])
         day = 1 # most of the time, there's no day
         if len(split_date) > 2:
-            day = split_date[2]
+            day = int(split_date[2])
         latest_date = datetime.date(year, month, day)
 
     return latest_date

From 6bdee2349a7d69a5752de1c139ceec013492cb4f Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 04:59:57 -0700
Subject: [PATCH 018/125] #398 have to save as a string afterwards

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index 2b2acd85..09934ac1 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -74,7 +74,7 @@ def format_date(date_field):
             day = int(split_date[2])
         latest_date = datetime.date(year, month, day)
 
-    return latest_date
+    return str(latest_date)
 
 
 

From 00d53f4b153bd9ffa09197766f529b23d4a7e69d Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 05:01:13 -0700
Subject: [PATCH 019/125] #398 handling an edge case

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index 09934ac1..b26da672 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -48,6 +48,8 @@ def format_id(id: str, prefix: str):
 
 
 def format_date(date_field):
+    if len(date_field) == 0:
+        return str()
     dates = date_field.split(',')
 
     # Arbitrarily far back date to improve on

From 98b6ad59b3fb8b3413c8f0af8da1c77486698a56 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 05:02:04 -0700
Subject: [PATCH 020/125] #398 some debugging code

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index b26da672..eab79b5e 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -69,6 +69,7 @@ def format_date(date_field):
                 latest_date = curr_date
     else:
         split_date = dates[0].split('-')
+        print(split_date, date_field)
         year = int(split_date[0])
         month = int(split_date[1])
         day = 1 # most of the time, there's no day

From a6049fc1a46d22ce48ce2b2413663a38cff79fd2 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 05:03:13 -0700
Subject: [PATCH 021/125] #398 more debugging code

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index eab79b5e..1d60c367 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -58,6 +58,7 @@ def format_date(date_field):
     if len(dates) > 1:
         for date in dates:
             split_date = date.split('-')
+            print(split_date, date, date_field)
             year = int(split_date[0])
             month = int(split_date[1])
             day = 1 # most of the time, there's no day

From 0cc5805d10feeead18c4d4947dc2698a4db74a9b Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 05:04:08 -0700
Subject: [PATCH 022/125] #398 another (very strange) edge case

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index 1d60c367..fe76d22a 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -57,8 +57,9 @@ def format_date(date_field):
 
     if len(dates) > 1:
         for date in dates:
+            if len(date) == 0:
+                continue
             split_date = date.split('-')
-            print(split_date, date, date_field)
             year = int(split_date[0])
             month = int(split_date[1])
             day = 1 # most of the time, there's no day

From a163ae3ba2db3c38ea61af5b66ca7f7d219bf604 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 05:16:55 -0700
Subject: [PATCH 023/125] #393

---
 convert/chembl_mysql_to_kg_jsonl.py | 32 ++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/convert/chembl_mysql_to_kg_jsonl.py b/convert/chembl_mysql_to_kg_jsonl.py
index 37e7692b..966289d9 100755
--- a/convert/chembl_mysql_to_kg_jsonl.py
+++ b/convert/chembl_mysql_to_kg_jsonl.py
@@ -435,22 +435,22 @@ def make_node(id: str,
                                                           update_date))
 
 # get molecule-to-disease indications
-
-    sql = '''select md.chembl_id, di.mesh_id from molecule_dictionary as md inner join drug_indication as di on md.molregno = di.molregno'''
-    if test_mode:
-        sql += str_sql_row_limit_test_mode
-    with connection.cursor() as cursor:
-        cursor.execute(sql)
-        results = cursor.fetchall()
-    for (chembl_id, mesh_id) in results:
-        subject_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + chembl_id
-        object_curie_id = kg2_util.CURIE_PREFIX_MESH + ':' + mesh_id
-        predicate_label = kg2_util.EDGE_LABEL_BIOLINK_APPLIED_TO_TREAT
-        edges_output.write(kg2_util.make_edge_biolink(subject_curie_id,
-                                                      object_curie_id,
-                                                      predicate_label,
-                                                      CHEMBL_KB_CURIE_ID,
-                                                      update_date))
+# Removed per #393
+    # sql = '''select md.chembl_id, di.mesh_id from molecule_dictionary as md inner join drug_indication as di on md.molregno = di.molregno'''
+    # if test_mode:
+    #     sql += str_sql_row_limit_test_mode
+    # with connection.cursor() as cursor:
+    #     cursor.execute(sql)
+    #     results = cursor.fetchall()
+    # for (chembl_id, mesh_id) in results:
+    #     subject_curie_id = CHEMBL_CURIE_BASE_COMPOUND + ':' + chembl_id
+    #     object_curie_id = kg2_util.CURIE_PREFIX_MESH + ':' + mesh_id
+    #     predicate_label = kg2_util.EDGE_LABEL_BIOLINK_APPLIED_TO_TREAT
+    #     edges_output.write(kg2_util.make_edge_biolink(subject_curie_id,
+    #                                                   object_curie_id,
+    #                                                   predicate_label,
+    #                                                   CHEMBL_KB_CURIE_ID,
+    #                                                   update_date))
 # get metabolism information
 
     sql = '''select m1.chembl_id as drug_id,

From 65543c9dc8fd78fc88445d3e744cd1dc8275473d Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 05:17:15 -0700
Subject: [PATCH 024/125] #398 remove debugging info

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
index fe76d22a..4fa7ac64 100644
--- a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
+++ b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
@@ -71,7 +71,6 @@ def format_date(date_field):
                 latest_date = curr_date
     else:
         split_date = dates[0].split('-')
-        print(split_date, date_field)
         year = int(split_date[0])
         month = int(split_date[1])
         day = 1 # most of the time, there's no day

From e5544dc2803909cd7f7cbbd96984e9e309accb02 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 15:41:54 -0700
Subject: [PATCH 025/125] #140 architecture for versioning files

---
 build/build-kg2-snakemake.sh | 47 ++++++++++++++++++++++++++++++++++--
 master-config.shinc          |  3 ++-
 process/run-simplify.sh      | 35 ---------------------------
 3 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh
index c0c31fc1..5fb7a8bb 100755
--- a/build/build-kg2-snakemake.sh
+++ b/build/build-kg2-snakemake.sh
@@ -75,6 +75,44 @@ function build_kg2 () {
 echo "================= starting build-kg2-snakemake.sh =================="
 date
 
+export PATH=$PATH:${BUILD_DIR}
+
+kg2_version_file="kg2-version.txt"
+local_kg2_version_file="${BUILD_DIR}/${kg2_version_file}"
+trigger_file_is_major_release=${BUILD_DIR}/major-release
+trigger_file_is_minor_release=${BUILD_DIR}/minor-release
+
+increment_flag=''
+if [[ "${test_flag}" == "test" || "${dryrun}" == "-n" ]]
+then
+    increment_flag=''
+else
+    if [ -e ${trigger_file_is_major_release} ]
+    then
+        increment_flag='--increment_major'
+    else
+        if [ -e ${trigger_file_is_minor_release} ]
+        then
+            increment_flag='--increment_minor'
+        fi
+    fi
+fi
+
+if [[ "${ci_flag}" == "ci" ]]
+then
+    sed -i "\@^version=@cversion=KG2.CI" ${CODE_DIR}/master-config.shinc
+else
+    ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${local_kg2_version_file}
+    if [[ "${increment_flag}" != '' ]]
+    then
+        ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${local_kg2_version_file}
+    else
+        echo "*** TEST MODE -- NO INCREMENT ***"
+    fi
+    kg2_version=`cat ${local_kg2_version_file}`
+    sed -i "\@^version=@cversion=${kg2_version}" ${CODE_DIR}/master-config.shinc
+fi
+
 snakemake_config_file=${BUILD_CODE_DIR}/snakemake-config.yaml
 snakefile=${BUILD_CODE_DIR}/Snakefile
 
@@ -91,8 +129,6 @@ ${python_command} ${BUILD_CODE_DIR}/generate_snakemake_config_file.py ${test_arg
 # --dag | dot -Tpng > ~/kg2-build/snakemake_diagram.png: Creates Snakemake workflow diagram (when combined with -F and -j)
 # -n: dry run REMOVE THIS BEFORE BUILDING
 
-export PATH=$PATH:${BUILD_DIR}
-
 graphic=""
 if [[ "${build_flag}" == "graphic" || "${secondary_build_flag}" == "graphic" || "${tertiary_build_flag}" == "graphic" ]]
 then
@@ -116,6 +152,13 @@ fi
 
 cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish -j 16 ${dryrun} ${graphic}
 
+${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file}
+
+if [[ -f ${trigger_file_is_major_release} ]]
+then
+   rm -f ${trigger_file_is_major_release}
+fi
+
 date
 echo "================ script finished ============================"
 } 
diff --git a/master-config.shinc b/master-config.shinc
index e5c5e8ce..c6e5de45 100644
--- a/master-config.shinc
+++ b/master-config.shinc
@@ -29,4 +29,5 @@ knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-t
 ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml
 rtx_config_file=RTXConfiguration-config.json
 biolink_model_version=4.2.0
-infores_registry_version=0.2.8
\ No newline at end of file
+infores_registry_version=0.2.8
+version=
\ No newline at end of file
diff --git a/process/run-simplify.sh b/process/run-simplify.sh
index 2c329430..6a8951b7 100755
--- a/process/run-simplify.sh
+++ b/process/run-simplify.sh
@@ -18,41 +18,12 @@ date
 CONFIG_DIR=`dirname "$0"`
 source ${CONFIG_DIR}/master-config.shinc
 
-trigger_file_is_major_release=${BUILD_DIR}/major-release
-trigger_file_is_minor_release=${BUILD_DIR}/minor-release
-
 input_nodes_json=${1:-}
 input_edges_json=${2:-}
 output_nodes_json=${3:-}
 output_edges_json=${4:-}
 local_version_filename=${5:-"${BUILD_DIR}/kg2-version.txt"}
 build_flag=${6:-""}
-s3_version_filename="kg2-version.txt"
-
-${s3_cp_cmd} s3://${s3_bucket_public}/${s3_version_filename} ${local_version_filename}
-test_flag=''
-increment_flag=''
-if [[ "${build_flag}" == 'test' ]]
-then
-   test_flag='--test'
-else
-    if [ -e ${trigger_file_is_major_release} ]
-    then
-        increment_flag='--increment_major'
-    else
-        if [ -e ${trigger_file_is_minor_release} ]
-        then
-            increment_flag='--increment_minor'
-        fi
-    fi
-fi
-
-if [[ "${increment_flag}" != '' ]]
-then
-    ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${local_version_filename}
-else
-    echo "*** TEST MODE -- NO INCREMENT ***"
-fi
 
 # TODO: Inhibits and increase are not in biolink model anymore - Find out what that should be now
 ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py ${test_flag} --dropNegated \
@@ -60,12 +31,6 @@ ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py
                         ${predicate_mapping_file} ${infores_mapping_file} ${curies_to_urls_file} \
                         ${knowledge_level_agent_type_mapping_file} ${input_nodes_json} ${input_edges_json} \
                         ${output_nodes_json} ${output_edges_json} ${local_version_filename}
-${s3_cp_cmd} ${local_version_filename} s3://${s3_bucket_public}/${s3_version_filename}
-
-if [[ -f ${trigger_file_is_major_release} ]]
-then
-   rm -f ${trigger_file_is_major_release}
-fi
 
 date
 echo "================= finishing run-simplify.sh =================="

From 2796ecd62db58d9b4d36870c70e34ff0e8e59491 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 15:46:50 -0700
Subject: [PATCH 026/125] #140 comment out s3 command for ci

---
 build/build-kg2-snakemake.sh | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh
index 5fb7a8bb..4e476038 100755
--- a/build/build-kg2-snakemake.sh
+++ b/build/build-kg2-snakemake.sh
@@ -152,13 +152,21 @@ fi
 
 cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish -j 16 ${dryrun} ${graphic}
 
-${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file}
+if [[ "${ci_flag}" != "ci" ]]
+then
+    ${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file}
+fi
 
 if [[ -f ${trigger_file_is_major_release} ]]
 then
    rm -f ${trigger_file_is_major_release}
 fi
 
+if [[ -f ${trigger_file_is_minor_release} ]]
+then
+   rm -f ${trigger_file_is_minor_release}
+fi
+
 date
 echo "================ script finished ============================"
 } 

From 1270624a1caeb927b9ded2c2ff8490db056cb3a8 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 16 Jul 2024 15:59:33 -0700
Subject: [PATCH 027/125] #140 a lot more changes to log file names and tsv
 output

---
 build/build-kg2-snakemake.sh    |  34 ++++++-----
 build/snakemake-config-var.yaml | 103 ++++++++++++++++----------------
 master-config.shinc             |   2 +-
 3 files changed, 71 insertions(+), 68 deletions(-)

diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh
index 4e476038..6d313846 100755
--- a/build/build-kg2-snakemake.sh
+++ b/build/build-kg2-snakemake.sh
@@ -64,19 +64,6 @@ then
     run_flag="-F"
 fi
 
-build_kg2_log_file=${BUILD_DIR}/build-kg2-snakemake${dryrun}${test_suffix}.log
-touch ${build_kg2_log_file}
-if [[ "${ci_flag}" == "ci" ]]
-then
-    trap "cat ${build_kg2_log_file}" EXIT
-fi
-
-function build_kg2 () {
-echo "================= starting build-kg2-snakemake.sh =================="
-date
-
-export PATH=$PATH:${BUILD_DIR}
-
 kg2_version_file="kg2-version.txt"
 local_kg2_version_file="${BUILD_DIR}/${kg2_version_file}"
 trigger_file_is_major_release=${BUILD_DIR}/major-release
@@ -100,7 +87,7 @@ fi
 
 if [[ "${ci_flag}" == "ci" ]]
 then
-    sed -i "\@^version=@cversion=KG2.CI" ${CODE_DIR}/master-config.shinc
+    sed -i "\@^kg2_version=@ckg2_version=KG2.CI" ${CODE_DIR}/master-config.shinc
 else
     ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${local_kg2_version_file}
     if [[ "${increment_flag}" != '' ]]
@@ -109,10 +96,25 @@ else
     else
         echo "*** TEST MODE -- NO INCREMENT ***"
     fi
-    kg2_version=`cat ${local_kg2_version_file}`
-    sed -i "\@^version=@cversion=${kg2_version}" ${CODE_DIR}/master-config.shinc
+    curr_kg2_version=`cat ${local_kg2_version_file}`
+    sed -i "\@^kg2_version=@ckg2_version=${curr_kg2_version}" ${CODE_DIR}/master-config.shinc
+fi
+
+source ${config_dir}/master-config.shinc
+
+build_kg2_log_file=${BUILD_DIR}/build-kg2-snakemake-${kg2_version}${dryrun}${test_suffix}.log
+touch ${build_kg2_log_file}
+if [[ "${ci_flag}" == "ci" ]]
+then
+    trap "cat ${build_kg2_log_file}" EXIT
 fi
 
+function build_kg2 () {
+echo "================= starting build-kg2-snakemake.sh =================="
+date
+
+export PATH=$PATH:${BUILD_DIR}
+
 snakemake_config_file=${BUILD_CODE_DIR}/snakemake-config.yaml
 snakefile=${BUILD_CODE_DIR}/Snakefile
 
diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index 66e5fea8..72164364 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -1,19 +1,20 @@
 nodes_suffix: -nodes
 edges_suffix: -edges
+version_suffix: -${kg2_version}
 
 validation_base: run-validation-tests
 validation_script: ${VALIDATE_CODE_DIR}/${validation_base}.sh
-validation_log: ${BUILD_DIR}/${validation_base}${test_suffix}.log
+validation_log: ${BUILD_DIR}/${validation_base}${version_suffix}${test_suffix}.log
 validation_placeholder: ${BUILD_DIR}/validation-placeholder.empty
 
 umls_extraction_base: extract-umls
 umls_conversion_base: umls_list_jsonl_to_kg_jsonl
 umls_output_base: kg2-umls
 umls_extraction_script: ${EXTRACT_CODE_DIR}/${umls_extraction_base}.sh
-umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${test_suffix}.log
+umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${version_suffix}${test_suffix}.log
 umls_extract_file: ${BUILD_DIR}/umls.jsonl
 umls_conversion_script: ${CONVERT_CODE_DIR}/${umls_conversion_base}.py
-umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${test_suffix}.log
+umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${version_suffix}${test_suffix}.log
 umls_name_heirarchy: ${MAPS_CODE_DIR}/umls-name-heirarchy.yaml
 umls_tui_map: ${MAPS_CODE_DIR}/tui_combo_mappings.json
 umls_output_nodes_file: ${BUILD_DIR}/${umls_output_base}${nodes_suffix}${test_suffix}.jsonl
@@ -22,7 +23,7 @@ umls_output_edges_file: ${BUILD_DIR}/${umls_output_base}${edges_suffix}${test_su
 ont_conversion_base: build-multi-ont-kg
 ont_output_base: kg2-ont
 ont_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.sh
-ont_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${test_suffix}.log
+ont_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log
 ont_output_nodes_file: ${BUILD_DIR}/${ont_output_base}${nodes_suffix}${test_suffix}.jsonl
 ont_output_edges_file: ${BUILD_DIR}/${ont_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -30,12 +31,12 @@ semmeddb_extraction_base: extract-semmeddb
 semmeddb_conversion_base: semmeddb_tuplelist_json_to_kg_jsonl
 semmeddb_output_base: kg2-semmeddb
 semmeddb_extraction_script: ${EXTRACT_CODE_DIR}/${semmeddb_extraction_base}.sh
-semmeddb_extraction_log: ${BUILD_DIR}/${semmeddb_extraction_base}${test_suffix}.log
+semmeddb_extraction_log: ${BUILD_DIR}/${semmeddb_extraction_base}${version_suffix}${test_suffix}.log
 semmeddb_tuplelist_file: ${BUILD_DIR}/semmeddb-tuplelist.jsonl
 semmeddb_exclusion_file: ${BUILD_DIR}/semmed-exclude-list.yaml
 semmeddb_version_file: ${BUILD_DIR}/semmeddb-version.txt
 semmeddb_conversion_script: ${CONVERT_CODE_DIR}/${semmeddb_conversion_base}.py
-semmeddb_conversion_log: ${BUILD_DIR}/${semmeddb_conversion_base}${test_suffix}.log
+semmeddb_conversion_log: ${BUILD_DIR}/${semmeddb_conversion_base}${version_suffix}${test_suffix}.log
 semmeddb_output_nodes_file: ${BUILD_DIR}/${semmeddb_output_base}${nodes_suffix}${test_suffix}.jsonl
 semmeddb_output_edges_file: ${BUILD_DIR}/${semmeddb_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -43,10 +44,10 @@ uniprotkb_extraction_base: extract-uniprotkb
 uniprotkb_conversion_base: uniprotkb_dat_to_kg_jsonl
 uniprotkb_output_base: kg2-uniprotkb
 uniprotkb_extraction_script: ${EXTRACT_CODE_DIR}/${uniprotkb_extraction_base}.sh
-uniprotkb_extraction_log: ${BUILD_DIR}/${uniprotkb_extraction_base}${test_suffix}.log
+uniprotkb_extraction_log: ${BUILD_DIR}/${uniprotkb_extraction_base}${version_suffix}${test_suffix}.log
 uniprotkb_dat_file: ${BUILD_DIR}/uniprotkb/uniprot_sprot.dat
 uniprotkb_conversion_script: ${CONVERT_CODE_DIR}/${uniprotkb_conversion_base}.py
-uniprotkb_conversion_log: ${BUILD_DIR}/${uniprotkb_conversion_base}${test_suffix}.log
+uniprotkb_conversion_log: ${BUILD_DIR}/${uniprotkb_conversion_base}${version_suffix}${test_suffix}.log
 uniprotkb_output_nodes_file: ${BUILD_DIR}/${uniprotkb_output_base}${nodes_suffix}${test_suffix}.jsonl
 uniprotkb_output_edges_file: ${BUILD_DIR}/${uniprotkb_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -54,10 +55,10 @@ ensembl_extraction_base: extract-ensembl
 ensembl_conversion_base: ensembl_json_to_kg_jsonl
 ensembl_output_base: kg2-ensembl
 ensembl_extraction_script: ${EXTRACT_CODE_DIR}/${ensembl_extraction_base}.sh
-ensembl_extraction_log: ${BUILD_DIR}/${ensembl_extraction_base}${test_suffix}.log
+ensembl_extraction_log: ${BUILD_DIR}/${ensembl_extraction_base}${version_suffix}${test_suffix}.log
 ensembl_source_json_file: ${BUILD_DIR}/ensembl/ensembl_genes_homo_sapiens.json
 ensembl_conversion_script: ${CONVERT_CODE_DIR}/${ensembl_conversion_base}.py
-ensembl_conversion_log: ${BUILD_DIR}/${ensembl_conversion_base}${test_suffix}.log
+ensembl_conversion_log: ${BUILD_DIR}/${ensembl_conversion_base}${version_suffix}${test_suffix}.log
 ensembl_output_nodes_file: ${BUILD_DIR}/${ensembl_output_base}${nodes_suffix}${test_suffix}.jsonl
 ensembl_output_edges_file: ${BUILD_DIR}/${ensembl_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -65,10 +66,10 @@ unichem_extraction_base: extract-unichem
 unichem_conversion_base: unichem_tsv_to_kg_jsonl
 unichem_output_base: kg2-unichem
 unichem_extraction_script: ${EXTRACT_CODE_DIR}/${unichem_extraction_base}.sh
-unichem_extraction_log: ${BUILD_DIR}/${unichem_extraction_base}${test_suffix}.log
+unichem_extraction_log: ${BUILD_DIR}/${unichem_extraction_base}${version_suffix}${test_suffix}.log
 unichem_output_tsv_file: ${BUILD_DIR}/unichem/unichem-mappings.tsv
 unichem_conversion_script: ${CONVERT_CODE_DIR}/${unichem_conversion_base}.py
-unichem_conversion_log: ${BUILD_DIR}/${unichem_conversion_base}${test_suffix}.log
+unichem_conversion_log: ${BUILD_DIR}/${unichem_conversion_base}${version_suffix}${test_suffix}.log
 unichem_output_nodes_file: ${BUILD_DIR}/${unichem_output_base}${nodes_suffix}${test_suffix}.jsonl
 unichem_output_edges_file: ${BUILD_DIR}/${unichem_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -76,10 +77,10 @@ chembl_extraction_base: extract-chembl
 chembl_conversion_base: chembl_mysql_to_kg_jsonl
 chembl_output_base: kg2-chembl
 chembl_extraction_script: ${EXTRACT_CODE_DIR}/${chembl_extraction_base}.sh
-chembl_extraction_log: ${BUILD_DIR}/${chembl_extraction_base}${test_suffix}.log
+chembl_extraction_log: ${BUILD_DIR}/${chembl_extraction_base}${version_suffix}${test_suffix}.log
 chembl_mysql_dbname: chembl
 chembl_conversion_script: ${CONVERT_CODE_DIR}/${chembl_conversion_base}.py
-chembl_conversion_log: ${BUILD_DIR}/${chembl_conversion_base}${test_suffix}.log
+chembl_conversion_log: ${BUILD_DIR}/${chembl_conversion_base}${version_suffix}${test_suffix}.log
 chembl_output_nodes_file: ${BUILD_DIR}/${chembl_output_base}${nodes_suffix}${test_suffix}.jsonl
 chembl_output_edges_file: ${BUILD_DIR}/${chembl_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -87,10 +88,10 @@ ncbigene_extraction_base: extract-ncbigene
 ncbigene_conversion_base: ncbigene_tsv_to_kg_jsonl
 ncbigene_output_base: kg2-ncbigene
 ncbigene_extraction_script: ${EXTRACT_CODE_DIR}/${ncbigene_extraction_base}.sh
-ncbigene_extraction_log: ${BUILD_DIR}/${ncbigene_extraction_base}${test_suffix}.log
+ncbigene_extraction_log: ${BUILD_DIR}/${ncbigene_extraction_base}${version_suffix}${test_suffix}.log
 ncbigene_tsv_file: ${BUILD_DIR}/ncbigene/Homo_sapiens_gene_info.tsv
 ncbigene_conversion_script: ${CONVERT_CODE_DIR}/${ncbigene_conversion_base}.py
-ncbigene_conversion_log: ${BUILD_DIR}/${ncbigene_conversion_base}${test_suffix}.log
+ncbigene_conversion_log: ${BUILD_DIR}/${ncbigene_conversion_base}${version_suffix}${test_suffix}.log
 ncbigene_output_nodes_file: ${BUILD_DIR}/${ncbigene_output_base}${nodes_suffix}${test_suffix}.jsonl
 ncbigene_output_edges_file: ${BUILD_DIR}/${ncbigene_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -98,10 +99,10 @@ dgidb_extraction_base: extract-dgidb
 dgidb_conversion_base: dgidb_tsv_to_kg_jsonl
 dgidb_output_base: kg2-dgidb
 dgidb_extraction_script: ${EXTRACT_CODE_DIR}/${dgidb_extraction_base}.sh
-dgidb_extraction_log: ${BUILD_DIR}/${dgidb_extraction_base}${test_suffix}.log
+dgidb_extraction_log: ${BUILD_DIR}/${dgidb_extraction_base}${version_suffix}${test_suffix}.log
 dgidb_dir: ${BUILD_DIR}/dgidb
 dgidb_conversion_script: ${CONVERT_CODE_DIR}/${dgidb_conversion_base}.py
-dgidb_conversion_log: ${BUILD_DIR}/${dgidb_conversion_base}${test_suffix}.log
+dgidb_conversion_log: ${BUILD_DIR}/${dgidb_conversion_base}${version_suffix}${test_suffix}.log
 dgidb_output_nodes_file: ${BUILD_DIR}/${dgidb_output_base}${nodes_suffix}${test_suffix}.jsonl
 dgidb_output_edges_file: ${BUILD_DIR}/${dgidb_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -109,11 +110,11 @@ repodb_extraction_base: extract-repodb
 repodb_conversion_base: repodb_csv_to_kg_jsonl
 repodb_output_base: kg2-repodb
 repodb_extraction_script: ${EXTRACT_CODE_DIR}/${repodb_extraction_base}.sh
-repodb_extraction_log: ${BUILD_DIR}/${repodb_extraction_base}${test_suffix}.log
+repodb_extraction_log: ${BUILD_DIR}/${repodb_extraction_base}${version_suffix}${test_suffix}.log
 repodb_dir: ${BUILD_DIR}/repodb
 repodb_input_file: ${repodb_dir}/repodb.csv
 repodb_conversion_script: ${CONVERT_CODE_DIR}/${repodb_conversion_base}.py
-repodb_conversion_log: ${BUILD_DIR}/${repodb_conversion_base}${test_suffix}.log
+repodb_conversion_log: ${BUILD_DIR}/${repodb_conversion_base}${version_suffix}${test_suffix}.log
 repodb_output_nodes_file: ${BUILD_DIR}/${repodb_output_base}${nodes_suffix}${test_suffix}.jsonl
 repodb_output_edges_file: ${BUILD_DIR}/${repodb_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -121,10 +122,10 @@ drugbank_extraction_base: extract-drugbank
 drugbank_conversion_base: drugbank_xml_to_kg_jsonl
 drugbank_output_base: kg2-drugbank
 drugbank_extraction_script: ${EXTRACT_CODE_DIR}/${drugbank_extraction_base}.sh
-drugbank_extraction_log: ${BUILD_DIR}/${drugbank_extraction_base}${test_suffix}.log
+drugbank_extraction_log: ${BUILD_DIR}/${drugbank_extraction_base}${version_suffix}${test_suffix}.log
 drugbank_input_file: ${BUILD_DIR}/drugbank.xml
 drugbank_conversion_script: ${CONVERT_CODE_DIR}/${drugbank_conversion_base}.py
-drugbank_conversion_log: ${BUILD_DIR}/${drugbank_conversion_base}${test_suffix}.log
+drugbank_conversion_log: ${BUILD_DIR}/${drugbank_conversion_base}${version_suffix}${test_suffix}.log
 drugbank_output_nodes_file: ${BUILD_DIR}/${drugbank_output_base}${nodes_suffix}${test_suffix}.jsonl
 drugbank_output_edges_file: ${BUILD_DIR}/${drugbank_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -132,11 +133,11 @@ smpdb_extraction_base: extract-smpdb
 smpdb_conversion_base: smpdb_csv_to_kg_jsonl
 smpdb_output_base: kg2-smpdb
 smpdb_extraction_script: ${EXTRACT_CODE_DIR}/${smpdb_extraction_base}.sh
-smpdb_extraction_log: ${BUILD_DIR}/${smpdb_extraction_base}${test_suffix}.log
+smpdb_extraction_log: ${BUILD_DIR}/${smpdb_extraction_base}${version_suffix}${test_suffix}.log
 smpdb_dir: ${BUILD_DIR}/smpdb
 smpdb_input_file: ${smpdb_dir}/pathbank_pathways.csv
 smpdb_conversion_script: ${CONVERT_CODE_DIR}/${smpdb_conversion_base}.py
-smpdb_conversion_log: ${BUILD_DIR}/${smpdb_conversion_base}${test_suffix}.log
+smpdb_conversion_log: ${BUILD_DIR}/${smpdb_conversion_base}${version_suffix}${test_suffix}.log
 smpdb_output_nodes_file: ${BUILD_DIR}/${smpdb_output_base}${nodes_suffix}${test_suffix}.jsonl
 smpdb_output_edges_file: ${BUILD_DIR}/${smpdb_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -144,10 +145,10 @@ hmdb_extraction_base: extract-hmdb
 hmdb_conversion_base: hmdb_xml_to_kg_jsonl
 hmdb_output_base: kg2-hmdb
 hmdb_extraction_script: ${EXTRACT_CODE_DIR}/${hmdb_extraction_base}.sh
-hmdb_extraction_log: ${BUILD_DIR}/${hmdb_extraction_base}${test_suffix}.log
+hmdb_extraction_log: ${BUILD_DIR}/${hmdb_extraction_base}${version_suffix}${test_suffix}.log
 hmdb_input_file: ${BUILD_DIR}/hmdb_metabolites.xml
 hmdb_conversion_script: ${CONVERT_CODE_DIR}/${hmdb_conversion_base}.py
-hmdb_conversion_log: ${BUILD_DIR}/${hmdb_conversion_base}${test_suffix}.log
+hmdb_conversion_log: ${BUILD_DIR}/${hmdb_conversion_base}${version_suffix}${test_suffix}.log
 hmdb_output_nodes_file: ${BUILD_DIR}/${hmdb_output_base}${nodes_suffix}${test_suffix}.jsonl
 hmdb_output_edges_file: ${BUILD_DIR}/${hmdb_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -155,10 +156,10 @@ go_annotations_extraction_base: extract-go-annotations
 go_annotations_conversion_base: go_gpa_to_kg_jsonl
 go_annotations_output_base: kg2-go-annotations
 go_annotations_extraction_script: ${EXTRACT_CODE_DIR}/${go_annotations_extraction_base}.sh
-go_annotations_extraction_log: ${BUILD_DIR}/${go_annotations_extraction_base}${test_suffix}.log
+go_annotations_extraction_log: ${BUILD_DIR}/${go_annotations_extraction_base}${version_suffix}${test_suffix}.log
 go_annotations_input_file: ${BUILD_DIR}/goa_human.gpa
 go_annotations_conversion_script: ${CONVERT_CODE_DIR}/${go_annotations_conversion_base}.py
-go_annotations_conversion_log: ${BUILD_DIR}/${go_annotations_conversion_base}${test_suffix}.log
+go_annotations_conversion_log: ${BUILD_DIR}/${go_annotations_conversion_base}${version_suffix}${test_suffix}.log
 go_annotations_output_nodes_file: ${BUILD_DIR}/${go_annotations_output_base}${nodes_suffix}${test_suffix}.jsonl
 go_annotations_output_edges_file: ${BUILD_DIR}/${go_annotations_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -166,10 +167,10 @@ reactome_extraction_base: extract-reactome
 reactome_conversion_base: reactome_mysql_to_kg_jsonl
 reactome_output_base: kg2-reactome
 reactome_extraction_script: ${EXTRACT_CODE_DIR}/${reactome_extraction_base}.sh
-reactome_extraction_log: ${BUILD_DIR}/${reactome_extraction_base}${test_suffix}.log
+reactome_extraction_log: ${BUILD_DIR}/${reactome_extraction_base}${version_suffix}${test_suffix}.log
 reactome_mysql_dbname: reactome
 reactome_conversion_script: ${CONVERT_CODE_DIR}/${reactome_conversion_base}.py
-reactome_conversion_log: ${BUILD_DIR}/${reactome_conversion_base}${test_suffix}.log
+reactome_conversion_log: ${BUILD_DIR}/${reactome_conversion_base}${version_suffix}${test_suffix}.log
 reactome_output_nodes_file: ${BUILD_DIR}/${reactome_output_base}${nodes_suffix}${test_suffix}.jsonl
 reactome_output_edges_file: ${BUILD_DIR}/${reactome_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -177,10 +178,10 @@ mirbase_extraction_base: extract-mirbase
 mirbase_conversion_base: mirbase_dat_to_kg_jsonl
 mirbase_output_base: kg2-mirbase
 mirbase_extraction_script: ${EXTRACT_CODE_DIR}/${mirbase_extraction_base}.sh
-mirbase_extraction_log: ${BUILD_DIR}/${mirbase_extraction_base}${test_suffix}.log
+mirbase_extraction_log: ${BUILD_DIR}/${mirbase_extraction_base}${version_suffix}${test_suffix}.log
 mirbase_input_file: ${BUILD_DIR}/miRNA.dat
 mirbase_conversion_script: ${CONVERT_CODE_DIR}/${mirbase_conversion_base}.py
-mirbase_conversion_log: ${BUILD_DIR}/${mirbase_conversion_base}${test_suffix}.log
+mirbase_conversion_log: ${BUILD_DIR}/${mirbase_conversion_base}${version_suffix}${test_suffix}.log
 mirbase_output_nodes_file: ${BUILD_DIR}/${mirbase_output_base}${nodes_suffix}${test_suffix}.jsonl
 mirbase_output_edges_file: ${BUILD_DIR}/${mirbase_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -188,10 +189,10 @@ jensenlab_extraction_base: extract-jensenlab
 jensenlab_conversion_base: jensenlab_tsv_to_kg_jsonl
 jensenlab_output_base: kg2-jensenlab
 jensenlab_extraction_script: ${EXTRACT_CODE_DIR}/${jensenlab_extraction_base}.sh
-jensenlab_extraction_log: ${BUILD_DIR}/${jensenlab_extraction_base}${test_suffix}.log
+jensenlab_extraction_log: ${BUILD_DIR}/${jensenlab_extraction_base}${version_suffix}${test_suffix}.log
 jensenlab_dir: ${BUILD_DIR}/jensenlab
 jensenlab_conversion_script: ${CONVERT_CODE_DIR}/${jensenlab_conversion_base}.py
-jensenlab_conversion_log: ${BUILD_DIR}/${jensenlab_conversion_base}${test_suffix}.log
+jensenlab_conversion_log: ${BUILD_DIR}/${jensenlab_conversion_base}${version_suffix}${test_suffix}.log
 jensenlab_output_nodes_file: ${BUILD_DIR}/${jensenlab_output_base}${nodes_suffix}${test_suffix}.jsonl
 jensenlab_output_edges_file: ${BUILD_DIR}/${jensenlab_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -199,11 +200,11 @@ drugcentral_extraction_base: extract-drugcentral
 drugcentral_conversion_base: drugcentral_json_to_kg_jsonl
 drugcentral_output_base: kg2-drugcentral
 drugcentral_extraction_script: ${EXTRACT_CODE_DIR}/${drugcentral_extraction_base}.sh
-drugcentral_extraction_log: ${BUILD_DIR}/${drugcentral_extraction_base}${test_suffix}.log
+drugcentral_extraction_log: ${BUILD_DIR}/${drugcentral_extraction_base}${version_suffix}${test_suffix}.log
 drugcentral_dir: ${BUILD_DIR}/drugcentral
 drugcentral_input_file: ${drugcentral_dir}/drugcentral_psql_json.json
 drugcentral_conversion_script: ${CONVERT_CODE_DIR}/${drugcentral_conversion_base}.py
-drugcentral_conversion_log: ${BUILD_DIR}/${drugcentral_conversion_base}${test_suffix}.log
+drugcentral_conversion_log: ${BUILD_DIR}/${drugcentral_conversion_base}${version_suffix}${test_suffix}.log
 drugcentral_output_nodes_file: ${BUILD_DIR}/${drugcentral_output_base}${nodes_suffix}${test_suffix}.jsonl
 drugcentral_output_edges_file: ${BUILD_DIR}/${drugcentral_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -211,10 +212,10 @@ intact_extraction_base: extract-intact
 intact_conversion_base: intact_tsv_to_kg_jsonl
 intact_output_base: kg2-intact
 intact_extraction_script: ${EXTRACT_CODE_DIR}/${intact_extraction_base}.sh
-intact_extraction_log: ${BUILD_DIR}/${intact_extraction_base}${test_suffix}.log
+intact_extraction_log: ${BUILD_DIR}/${intact_extraction_base}${version_suffix}${test_suffix}.log
 intact_input_file: ${BUILD_DIR}/intact.txt
 intact_conversion_script: ${CONVERT_CODE_DIR}/${intact_conversion_base}.py
-intact_conversion_log: ${BUILD_DIR}/${intact_conversion_base}${test_suffix}.log
+intact_conversion_log: ${BUILD_DIR}/${intact_conversion_base}${version_suffix}${test_suffix}.log
 intact_output_nodes_file: ${BUILD_DIR}/${intact_output_base}${nodes_suffix}${test_suffix}.jsonl
 intact_output_edges_file: ${BUILD_DIR}/${intact_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -222,10 +223,10 @@ disgenet_extraction_base: extract-disgenet
 disgenet_conversion_base: disgenet_tsv_to_kg_jsonl
 disgenet_output_base: kg2-disgenet
 disgenet_extraction_script: ${EXTRACT_CODE_DIR}/${disgenet_extraction_base}.sh
-disgenet_extraction_log: ${BUILD_DIR}/${disgenet_extraction_base}${test_suffix}.log
+disgenet_extraction_log: ${BUILD_DIR}/${disgenet_extraction_base}${version_suffix}${test_suffix}.log
 disgenet_input_file: ${BUILD_DIR}/all_gene_disease_pmid_associations.tsv
 disgenet_conversion_script: ${CONVERT_CODE_DIR}/${disgenet_conversion_base}.py
-disgenet_conversion_log: ${BUILD_DIR}/${disgenet_conversion_base}${test_suffix}.log
+disgenet_conversion_log: ${BUILD_DIR}/${disgenet_conversion_base}${version_suffix}${test_suffix}.log
 disgenet_output_nodes_file: ${BUILD_DIR}/${disgenet_output_base}${nodes_suffix}${test_suffix}.jsonl
 disgenet_output_edges_file: ${BUILD_DIR}/${disgenet_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -233,10 +234,10 @@ kegg_extraction_base: extract-kegg
 kegg_conversion_base: kegg_jsonl_to_kg_jsonl
 kegg_output_base: kg2-kegg
 kegg_extraction_script: ${EXTRACT_CODE_DIR}/${kegg_extraction_base}.sh
-kegg_extraction_log: ${BUILD_DIR}/${kegg_extraction_base}${test_suffix}.log
+kegg_extraction_log: ${BUILD_DIR}/${kegg_extraction_base}${version_suffix}${test_suffix}.log
 kegg_input_file: ${BUILD_DIR}/kegg.jsonl
 kegg_conversion_script: ${CONVERT_CODE_DIR}/${kegg_conversion_base}.py
-kegg_conversion_log: ${BUILD_DIR}/${kegg_conversion_base}${test_suffix}.log
+kegg_conversion_log: ${BUILD_DIR}/${kegg_conversion_base}${version_suffix}${test_suffix}.log
 kegg_output_nodes_file: ${BUILD_DIR}/${kegg_output_base}${nodes_suffix}${test_suffix}.jsonl
 kegg_output_edges_file: ${BUILD_DIR}/${kegg_output_base}${edges_suffix}${test_suffix}.jsonl
 
@@ -244,17 +245,17 @@ clinicaltrialskg_extraction_base: extract-clinicaltrialskg
 clinicaltrialskg_conversion_base: clinicaltrialskg_tsv_to_kg_jsonl
 clinicaltrialskg_output_base: kg2-clinicaltrialskg
 clinicaltrialskg_extraction_script: ${EXTRACT_CODE_DIR}/${clinicaltrialskg_extraction_base}.sh
-clinicaltrialskg_extraction_log: ${BUILD_DIR}/${clinicaltrialskg_extraction_base}${test_suffix}.log
+clinicaltrialskg_extraction_log: ${BUILD_DIR}/${clinicaltrialskg_extraction_base}${version_suffix}${test_suffix}.log
 clinicaltrialskg_input_file: ${BUILD_DIR}/clinicaltrialskg-edges.tsv
 clinicaltrialskg_conversion_script: ${CONVERT_CODE_DIR}/${clinicaltrialskg_conversion_base}.py
-clinicaltrialskg_conversion_log: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${test_suffix}.log
+clinicaltrialskg_conversion_log: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${version_suffix}${test_suffix}.log
 clinicaltrialskg_output_nodes_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${nodes_suffix}${test_suffix}.jsonl
 clinicaltrialskg_output_edges_file: ${BUILD_DIR}/${clinicaltrialskg_conversion_base}${edges_suffix}${test_suffix}.jsonl
 
 merge_base: merge_graphs
 merge_script: ${PROCESS_CODE_DIR}/${merge_base}.py
 merged_output_base: kg2-merged
-merge_log: ${BUILD_DIR}/${merge_base}${test_suffix}.log
+merge_log: ${BUILD_DIR}/${merge_base}${version_suffix}${test_suffix}.log
 merged_output_nodes_file: ${BUILD_DIR}/${merged_output_base}${nodes_suffix}${test_suffix}.jsonl
 merged_output_edges_file: ${BUILD_DIR}/${merged_output_base}${edges_suffix}${test_suffix}.jsonl
 output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${test_suffix}.jsonl
@@ -262,31 +263,31 @@ output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${test_suffix}.j
 simplify_base: run-simplify
 simplified_output_base: kg2-simplified
 simplify_script: ${PROCESS_CODE_DIR}/${simplify_base}.sh
-simplify_log: ${BUILD_DIR}/${simplify_base}${test_suffix}.log
+simplify_log: ${BUILD_DIR}/${simplify_base}${version_suffix}${test_suffix}.log
 simplified_output_nodes_file: ${BUILD_DIR}/${simplified_output_base}${nodes_suffix}${test_suffix}.jsonl
 simplified_output_edges_file: ${BUILD_DIR}/${simplified_output_base}${edges_suffix}${test_suffix}.jsonl
 
 report_base: report_stats_on_kg_jsonl
 report_script: ${PROCESS_CODE_DIR}/${report_base}.py
-report_log: ${BUILD_DIR}/${report_base}${test_suffix}.log
+report_log: ${BUILD_DIR}/${report_base}${version_suffix}${test_suffix}.log
 report_file: ${BUILD_DIR}/kg2-report${test_suffix}.json
 
-simplified_report_log: ${BUILD_DIR}/${report_base}-simplified${test_suffix}.log
+simplified_report_log: ${BUILD_DIR}/${report_base}-simplified${version_suffix}${test_suffix}.log
 simplified_report_file_base: kg2-simplified-report${test_suffix}.json
 simplified_report_file: ${BUILD_DIR}/${simplified_report_file_base}
 
 slim_base: slim_kg2
 slim_output_base: kg2-slim
 slim_script: ${PROCESS_CODE_DIR}/${slim_base}.py
-slim_log: ${BUILD_DIR}/${slim_base}${test_suffix}.log
+slim_log: ${BUILD_DIR}/${slim_base}${version_suffix}${test_suffix}.log
 slim_output_nodes_file: ${BUILD_DIR}/${slim_output_base}${nodes_suffix}${test_suffix}.jsonl
 slim_output_edges_file: ${BUILD_DIR}/${slim_output_base}${edges_suffix}${test_suffix}.jsonl
 
 tsv_base: kg_json_to_tsv
 tsv_script: ${PROCESS_CODE_DIR}/${tsv_base}.py
-tsv_log: ${BUILD_DIR}/${tsv_base}${test_suffix}.log
+tsv_log: ${BUILD_DIR}/${tsv_base}${version_suffix}${test_suffix}.log
 kg2_tsv_dir: ${BUILD_DIR}/TSV
-kg2_tsv_tarball: ${BUILD_DIR}/kg2-tsv-for-neo4j${test_suffix}.tar.gz
+kg2_tsv_tarball: ${BUILD_DIR}/kg2-tsv-for-neo4j${version_suffix}${test_suffix}.tar.gz
 tsv_placeholder: ${BUILD_DIR}/tsv_placeholder.empty
 
 finish_script: ${BUILD_CODE_DIR}/finish-snakemake.sh
diff --git a/master-config.shinc b/master-config.shinc
index c6e5de45..97fb5956 100644
--- a/master-config.shinc
+++ b/master-config.shinc
@@ -30,4 +30,4 @@ ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml
 rtx_config_file=RTXConfiguration-config.json
 biolink_model_version=4.2.0
 infores_registry_version=0.2.8
-version=
\ No newline at end of file
+kg2_version=
\ No newline at end of file

From f0aee45706b445b5561777181d1df4b406c83899 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 14:58:01 -0700
Subject: [PATCH 028/125] #140 on the neo4j side

---
 neo4j/tsv-to-neo4j.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/neo4j/tsv-to-neo4j.sh b/neo4j/tsv-to-neo4j.sh
index 4371b910..2fd03361 100755
--- a/neo4j/tsv-to-neo4j.sh
+++ b/neo4j/tsv-to-neo4j.sh
@@ -53,8 +53,13 @@ rm -f ${tsv_tarball}
 rm -r -f ${tsv_dir}
 mkdir -p ${tsv_dir}
 
+# get the latest KG2 version
+kg2_version_file="kg2-version.txt"
+${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${BUILD_DIR}/${kg2_version_file}
+kg2_version=`cat ${BUILD_DIR}/${kg2_version_file}`
+
 # download the latest TSV files from the S3 Bucket
-${s3_cp_cmd} s3://${s3_bucket}/kg2-tsv-for-neo4j${test_arg}.tar.gz ${tsv_tarball}
+${s3_cp_cmd} s3://${s3_bucket}/kg2-tsv-for-neo4j-${kg2_version}${test_arg}.tar.gz ${tsv_tarball}
 
 # unpack the TSV tarball
 tar -xvzf ${tsv_tarball} -C ${tsv_dir}

From 07c854959da9a5fd6fcf2e299ee0354a1535b4ce Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 15:01:34 -0700
Subject: [PATCH 029/125] #140 adding the name to the name of other build
 artifacts as well

---
 build/snakemake-config-var.yaml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index 72164364..c811da79 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -256,32 +256,32 @@ merge_base: merge_graphs
 merge_script: ${PROCESS_CODE_DIR}/${merge_base}.py
 merged_output_base: kg2-merged
 merge_log: ${BUILD_DIR}/${merge_base}${version_suffix}${test_suffix}.log
-merged_output_nodes_file: ${BUILD_DIR}/${merged_output_base}${nodes_suffix}${test_suffix}.jsonl
-merged_output_edges_file: ${BUILD_DIR}/${merged_output_base}${edges_suffix}${test_suffix}.jsonl
-output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${test_suffix}.jsonl
+merged_output_nodes_file: ${BUILD_DIR}/${merged_output_base}${version_suffix}${nodes_suffix}${test_suffix}.jsonl
+merged_output_edges_file: ${BUILD_DIR}/${merged_output_base}${version_suffix}${edges_suffix}${test_suffix}.jsonl
+output_file_orphan_edges: ${BUILD_DIR}/kg2-orphan${edges_suffix}${version_suffix}${test_suffix}.jsonl
 
 simplify_base: run-simplify
 simplified_output_base: kg2-simplified
 simplify_script: ${PROCESS_CODE_DIR}/${simplify_base}.sh
 simplify_log: ${BUILD_DIR}/${simplify_base}${version_suffix}${test_suffix}.log
-simplified_output_nodes_file: ${BUILD_DIR}/${simplified_output_base}${nodes_suffix}${test_suffix}.jsonl
-simplified_output_edges_file: ${BUILD_DIR}/${simplified_output_base}${edges_suffix}${test_suffix}.jsonl
+simplified_output_nodes_file: ${BUILD_DIR}/${simplified_output_base}${version_suffix}${nodes_suffix}${test_suffix}.jsonl
+simplified_output_edges_file: ${BUILD_DIR}/${simplified_output_base}${version_suffix}${edges_suffix}${test_suffix}.jsonl
 
 report_base: report_stats_on_kg_jsonl
 report_script: ${PROCESS_CODE_DIR}/${report_base}.py
 report_log: ${BUILD_DIR}/${report_base}${version_suffix}${test_suffix}.log
-report_file: ${BUILD_DIR}/kg2-report${test_suffix}.json
+report_file: ${BUILD_DIR}/kg2-report${version_suffix}${test_suffix}.json
 
 simplified_report_log: ${BUILD_DIR}/${report_base}-simplified${version_suffix}${test_suffix}.log
-simplified_report_file_base: kg2-simplified-report${test_suffix}.json
+simplified_report_file_base: kg2-simplified-report${version_suffix}${test_suffix}.json
 simplified_report_file: ${BUILD_DIR}/${simplified_report_file_base}
 
 slim_base: slim_kg2
 slim_output_base: kg2-slim
 slim_script: ${PROCESS_CODE_DIR}/${slim_base}.py
 slim_log: ${BUILD_DIR}/${slim_base}${version_suffix}${test_suffix}.log
-slim_output_nodes_file: ${BUILD_DIR}/${slim_output_base}${nodes_suffix}${test_suffix}.jsonl
-slim_output_edges_file: ${BUILD_DIR}/${slim_output_base}${edges_suffix}${test_suffix}.jsonl
+slim_output_nodes_file: ${BUILD_DIR}/${slim_output_base}${version_suffix}${nodes_suffix}${test_suffix}.jsonl
+slim_output_edges_file: ${BUILD_DIR}/${slim_output_base}${version_suffix}${edges_suffix}${test_suffix}.jsonl
 
 tsv_base: kg_json_to_tsv
 tsv_script: ${PROCESS_CODE_DIR}/${tsv_base}.py

From 184fa6ec204cfb488e0e9890c3eb37f9a78c008b Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 15:09:18 -0700
Subject: [PATCH 030/125] #140 made sure its only defined once

---
 build/Snakefile-post-etl        |  2 +-
 build/build-kg2-snakemake.sh    | 10 ++++------
 build/snakemake-config-var.yaml |  2 --
 master-config.shinc             |  2 ++
 neo4j/tsv-to-neo4j.sh           |  5 ++---
 process/run-simplify.sh         |  7 +++----
 6 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl
index e6de67ee..62ccffd7 100644
--- a/build/Snakefile-post-etl
+++ b/build/Snakefile-post-etl
@@ -126,7 +126,7 @@ rule Simplify:
     log:
         config['SIMPLIFY_LOG']
     shell:
-        "bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['VERSION_FILE'] + " " + config['TEST_FLAG'] + " > {log} 2>&1" 
+        "bash -x {input.code} {input.nodes} {input.edges} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" 
 
 rule Slim:
     input:
diff --git a/build/build-kg2-snakemake.sh b/build/build-kg2-snakemake.sh
index 6d313846..96d00935 100755
--- a/build/build-kg2-snakemake.sh
+++ b/build/build-kg2-snakemake.sh
@@ -64,8 +64,6 @@ then
     run_flag="-F"
 fi
 
-kg2_version_file="kg2-version.txt"
-local_kg2_version_file="${BUILD_DIR}/${kg2_version_file}"
 trigger_file_is_major_release=${BUILD_DIR}/major-release
 trigger_file_is_minor_release=${BUILD_DIR}/minor-release
 
@@ -89,14 +87,14 @@ if [[ "${ci_flag}" == "ci" ]]
 then
     sed -i "\@^kg2_version=@ckg2_version=KG2.CI" ${CODE_DIR}/master-config.shinc
 else
-    ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${local_kg2_version_file}
+    ${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${kg2_version_file_local}
     if [[ "${increment_flag}" != '' ]]
     then
-        ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${local_kg2_version_file}
+        ${VENV_DIR}/bin/python3 ${PROCESS_CODE_DIR}/update_version.py ${increment_flag} ${kg2_version_file_local}
     else
         echo "*** TEST MODE -- NO INCREMENT ***"
     fi
-    curr_kg2_version=`cat ${local_kg2_version_file}`
+    curr_kg2_version=`cat ${kg2_version_file_local}`
     sed -i "\@^kg2_version=@ckg2_version=${curr_kg2_version}" ${CODE_DIR}/master-config.shinc
 fi
 
@@ -156,7 +154,7 @@ cd ~ && ${VENV_DIR}/bin/snakemake --snakefile ${snakefile} ${run_flag} -R Finish
 
 if [[ "${ci_flag}" != "ci" ]]
 then
-    ${s3_cp_cmd} ${local_kg2_version_file} s3://${s3_bucket_public}/${kg2_version_file}
+    ${s3_cp_cmd} ${kg2_version_file_local} s3://${s3_bucket_public}/${kg2_version_file}
 fi
 
 if [[ -f ${trigger_file_is_major_release} ]]
diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index c811da79..a751dc4c 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -291,5 +291,3 @@ kg2_tsv_tarball: ${BUILD_DIR}/kg2-tsv-for-neo4j${version_suffix}${test_suffix}.t
 tsv_placeholder: ${BUILD_DIR}/tsv_placeholder.empty
 
 finish_script: ${BUILD_CODE_DIR}/finish-snakemake.sh
-
-version_file: ${BUILD_DIR}/kg2-version.txt
diff --git a/master-config.shinc b/master-config.shinc
index 97fb5956..ac6dd9e5 100644
--- a/master-config.shinc
+++ b/master-config.shinc
@@ -30,4 +30,6 @@ ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml
 rtx_config_file=RTXConfiguration-config.json
 biolink_model_version=4.2.0
 infores_registry_version=0.2.8
+kg2_version_file=version.txt
+kg2_version_file_local=${BUILD_DIR}/${kg2_version_file}
 kg2_version=
\ No newline at end of file
diff --git a/neo4j/tsv-to-neo4j.sh b/neo4j/tsv-to-neo4j.sh
index 2fd03361..07c2f692 100755
--- a/neo4j/tsv-to-neo4j.sh
+++ b/neo4j/tsv-to-neo4j.sh
@@ -54,9 +54,8 @@ rm -r -f ${tsv_dir}
 mkdir -p ${tsv_dir}
 
 # get the latest KG2 version
-kg2_version_file="kg2-version.txt"
-${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${BUILD_DIR}/${kg2_version_file}
-kg2_version=`cat ${BUILD_DIR}/${kg2_version_file}`
+${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${kg2_version_file_local}
+kg2_version=`cat ${kg2_version_file_local}`
 
 # download the latest TSV files from the S3 Bucket
 ${s3_cp_cmd} s3://${s3_bucket}/kg2-tsv-for-neo4j-${kg2_version}${test_arg}.tar.gz ${tsv_tarball}
diff --git a/process/run-simplify.sh b/process/run-simplify.sh
index 6a8951b7..4a033273 100755
--- a/process/run-simplify.sh
+++ b/process/run-simplify.sh
@@ -10,7 +10,7 @@ if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
     exit 2
 fi
 
-# Usage: run-simplify.sh <input_nodes_json> <input_edges_json> <output_nodes_json> <output_edges_json> [version_filename] [test]
+# Usage: run-simplify.sh <input_nodes_json> <input_edges_json> <output_nodes_json> <output_edges_json> [test]
 
 echo "================= starting run-simplify.sh =================="
 date
@@ -22,15 +22,14 @@ input_nodes_json=${1:-}
 input_edges_json=${2:-}
 output_nodes_json=${3:-}
 output_edges_json=${4:-}
-local_version_filename=${5:-"${BUILD_DIR}/kg2-version.txt"}
-build_flag=${6:-""}
+build_flag=${5:-""}
 
 # TODO: Inhibits and increase are not in biolink model anymore - Find out what that should be now
 ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py ${test_flag} --dropNegated \
                         --dropSelfEdgesExcept interacts_with,regulates,inhibits,increase \
                         ${predicate_mapping_file} ${infores_mapping_file} ${curies_to_urls_file} \
                         ${knowledge_level_agent_type_mapping_file} ${input_nodes_json} ${input_edges_json} \
-                        ${output_nodes_json} ${output_edges_json} ${local_version_filename}
+                        ${output_nodes_json} ${output_edges_json} ${kg2_version_file_local}
 
 date
 echo "================= finishing run-simplify.sh =================="

From 49d75edba30e75a19c693b06365f2948de77a413 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 15:19:26 -0700
Subject: [PATCH 031/125] #393 remove RepoDB from the build system

---
 build/Snakefile-conversion                    | 13 -----
 build/Snakefile-extraction                    | 11 ----
 build/Snakefile-post-etl                      |  4 --
 build/snakemake-config-var.yaml               | 12 ----
 .../{ => archive}/repodb_csv_to_kg_jsonl.py   |  0
 extract/{ => archive}/extract-repodb.sh       |  0
 maps/curies-to-urls-map.yaml                  |  2 -
 ...g2-provided-by-curie-to-infores-curie.yaml |  4 --
 maps/knowledge-level-agent-type-map.yaml      |  4 --
 maps/predicate-remap.yaml                     | 57 -------------------
 10 files changed, 107 deletions(-)
 rename convert/{ => archive}/repodb_csv_to_kg_jsonl.py (100%)
 rename extract/{ => archive}/extract-repodb.sh (100%)

diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion
index ae80d765..6754be04 100644
--- a/build/Snakefile-conversion
+++ b/build/Snakefile-conversion
@@ -120,19 +120,6 @@ rule DGIdb_Conversion:
     shell:
         config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"
 
-rule RepoDB_Conversion:
-    input:
-        code = config['REPODB_CONVERSION_SCRIPT'],
-        real = config['REPODB_INPUT_FILE'],
-        validation = config['VALIDATION_PLACEHOLDER']
-    output:
-        nodes = config['REPODB_OUTPUT_NODES_FILE'],
-        edges = config['REPODB_OUTPUT_EDGES_FILE']
-    log:
-        config['REPODB_CONVERSION_LOG']
-    shell:
-        config['PYTHON_COMMAND'] + " {input.code} {input.real} {output.nodes} {output.edges} " + config['TEST_ARG'] + " > {log} 2>&1"
-
 rule DrugBank_Conversion:
     input:
         code = config['DRUGBANK_CONVERSION_SCRIPT'],
diff --git a/build/Snakefile-extraction b/build/Snakefile-extraction
index ac5e19b1..c23d0ef0 100644
--- a/build/Snakefile-extraction
+++ b/build/Snakefile-extraction
@@ -88,17 +88,6 @@ rule DGIdb:
     shell:
         "bash -x {input.code} " + config['DGIDB_DIR'] + " > {log} 2>&1"
 
-rule RepoDB:
-    input:
-        code = config['REPODB_EXTRACTION_SCRIPT'],
-        validation = config['VALIDATION_PLACEHOLDER']
-    output:
-        config['REPODB_INPUT_FILE']
-    log:
-        config['REPODB_EXTRACTION_LOG']
-    shell:
-        "bash -x {input.code} " + config['REPODB_DIR'] + " > {log} 2>&1"
-
 rule DrugBank:
     input:
         code = config['DRUGBANK_EXTRACTION_SCRIPT'],
diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl
index 62ccffd7..eeb1a44d 100644
--- a/build/Snakefile-post-etl
+++ b/build/Snakefile-post-etl
@@ -19,8 +19,6 @@ rule Merge:
         ncbigene_edges = config['NCBIGENE_OUTPUT_EDGES_FILE'],
         dgidb_nodes = config['DGIDB_OUTPUT_NODES_FILE'],
         dgidb_edges = config['DGIDB_OUTPUT_EDGES_FILE'],
-        repodb_nodes = config['REPODB_OUTPUT_NODES_FILE'],
-        repodb_edges = config['REPODB_OUTPUT_EDGES_FILE'],
         drugbank_nodes = config['DRUGBANK_OUTPUT_NODES_FILE'],
         drugbank_edges = config['DRUGBANK_OUTPUT_EDGES_FILE'],
         smpdb_nodes = config['SMPDB_OUTPUT_NODES_FILE'],
@@ -66,7 +64,6 @@ rule Merge:
             "{input.chembl_nodes} " + \
             "{input.ncbigene_nodes} " + \
             "{input.dgidb_nodes} " + \
-            "{input.repodb_nodes} " + \
             "{input.smpdb_nodes} " + \
             "{input.drugbank_nodes} " + \
             "{input.hmdb_nodes} " + \
@@ -89,7 +86,6 @@ rule Merge:
             "{input.chembl_edges} " + \
             "{input.ncbigene_edges} " + \
             "{input.dgidb_edges} " + \
-            "{input.repodb_edges} " + \
             "{input.smpdb_edges} " + \
             "{input.drugbank_edges} " + \
             "{input.hmdb_edges} " + \
diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index a751dc4c..209b3659 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -106,18 +106,6 @@ dgidb_conversion_log: ${BUILD_DIR}/${dgidb_conversion_base}${version_suffix}${te
 dgidb_output_nodes_file: ${BUILD_DIR}/${dgidb_output_base}${nodes_suffix}${test_suffix}.jsonl
 dgidb_output_edges_file: ${BUILD_DIR}/${dgidb_output_base}${edges_suffix}${test_suffix}.jsonl
 
-repodb_extraction_base: extract-repodb
-repodb_conversion_base: repodb_csv_to_kg_jsonl
-repodb_output_base: kg2-repodb
-repodb_extraction_script: ${EXTRACT_CODE_DIR}/${repodb_extraction_base}.sh
-repodb_extraction_log: ${BUILD_DIR}/${repodb_extraction_base}${version_suffix}${test_suffix}.log
-repodb_dir: ${BUILD_DIR}/repodb
-repodb_input_file: ${repodb_dir}/repodb.csv
-repodb_conversion_script: ${CONVERT_CODE_DIR}/${repodb_conversion_base}.py
-repodb_conversion_log: ${BUILD_DIR}/${repodb_conversion_base}${version_suffix}${test_suffix}.log
-repodb_output_nodes_file: ${BUILD_DIR}/${repodb_output_base}${nodes_suffix}${test_suffix}.jsonl
-repodb_output_edges_file: ${BUILD_DIR}/${repodb_output_base}${edges_suffix}${test_suffix}.jsonl
-
 drugbank_extraction_base: extract-drugbank
 drugbank_conversion_base: drugbank_xml_to_kg_jsonl
 drugbank_output_base: kg2-drugbank
diff --git a/convert/repodb_csv_to_kg_jsonl.py b/convert/archive/repodb_csv_to_kg_jsonl.py
similarity index 100%
rename from convert/repodb_csv_to_kg_jsonl.py
rename to convert/archive/repodb_csv_to_kg_jsonl.py
diff --git a/extract/extract-repodb.sh b/extract/archive/extract-repodb.sh
similarity index 100%
rename from extract/extract-repodb.sh
rename to extract/archive/extract-repodb.sh
diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index fe4192b8..b9c2af1c 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -427,8 +427,6 @@ use_for_bidirectional_mapping:
     rdfs: http://www.w3.org/2000/01/rdf-schema#
   -
     REACT: "https://identifiers.org/reactome:"
-  -
-    REPODB: http://apps.chiragjpgroup.org/repoDB/
   -
     RGD: "https://identifiers.org/rgd:"
   -
diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml
index efb2d639..718cbd11 100644
--- a/maps/kg2-provided-by-curie-to-infores-curie.yaml
+++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml
@@ -134,10 +134,6 @@ OBO:uberon.owl:
   source_name: PathWhiz
   infores_curie: infores:pathwhiz
   knowledge_type: primary_knowledge_source
-'REPODB:':
-  source_name: Drug Repositioning Database
-  infores_curie: infores:repodb
-  knowledge_type: knowledge_source
 'RTX:':
   source_name: RTX KG2
   infores_curie: infores:rtx-kg2
diff --git a/maps/knowledge-level-agent-type-map.yaml b/maps/knowledge-level-agent-type-map.yaml
index db99215a..85bf161a 100644
--- a/maps/knowledge-level-agent-type-map.yaml
+++ b/maps/knowledge-level-agent-type-map.yaml
@@ -222,10 +222,6 @@ infores:reactome:
   agent_type: manual_agent
   knowledge_level: knowledge_assertion
   reference: https://en.wikipedia.org/wiki/Reactome
-infores:repodb:
-  agent_type: automated_agent
-  knowledge_level: knowledge_assertion
-  reference: https://www.nature.com/articles/sdata201729
 infores:ro:
   agent_type: manual_agent
   knowledge_level: knowledge_assertion
diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml
index fbff26a8..1e08b978 100644
--- a/maps/predicate-remap.yaml
+++ b/maps/predicate-remap.yaml
@@ -3367,63 +3367,6 @@ REACT:positively_regulates_gene_expression:
 REACT:related_to:
   operation: keep
   core_predicate: biolink:related_to
-REPODB:clinically_tested_approved_unknown_phase:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_suspended_phase_0:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_suspended_phase_1:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_suspended_phase_1_or_phase_2:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_suspended_phase_2:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_suspended_phase_2_or_phase_3:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_suspended_phase_3:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_terminated_phase_0:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_terminated_phase_1:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_terminated_phase_1_or_phase_2:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_terminated_phase_2:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_terminated_phase_2_or_phase_3:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_terminated_phase_3:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_withdrawn_phase_0:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_withdrawn_phase_1:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_withdrawn_phase_1_or_phase_2:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_withdrawn_phase_2:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_withdrawn_phase_2_or_phase_3:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
-REPODB:clinically_tested_withdrawn_phase_3:
-  operation: keep
-  core_predicate: biolink:drug_regulatory_status_world_wide
 RO:0000052:
   operation: keep
   core_predicate: biolink:related_to

From ce8d8de7ff9324332063db753090d4d5375c83ef Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 15:45:29 -0700
Subject: [PATCH 032/125] #393 have to remove info from kg2_util as well

---
 kg2_util.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kg2_util.py b/kg2_util.py
index e61c6cba..a5aa0971 100644
--- a/kg2_util.py
+++ b/kg2_util.py
@@ -112,7 +112,6 @@
 CURIE_PREFIX_RDF = 'rdf'
 CURIE_PREFIX_RDFS = 'rdfs'
 CURIE_PREFIX_REACTOME='REACT'
-CURIE_PREFIX_REPODB = 'REPODB'
 CURIE_PREFIX_RHEA = 'RHEA'
 CURIE_PREFIX_RHEA_COMP = 'RHEA.COMP'
 CURIE_PREFIX_RO = 'RO'
@@ -175,7 +174,6 @@
 BASE_URL_PATHWHIZ_BOUND = 'https://pathbank.org/lims#/bounds/'
 BASE_URL_PMID = "http://www.ncbi.nlm.nih.gov/pubmed/"
 BASE_URL_REACTOME = BASE_BASE_URL_IDENTIFIERS_ORG + 'reactome:'
-BASE_URL_REPODB = 'http://apps.chiragjpgroup.org/repoDB/'
 BASE_URL_RTX = 'http://rtx.ai/identifiers#'
 BASE_URL_SEMMEDDB = 'https://skr3.nlm.nih.gov/SemMedDB'
 BASE_URL_SMPDB = BASE_BASE_URL_IDENTIFIERS_ORG + 'smpdb:'

From 6bdb5a6ed0429caeda31ae6387cbb74668eb89c7 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 16:23:28 -0700
Subject: [PATCH 033/125] #400 first pass at this

---
 maps/curies-to-urls-map.yaml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index b9c2af1c..468c842e 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -378,9 +378,7 @@ use_for_bidirectional_mapping:
   -
     OPL: http://purl.obolibrary.org/obo/OPL_
   -
-    orphanet: 'http://www.orpha.net/ORDO/Orphanet_'
-  -
-    ORPHANET: http://purl.bioontology.org/ontology/ORDO/
+    orphanet: http://purl.bioontology.org/ontology/ORDO/
   -
     owl: http://www.w3.org/2002/07/owl#
   -
@@ -665,10 +663,10 @@ use_for_contraction_only:
     OMIM: http://identifiers.org/omim/
   -
     OMOP: http://purl.obolibrary.org/obo/COHD_
-    # -
-  #    ORPHANET: http://www.orpha.net/ORDO/Orphanet_
   -
-    ORPHANET: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb
+    orphanet: http://www.orpha.net/ORDO/Orphanet_
+  -
+    orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb
   -
     PATO: http://purl.obolibrary.org/obo/pato#
   -

From 31cb00a5211c837e53e028e52ec3df4ac78ed6f0 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 16:26:58 -0700
Subject: [PATCH 034/125] #400 try this instead

---
 maps/curies-to-urls-map.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 468c842e..48d4956c 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -378,7 +378,7 @@ use_for_bidirectional_mapping:
   -
     OPL: http://purl.obolibrary.org/obo/OPL_
   -
-    orphanet: http://purl.bioontology.org/ontology/ORDO/
+    orphanet: http://www.orpha.net/ORDO/Orphanet_
   -
     owl: http://www.w3.org/2002/07/owl#
   -
@@ -664,7 +664,7 @@ use_for_contraction_only:
   -
     OMOP: http://purl.obolibrary.org/obo/COHD_
   -
-    orphanet: http://www.orpha.net/ORDO/Orphanet_
+    orphanet: http://purl.bioontology.org/ontology/ORDO/
   -
     orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb
   -

From 475066576832dc62bf5c27c7be023ad34903188e Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 17 Jul 2024 16:32:14 -0700
Subject: [PATCH 035/125] #400 handling the expansion map (hopefully)

---
 maps/curies-to-urls-map.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 48d4956c..0e4ba4cb 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -728,7 +728,9 @@ use_for_expansion_only:
   -
     FlyBase: https://flybase.org/reports/
   -
-    Orphanet: http://purl.bioontology.org/ontology/ORDO/
+    Orphanet: http://www.orpha.net/ORDO/Orphanet_
+  -
+    ORPHANET: http://www.orpha.net/ORDO/Orphanet_  
   -
     oboInOwl: http://www.geneontology.org/formats/oboInOwl#
   -

From 89591c3a07608be8bdaa73f75a3e0637cf20a5c3 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 22 Jul 2024 11:42:44 -0700
Subject: [PATCH 036/125] #392 initial edge blocklist (no synonyms yet)

---
 maps/edge-blocklist.yaml | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 maps/edge-blocklist.yaml

diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml
new file mode 100644
index 00000000..f40dca64
--- /dev/null
+++ b/maps/edge-blocklist.yaml
@@ -0,0 +1,32 @@
+-
+  subject_name: Vaccines
+  subject_ids:
+    - UMLS:C0042210
+  predicate: biolink:causes
+  object_name: Autism
+  object_ids:
+    - UMLS:C0004352
+-
+  subject_name: Measles-Mumps-Rubella Vaccine
+  subject_ids:
+    - UMLS:C0065828
+  predicate: biolink:causes
+  object_name: Autism
+  object_ids:
+    - UMLS:C0004352
+-
+  subject_name: Mercury
+  subject_ids:
+    - UMLS:C0025424
+  predicate: biolink:causes
+  object_name: Autism
+  object_ids:
+    - UMLS:C0004352
+-
+  subject_name: Thimerosal
+  subject_ids:
+    - UMLS:C0039867
+  predicate: biolink:causes
+  object_name: Autism
+  object_ids:
+    - UMLS:C0004352

From e33b320a7e8edf5895cd84479c61469d955237a4 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 22 Jul 2024 11:43:32 -0700
Subject: [PATCH 037/125] #387 grouping together xml blocks

---
 misc-tools/owlparser.py | 43 ++++++++++++++++++++++++++++-------------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 0db4b0be..6006e409 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -2,8 +2,12 @@
 import argparse
 
 COMMENT = "!--"
+XML_TAG = "?xml"
+RDF_TAG = "rdf:RDF"
 
-LINE_TYPE_COMMENT = "comment"
+OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG]
+
+LINE_TYPE_IGNORE = "ignore"
 LINE_TYPE_START_NEST = "start nest"
 LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
 LINE_TYPE_ENTRY = "entry"
@@ -16,6 +20,8 @@
 KEY_TEXT = "text"
 KEY_TYPE = "type"
 
+IGNORED_ATTRIBUTES = ["xml:lang"]
+
 def get_args():
 	arg_parser = argparse.ArgumentParser()
 	arg_parser.add_argument('--test', dest='test',
@@ -73,7 +79,8 @@ def convert_line(line):
 			start_reading_attribute_tag = False
 			start_reading_attribute_text = False
 			start_reading_main = True
-			attributes[attribute_tag] = attribute_text.strip('/').strip('"')
+			if attribute_tag not in IGNORED_ATTRIBUTES:
+				attributes[attribute_tag] = attribute_text.strip('/').strip('"')
 			attribute_tag = ""
 			attribute_text = ""
 
@@ -91,7 +98,8 @@ def convert_line(line):
 			if letter == ' ' and start_reading_attribute_text:
 				start_reading_attribute_tag = True
 				start_reading_attribute_text = False
-				attributes[attribute_tag] = attribute_text.strip('/').strip('"')
+				if attribute_tag not in IGNORED_ATTRIBUTES:
+					attributes[attribute_tag] = attribute_text.strip('/').strip('"')
 				attribute_tag = ""
 				attribute_text = ""
 				continue
@@ -113,8 +121,8 @@ def convert_line(line):
 	# Categorize the type of line
 	line_type = str()
 	out = dict()
-	if tag == COMMENT:
-		line_type = "comment"
+	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP:
+		line_type = LINE_TYPE_IGNORE
 	else:
 		start_tag_exists = (tag != str())
 		attributes_exist = (attributes != dict())
@@ -154,7 +162,8 @@ def convert_line(line):
 
 def divide_into_lines(input_file_name):
 	curr_str = ""
-	keys = set()
+	curr_nest = list()
+	curr_nest_tag = str()
 
 	with open(input_file_name) as input_file:
 		for line in input_file:
@@ -174,21 +183,29 @@ def divide_into_lines(input_file_name):
 					line_parsed = convert_line(curr_str)
 
 					tag = line_parsed.get(KEY_TAG, None)
+					line_type = line_parsed.get(KEY_TYPE, None)
 					attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys()
 
-					if tag is not None:
-						keys.add(tag)
-					for attribute_key in attribute_keys:
-						keys.add(attribute_key)
-					# print(json.dumps(convert_line(curr_str), indent=4))
+					if curr_nest_tag == str():
+						if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
+							curr_nest_tag = tag
+							curr_nest.append(line_parsed)
+						elif line_type != LINE_TYPE_IGNORE:
+							print(json.dumps(line_parsed, indent=4)) # replacement for processing right now
+					else:
+						if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag:
+							print(json.dumps(curr_nest, indent=4)) # replacement for processing right now
+							curr_nest = list()
+							curr_nest_tag = str()
+						else:
+							curr_nest.append(line_parsed)
+
 					curr_str = ""
 
 			if curr_str != "":
 				# divide lines by a space
 				curr_str += ' '
 
-	print(json.dumps(list(keys), indent=4))
-
 
 if __name__ == '__main__':
 	args = get_args()

From d7743bb3f96bc12cbbe63e856f425d22a02e507b Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 22 Jul 2024 12:00:34 -0700
Subject: [PATCH 038/125] #392 autism synonyms

---
 maps/edge-blocklist.yaml | 103 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 100 insertions(+), 3 deletions(-)

diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml
index f40dca64..67ec5623 100644
--- a/maps/edge-blocklist.yaml
+++ b/maps/edge-blocklist.yaml
@@ -5,28 +5,125 @@
   predicate: biolink:causes
   object_name: Autism
   object_ids:
+    - CHV:0000001598
+    - CHV:0000050438
+    - DOID:0060041
+    - DOID:12849
+    - EFO:0003756
+    - EFO:0003758
+    - HP:0000717
+    - HP:0000729
+    - ICD9:299.0
+    - MESH:D000067877
+    - MESH:D001321
+    - MONDO:0005258
+    - MONDO:0005260
+    - NCIT:C88412
+    - NCIT:C97161
+    - OMIM:209850
+    - OMIM:MTHU004240
+    - OMIM:MTHU038054
+    - OMIM:MTHU043125
+    - OMIM:MTHU043132
+    - PSY:04850
+    - PSY:04855
     - UMLS:C0004352
--
+    - UMLS:C0856975
+    - UMLS:C1510586
+    - UMLS:C1968924-
   subject_name: Measles-Mumps-Rubella Vaccine
   subject_ids:
     - UMLS:C0065828
   predicate: biolink:causes
   object_name: Autism
   object_ids:
+    - CHV:0000001598
+    - CHV:0000050438
+    - DOID:0060041
+    - DOID:12849
+    - EFO:0003756
+    - EFO:0003758
+    - HP:0000717
+    - HP:0000729
+    - ICD9:299.0
+    - MESH:D000067877
+    - MESH:D001321
+    - MONDO:0005258
+    - MONDO:0005260
+    - NCIT:C88412
+    - NCIT:C97161
+    - OMIM:209850
+    - OMIM:MTHU004240
+    - OMIM:MTHU038054
+    - OMIM:MTHU043125
+    - OMIM:MTHU043132
+    - PSY:04850
+    - PSY:04855
     - UMLS:C0004352
--
+    - UMLS:C0856975
+    - UMLS:C1510586
+    - UMLS:C1968924-
   subject_name: Mercury
   subject_ids:
     - UMLS:C0025424
   predicate: biolink:causes
   object_name: Autism
   object_ids:
+    - CHV:0000001598
+    - CHV:0000050438
+    - DOID:0060041
+    - DOID:12849
+    - EFO:0003756
+    - EFO:0003758
+    - HP:0000717
+    - HP:0000729
+    - ICD9:299.0
+    - MESH:D000067877
+    - MESH:D001321
+    - MONDO:0005258
+    - MONDO:0005260
+    - NCIT:C88412
+    - NCIT:C97161
+    - OMIM:209850
+    - OMIM:MTHU004240
+    - OMIM:MTHU038054
+    - OMIM:MTHU043125
+    - OMIM:MTHU043132
+    - PSY:04850
+    - PSY:04855
     - UMLS:C0004352
--
+    - UMLS:C0856975
+    - UMLS:C1510586
+    - UMLS:C1968924-
   subject_name: Thimerosal
   subject_ids:
     - UMLS:C0039867
   predicate: biolink:causes
   object_name: Autism
   object_ids:
+    - CHV:0000001598
+    - CHV:0000050438
+    - DOID:0060041
+    - DOID:12849
+    - EFO:0003756
+    - EFO:0003758
+    - HP:0000717
+    - HP:0000729
+    - ICD9:299.0
+    - MESH:D000067877
+    - MESH:D001321
+    - MONDO:0005258
+    - MONDO:0005260
+    - NCIT:C88412
+    - NCIT:C97161
+    - OMIM:209850
+    - OMIM:MTHU004240
+    - OMIM:MTHU038054
+    - OMIM:MTHU043125
+    - OMIM:MTHU043132
+    - PSY:04850
+    - PSY:04855
     - UMLS:C0004352
+    - UMLS:C0856975
+    - UMLS:C1510586
+    - UMLS:C1968924
\ No newline at end of file

From 7c53a604774289c94eee779f2a627dc77bc81afa Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 22 Jul 2024 16:15:46 -0700
Subject: [PATCH 039/125] #392 full edge blocklist

---
 maps/edge-blocklist.yaml | 35 ++++++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml
index 67ec5623..bb64c0be 100644
--- a/maps/edge-blocklist.yaml
+++ b/maps/edge-blocklist.yaml
@@ -1,7 +1,10 @@
 -
   subject_name: Vaccines
   subject_ids:
+    - ATC:J07
+    - MESH:D014612
     - UMLS:C0042210
+    - VANDF:4021642
   predicate: biolink:causes
   object_name: Autism
   object_ids:
@@ -30,9 +33,13 @@
     - UMLS:C0004352
     - UMLS:C0856975
     - UMLS:C1510586
-    - UMLS:C1968924-
+    - UMLS:C1968924
+-
   subject_name: Measles-Mumps-Rubella Vaccine
   subject_ids:
+    - MESH:D022542
+    - NCIT:C96403
+    - PDQ:CDR0000702931
     - UMLS:C0065828
   predicate: biolink:causes
   object_name: Autism
@@ -62,10 +69,19 @@
     - UMLS:C0004352
     - UMLS:C0856975
     - UMLS:C1510586
-    - UMLS:C1968924-
+    - UMLS:C1968924
+-
   subject_name: Mercury
   subject_ids:
+    - CHEBI:16170
+    - CHEMBL.TARGET:CHEMBL2363061
+    - KEGG.COMPOUND:C01319
+    - MESH:D008628
+    - NCIT:C66842
+    - NCIT:C68270
+    - RXNORM:6769
     - UMLS:C0025424
+    - VANDF:4025953
   predicate: biolink:causes
   object_name: Autism
   object_ids:
@@ -94,10 +110,23 @@
     - UMLS:C0004352
     - UMLS:C0856975
     - UMLS:C1510586
-    - UMLS:C1968924-
+    - UMLS:C1968924
+-
   subject_name: Thimerosal
   subject_ids:
+    - ATC:D08AK06
+    - CHEBI:9546
+    - CHEMBL.COMPOUND:CHEMBL508338
+    - CHV:0000012180
+    - DRUGBANK:DB11590
+    - DrugCentral:4733
+    - KEGG.DRUG:D00864
+    - MESH:D013849
+    - NCIT:C47751
+    - NDDF:003125
+    - RXNORM:10472
     - UMLS:C0039867
+    - VANDF:4017480
   predicate: biolink:causes
   object_name: Autism
   object_ids:

From f5c72743f55d8c20f50a47d8da888df00b7679c0 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 24 Jul 2024 19:19:24 -0700
Subject: [PATCH 040/125] #387 parses it into little dictionaries (generically)

---
 misc-tools/owlparser.py | 43 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 6006e409..c0f7f602 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -160,6 +160,46 @@ def convert_line(line):
 	return out
 
 
+def convert_nest(nest, index, working_dict):
+	if index >= len(nest):
+		return working_dict
+
+	element = nest[index]
+	line_type = element[KEY_TYPE]
+	line_tag = element[KEY_TAG]
+	line_text = element.get(KEY_TEXT, None)
+	line_attributes = element.get(KEY_ATTRIBUTES, None)
+
+	if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
+		working_dict[line_tag] = dict()
+
+		converted_nest = convert_nest(nest, index + 1, dict())
+		working_dict[line_tag] = converted_nest
+
+		if line_type == LINE_TYPE_START_NEST_WITH_ATTR:
+			working_dict[line_tag][KEY_ATTRIBUTES] = line_attributes
+
+	if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
+		if line_tag not in working_dict:
+			working_dict[line_tag] = list()
+
+		curr_dict = dict()
+
+		if line_text is not None:
+			curr_dict[KEY_TEXT] = line_text
+
+		if line_attributes is not None:
+			for attribute in line_attributes:
+				curr_dict[attribute] = line_attributes[attribute]
+
+		working_dict[line_tag].append(curr_dict)
+
+		convert_nest(nest, index + 1, working_dict)
+
+	return working_dict
+
+
+
 def divide_into_lines(input_file_name):
 	curr_str = ""
 	curr_nest = list()
@@ -191,10 +231,13 @@ def divide_into_lines(input_file_name):
 							curr_nest_tag = tag
 							curr_nest.append(line_parsed)
 						elif line_type != LINE_TYPE_IGNORE:
+							print("THIS VERSION")
 							print(json.dumps(line_parsed, indent=4)) # replacement for processing right now
 					else:
 						if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag:
 							print(json.dumps(curr_nest, indent=4)) # replacement for processing right now
+							nest_dict = convert_nest(curr_nest, 0, dict())
+							print(json.dumps(nest_dict, indent=4))
 							curr_nest = list()
 							curr_nest_tag = str()
 						else:

From 6db935f25c105c37d2d67f39fbefd07879080aee Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sat, 27 Jul 2024 00:35:54 -0700
Subject: [PATCH 041/125] #387 corrected some bugs with the XML parsing

---
 misc-tools/owlparser.py | 94 ++++++++++++++++++++++-------------------
 1 file changed, 51 insertions(+), 43 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index c0f7f602..6593d0cd 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -17,7 +17,7 @@
 
 KEY_TAG = "tag"
 KEY_ATTRIBUTES = "attributes"
-KEY_TEXT = "text"
+KEY_TEXT = "ENTRY_TEXT"
 KEY_TYPE = "type"
 
 IGNORED_ATTRIBUTES = ["xml:lang"]
@@ -121,7 +121,8 @@ def convert_line(line):
 	# Categorize the type of line
 	line_type = str()
 	out = dict()
-	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP:
+
+	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP:
 		line_type = LINE_TYPE_IGNORE
 	else:
 		start_tag_exists = (tag != str())
@@ -160,50 +161,60 @@ def convert_line(line):
 	return out
 
 
-def convert_nest(nest, index, working_dict):
-	if index >= len(nest):
-		return working_dict
+def convert_nest(nest, start_index):
+	nest_dict = dict()
+	curr_index = start_index
+
+	while curr_index < len(nest):
+		element = nest[curr_index]
+		line_type = element[KEY_TYPE]
+		line_tag = element[KEY_TAG]
+		line_text = element.get(KEY_TEXT, None)
+		line_attributes = element.get(KEY_ATTRIBUTES, None)
+
+		if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
+			if line_tag not in nest_dict:
+				nest_dict[line_tag] = list()
 
-	element = nest[index]
-	line_type = element[KEY_TYPE]
-	line_tag = element[KEY_TAG]
-	line_text = element.get(KEY_TEXT, None)
-	line_attributes = element.get(KEY_ATTRIBUTES, None)
+			converted_nest, ret_index = convert_nest(nest, curr_index + 1)
 
-	if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
-		working_dict[line_tag] = dict()
+			if line_attributes is not None:
+				for attribute in line_attributes:
+					converted_nest[attribute] = line_attributes[attribute]
 
-		converted_nest = convert_nest(nest, index + 1, dict())
-		working_dict[line_tag] = converted_nest
+			nest_dict[line_tag].append(converted_nest)
 
-		if line_type == LINE_TYPE_START_NEST_WITH_ATTR:
-			working_dict[line_tag][KEY_ATTRIBUTES] = line_attributes
+			curr_index = ret_index + 1
+			continue
 
-	if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
-		if line_tag not in working_dict:
-			working_dict[line_tag] = list()
+		if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
+			if line_tag not in nest_dict:
+				nest_dict[line_tag] = list()
 
-		curr_dict = dict()
+			curr_dict = dict()
 
-		if line_text is not None:
-			curr_dict[KEY_TEXT] = line_text
+			if line_text is not None:
+				curr_dict[KEY_TEXT] = line_text
 
-		if line_attributes is not None:
-			for attribute in line_attributes:
-				curr_dict[attribute] = line_attributes[attribute]
+			if line_attributes is not None:
+				for attribute in line_attributes:
+					curr_dict[attribute] = line_attributes[attribute]
 
-		working_dict[line_tag].append(curr_dict)
+			nest_dict[line_tag].append(curr_dict)
 
-		convert_nest(nest, index + 1, working_dict)
+			curr_index += 1
+			continue
 
-	return working_dict
+		if line_type in [LINE_TYPE_END_NEST]:
+			return nest_dict, curr_index
 
+	return nest_dict, curr_index
 
 
 def divide_into_lines(input_file_name):
 	curr_str = ""
 	curr_nest = list()
-	curr_nest_tag = str()
+	curr_nest_tags = list() # Treating it as a stack
 
 	with open(input_file_name) as input_file:
 		for line in input_file:
@@ -219,29 +230,26 @@ def divide_into_lines(input_file_name):
 
 				if letter == '>' and (next_letter == '<' or next_letter == ""):
 					# Only return if nesting
-					# print(curr_str)
 					line_parsed = convert_line(curr_str)
 
 					tag = line_parsed.get(KEY_TAG, None)
+					assert tag != KEY_TEXT # This could cause a massive conflict, but it is unlikely
 					line_type = line_parsed.get(KEY_TYPE, None)
 					attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys()
 
-					if curr_nest_tag == str():
-						if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
-							curr_nest_tag = tag
-							curr_nest.append(line_parsed)
-						elif line_type != LINE_TYPE_IGNORE:
-							print("THIS VERSION")
-							print(json.dumps(line_parsed, indent=4)) # replacement for processing right now
-					else:
-						if line_type == LINE_TYPE_END_NEST and curr_nest_tag == tag:
-							print(json.dumps(curr_nest, indent=4)) # replacement for processing right now
-							nest_dict = convert_nest(curr_nest, 0, dict())
+					if line_type != LINE_TYPE_IGNORE:
+						curr_nest.append(line_parsed)
+
+					if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
+						curr_nest_tags.append(tag)
+					elif line_type == LINE_TYPE_END_NEST:
+						popped_curr_nest_tag = curr_nest_tags.pop()
+						assert popped_curr_nest_tag == tag
+						if len(curr_nest_tags) == 0:
+							nest_dict, _ = convert_nest(curr_nest, 0)
 							print(json.dumps(nest_dict, indent=4))
 							curr_nest = list()
 							curr_nest_tag = str()
-						else:
-							curr_nest.append(line_parsed)
 
 					curr_str = ""
 

From 7b4ac97aa6fca31b119dec1eefcea40a03f6ea1f Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sat, 27 Jul 2024 03:12:28 -0700
Subject: [PATCH 042/125] #387 handling case where something is just one line
 and not in another nest

---
 misc-tools/owlparser.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 6593d0cd..76715eb2 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -1,5 +1,6 @@
 import json
 import argparse
+import datetime
 
 COMMENT = "!--"
 XML_TAG = "?xml"
@@ -29,6 +30,9 @@ def get_args():
 	arg_parser.add_argument('inputFile', type=str)
 	return arg_parser.parse_args()
 
+def date():
+	return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
 def convert_line(line):
 	tag = ""
 	attributes = dict()
@@ -240,16 +244,20 @@ def divide_into_lines(input_file_name):
 					if line_type != LINE_TYPE_IGNORE:
 						curr_nest.append(line_parsed)
 
+					output_nest = (line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
+
 					if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
 						curr_nest_tags.append(tag)
 					elif line_type == LINE_TYPE_END_NEST:
 						popped_curr_nest_tag = curr_nest_tags.pop()
 						assert popped_curr_nest_tag == tag
 						if len(curr_nest_tags) == 0:
-							nest_dict, _ = convert_nest(curr_nest, 0)
-							print(json.dumps(nest_dict, indent=4))
-							curr_nest = list()
-							curr_nest_tag = str()
+							output_nest = True
+					if output_nest: 
+						nest_dict, _ = convert_nest(curr_nest, 0)
+						print(json.dumps(nest_dict, indent=4))
+						curr_nest = list()
+						curr_nest_tag = str()
 
 					curr_str = ""
 
@@ -262,4 +270,7 @@ def divide_into_lines(input_file_name):
 	args = get_args()
 	input_file_name = args.inputFile
 
-	divide_into_lines(input_file_name)
\ No newline at end of file
+	print("File:", input_file_name)
+	print("Start Time:", date())
+	divide_into_lines(input_file_name)
+	print("End Time:", date())
\ No newline at end of file

From 31b47795449c33143f50d5349b3eafde8a631e2f Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 1 Aug 2024 13:54:07 -0700
Subject: [PATCH 043/125] #404, testing it out on CI first

---
 master-config.shinc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/master-config.shinc b/master-config.shinc
index ac6dd9e5..015cf2f7 100644
--- a/master-config.shinc
+++ b/master-config.shinc
@@ -28,7 +28,7 @@ infores_mapping_file=${MAPS_CODE_DIR}/kg2-provided-by-curie-to-infores-curie.yam
 knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-type-map.yaml
 ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml
 rtx_config_file=RTXConfiguration-config.json
-biolink_model_version=4.2.0
+biolink_model_version=4.2.1
 infores_registry_version=0.2.8
 kg2_version_file=version.txt
 kg2_version_file_local=${BUILD_DIR}/${kg2_version_file}

From b7597b948ac87fec01c87a02761cfe3cdb2f4880 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 1 Aug 2024 14:01:51 -0700
Subject: [PATCH 044/125] #404 predicate remapping for biolink 4.2.1

---
 maps/predicate-remap.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml
index 1e08b978..4da6dcac 100644
--- a/maps/predicate-remap.yaml
+++ b/maps/predicate-remap.yaml
@@ -4236,10 +4236,10 @@ SEMMEDDB:affects:
   core_predicate: biolink:affects
 SEMMEDDB:ASSOCIATED_WITH:
   operation: keep
-  core_predicate: biolink:associated_with
+  core_predicate: biolink:related_to
 SEMMEDDB:associated_with:
   operation: keep
-  core_predicate: biolink:associated_with
+  core_predicate: biolink:related_to
 SEMMEDDB:AUGMENTS:
   operation: keep
   core_predicate: biolink:affects

From 2e62525e824ca2b650c3ff4f082248b8305f254b Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 8 Aug 2024 16:52:17 -0700
Subject: [PATCH 045/125] #387 handle doctype special case from foodon

---
 misc-tools/owlparser.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 76715eb2..0a38ca66 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -5,8 +5,9 @@
 COMMENT = "!--"
 XML_TAG = "?xml"
 RDF_TAG = "rdf:RDF"
+DOCTYPE_TAG = "!DOCTYPE"
 
-OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG]
+OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG]
 
 LINE_TYPE_IGNORE = "ignore"
 LINE_TYPE_START_NEST = "start nest"
@@ -48,6 +49,8 @@ def convert_line(line):
 	start_reading_main = False
 	start_reading_end_tag = False
 
+	start_brackets = 0
+
 	for letter_index in range(len(line)):
 		letter = line[letter_index]
 		next_letter = ""
@@ -57,6 +60,11 @@ def convert_line(line):
 		if letter_index - 1 >= 0:
 			prev_letter = line[letter_index - 1]
 
+		if letter == '<':
+			start_brackets += 1
+		if letter == '>':
+			start_brackets -= 1
+
 		# First <
 		if letter == '<' and letter_index == 0:
 			if next_letter != '/':
@@ -71,14 +79,14 @@ def convert_line(line):
 			start_reading_attributes = True
 			start_reading_attribute_tag = True
 			continue
-		elif letter == '>' and start_reading_tag:
+		elif letter == '>' and start_reading_tag and start_brackets == 0:
 			start_reading_tag = False
 			start_reading_main = True
 			continue
 		elif start_reading_tag:
 			tag += letter
 
-		if letter == '>' and start_reading_attributes:
+		if letter == '>' and start_reading_attributes and start_brackets == 0:
 			start_reading_attributes = False
 			start_reading_attribute_tag = False
 			start_reading_attribute_text = False
@@ -117,7 +125,7 @@ def convert_line(line):
 		elif start_reading_main:
 			main_text += letter
 
-		if letter == '>' and start_reading_end_tag:
+		if letter == '>' and start_reading_end_tag and start_brackets == 0:
 			continue
 		elif start_reading_end_tag:
 			end_tag += letter
@@ -219,6 +227,7 @@ def divide_into_lines(input_file_name):
 	curr_str = ""
 	curr_nest = list()
 	curr_nest_tags = list() # Treating it as a stack
+	start_brackets = 0
 
 	with open(input_file_name) as input_file:
 		for line in input_file:
@@ -226,13 +235,18 @@ def divide_into_lines(input_file_name):
 
 			for letter_index in range(len(line_str)):
 				letter = line_str[letter_index]
+				if letter == '<':
+					start_brackets += 1
+				if letter == '>':
+					start_brackets -= 1
+
 				next_letter = ""
 				if letter_index + 1 < len(line_str):
 					next_letter = line_str[letter_index + 1]
 
 				curr_str += letter
 
-				if letter == '>' and (next_letter == '<' or next_letter == ""):
+				if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
 					# Only return if nesting
 					line_parsed = convert_line(curr_str)
 
@@ -264,6 +278,7 @@ def divide_into_lines(input_file_name):
 			if curr_str != "":
 				# divide lines by a space
 				curr_str += ' '
+	# print(json.dumps(curr_nest, indent=4))
 
 
 if __name__ == '__main__':

From 38634ddbe1ddeba8a2fb9ec349a6b7f4142584d8 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 8 Aug 2024 17:45:08 -0700
Subject: [PATCH 046/125] #387 handle doctype special case from foodon

---
 misc-tools/owlparser.py | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 0a38ca66..84fef2c2 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -6,6 +6,10 @@
 XML_TAG = "?xml"
 RDF_TAG = "rdf:RDF"
 DOCTYPE_TAG = "!DOCTYPE"
+CLASS_TAG = "owl:Class"
+SUBCLASS_TAG = "rdfs:subClassOf"
+NODEID_TAG = "rdf:nodeID"
+GENID_PREFIX = "genid"
 
 OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG]
 
@@ -24,6 +28,11 @@
 
 IGNORED_ATTRIBUTES = ["xml:lang"]
 
+OUTPUT_NESTS = []
+GENID_REMAINING_NESTS = dict()
+GENID_TO_ID = dict()
+ID_TO_GENIDS = dict()
+
 def get_args():
 	arg_parser = argparse.ArgumentParser()
 	arg_parser.add_argument('--test', dest='test',
@@ -223,6 +232,22 @@ def convert_nest(nest, start_index):
 	return nest_dict, curr_index
 
 
+def check_for_genids(nest_dict):
+	CLASS_TAG = "owl:Class"
+	SUBCLASS_TAG = "rdfs:subClassOf"
+	NODEID_TAG = "rdf:nodeID"
+	GENID_PREFIX = "genid"
+
+	genids = list()
+
+	for nest_class in nest_dict.get(CLASS_TAG, dict()):
+		for nest_subclass in nest_class.get(SUBCLASS_TAG, dict()):
+			potential_genid = nest_subclass.get(NODEID_TAG, str())
+			if potential_genid.startswith(GENID_PREFIX):
+				genids.append(potential_genid)
+
+	return genids
+
 def divide_into_lines(input_file_name):
 	curr_str = ""
 	curr_nest = list()
@@ -269,6 +294,9 @@ def divide_into_lines(input_file_name):
 							output_nest = True
 					if output_nest: 
 						nest_dict, _ = convert_nest(curr_nest, 0)
+						genids = check_for_genids(nest_dict)
+						if len(genids) > 0:
+							nest_dict['genids'] = genids
 						print(json.dumps(nest_dict, indent=4))
 						curr_nest = list()
 						curr_nest_tag = str()
@@ -278,8 +306,6 @@ def divide_into_lines(input_file_name):
 			if curr_str != "":
 				# divide lines by a space
 				curr_str += ' '
-	# print(json.dumps(curr_nest, indent=4))
-
 
 if __name__ == '__main__':
 	args = get_args()

From 23ff6eaaf6f0cda76cb00eb07aa4be4cccb64ed3 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sat, 10 Aug 2024 17:01:48 -0700
Subject: [PATCH 047/125] #387 refactored for clarity

---
 misc-tools/owlparser.py | 343 +++++++++++++++++++++++++++-------------
 1 file changed, 235 insertions(+), 108 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 84fef2c2..f38035b9 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -7,8 +7,10 @@
 RDF_TAG = "rdf:RDF"
 DOCTYPE_TAG = "!DOCTYPE"
 CLASS_TAG = "owl:Class"
+RESTRICTION_TAG = "owl:Restriction"
 SUBCLASS_TAG = "rdfs:subClassOf"
 NODEID_TAG = "rdf:nodeID"
+RDF_ABOUT_TAG = "rdf:about"
 GENID_PREFIX = "genid"
 
 OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG]
@@ -43,107 +45,21 @@ def get_args():
 def date():
 	return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
-def convert_line(line):
-	tag = ""
-	attributes = dict()
-	attribute_tag = ""
-	attribute_text = ""
-	main_text = ""
-	end_tag = ""
-
-	start_reading_tag = False
-	start_reading_attributes = False
-	start_reading_attribute_tag = False
-	start_reading_attribute_text = False
-	start_reading_main = False
-	start_reading_end_tag = False
-
-	start_brackets = 0
-
-	for letter_index in range(len(line)):
-		letter = line[letter_index]
-		next_letter = ""
-		prev_letter = ""
-		if letter_index + 1 < len(line):
-			next_letter = line[letter_index + 1]
-		if letter_index - 1 >= 0:
-			prev_letter = line[letter_index - 1]
-
-		if letter == '<':
-			start_brackets += 1
-		if letter == '>':
-			start_brackets -= 1
-
-		# First <
-		if letter == '<' and letter_index == 0:
-			if next_letter != '/':
-				start_reading_tag = True
-			continue
-		if letter == '/' and prev_letter == '<':
-			start_reading_end_tag = True
-			continue
-
-		if letter == ' ' and start_reading_tag:
-			start_reading_tag = False
-			start_reading_attributes = True
-			start_reading_attribute_tag = True
-			continue
-		elif letter == '>' and start_reading_tag and start_brackets == 0:
-			start_reading_tag = False
-			start_reading_main = True
-			continue
-		elif start_reading_tag:
-			tag += letter
-
-		if letter == '>' and start_reading_attributes and start_brackets == 0:
-			start_reading_attributes = False
-			start_reading_attribute_tag = False
-			start_reading_attribute_text = False
-			start_reading_main = True
-			if attribute_tag not in IGNORED_ATTRIBUTES:
-				attributes[attribute_tag] = attribute_text.strip('/').strip('"')
-			attribute_tag = ""
-			attribute_text = ""
-
-			if prev_letter == '/':
-				end_tag = tag
-			continue
-		elif start_reading_attributes:
-			if letter == '=' and start_reading_attribute_tag:
-				start_reading_attribute_text = True
-				start_reading_attribute_tag = False
-				continue
-			elif start_reading_attribute_tag:
-				attribute_tag += letter
-
-			if letter == ' ' and start_reading_attribute_text:
-				start_reading_attribute_tag = True
-				start_reading_attribute_text = False
-				if attribute_tag not in IGNORED_ATTRIBUTES:
-					attributes[attribute_tag] = attribute_text.strip('/').strip('"')
-				attribute_tag = ""
-				attribute_text = ""
-				continue
-			elif start_reading_attribute_text:
-				attribute_text += letter
-
-		if letter == '<' and start_reading_main:
-			start_reading_main = False
-			start_reading_end_tag = True
-			continue
-		elif start_reading_main:
-			main_text += letter
+class LineElementRead():
+	TAG = 1
+	ATTRIBUTE_TAG = 2
+	ATTRIBUTE_TEXT = 3
+	MAIN = 4
+	END_TAG = 5
 
-		if letter == '>' and start_reading_end_tag and start_brackets == 0:
-			continue
-		elif start_reading_end_tag:
-			end_tag += letter
 
+def categorize_line(tag, attributes, main_text, end_tag, only_tag):
 	# Categorize the type of line
 	line_type = str()
 	out = dict()
 
-	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP:
+	# Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
+	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP or only_tag:
 		line_type = LINE_TYPE_IGNORE
 	else:
 		start_tag_exists = (tag != str())
@@ -181,6 +97,143 @@ def convert_line(line):
 
 	return out
 
+def get_letters(line, letter_index, start_brackets):
+	letter = line[letter_index]
+	next_letter = ""
+	prev_letter = ""
+	if letter_index + 1 < len(line):
+		next_letter = line[letter_index + 1]
+	if letter_index - 1 >= 0:
+		prev_letter = line[letter_index - 1]
+
+	if letter == '<':
+		start_brackets += 1
+	if letter == '>':
+		start_brackets -= 1
+
+	return letter, next_letter, prev_letter, start_brackets
+
+
+def identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read):
+	changed = False
+
+	if letter == '<' and letter_index == 0:
+		if next_letter != '/':
+			type_to_read = LineElementRead.TAG
+		changed = True
+	if letter == '/' and prev_letter == '<':
+		type_to_read = LineElementRead.END_TAG
+		changed = True
+
+	return changed, type_to_read
+
+
+def read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line):
+	only_tag = False
+	changed = False
+
+	if letter == ' ' and type_to_read == LineElementRead.TAG:
+		type_to_read = LineElementRead.ATTRIBUTE_TAG
+		changed = True
+	elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0:
+		type_to_read = LineElementRead.MAIN
+
+		if prev_letter == '/':
+			print("Warning - strange tag, ignoring", line)
+			only_tag = True
+		changed = True
+	elif type_to_read == LineElementRead.TAG:
+		tag += letter
+		changed = True
+
+	return changed, type_to_read, (only_tag, tag)
+
+
+def store_attribute(attributes, attribute_tag, attribute_text):
+	if attribute_tag not in IGNORED_ATTRIBUTES:
+		attributes[attribute_tag] = attribute_text.strip('/').strip('"')
+	attribute_tag = ""
+	attribute_text = ""
+
+	return attributes, attribute_tag, attribute_text
+
+
+def process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag):
+	changed = False
+	start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT)
+
+	if letter == '>' and start_reading_attributes and start_brackets == 0:
+		type_to_read = LineElementRead.MAIN
+		attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text)
+
+		if prev_letter == '/':
+			end_tag = tag
+		changed = True
+	elif start_reading_attributes:
+		if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG:
+			type_to_read = LineElementRead.ATTRIBUTE_TEXT
+			changed = True
+		elif type_to_read == LineElementRead.ATTRIBUTE_TAG:
+			attribute_tag += letter
+			changed = True
+
+		elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+			type_to_read = LineElementRead.ATTRIBUTE_TAG
+			attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text)
+			changed = True
+		elif type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+			attribute_text += letter
+			changed = True
+
+	return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag)
+
+
+
+def convert_line(line):
+	tag = ""
+	attributes = dict()
+	attribute_tag = ""
+	attribute_text = ""
+	main_text = ""
+	end_tag = ""
+
+	type_to_read = 0
+
+	only_tag = False
+
+	start_brackets = 0
+
+	for letter_index in range(len(line)):
+		letter, next_letter, prev_letter, start_brackets = get_letters(line, letter_index, start_brackets)
+
+		# First <
+		tag_identified, type_to_read = identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read)
+		if tag_identified:
+			continue
+
+		tag_read, type_to_read, tag_read_data = read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line)
+		if tag_read:
+			(only_tag, tag) = tag_read_data
+			continue
+
+		attributes_read, type_to_read, attributes_read_data = process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag)
+		if attributes_read:
+			(attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data
+			continue
+
+		if letter == '<' and type_to_read == LineElementRead.MAIN:
+			type_to_read = LineElementRead.END_TAG
+			continue
+		elif type_to_read == LineElementRead.MAIN:
+			main_text += letter
+
+		if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0:
+			continue
+		elif type_to_read == LineElementRead.END_TAG:
+			end_tag += letter
+
+	return categorize_line(tag, attributes, main_text, end_tag, only_tag)
+
 
 def convert_nest(nest, start_index):
 	nest_dict = dict()
@@ -232,22 +285,83 @@ def convert_nest(nest, start_index):
 	return nest_dict, curr_index
 
 
-def check_for_genids(nest_dict):
-	CLASS_TAG = "owl:Class"
-	SUBCLASS_TAG = "rdfs:subClassOf"
-	NODEID_TAG = "rdf:nodeID"
-	GENID_PREFIX = "genid"
-
+def check_for_class_genids(nest_dict):
 	genids = list()
 
-	for nest_class in nest_dict.get(CLASS_TAG, dict()):
-		for nest_subclass in nest_class.get(SUBCLASS_TAG, dict()):
+	nest_dict_classes = nest_dict.get(CLASS_TAG, list())
+	for nest_class_index in range(len(nest_dict_classes)):
+		nest_class = nest_dict_classes[nest_class_index]
+		nest_subclasses = nest_class.get(SUBCLASS_TAG, list())
+		for nest_subclass_index in range(len(nest_subclasses)):
+			nest_subclass = nest_subclasses[nest_subclass_index]
 			potential_genid = nest_subclass.get(NODEID_TAG, str())
 			if potential_genid.startswith(GENID_PREFIX):
 				genids.append(potential_genid)
 
 	return genids
 
+
+def check_for_restriction_genids(nest_dict):
+	for nest_restriction in nest_dict.get(RESTRICTION_TAG, dict()):
+		potential_genid = nest_restriction.get(NODEID_TAG, str())
+		if potential_genid.startswith(GENID_PREFIX):
+				return potential_genid
+	return None
+
+def extract_class_id(nest_dict):
+	nest_dict_classes = nest_dict.get(CLASS_TAG, list())
+	# Can't have competing class_ids
+	assert len(nest_dict_classes) <= 1
+
+	for nest_class_index in range(len(nest_dict_classes)):
+		nest_class = nest_dict_classes[nest_class_index]
+		return nest_class.get(RDF_ABOUT_TAG, str())
+
+def store_genid_nest_in_class_nest(genid, genid_nest, class_nest):
+	output_class_nest = class_nest
+	
+	nest_dict_classes = class_nest.get(CLASS_TAG, list())
+	for nest_class_index in range(len(nest_dict_classes)):
+		nest_class = nest_dict_classes[nest_class_index]
+		nest_subclasses = nest_class.get(SUBCLASS_TAG, list())
+		for nest_subclass_index in range(len(nest_subclasses)):
+			nest_subclass = nest_subclasses[nest_subclass_index]
+			potential_genid = nest_subclass.get(NODEID_TAG, str())
+			if potential_genid == genid:
+				output_class_nest[CLASS_TAG][nest_class_index][SUBCLASS_TAG][nest_subclass_index][RESTRICTION_TAG] = genid_nest[RESTRICTION_TAG]
+
+	return output_class_nest
+
+
+def triage_nest_dict(nest_dict):
+	genids = check_for_class_genids(nest_dict)
+	restriction_genid = check_for_restriction_genids(nest_dict)
+	class_id = extract_class_id(nest_dict)
+
+	if len(genids) > 0:
+		for genid in genids:
+			GENID_TO_ID[genid] = class_id
+		ID_TO_GENIDS[class_id] = genids
+		GENID_REMAINING_NESTS[class_id] = nest_dict
+	elif restriction_genid is not None:
+		class_id = GENID_TO_ID.get(restriction_genid, str())
+		if len(class_id) == 0:
+			print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
+			OUTPUT_NESTS.append(nest_dict)
+			return
+		class_nest = GENID_REMAINING_NESTS[class_id]
+		ID_TO_GENIDS[class_id].remove(restriction_genid)
+		updated_class_nest = store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
+
+		if len(ID_TO_GENIDS[class_id]) > 0:
+			GENID_REMAINING_NESTS[class_id] = updated_class_nest
+		else:
+			OUTPUT_NESTS.append(updated_class_nest)
+			GENID_REMAINING_NESTS[class_id] = None
+	else:
+		OUTPUT_NESTS.append(nest_dict)
+
+
 def divide_into_lines(input_file_name):
 	curr_str = ""
 	curr_nest = list()
@@ -289,15 +403,18 @@ def divide_into_lines(input_file_name):
 						curr_nest_tags.append(tag)
 					elif line_type == LINE_TYPE_END_NEST:
 						popped_curr_nest_tag = curr_nest_tags.pop()
-						assert popped_curr_nest_tag == tag
+						assert popped_curr_nest_tag == tag, curr_nest
 						if len(curr_nest_tags) == 0:
 							output_nest = True
 					if output_nest: 
 						nest_dict, _ = convert_nest(curr_nest, 0)
-						genids = check_for_genids(nest_dict)
-						if len(genids) > 0:
-							nest_dict['genids'] = genids
-						print(json.dumps(nest_dict, indent=4))
+						# genids = check_for_class_genids(nest_dict)
+						triage_nest_dict(nest_dict)
+						# restriction_genid = check_for_restriction_genids(nest_dict)
+
+						# if len(genids) > 0:
+						# 	nest_dict['genids'] = genids
+						# print(json.dumps(nest_dict, indent=4))
 						curr_nest = list()
 						curr_nest_tag = str()
 
@@ -307,6 +424,16 @@ def divide_into_lines(input_file_name):
 				# divide lines by a space
 				curr_str += ' '
 
+	print(json.dumps(OUTPUT_NESTS, indent=4))
+
+	print("=========")
+
+	print("Remaining:")
+	for item in GENID_REMAINING_NESTS:
+		if GENID_REMAINING_NESTS[item] != None:
+			print(item)
+			print(json.dumps(GENID_REMAINING_NESTS[item], indent=4))
+
 if __name__ == '__main__':
 	args = get_args()
 	input_file_name = args.inputFile

From 9b8dfc49b5451b4622568b9168530bab991c711a Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sat, 10 Aug 2024 18:42:06 -0700
Subject: [PATCH 048/125] #387 more refactoring, but pre-sorting into classes

---
 misc-tools/owlparser.py | 65 ++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index f38035b9..114e72e9 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -115,36 +115,35 @@ def get_letters(line, letter_index, start_brackets):
 
 
 def identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read):
-	changed = False
+	changed = True
 
 	if letter == '<' and letter_index == 0:
 		if next_letter != '/':
 			type_to_read = LineElementRead.TAG
-		changed = True
-	if letter == '/' and prev_letter == '<':
+	elif letter == '/' and prev_letter == '<':
 		type_to_read = LineElementRead.END_TAG
-		changed = True
+	else:
+		changed = False
 
 	return changed, type_to_read
 
 
 def read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line):
 	only_tag = False
-	changed = False
+	changed = True
 
 	if letter == ' ' and type_to_read == LineElementRead.TAG:
 		type_to_read = LineElementRead.ATTRIBUTE_TAG
-		changed = True
 	elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0:
 		type_to_read = LineElementRead.MAIN
 
 		if prev_letter == '/':
 			print("Warning - strange tag, ignoring", line)
 			only_tag = True
-		changed = True
 	elif type_to_read == LineElementRead.TAG:
 		tag += letter
-		changed = True
+	else:
+		changed = False
 
 	return changed, type_to_read, (only_tag, tag)
 
@@ -158,8 +157,8 @@ def store_attribute(attributes, attribute_tag, attribute_text):
 	return attributes, attribute_tag, attribute_text
 
 
-def process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag):
-	changed = False
+def read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag):
+	changed = True
 	start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT)
 
 	if letter == '>' and start_reading_attributes and start_brackets == 0:
@@ -168,26 +167,45 @@ def process_attributes(letter, prev_letter, type_to_read, start_brackets, attrib
 
 		if prev_letter == '/':
 			end_tag = tag
-		changed = True
 	elif start_reading_attributes:
 		if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG:
 			type_to_read = LineElementRead.ATTRIBUTE_TEXT
-			changed = True
 		elif type_to_read == LineElementRead.ATTRIBUTE_TAG:
 			attribute_tag += letter
-			changed = True
-
 		elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT:
 			type_to_read = LineElementRead.ATTRIBUTE_TAG
 			attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text)
-			changed = True
 		elif type_to_read == LineElementRead.ATTRIBUTE_TEXT:
 			attribute_text += letter
-			changed = True
+	else:
+		changed = False
 
 	return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag)
 
 
+def read_main(letter, type_to_read, main_text):
+	changed = True
+	if letter == '<' and type_to_read == LineElementRead.MAIN:
+		type_to_read = LineElementRead.END_TAG
+	elif type_to_read == LineElementRead.MAIN:
+		main_text += letter
+	else:
+		changed = False
+
+	return changed, type_to_read, (main_text)
+
+
+def read_end_tag(letter, type_to_read, start_brackets, end_tag):
+	changed = True
+	if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0:
+		pass
+	elif type_to_read == LineElementRead.END_TAG:
+		end_tag += letter
+	else:
+		changed = False
+
+	return changed, type_to_read, (end_tag)
+
 
 def convert_line(line):
 	tag = ""
@@ -216,21 +234,20 @@ def convert_line(line):
 			(only_tag, tag) = tag_read_data
 			continue
 
-		attributes_read, type_to_read, attributes_read_data = process_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag)
+		attributes_read, type_to_read, attributes_read_data = read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag)
 		if attributes_read:
 			(attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data
 			continue
 
-		if letter == '<' and type_to_read == LineElementRead.MAIN:
-			type_to_read = LineElementRead.END_TAG
+		main_read, type_to_read, main_read_data = read_main(letter, type_to_read, main_text)
+		if main_read:
+			(main_text) = main_read_data
 			continue
-		elif type_to_read == LineElementRead.MAIN:
-			main_text += letter
 
-		if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0:
+		end_tag_read, type_to_read, end_tag_read_data = read_end_tag(letter, type_to_read, start_brackets, end_tag)
+		if end_tag_read:
+			(end_tag) = end_tag_read_data
 			continue
-		elif type_to_read == LineElementRead.END_TAG:
-			end_tag += letter
 
 	return categorize_line(tag, attributes, main_text, end_tag, only_tag)
 

From bab707c4f4bdca3d3c7eed72a0f3c2a63bf696e0 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 12 Aug 2024 11:42:37 -0700
Subject: [PATCH 049/125] #387 refactored into class form

---
 misc-tools/owlparser.py | 745 ++++++++++++++++++++--------------------
 1 file changed, 379 insertions(+), 366 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 114e72e9..83371543 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -2,39 +2,6 @@
 import argparse
 import datetime
 
-COMMENT = "!--"
-XML_TAG = "?xml"
-RDF_TAG = "rdf:RDF"
-DOCTYPE_TAG = "!DOCTYPE"
-CLASS_TAG = "owl:Class"
-RESTRICTION_TAG = "owl:Restriction"
-SUBCLASS_TAG = "rdfs:subClassOf"
-NODEID_TAG = "rdf:nodeID"
-RDF_ABOUT_TAG = "rdf:about"
-GENID_PREFIX = "genid"
-
-OUTMOST_TAGS_SKIP = [XML_TAG, RDF_TAG, DOCTYPE_TAG]
-
-LINE_TYPE_IGNORE = "ignore"
-LINE_TYPE_START_NEST = "start nest"
-LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
-LINE_TYPE_ENTRY = "entry"
-LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes"
-LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes"
-LINE_TYPE_END_NEST = "end nest"
-
-KEY_TAG = "tag"
-KEY_ATTRIBUTES = "attributes"
-KEY_TEXT = "ENTRY_TEXT"
-KEY_TYPE = "type"
-
-IGNORED_ATTRIBUTES = ["xml:lang"]
-
-OUTPUT_NESTS = []
-GENID_REMAINING_NESTS = dict()
-GENID_TO_ID = dict()
-ID_TO_GENIDS = dict()
-
 def get_args():
 	arg_parser = argparse.ArgumentParser()
 	arg_parser.add_argument('--test', dest='test',
@@ -52,404 +19,449 @@ class LineElementRead():
 	MAIN = 4
 	END_TAG = 5
 
-
-def categorize_line(tag, attributes, main_text, end_tag, only_tag):
-	# Categorize the type of line
-	line_type = str()
-	out = dict()
-
-	# Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
-	if tag == COMMENT or tag in OUTMOST_TAGS_SKIP or end_tag in OUTMOST_TAGS_SKIP or only_tag:
-		line_type = LINE_TYPE_IGNORE
-	else:
-		start_tag_exists = (tag != str())
-		attributes_exist = (attributes != dict())
-		text_exists = (main_text != str())
-		end_tag_exists = (end_tag != str())
-
-		if start_tag_exists:
-			if attributes_exist:
-				if text_exists:
-					line_type = LINE_TYPE_ENTRY_WITH_ATTR
-					out[KEY_TAG] = tag
-					out[KEY_ATTRIBUTES] = attributes
-					out[KEY_TEXT] = main_text
-				elif end_tag_exists:
-					line_type = LINE_TYPE_ENTRY_ONLY_ATTR
-					out[KEY_TAG] = tag
-					out[KEY_ATTRIBUTES] = attributes
+class XMLParser():
+	def __init__(self, skip_tags, ignored_attributes, processing_func):
+		self.COMMENT = "!--"
+		self.OUTMOST_TAGS_SKIP = skip_tags
+		self.IGNORED_ATTRIBUTES = ignored_attributes
+		self.processing_func = processing_func
+
+		self.LINE_TYPE_IGNORE = "ignore"
+		self.LINE_TYPE_START_NEST = "start nest"
+		self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
+		self.LINE_TYPE_ENTRY = "entry"
+		self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes"
+		self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes"
+		self.LINE_TYPE_END_NEST = "end nest"
+
+		self.KEY_TAG = "tag"
+		self.KEY_ATTRIBUTES = "attributes"
+		self.KEY_TEXT = "ENTRY_TEXT"
+		self.KEY_TYPE = "type"
+
+
+	def categorize_line(self, tag, attributes, main_text, end_tag, only_tag):
+		# Categorize the type of line
+		line_type = str()
+		out = dict()
+
+		# Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
+		if tag == self.COMMENT or tag in self.OUTMOST_TAGS_SKIP or end_tag in self.OUTMOST_TAGS_SKIP or only_tag:
+			line_type = self.LINE_TYPE_IGNORE
+		else:
+			start_tag_exists = (tag != str())
+			attributes_exist = (attributes != dict())
+			text_exists = (main_text != str())
+			end_tag_exists = (end_tag != str())
+
+			if start_tag_exists:
+				if attributes_exist:
+					if text_exists:
+						line_type = self.LINE_TYPE_ENTRY_WITH_ATTR
+						out[self.KEY_TAG] = tag
+						out[self.KEY_ATTRIBUTES] = attributes
+						out[self.KEY_TEXT] = main_text
+					elif end_tag_exists:
+						line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR
+						out[self.KEY_TAG] = tag
+						out[self.KEY_ATTRIBUTES] = attributes
+					else:
+						line_type = self.LINE_TYPE_START_NEST_WITH_ATTR
+						out[self.KEY_TAG] = tag
+						out[self.KEY_ATTRIBUTES] = attributes
+				elif text_exists:
+					line_type = self.LINE_TYPE_ENTRY
+					out[self.KEY_TAG] = tag
+					out[self.KEY_TEXT] = main_text
 				else:
-					line_type = LINE_TYPE_START_NEST_WITH_ATTR
-					out[KEY_TAG] = tag
-					out[KEY_ATTRIBUTES] = attributes
-			elif text_exists:
-				line_type = LINE_TYPE_ENTRY
-				out[KEY_TAG] = tag
-				out[KEY_TEXT] = main_text
-			else:
-				line_type = LINE_TYPE_START_NEST
-				out[KEY_TAG] = tag
-		elif end_tag_exists:
-			line_type = LINE_TYPE_END_NEST
-			out[KEY_TAG] = end_tag
+					line_type = self.LINE_TYPE_START_NEST
+					out[self.KEY_TAG] = tag
+			elif end_tag_exists:
+				line_type = self.LINE_TYPE_END_NEST
+				out[self.KEY_TAG] = end_tag
 
-	out[KEY_TYPE] = line_type
+		out[self.KEY_TYPE] = line_type
 
-	return out
+		return out
 
-def get_letters(line, letter_index, start_brackets):
-	letter = line[letter_index]
-	next_letter = ""
-	prev_letter = ""
-	if letter_index + 1 < len(line):
-		next_letter = line[letter_index + 1]
-	if letter_index - 1 >= 0:
-		prev_letter = line[letter_index - 1]
+	def get_letters(self, line, letter_index, start_brackets):
+		letter = line[letter_index]
+		next_letter = ""
+		prev_letter = ""
+		if letter_index + 1 < len(line):
+			next_letter = line[letter_index + 1]
+		if letter_index - 1 >= 0:
+			prev_letter = line[letter_index - 1]
 
-	if letter == '<':
-		start_brackets += 1
-	if letter == '>':
-		start_brackets -= 1
+		if letter == '<':
+			start_brackets += 1
+		if letter == '>':
+			start_brackets -= 1
 
-	return letter, next_letter, prev_letter, start_brackets
+		return letter, next_letter, prev_letter, start_brackets
 
 
-def identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read):
-	changed = True
+	def identify_tag_type(self, letter_index, letter, next_letter, prev_letter, type_to_read):
+		changed = True
 
-	if letter == '<' and letter_index == 0:
-		if next_letter != '/':
-			type_to_read = LineElementRead.TAG
-	elif letter == '/' and prev_letter == '<':
-		type_to_read = LineElementRead.END_TAG
-	else:
-		changed = False
+		if letter == '<' and letter_index == 0:
+			if next_letter != '/':
+				type_to_read = LineElementRead.TAG
+		elif letter == '/' and prev_letter == '<':
+			type_to_read = LineElementRead.END_TAG
+		else:
+			changed = False
 
-	return changed, type_to_read
+		return changed, type_to_read
 
 
-def read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line):
-	only_tag = False
-	changed = True
+	def read_tag(self, letter, prev_letter, type_to_read, start_brackets, tag, line):
+		only_tag = False
+		changed = True
 
-	if letter == ' ' and type_to_read == LineElementRead.TAG:
-		type_to_read = LineElementRead.ATTRIBUTE_TAG
-	elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0:
-		type_to_read = LineElementRead.MAIN
+		if letter == ' ' and type_to_read == LineElementRead.TAG:
+			type_to_read = LineElementRead.ATTRIBUTE_TAG
+		elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0:
+			type_to_read = LineElementRead.MAIN
+
+			if prev_letter == '/':
+				print("Warning - strange tag, ignoring", line)
+				only_tag = True
+		elif type_to_read == LineElementRead.TAG:
+			tag += letter
+		else:
+			changed = False
 
-		if prev_letter == '/':
-			print("Warning - strange tag, ignoring", line)
-			only_tag = True
-	elif type_to_read == LineElementRead.TAG:
-		tag += letter
-	else:
-		changed = False
+		return changed, type_to_read, (only_tag, tag)
 
-	return changed, type_to_read, (only_tag, tag)
 
+	def store_attribute(self, attributes, attribute_tag, attribute_text):
+		if attribute_tag not in self.IGNORED_ATTRIBUTES:
+			attributes[attribute_tag] = attribute_text.strip('/').strip('"')
+		attribute_tag = ""
+		attribute_text = ""
 
-def store_attribute(attributes, attribute_tag, attribute_text):
-	if attribute_tag not in IGNORED_ATTRIBUTES:
-		attributes[attribute_tag] = attribute_text.strip('/').strip('"')
-	attribute_tag = ""
-	attribute_text = ""
+		return attributes, attribute_tag, attribute_text
 
-	return attributes, attribute_tag, attribute_text
 
+	def read_attributes(self, letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag):
+		changed = True
+		start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT)
 
-def read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag):
-	changed = True
-	start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT)
+		if letter == '>' and start_reading_attributes and start_brackets == 0:
+			type_to_read = LineElementRead.MAIN
+			attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text)
 
-	if letter == '>' and start_reading_attributes and start_brackets == 0:
-		type_to_read = LineElementRead.MAIN
-		attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text)
+			if prev_letter == '/':
+				end_tag = tag
+		elif start_reading_attributes:
+			if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG:
+				type_to_read = LineElementRead.ATTRIBUTE_TEXT
+			elif type_to_read == LineElementRead.ATTRIBUTE_TAG:
+				attribute_tag += letter
+			elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+				type_to_read = LineElementRead.ATTRIBUTE_TAG
+				attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text)
+			elif type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+				attribute_text += letter
+		else:
+			changed = False
 
-		if prev_letter == '/':
-			end_tag = tag
-	elif start_reading_attributes:
-		if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG:
-			type_to_read = LineElementRead.ATTRIBUTE_TEXT
-		elif type_to_read == LineElementRead.ATTRIBUTE_TAG:
-			attribute_tag += letter
-		elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-			type_to_read = LineElementRead.ATTRIBUTE_TAG
-			attributes, attribute_tag, attribute_text = store_attribute(attributes, attribute_tag, attribute_text)
-		elif type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-			attribute_text += letter
-	else:
-		changed = False
+		return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag)
 
-	return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag)
 
+	def read_main(self, letter, type_to_read, main_text):
+		changed = True
+		if letter == '<' and type_to_read == LineElementRead.MAIN:
+			type_to_read = LineElementRead.END_TAG
+		elif type_to_read == LineElementRead.MAIN:
+			main_text += letter
+		else:
+			changed = False
 
-def read_main(letter, type_to_read, main_text):
-	changed = True
-	if letter == '<' and type_to_read == LineElementRead.MAIN:
-		type_to_read = LineElementRead.END_TAG
-	elif type_to_read == LineElementRead.MAIN:
-		main_text += letter
-	else:
-		changed = False
+		return changed, type_to_read, (main_text)
 
-	return changed, type_to_read, (main_text)
 
+	def read_end_tag(self, letter, type_to_read, start_brackets, end_tag):
+		changed = True
+		if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0:
+			pass
+		elif type_to_read == LineElementRead.END_TAG:
+			end_tag += letter
+		else:
+			changed = False
 
-def read_end_tag(letter, type_to_read, start_brackets, end_tag):
-	changed = True
-	if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0:
-		pass
-	elif type_to_read == LineElementRead.END_TAG:
-		end_tag += letter
-	else:
-		changed = False
+		return changed, type_to_read, (end_tag)
 
-	return changed, type_to_read, (end_tag)
 
+	def convert_line(self, line):
+		tag = ""
+		attributes = dict()
+		attribute_tag = ""
+		attribute_text = ""
+		main_text = ""
+		end_tag = ""
 
-def convert_line(line):
-	tag = ""
-	attributes = dict()
-	attribute_tag = ""
-	attribute_text = ""
-	main_text = ""
-	end_tag = ""
+		type_to_read = 0
 
-	type_to_read = 0
+		only_tag = False
 
-	only_tag = False
+		start_brackets = 0
 
-	start_brackets = 0
+		for letter_index in range(len(line)):
+			letter, next_letter, prev_letter, start_brackets = self.get_letters(line, letter_index, start_brackets)
 
-	for letter_index in range(len(line)):
-		letter, next_letter, prev_letter, start_brackets = get_letters(line, letter_index, start_brackets)
+			# First <
+			tag_identified, type_to_read = self.identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read)
+			if tag_identified:
+				continue
 
-		# First <
-		tag_identified, type_to_read = identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read)
-		if tag_identified:
-			continue
+			tag_read, type_to_read, tag_read_data = self.read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line)
+			if tag_read:
+				(only_tag, tag) = tag_read_data
+				continue
 
-		tag_read, type_to_read, tag_read_data = read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line)
-		if tag_read:
-			(only_tag, tag) = tag_read_data
-			continue
+			attributes_read, type_to_read, attributes_read_data = self.read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag)
+			if attributes_read:
+				(attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data
+				continue
 
-		attributes_read, type_to_read, attributes_read_data = read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag)
-		if attributes_read:
-			(attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data
-			continue
+			main_read, type_to_read, main_read_data = self.read_main(letter, type_to_read, main_text)
+			if main_read:
+				(main_text) = main_read_data
+				continue
 
-		main_read, type_to_read, main_read_data = read_main(letter, type_to_read, main_text)
-		if main_read:
-			(main_text) = main_read_data
-			continue
+			end_tag_read, type_to_read, end_tag_read_data = self.read_end_tag(letter, type_to_read, start_brackets, end_tag)
+			if end_tag_read:
+				(end_tag) = end_tag_read_data
+				continue
 
-		end_tag_read, type_to_read, end_tag_read_data = read_end_tag(letter, type_to_read, start_brackets, end_tag)
-		if end_tag_read:
-			(end_tag) = end_tag_read_data
-			continue
+		return self.categorize_line(tag, attributes, main_text, end_tag, only_tag)
 
-	return categorize_line(tag, attributes, main_text, end_tag, only_tag)
 
+	def convert_nest(self, nest, start_index):
+		nest_dict = dict()
+		curr_index = start_index
 
-def convert_nest(nest, start_index):
-	nest_dict = dict()
-	curr_index = start_index
+		while curr_index < len(nest):
+			element = nest[curr_index]
+			line_type = element[self.KEY_TYPE]
+			line_tag = element[self.KEY_TAG]
+			line_text = element.get(self.KEY_TEXT, None)
+			line_attributes = element.get(self.KEY_ATTRIBUTES, None)
 
-	while curr_index < len(nest):
-		element = nest[curr_index]
-		line_type = element[KEY_TYPE]
-		line_tag = element[KEY_TAG]
-		line_text = element.get(KEY_TEXT, None)
-		line_attributes = element.get(KEY_ATTRIBUTES, None)
+			if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
+				if line_tag not in nest_dict:
+					nest_dict[line_tag] = list()
 
-		if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
-			if line_tag not in nest_dict:
-				nest_dict[line_tag] = list()
+				converted_nest, ret_index = self.convert_nest(nest, curr_index + 1)
 
-			converted_nest, ret_index = convert_nest(nest, curr_index + 1)
+				if line_attributes is not None:
+					for attribute in line_attributes:
+						converted_nest[attribute] = line_attributes[attribute]
 
-			if line_attributes is not None:
-				for attribute in line_attributes:
-					converted_nest[attribute] = line_attributes[attribute]
+				nest_dict[line_tag].append(converted_nest)
 
-			nest_dict[line_tag].append(converted_nest)
+				curr_index = ret_index + 1
+				continue
 
-			curr_index = ret_index + 1
-			continue
+			if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]:
+				if line_tag not in nest_dict:
+					nest_dict[line_tag] = list()
 
-		if line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR]:
-			if line_tag not in nest_dict:
-				nest_dict[line_tag] = list()
+				curr_dict = dict()
 
-			curr_dict = dict()
-
-			if line_text is not None:
-				curr_dict[KEY_TEXT] = line_text
+				if line_text is not None:
+					curr_dict[self.KEY_TEXT] = line_text
 
-			if line_attributes is not None:
-				for attribute in line_attributes:
-					curr_dict[attribute] = line_attributes[attribute]
+				if line_attributes is not None:
+					for attribute in line_attributes:
+						curr_dict[attribute] = line_attributes[attribute]
 
-			nest_dict[line_tag].append(curr_dict)
+				nest_dict[line_tag].append(curr_dict)
 
-			curr_index += 1
-			continue
-
-		if line_type in [LINE_TYPE_END_NEST]:
-			return nest_dict, curr_index
+				curr_index += 1
+				continue
 
-	return nest_dict, curr_index
+			if line_type in [self.LINE_TYPE_END_NEST]:
+				return nest_dict, curr_index
 
+		return nest_dict, curr_index
 
-def check_for_class_genids(nest_dict):
-	genids = list()
 
-	nest_dict_classes = nest_dict.get(CLASS_TAG, list())
-	for nest_class_index in range(len(nest_dict_classes)):
-		nest_class = nest_dict_classes[nest_class_index]
-		nest_subclasses = nest_class.get(SUBCLASS_TAG, list())
-		for nest_subclass_index in range(len(nest_subclasses)):
-			nest_subclass = nest_subclasses[nest_subclass_index]
-			potential_genid = nest_subclass.get(NODEID_TAG, str())
-			if potential_genid.startswith(GENID_PREFIX):
-				genids.append(potential_genid)
+	def divide_into_lines(self, input_file_name):
+		curr_str = ""
+		curr_nest = list()
+		curr_nest_tags = list() # Treating it as a stack
+		start_brackets = 0
 
-	return genids
+		with open(input_file_name) as input_file:
+			for line in input_file:
+				line_str = line.strip()
 
+				for letter_index in range(len(line_str)):
+					letter = line_str[letter_index]
+					if letter == '<':
+						start_brackets += 1
+					if letter == '>':
+						start_brackets -= 1
 
-def check_for_restriction_genids(nest_dict):
-	for nest_restriction in nest_dict.get(RESTRICTION_TAG, dict()):
-		potential_genid = nest_restriction.get(NODEID_TAG, str())
-		if potential_genid.startswith(GENID_PREFIX):
-				return potential_genid
-	return None
-
-def extract_class_id(nest_dict):
-	nest_dict_classes = nest_dict.get(CLASS_TAG, list())
-	# Can't have competing class_ids
-	assert len(nest_dict_classes) <= 1
-
-	for nest_class_index in range(len(nest_dict_classes)):
-		nest_class = nest_dict_classes[nest_class_index]
-		return nest_class.get(RDF_ABOUT_TAG, str())
+					next_letter = ""
+					if letter_index + 1 < len(line_str):
+						next_letter = line_str[letter_index + 1]
 
-def store_genid_nest_in_class_nest(genid, genid_nest, class_nest):
-	output_class_nest = class_nest
-	
-	nest_dict_classes = class_nest.get(CLASS_TAG, list())
-	for nest_class_index in range(len(nest_dict_classes)):
-		nest_class = nest_dict_classes[nest_class_index]
-		nest_subclasses = nest_class.get(SUBCLASS_TAG, list())
-		for nest_subclass_index in range(len(nest_subclasses)):
-			nest_subclass = nest_subclasses[nest_subclass_index]
-			potential_genid = nest_subclass.get(NODEID_TAG, str())
-			if potential_genid == genid:
-				output_class_nest[CLASS_TAG][nest_class_index][SUBCLASS_TAG][nest_subclass_index][RESTRICTION_TAG] = genid_nest[RESTRICTION_TAG]
+					curr_str += letter
 
-	return output_class_nest
+					if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
+						# Only return if nesting
+						line_parsed = self.convert_line(curr_str)
 
+						tag = line_parsed.get(self.KEY_TAG, None)
+						assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely
+						line_type = line_parsed.get(self.KEY_TYPE, None)
+						attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys()
 
-def triage_nest_dict(nest_dict):
-	genids = check_for_class_genids(nest_dict)
-	restriction_genid = check_for_restriction_genids(nest_dict)
-	class_id = extract_class_id(nest_dict)
-
-	if len(genids) > 0:
-		for genid in genids:
-			GENID_TO_ID[genid] = class_id
-		ID_TO_GENIDS[class_id] = genids
-		GENID_REMAINING_NESTS[class_id] = nest_dict
-	elif restriction_genid is not None:
-		class_id = GENID_TO_ID.get(restriction_genid, str())
-		if len(class_id) == 0:
-			print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
-			OUTPUT_NESTS.append(nest_dict)
-			return
-		class_nest = GENID_REMAINING_NESTS[class_id]
-		ID_TO_GENIDS[class_id].remove(restriction_genid)
-		updated_class_nest = store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
-
-		if len(ID_TO_GENIDS[class_id]) > 0:
-			GENID_REMAINING_NESTS[class_id] = updated_class_nest
+						if line_type != self.LINE_TYPE_IGNORE:
+							curr_nest.append(line_parsed)
+
+						output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
+
+						if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
+							curr_nest_tags.append(tag)
+						elif line_type == self.LINE_TYPE_END_NEST:
+							popped_curr_nest_tag = curr_nest_tags.pop()
+							assert popped_curr_nest_tag == tag, curr_nest
+							if len(curr_nest_tags) == 0:
+								output_nest = True
+						if output_nest: 
+							nest_dict, _ = self.convert_nest(curr_nest, 0)
+
+							self.processing_func(nest_dict)
+
+							curr_nest = list()
+							curr_nest_tag = str()
+
+						curr_str = ""
+
+				if curr_str != "":
+					# divide lines by a space
+					curr_str += ' '
+
+
+class OWLParser():
+	def __init__(self, input_file_name):
+		self.XML_TAG = "?xml"
+		self.RDF_TAG = "rdf:RDF"
+		self.DOCTYPE_TAG = "!DOCTYPE"
+		self.CLASS_TAG = "owl:Class"
+		self.RESTRICTION_TAG = "owl:Restriction"
+		self.SUBCLASS_TAG = "rdfs:subClassOf"
+		self.NODEID_TAG = "rdf:nodeID"
+		self.RDF_ABOUT_TAG = "rdf:about"
+		self.GENID_PREFIX = "genid"
+
+		self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
+
+		self.ignored_attributes = ["xml:lang"]
+
+		self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict)
+
+		self.OUTPUT_NESTS = []
+		self.GENID_REMAINING_NESTS = dict()
+		self.GENID_TO_ID = dict()
+		self.ID_TO_GENIDS = dict()
+
+		self.input_file = input_file_name
+
+	def check_for_class_genids(self, nest_dict):
+		genids = list()
+
+		nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
+		for nest_class_index in range(len(nest_dict_classes)):
+			nest_class = nest_dict_classes[nest_class_index]
+			nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
+			for nest_subclass_index in range(len(nest_subclasses)):
+				nest_subclass = nest_subclasses[nest_subclass_index]
+				potential_genid = nest_subclass.get(self.NODEID_TAG, str())
+				if potential_genid.startswith(self.GENID_PREFIX):
+					genids.append(potential_genid)
+
+		return genids
+
+
+	def check_for_restriction_genids(self, nest_dict):
+		for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()):
+			potential_genid = nest_restriction.get(self.NODEID_TAG, str())
+			if potential_genid.startswith(self.GENID_PREFIX):
+					return potential_genid
+		return None
+
+	def extract_class_id(self, nest_dict):
+		nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
+		# Can't have competing class_ids
+		assert len(nest_dict_classes) <= 1
+
+		for nest_class_index in range(len(nest_dict_classes)):
+			nest_class = nest_dict_classes[nest_class_index]
+			return nest_class.get(self.RDF_ABOUT_TAG, str())
+
+	def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
+		output_class_nest = class_nest
+		
+		nest_dict_classes = class_nest.get(self.CLASS_TAG, list())
+		for nest_class_index in range(len(nest_dict_classes)):
+			nest_class = nest_dict_classes[nest_class_index]
+			nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
+			for nest_subclass_index in range(len(nest_subclasses)):
+				nest_subclass = nest_subclasses[nest_subclass_index]
+				potential_genid = nest_subclass.get(self.NODEID_TAG, str())
+				if potential_genid == genid:
+					output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG]
+
+		return output_class_nest
+
+
+	def triage_nest_dict(self, nest_dict):
+		genids = self.check_for_class_genids(nest_dict)
+		restriction_genid = self.check_for_restriction_genids(nest_dict)
+		class_id = self.extract_class_id(nest_dict)
+
+		if len(genids) > 0:
+			for genid in genids:
+				self.GENID_TO_ID[genid] = class_id
+			self.ID_TO_GENIDS[class_id] = genids
+			self.GENID_REMAINING_NESTS[class_id] = nest_dict
+		elif restriction_genid is not None:
+			class_id = self.GENID_TO_ID.get(restriction_genid, str())
+			if len(class_id) == 0:
+				print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
+				self.OUTPUT_NESTS.append(nest_dict)
+				return
+			class_nest = self.GENID_REMAINING_NESTS[class_id]
+			self.ID_TO_GENIDS[class_id].remove(restriction_genid)
+			updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
+
+			if len(self.ID_TO_GENIDS[class_id]) > 0:
+				self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
+			else:
+				self.OUTPUT_NESTS.append(updated_class_nest)
+				self.GENID_REMAINING_NESTS[class_id] = None
 		else:
-			OUTPUT_NESTS.append(updated_class_nest)
-			GENID_REMAINING_NESTS[class_id] = None
-	else:
-		OUTPUT_NESTS.append(nest_dict)
-
-
-def divide_into_lines(input_file_name):
-	curr_str = ""
-	curr_nest = list()
-	curr_nest_tags = list() # Treating it as a stack
-	start_brackets = 0
-
-	with open(input_file_name) as input_file:
-		for line in input_file:
-			line_str = line.strip()
-
-			for letter_index in range(len(line_str)):
-				letter = line_str[letter_index]
-				if letter == '<':
-					start_brackets += 1
-				if letter == '>':
-					start_brackets -= 1
-
-				next_letter = ""
-				if letter_index + 1 < len(line_str):
-					next_letter = line_str[letter_index + 1]
-
-				curr_str += letter
-
-				if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
-					# Only return if nesting
-					line_parsed = convert_line(curr_str)
-
-					tag = line_parsed.get(KEY_TAG, None)
-					assert tag != KEY_TEXT # This could cause a massive conflict, but it is unlikely
-					line_type = line_parsed.get(KEY_TYPE, None)
-					attribute_keys = line_parsed.get(KEY_ATTRIBUTES, dict()).keys()
-
-					if line_type != LINE_TYPE_IGNORE:
-						curr_nest.append(line_parsed)
-
-					output_nest = (line_type in [LINE_TYPE_ENTRY, LINE_TYPE_ENTRY_WITH_ATTR, LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
-
-					if line_type in [LINE_TYPE_START_NEST, LINE_TYPE_START_NEST_WITH_ATTR]:
-						curr_nest_tags.append(tag)
-					elif line_type == LINE_TYPE_END_NEST:
-						popped_curr_nest_tag = curr_nest_tags.pop()
-						assert popped_curr_nest_tag == tag, curr_nest
-						if len(curr_nest_tags) == 0:
-							output_nest = True
-					if output_nest: 
-						nest_dict, _ = convert_nest(curr_nest, 0)
-						# genids = check_for_class_genids(nest_dict)
-						triage_nest_dict(nest_dict)
-						# restriction_genid = check_for_restriction_genids(nest_dict)
-
-						# if len(genids) > 0:
-						# 	nest_dict['genids'] = genids
-						# print(json.dumps(nest_dict, indent=4))
-						curr_nest = list()
-						curr_nest_tag = str()
-
-					curr_str = ""
-
-			if curr_str != "":
-				# divide lines by a space
-				curr_str += ' '
-
-	print(json.dumps(OUTPUT_NESTS, indent=4))
-
-	print("=========")
-
-	print("Remaining:")
-	for item in GENID_REMAINING_NESTS:
-		if GENID_REMAINING_NESTS[item] != None:
-			print(item)
-			print(json.dumps(GENID_REMAINING_NESTS[item], indent=4))
+			self.OUTPUT_NESTS.append(nest_dict)
+
+
+	def parse_OWL_file(self):
+		self.xml_parser.divide_into_lines(self.input_file)
+		print(json.dumps(self.OUTPUT_NESTS, indent=4))
+
+		print("=========")
+
+		print("Remaining:")
+		for item in self.GENID_REMAINING_NESTS:
+			if self.GENID_REMAINING_NESTS[item] != None:
+				print(item)
+				print(json.dumps(self.GENID_REMAINING_NESTS[item], indent=4))
+
 
 if __name__ == '__main__':
 	args = get_args()
@@ -457,5 +469,6 @@ def divide_into_lines(input_file_name):
 
 	print("File:", input_file_name)
 	print("Start Time:", date())
-	divide_into_lines(input_file_name)
+	owl_parser = OWLParser(input_file_name)
+	owl_parser.parse_OWL_file()
 	print("End Time:", date())
\ No newline at end of file

From a55212c1f10a439a2e01401d094c25b8b1b5abc5 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 12 Aug 2024 19:27:47 -0700
Subject: [PATCH 050/125] #387 added in output filing

---
 misc-tools/owlparser.py | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/misc-tools/owlparser.py b/misc-tools/owlparser.py
index 83371543..629a8226 100644
--- a/misc-tools/owlparser.py
+++ b/misc-tools/owlparser.py
@@ -1,12 +1,14 @@
 import json
 import argparse
 import datetime
+import kg2_util
 
 def get_args():
 	arg_parser = argparse.ArgumentParser()
 	arg_parser.add_argument('--test', dest='test',
 							action="store_true", default=False)
 	arg_parser.add_argument('inputFile', type=str)
+	arg_parser.add_argument('outputFile', type=str)
 	return arg_parser.parse_args()
 
 def date():
@@ -349,7 +351,7 @@ def divide_into_lines(self, input_file_name):
 
 
 class OWLParser():
-	def __init__(self, input_file_name):
+	def __init__(self, input_file_name, output_file_name):
 		self.XML_TAG = "?xml"
 		self.RDF_TAG = "rdf:RDF"
 		self.DOCTYPE_TAG = "!DOCTYPE"
@@ -366,12 +368,16 @@ def __init__(self, input_file_name):
 
 		self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict)
 
-		self.OUTPUT_NESTS = []
 		self.GENID_REMAINING_NESTS = dict()
 		self.GENID_TO_ID = dict()
 		self.ID_TO_GENIDS = dict()
 
 		self.input_file = input_file_name
+		self.output_file_name = output_file_name
+
+		self.output_info = create_single_jsonlines()
+		self.output = output_info[0]
+
 
 	def check_for_class_genids(self, nest_dict):
 		genids = list()
@@ -435,7 +441,9 @@ def triage_nest_dict(self, nest_dict):
 			class_id = self.GENID_TO_ID.get(restriction_genid, str())
 			if len(class_id) == 0:
 				print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
-				self.OUTPUT_NESTS.append(nest_dict)
+
+				# Save to output despite not matching with an existing class
+				self.output.write(nest_dict)
 				return
 			class_nest = self.GENID_REMAINING_NESTS[class_id]
 			self.ID_TO_GENIDS[class_id].remove(restriction_genid)
@@ -444,31 +452,32 @@ def triage_nest_dict(self, nest_dict):
 			if len(self.ID_TO_GENIDS[class_id]) > 0:
 				self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
 			else:
-				self.OUTPUT_NESTS.append(updated_class_nest)
+				# Since all of the genids used in this class have been matched, output
+				self.output.write(nest_dict)
 				self.GENID_REMAINING_NESTS[class_id] = None
 		else:
-			self.OUTPUT_NESTS.append(nest_dict)
+			# There are no genids that need to be worked with, so just output
+			self.output.write(nest_dict)
 
 
 	def parse_OWL_file(self):
 		self.xml_parser.divide_into_lines(self.input_file)
-		print(json.dumps(self.OUTPUT_NESTS, indent=4))
 
-		print("=========")
-
-		print("Remaining:")
+		# Genid wasn't filled, still want to include them though
 		for item in self.GENID_REMAINING_NESTS:
 			if self.GENID_REMAINING_NESTS[item] != None:
-				print(item)
-				print(json.dumps(self.GENID_REMAINING_NESTS[item], indent=4))
+				self.output.write(self.GENID_REMAINING_NESTS[item])
+
+		close_single_jsonlines(self.output_info, self.output_file_name)
 
 
 if __name__ == '__main__':
 	args = get_args()
 	input_file_name = args.inputFile
+	output_file_name = args.outputFile
 
 	print("File:", input_file_name)
 	print("Start Time:", date())
-	owl_parser = OWLParser(input_file_name)
+	owl_parser = OWLParser(input_file_name, output_file_name)
 	owl_parser.parse_OWL_file()
 	print("End Time:", date())
\ No newline at end of file

From e8d9e8803bacffa0a6344c7cb778af74240b3eb5 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 12 Aug 2024 19:28:25 -0700
Subject: [PATCH 051/125] #387 moving bc of kg2_util

---
 misc-tools/owlparser.py => owlparser.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename misc-tools/owlparser.py => owlparser.py (100%)

diff --git a/misc-tools/owlparser.py b/owlparser.py
similarity index 100%
rename from misc-tools/owlparser.py
rename to owlparser.py

From 8d6668fffcdf3556bf6b18e3553176e2d5138b9e Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 12 Aug 2024 20:50:31 -0700
Subject: [PATCH 052/125] #387 slightly more efficient

---
 owlparser.py | 237 ++++++++++++++++++++++++++-------------------------
 1 file changed, 119 insertions(+), 118 deletions(-)

diff --git a/owlparser.py b/owlparser.py
index 629a8226..2bc87905 100644
--- a/owlparser.py
+++ b/owlparser.py
@@ -1,7 +1,7 @@
 import json
 import argparse
 import datetime
-import kg2_util
+import kg2_util_thin as kg2_util
 
 def get_args():
 	arg_parser = argparse.ArgumentParser()
@@ -41,204 +41,205 @@ def __init__(self, skip_tags, ignored_attributes, processing_func):
 		self.KEY_TEXT = "ENTRY_TEXT"
 		self.KEY_TYPE = "type"
 
-
-	def categorize_line(self, tag, attributes, main_text, end_tag, only_tag):
+		# Variables for line reading
+		self.tag = ""
+		self.attributes = dict()
+		self.attribute_tag = ""
+		self.attribute_text = ""
+		self.main_text = ""
+		self.end_tag = ""
+		self.only_tag = False
+		self.start_brackets = 0
+		self.line = ""
+		self.letter = ""
+		self.next_letter = ""
+		self.prev_letter = ""
+		self.type_to_read = 0
+
+	def categorize_line(self):
 		# Categorize the type of line
 		line_type = str()
 		out = dict()
 
 		# Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
-		if tag == self.COMMENT or tag in self.OUTMOST_TAGS_SKIP or end_tag in self.OUTMOST_TAGS_SKIP or only_tag:
+		if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag:
 			line_type = self.LINE_TYPE_IGNORE
 		else:
-			start_tag_exists = (tag != str())
-			attributes_exist = (attributes != dict())
-			text_exists = (main_text != str())
-			end_tag_exists = (end_tag != str())
+			start_tag_exists = (self.tag != str())
+			attributes_exist = (self.attributes != dict())
+			text_exists = (self.main_text != str())
+			end_tag_exists = (self.end_tag != str())
 
 			if start_tag_exists:
 				if attributes_exist:
 					if text_exists:
 						line_type = self.LINE_TYPE_ENTRY_WITH_ATTR
-						out[self.KEY_TAG] = tag
-						out[self.KEY_ATTRIBUTES] = attributes
-						out[self.KEY_TEXT] = main_text
+						out[self.KEY_TAG] = self.tag
+						out[self.KEY_ATTRIBUTES] = self.attributes
+						out[self.KEY_TEXT] = self.main_text
 					elif end_tag_exists:
 						line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR
-						out[self.KEY_TAG] = tag
-						out[self.KEY_ATTRIBUTES] = attributes
+						out[self.KEY_TAG] = self.tag
+						out[self.KEY_ATTRIBUTES] = self.attributes
 					else:
 						line_type = self.LINE_TYPE_START_NEST_WITH_ATTR
-						out[self.KEY_TAG] = tag
-						out[self.KEY_ATTRIBUTES] = attributes
+						out[self.KEY_TAG] = self.tag
+						out[self.KEY_ATTRIBUTES] = self.attributes
 				elif text_exists:
 					line_type = self.LINE_TYPE_ENTRY
-					out[self.KEY_TAG] = tag
-					out[self.KEY_TEXT] = main_text
+					out[self.KEY_TAG] = self.tag
+					out[self.KEY_TEXT] = self.main_text
 				else:
 					line_type = self.LINE_TYPE_START_NEST
-					out[self.KEY_TAG] = tag
+					out[self.KEY_TAG] = self.tag
 			elif end_tag_exists:
 				line_type = self.LINE_TYPE_END_NEST
-				out[self.KEY_TAG] = end_tag
+				out[self.KEY_TAG] = self.end_tag
 
 		out[self.KEY_TYPE] = line_type
 
 		return out
 
-	def get_letters(self, line, letter_index, start_brackets):
-		letter = line[letter_index]
-		next_letter = ""
-		prev_letter = ""
-		if letter_index + 1 < len(line):
-			next_letter = line[letter_index + 1]
+	def get_letters(self, letter_index):
+		self.letter = self.line[letter_index]
+		self.next_letter = ""
+		self.prev_letter = ""
+		if letter_index + 1 < len(self.line):
+			self.next_letter = self.line[letter_index + 1]
 		if letter_index - 1 >= 0:
-			prev_letter = line[letter_index - 1]
-
-		if letter == '<':
-			start_brackets += 1
-		if letter == '>':
-			start_brackets -= 1
+			self.prev_letter = self.line[letter_index - 1]
 
-		return letter, next_letter, prev_letter, start_brackets
+		if self.letter == '<':
+			self.start_brackets += 1
+		if self.letter == '>':
+			self.start_brackets -= 1
 
 
-	def identify_tag_type(self, letter_index, letter, next_letter, prev_letter, type_to_read):
+	def identify_tag_type(self, letter_index):
 		changed = True
 
-		if letter == '<' and letter_index == 0:
-			if next_letter != '/':
-				type_to_read = LineElementRead.TAG
-		elif letter == '/' and prev_letter == '<':
-			type_to_read = LineElementRead.END_TAG
+		if self.letter == '<' and letter_index == 0:
+			if self.next_letter != '/':
+				self.type_to_read = LineElementRead.TAG
+		elif self.letter == '/' and self.prev_letter == '<':
+			self.type_to_read = LineElementRead.END_TAG
 		else:
 			changed = False
 
-		return changed, type_to_read
+		return changed
 
 
-	def read_tag(self, letter, prev_letter, type_to_read, start_brackets, tag, line):
-		only_tag = False
+	def read_tag(self):
 		changed = True
 
-		if letter == ' ' and type_to_read == LineElementRead.TAG:
-			type_to_read = LineElementRead.ATTRIBUTE_TAG
-		elif letter == '>' and type_to_read == LineElementRead.TAG and start_brackets == 0:
-			type_to_read = LineElementRead.MAIN
+		if self.letter == ' ' and self.type_to_read == LineElementRead.TAG:
+			self.type_to_read = LineElementRead.ATTRIBUTE_TAG
+		elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0:
+			self.type_to_read = LineElementRead.MAIN
 
-			if prev_letter == '/':
-				print("Warning - strange tag, ignoring", line)
-				only_tag = True
-		elif type_to_read == LineElementRead.TAG:
-			tag += letter
+			if self.prev_letter == '/':
+				print("Warning - strange tag, ignoring", self.line)
+				self.only_tag = True
+		elif self.type_to_read == LineElementRead.TAG:
+			self.tag += self.letter
 		else:
 			changed = False
 
-		return changed, type_to_read, (only_tag, tag)
-
+		return changed
 
-	def store_attribute(self, attributes, attribute_tag, attribute_text):
-		if attribute_tag not in self.IGNORED_ATTRIBUTES:
-			attributes[attribute_tag] = attribute_text.strip('/').strip('"')
-		attribute_tag = ""
-		attribute_text = ""
 
-		return attributes, attribute_tag, attribute_text
+	def store_attribute(self):
+		if self.attribute_tag not in self.IGNORED_ATTRIBUTES:
+			self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"')
+		self.attribute_tag = ""
+		self.attribute_text = ""
 
 
-	def read_attributes(self, letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag):
+	def read_attributes(self):
 		changed = True
-		start_reading_attributes = (type_to_read == LineElementRead.ATTRIBUTE_TAG or type_to_read == LineElementRead.ATTRIBUTE_TEXT)
+		start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT)
 
-		if letter == '>' and start_reading_attributes and start_brackets == 0:
-			type_to_read = LineElementRead.MAIN
-			attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text)
+		if self.letter == '>' and start_reading_attributes and self.start_brackets == 0:
+			self.type_to_read = LineElementRead.MAIN
+			
+			self.store_attribute()
 
-			if prev_letter == '/':
-				end_tag = tag
+			if self.prev_letter == '/':
+				self.end_tag = self.tag
 		elif start_reading_attributes:
-			if letter == '=' and type_to_read == LineElementRead.ATTRIBUTE_TAG:
-				type_to_read = LineElementRead.ATTRIBUTE_TEXT
-			elif type_to_read == LineElementRead.ATTRIBUTE_TAG:
-				attribute_tag += letter
-			elif letter == ' ' and type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-				type_to_read = LineElementRead.ATTRIBUTE_TAG
-				attributes, attribute_tag, attribute_text = self.store_attribute(attributes, attribute_tag, attribute_text)
-			elif type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-				attribute_text += letter
+			if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
+				self.type_to_read = LineElementRead.ATTRIBUTE_TEXT
+			elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
+				self.attribute_tag += self.letter
+			elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+				self.type_to_read = LineElementRead.ATTRIBUTE_TAG
+				self.store_attribute()
+			elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+				self.attribute_text += self.letter
 		else:
 			changed = False
 
-		return changed, type_to_read, (attributes, attribute_tag, attribute_text, end_tag)
+		return changed
 
 
-	def read_main(self, letter, type_to_read, main_text):
+	def read_main(self):
 		changed = True
-		if letter == '<' and type_to_read == LineElementRead.MAIN:
-			type_to_read = LineElementRead.END_TAG
-		elif type_to_read == LineElementRead.MAIN:
-			main_text += letter
+		if self.letter == '<' and self.type_to_read == LineElementRead.MAIN:
+			self.type_to_read = LineElementRead.END_TAG
+		elif self.type_to_read == LineElementRead.MAIN:
+			self.main_text += self.letter
 		else:
 			changed = False
 
-		return changed, type_to_read, (main_text)
+		return changed
 
 
-	def read_end_tag(self, letter, type_to_read, start_brackets, end_tag):
+	def read_end_tag(self):
 		changed = True
-		if letter == '>' and type_to_read == LineElementRead.END_TAG and start_brackets == 0:
+		if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0:
 			pass
-		elif type_to_read == LineElementRead.END_TAG:
-			end_tag += letter
+		elif self.type_to_read == LineElementRead.END_TAG:
+			self.end_tag += self.letter
 		else:
 			changed = False
 
-		return changed, type_to_read, (end_tag)
+		return changed
 
 
-	def convert_line(self, line):
-		tag = ""
-		attributes = dict()
-		attribute_tag = ""
-		attribute_text = ""
-		main_text = ""
-		end_tag = ""
+	def convert_line(self):
+		self.tag = ""
+		self.attributes = dict()
+		self.attribute_tag = ""
+		self.attribute_text = ""
+		self.main_text = ""
+		self.end_tag = ""
 
-		type_to_read = 0
+		self.type_to_read = 0
 
-		only_tag = False
+		self.only_tag = False
 
-		start_brackets = 0
+		self.start_brackets = 0
 
-		for letter_index in range(len(line)):
-			letter, next_letter, prev_letter, start_brackets = self.get_letters(line, letter_index, start_brackets)
+		for letter_index in range(len(self.line)):
+			self.get_letters(letter_index)
 
 			# First <
-			tag_identified, type_to_read = self.identify_tag_type(letter_index, letter, next_letter, prev_letter, type_to_read)
-			if tag_identified:
+			if self.identify_tag_type(letter_index):
 				continue
 
-			tag_read, type_to_read, tag_read_data = self.read_tag(letter, prev_letter, type_to_read, start_brackets, tag, line)
-			if tag_read:
-				(only_tag, tag) = tag_read_data
+			if self.read_tag():
 				continue
 
-			attributes_read, type_to_read, attributes_read_data = self.read_attributes(letter, prev_letter, type_to_read, start_brackets, attributes, attribute_tag, attribute_text, tag, end_tag)
-			if attributes_read:
-				(attributes, attribute_tag, attribute_text, end_tag) = attributes_read_data
+			if self.read_attributes():
 				continue
 
-			main_read, type_to_read, main_read_data = self.read_main(letter, type_to_read, main_text)
-			if main_read:
-				(main_text) = main_read_data
+			if self.read_main():
 				continue
 
-			end_tag_read, type_to_read, end_tag_read_data = self.read_end_tag(letter, type_to_read, start_brackets, end_tag)
-			if end_tag_read:
-				(end_tag) = end_tag_read_data
+			if self.read_end_tag():
 				continue
 
-		return self.categorize_line(tag, attributes, main_text, end_tag, only_tag)
+		return self.categorize_line()
 
 
 	def convert_nest(self, nest, start_index):
@@ -316,7 +317,8 @@ def divide_into_lines(self, input_file_name):
 
 					if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
 						# Only return if nesting
-						line_parsed = self.convert_line(curr_str)
+						self.line = curr_str
+						line_parsed = self.convert_line()
 
 						tag = line_parsed.get(self.KEY_TAG, None)
 						assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely
@@ -375,9 +377,8 @@ def __init__(self, input_file_name, output_file_name):
 		self.input_file = input_file_name
 		self.output_file_name = output_file_name
 
-		self.output_info = create_single_jsonlines()
-		self.output = output_info[0]
-
+		self.output_info = kg2_util.create_single_jsonlines()
+		self.output = self.output_info[0]
 
 	def check_for_class_genids(self, nest_dict):
 		genids = list()
@@ -468,7 +469,7 @@ def parse_OWL_file(self):
 			if self.GENID_REMAINING_NESTS[item] != None:
 				self.output.write(self.GENID_REMAINING_NESTS[item])
 
-		close_single_jsonlines(self.output_info, self.output_file_name)
+		kg2_util.close_single_jsonlines(self.output_info, self.output_file_name)
 
 
 if __name__ == '__main__':

From b377ae97127ed8b023c00e4ad04c959741a84f9a Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 15 Aug 2024 16:33:11 -0700
Subject: [PATCH 053/125] #387 loads multiple files now

---
 owlparser.py | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/owlparser.py b/owlparser.py
index 2bc87905..1973009a 100644
--- a/owlparser.py
+++ b/owlparser.py
@@ -15,6 +15,7 @@ def date():
 	return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
 class LineElementRead():
+	NONE = 0
 	TAG = 1
 	ATTRIBUTE_TAG = 2
 	ATTRIBUTE_TEXT = 3
@@ -54,7 +55,7 @@ def __init__(self, skip_tags, ignored_attributes, processing_func):
 		self.letter = ""
 		self.next_letter = ""
 		self.prev_letter = ""
-		self.type_to_read = 0
+		self.type_to_read = LineElementRead.NONE
 
 	def categorize_line(self):
 		# Categorize the type of line
@@ -214,7 +215,7 @@ def convert_line(self):
 		self.main_text = ""
 		self.end_tag = ""
 
-		self.type_to_read = 0
+		self.type_to_read = LineElementRead.NONE
 
 		self.only_tag = False
 
@@ -353,7 +354,7 @@ def divide_into_lines(self, input_file_name):
 
 
 class OWLParser():
-	def __init__(self, input_file_name, output_file_name):
+	def __init__(self, input_files, output_file_name):
 		self.XML_TAG = "?xml"
 		self.RDF_TAG = "rdf:RDF"
 		self.DOCTYPE_TAG = "!DOCTYPE"
@@ -374,7 +375,7 @@ def __init__(self, input_file_name, output_file_name):
 		self.GENID_TO_ID = dict()
 		self.ID_TO_GENIDS = dict()
 
-		self.input_file = input_file_name
+		self.input_files = input_files
 		self.output_file_name = output_file_name
 
 		self.output_info = kg2_util.create_single_jsonlines()
@@ -462,23 +463,40 @@ def triage_nest_dict(self, nest_dict):
 
 
 	def parse_OWL_file(self):
-		self.xml_parser.divide_into_lines(self.input_file)
+		for input_file in self.input_files:
+			print("Reading:", input_file, "starting at", date())
+			self.xml_parser.divide_into_lines(input_file)
 
-		# Genid wasn't filled, still want to include them though
-		for item in self.GENID_REMAINING_NESTS:
-			if self.GENID_REMAINING_NESTS[item] != None:
-				self.output.write(self.GENID_REMAINING_NESTS[item])
+			# Genid wasn't filled, still want to include them though
+			for item in self.GENID_REMAINING_NESTS:
+				if self.GENID_REMAINING_NESTS[item] != None:
+					self.output.write(self.GENID_REMAINING_NESTS[item])
+
+			# Refresh everything for the next file
+			self.GENID_REMAINING_NESTS = dict()
+			self.GENID_TO_ID = dict()
+			self.ID_TO_GENIDS = dict()
 
 		kg2_util.close_single_jsonlines(self.output_info, self.output_file_name)
 
 
+def identify_input_files(ont_load_inventory):
+	input_files = list()
+	for item in ont_load_inventory:
+		input_files.append(item['file'])
+
+	return input_files
+
 if __name__ == '__main__':
 	args = get_args()
 	input_file_name = args.inputFile
 	output_file_name = args.outputFile
 
-	print("File:", input_file_name)
+	ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name))
+	input_files = identify_input_files(ont_load_inventory)
+
+	print("Files:", input_files)
 	print("Start Time:", date())
-	owl_parser = OWLParser(input_file_name, output_file_name)
+	owl_parser = OWLParser(input_files, output_file_name)
 	owl_parser.parse_OWL_file()
 	print("End Time:", date())
\ No newline at end of file

From e73585507859016e5f55067f0bc6d2b42d528cdb Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sat, 17 Aug 2024 01:29:30 -0700
Subject: [PATCH 054/125] #387 save the name of the output file as well

---
 owlparser.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/owlparser.py b/owlparser.py
index 1973009a..3f271dd4 100644
--- a/owlparser.py
+++ b/owlparser.py
@@ -1,7 +1,7 @@
 import json
 import argparse
 import datetime
-import kg2_util_thin as kg2_util
+import kg2_util
 
 def get_args():
 	arg_parser = argparse.ArgumentParser()
@@ -365,6 +365,8 @@ def __init__(self, input_files, output_file_name):
 		self.RDF_ABOUT_TAG = "rdf:about"
 		self.GENID_PREFIX = "genid"
 
+		self.OWL_SOURCE_KEY = "owl_source"
+
 		self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
 
 		self.ignored_attributes = ["xml:lang"]
@@ -429,6 +431,13 @@ def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
 		return output_class_nest
 
 
+	def write_to_output(self, output_dict, source_file):
+		output_dict[self.OWL_SOURCE_KEY] = source_file
+		self.output.write(output_dict)
+
+		return
+
+
 	def triage_nest_dict(self, nest_dict):
 		genids = self.check_for_class_genids(nest_dict)
 		restriction_genid = self.check_for_restriction_genids(nest_dict)
@@ -445,7 +454,7 @@ def triage_nest_dict(self, nest_dict):
 				print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
 
 				# Save to output despite not matching with an existing class
-				self.output.write(nest_dict)
+				self.write_to_output(nest_dict, self.input_file)
 				return
 			class_nest = self.GENID_REMAINING_NESTS[class_id]
 			self.ID_TO_GENIDS[class_id].remove(restriction_genid)
@@ -464,13 +473,14 @@ def triage_nest_dict(self, nest_dict):
 
 	def parse_OWL_file(self):
 		for input_file in self.input_files:
+			self.input_file = input_file
 			print("Reading:", input_file, "starting at", date())
 			self.xml_parser.divide_into_lines(input_file)
 
 			# Genid wasn't filled, still want to include them though
 			for item in self.GENID_REMAINING_NESTS:
 				if self.GENID_REMAINING_NESTS[item] != None:
-					self.output.write(self.GENID_REMAINING_NESTS[item])
+					self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file)
 
 			# Refresh everything for the next file
 			self.GENID_REMAINING_NESTS = dict()

From 462c0bfec45f99ff1cd0652a020d23386790d956 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Fri, 23 Aug 2024 12:04:23 -0700
Subject: [PATCH 055/125] #387 start of processing the ontologies JSON Lines
 file

---
 ontologies_jsonl_to_kg_jsonl.py | 147 ++++++++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 ontologies_jsonl_to_kg_jsonl.py

diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py
new file mode 100644
index 00000000..12784bc8
--- /dev/null
+++ b/ontologies_jsonl_to_kg_jsonl.py
@@ -0,0 +1,147 @@
+import argparse
+import kg2_util
+import json
+
+OWL_CLASS_TAG = "owl:Class"
+SUBCLASS_TAG = "rdfs:subClassOf"
+DESCRIPTION_TAG = "obo:IAO_0000115"
+XREF_TAG = "oboInOwl:hasDbXref"
+ID_TAG = "rdf:about"
+NAME_TAG = "rdfs:label"
+EXACT_MATCH_TAG = "skos:exactMatch"
+COMMENT_TAG = "rdfs:comment"
+
+TEXT_KEY = "ENTRY_TEXT"
+RESOURCE_KEY = "rdf:resource"
+
+OWL_SOURCE_KEY = "owl_source"
+
+KEYS_DICT = dict()
+
+COMMENT_PREFIX = "COMMENTS: "
+
+CLASSES_DICT = dict()
+
+def get_args():
+	arg_parser = argparse.ArgumentParser()
+	arg_parser.add_argument('--test', dest='test',
+							action="store_true", default=False)
+	arg_parser.add_argument('inputFile', type=str)
+	arg_parser.add_argument('outputFile', type=str)
+	return arg_parser.parse_args()
+
+def process_ontology_item(ontology_item):
+	source = ontology_item.get(OWL_SOURCE_KEY, str())
+	for owl_class in ontology_item.get(OWL_CLASS_TAG, list()):
+		# Typically genid classes which don't neatly map onto the KG2 schema
+		if ID_TAG not in owl_class:
+			continue
+		# TODO: MAP THIS HERE, since not all sources use same IRIs for the same nodes
+		node_id = owl_class.get(ID_TAG, str())
+
+		# Configure the name
+		name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
+		if len(name_list) == 0:
+			continue
+
+		# Configure the description
+		description_list = list()
+		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)]
+		description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)]
+		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)]
+		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
+		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
+
+		# Configure the biological sequence
+		has_biological_sequence = dict()
+		has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
+		has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
+		has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence]
+		has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence]
+
+		# Extract edge triples
+		edges_list = list()
+
+		for edge_type in ["obo:RO_0002175", "obo:RO_0002161", "obo:RO_0002604", "obo:RO_0002171", "obo:RO_0002174", "obo:RO_0002475", "obo:RO_0001900", "obo:RO_0004050"]:
+			for edge in owl_class.get(edge_type, list()):
+				if RESOURCE_KEY in edge:
+					edges_list.append((edge_type, edge.get(RESOURCE_KEY, None)))
+
+		for edge_type in ["oboInOwl:hasDbXref"]:
+			for edge in owl_class.get(edge_type, list()):
+				if TEXT_KEY in edge:
+					edges_list.append((edge_type, edge.get(TEXT_KEY, None)))
+
+		restriction_edges = list()
+		restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
+		for equiv in owl_class.get("owl:equivalentClass", list()):
+			for mini_class in equiv.get("owl:Class", list()):
+				for edge in mini_class.get("owl:intersectionOf", list()):
+					restriction_edges.append((edge, "owl:equivalentClass"))
+
+		for (edge, general_edge_type) in restriction_edges:
+			for restriction in edge.get("owl:Restriction", list()):
+				edge_type = restriction.get("owl:onProperty", list())
+				edge_object = restriction.get("owl:someValuesFrom", list())
+				if len(edge_type) != 1:
+					assert len(edge_type) <= 1, edge 
+					continue
+				if len(edge_object) != 1:
+					assert len(edge_object) <= 1, edge
+					continue
+				edge_type = edge_type[0].get(RESOURCE_KEY, None)
+				edge_object = edge_object[0].get(RESOURCE_KEY, None)
+
+				if edge_type != None and edge_object != None:
+					edges_list.append((edge_type, edge_object))
+
+			if RESOURCE_KEY in edge:
+				edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
+
+
+		# node_id = owl_class.get(ID_TAG, list())
+
+		# superclasses = [superclass.get(RESOURCE_KEY, str()) for superclass in owl_class.get(SUBCLASS_TAG, list())]
+
+		# # Also query for comments?
+		# # Descriptions appear to be additive in current KG2
+		# descriptions = owl_class.get(DESCRIPTION_TAG, list())
+		# assert len(descriptions) <= 1
+		# description = str()
+		# for element in descriptions:
+		# 	description += element[TEXT_KEY]
+
+		# xrefs = [xref[TEXT_KEY] for xref in owl_class.get(XREF_TAG, list())]
+		# for element in owl_class.get(XREF_TAG, list()):
+		# 	xrefs.append(element[TEXT_KEY])
+
+		# exact_matches = [exact_match[RESOURCE_KEY] for exact_match in owl_class.get(EXACT_MATCH_TAG, list())]
+
+		# names = owl_class.get(NAME_TAG, list())
+		# assert len(names) <= 1, ontology_item
+		# name = str()
+		# for element in names:
+		# 	name += element[TEXT_KEY]
+
+		# node = {"id": node_id, "superclasses": superclasses, "description": description, "xrefs": xrefs, "name": name, "exact_matches": exact_matches}
+
+		node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": edges_list}
+		print(json.dumps(node, indent=4))
+
+
+if __name__ == '__main__':
+	args = get_args()
+	input_file_name = args.inputFile
+	output_file_name = args.outputFile
+
+	input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
+	input_data = input_read_jsonlines_info[0]
+
+	owl_class_count = 0
+	for ontology_item in input_data:
+		process_ontology_item(ontology_item)
+
+	# print("OWL Classes:", owl_class_count)
+	# for key in KEYS_DICT:
+	# 	KEYS_DICT[key] = KEYS_DICT[key] / owl_class_count
+	# print(json.dumps(KEYS_DICT, indent=4, sort_keys=True))
\ No newline at end of file

From b12e98409d1d7ef739315c747e258c91bf38a145 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 26 Aug 2024 23:04:24 -0700
Subject: [PATCH 056/125] #387 additional weird sources due to FOODON

---
 maps/curies-to-urls-map.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 0e4ba4cb..7a57b610 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -5,6 +5,8 @@
 use_for_bidirectional_mapping:
   -
     AEO: http://purl.obolibrary.org/obo/AEO_
+  -
+    AGRO: http://purl.obolibrary.org/obo/AGRO_
   -
     AIR: https://identifiers.org/umls/AIR/
   -
@@ -521,6 +523,8 @@ use_for_bidirectional_mapping:
     ZFIN: "https://identifiers.org/zfin:"
 ##########################################3
 use_for_contraction_only:
+  -
+    AGRO: "&obo;AGRO_"
   -
     AraPort: https://www.araport.org/locus/
   -

From e9d6d68b80d4a14cea7658930771b0165d288d68 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 26 Aug 2024 23:27:22 -0700
Subject: [PATCH 057/125] #387 more additional weird source links due to FOODON

---
 maps/curies-to-urls-map.yaml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 7a57b610..647ce2c8 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -47,6 +47,8 @@ use_for_bidirectional_mapping:
     CAID: 'http://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid='
   -
     CARO: http://purl.obolibrary.org/obo/CARO_
+  -
+    CDNO: http://purl.obolibrary.org/obo/CDNO_
   -
     CEPH: http://purl.obolibrary.org/obo/CEPH_
   -
@@ -77,6 +79,8 @@ use_for_bidirectional_mapping:
     CLO: http://purl.obolibrary.org/obo/CLO_
   -
     COAR_RESOURCE: 'http://purl.org/coar/resource_type/'
+  -
+    COB: http://purl.obolibrary.org/obo/COB_
   -
     COG: 'https://www.ncbi.nlm.nih.gov/research/cog-project/'
   -
@@ -525,38 +529,56 @@ use_for_bidirectional_mapping:
 use_for_contraction_only:
   -
     AGRO: "&obo;AGRO_"
+  -
+    apollo: "&obo;APOLLO_"
   -
     AraPort: https://www.araport.org/locus/
   -
     AraPort: https://bar.utoronto.ca/thalemine/portal.do?externalids=
+  -
+    ARO: "&obo;ARO_"
   -
     BAO: http://www.bioassayontology.org/bao#
   -
     BFO: http://www.ifomis.org/bfo/1.1/snap#
   -
     BFO: http://www.ifomis.org/bfo/1.1/span#
+  -
+    BFO: "&obo;BFO_"
   -
     biolink: https://w3id.org/biolink/biolinkml/meta/
   -
     biolink: https://w3id.org/biolink/biolink-model
   -
     BTO: http://purl.obolibrary.org/obo/bto#
+  -
+    CDNO: "&obo;CDNO_"
   -
     CHEBI: http://purl.obolibrary.org/obo/chebi/
   -
     CHEBI: http://purl.obolibrary.org/obo/chebi#
+  -
+    CHEBI: "&obo;CHEBI_"
   -
     CHEMBL.COMPOUND: https://www.ebi.ac.uk/chembl/compound/inspect/
+  -
+    CHMO: "&obo;CHMO_"
   -
     CL: http://purl.obolibrary.org/obo/cl#
+  -
+    COB: "&obo;COB_"
   -
     CPT: http://purl.bioontology.org/ontology/HCPT/
   -
     DDANAT: http://purl.obolibrary.org/obo/ddanat#
   -
     DGIdb: https://www.dgidb.org/interaction_types/
+  -
+    DOID: "&obo;DOID_"
   -
     DRUGBANK: http://purl.bioontology.org/ontology/DRUGBANK/
+  -
+    ECOCORE: "&obo;ECOCORE_"
   -
     ecogene: http://www.ecogene.org/gene/
   -
@@ -567,6 +589,10 @@ use_for_contraction_only:
     EFO: http://www.ebi.ac.uk/efoIri
   -
     ENSEMBL: http://www.ensembl.org/id/
+  -
+    ENVO: "&obo;ENVO_"
+  -
+    EPO: "&obo;EPO_"
   -
     FBgn: https://flybase.org/reports/FBgn
   -

From 61c3f069cdce612650faaa009821a40a78674c5d Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 26 Aug 2024 23:39:46 -0700
Subject: [PATCH 058/125] #387 even more additional weird source links due to
 FOODON

---
 maps/curies-to-urls-map.yaml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 647ce2c8..adf646a7 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -593,16 +593,30 @@ use_for_contraction_only:
     ENVO: "&obo;ENVO_"
   -
     EPO: "&obo;EPO_"
+  -
+    ExO: "&obo;ExO_"
+  -
+    FAO: "&obo;FAO_"
   -
     FBgn: https://flybase.org/reports/FBgn
   -
     FMA: http://purl.obolibrary.org/obo/fma#
   -
     FMA: http://purl.bioontology.org/ontology/FMA/
+  -
+    FOODON: "&obo;FOODON_"
+  -
+    GAZ: "&obo;GAZ_"
+  -
+    GENEPIO: "&obo;GENEPIO_"
+  -
+    GO: "&obo;GO_"
   -
     GO: "http://purl.bioontology.org/ontology/GO/GO%3A"
   -
     GO: http://purl.bioontology.org/ontology/GO/
+  -
+    HANCESTRO: "&obo;HANCESTRO_"
   -
     HCPCS: https://hcpcs.codes/a-codes/
   -
@@ -617,6 +631,8 @@ use_for_contraction_only:
     HP: "http://purl.bioontology.org/ontology/HPO/HP%3A"
   -
     HP: http://purl.bioontology.org/ontology/HPO/
+  -
+    IAO: "&obo;IAO_"
   -
     ICD10: http://purl.bioontology.org/ontology/ICD10/
   -
@@ -633,6 +649,8 @@ use_for_contraction_only:
     ICD9: http://purl.bioontology.org/ontology/ICD9CM/
   -
     ICD9: http://purl.obolibrary.org/obo/ICD9_
+  -
+    IDO: "&obo;IDO_"
   -
     KEGG: http://purl.obolibrary.org/obo/KEGG_
   -
@@ -653,6 +671,8 @@ use_for_contraction_only:
     MESH: http://identifiers.org/mesh/
   -
     MGI: "http://www.informatics.jax.org/marker/MGI:"
+  -
+    MI: "&obo;MI_"
   -
     miRBase: "https://identifiers.org/mirbase:"
   -
@@ -665,6 +685,10 @@ use_for_contraction_only:
     NCBITaxon: http://purl.bioontology.org/ontology/NCBITAXON/
   -
     NCBITaxon: http://purl.obolibrary.org/obo/ncbitaxon#
+  -
+    NCBITaxon: "&obo;NCBITaxon"
+  -
+    NCBITaxon: "&obo;NCBITaxon#"
   -
     NCIT: http://purl.bioontology.org/ontology/NCI/
   -
@@ -681,6 +705,8 @@ use_for_contraction_only:
     NCIT: http://purl.bioontology.org/ontology/NCI_CTCAE_5/
   -
     NCIT: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#
+  -
+    NCIT: "&obo;NCIT_"
   -
     OBO: http://purl.obolibrary.org/obo#
   -

From da66493092bb3ba90768654ad340926374aa08c8 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 26 Aug 2024 23:55:40 -0700
Subject: [PATCH 059/125] #387 final additional weird source links due to
 FOODON

---
 maps/curies-to-urls-map.yaml | 38 ++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index adf646a7..9ce1b30d 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -179,6 +179,8 @@ use_for_bidirectional_mapping:
     FBrf: https://flybase.org/reports/FBrf
   -
     FIX: http://purl.obolibrary.org/obo/FIX_
+  -
+    FLOPO: http://purl.obolibrary.org/obo/FLOPO_
   -
     FLU: http://purl.obolibrary.org/obo/FLU_
   -
@@ -381,6 +383,8 @@ use_for_bidirectional_mapping:
     OMRSE: http://purl.obolibrary.org/obo/OMRSE_
   -
     OncoTree: http://purl.obolibrary.org/obo/ONCOTREE_
+  -
+    ONS: http://purl.obolibrary.org/obo/ONS_
   -
     OPL: http://purl.obolibrary.org/obo/OPL_
   -
@@ -445,6 +449,8 @@ use_for_bidirectional_mapping:
     RXNORM: http://purl.bioontology.org/ontology/RXNORM/
   -
     SEMMEDDB: https://skr3.nlm.nih.gov/SemMedDB
+  -
+    SEPIO: http://purl.obolibrary.org/obo/SEPIO_
   -
     sgd: "https://identifiers.org/sgd:"
   -
@@ -599,6 +605,8 @@ use_for_contraction_only:
     FAO: "&obo;FAO_"
   -
     FBgn: https://flybase.org/reports/FBgn
+  -
+    FLOPO: "&obo;FLOPO_"
   -
     FMA: http://purl.obolibrary.org/obo/fma#
   -
@@ -707,6 +715,8 @@ use_for_contraction_only:
     NCIT: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#
   -
     NCIT: "&obo;NCIT_"
+  -
+    OBI: "&obo;OBI_"
   -
     OBO: http://purl.obolibrary.org/obo#
   -
@@ -717,20 +727,38 @@ use_for_contraction_only:
     OMIM: http://purl.bioontology.org/ontology/OMIM/
   -
     OMIM: http://identifiers.org/omim/
+  -
+    OMIT: "&obo;OMIT_"
   -
     OMOP: http://purl.obolibrary.org/obo/COHD_
+  -
+    OMP: "&obo;OMP_"
+  -
+    ONS: "&obo;ONS_"
   -
     orphanet: http://purl.bioontology.org/ontology/ORDO/
   -
     orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb
   -
     PATO: http://purl.obolibrary.org/obo/pato#
+  -
+    PATO: "&obo;PATO_"
+  -
+    PCO: "&obo;PCO_"
+  -
+    PO: "&obo;PO_"
   -
     PomBase: http://www.pombase.org/spombe/result/
   -
     PR: http://purl.obolibrary.org/obo/pr#
+  -
+    PR: "&obo;PR_"
   -
     RO: http://www.obofoundry.org/ro/ro.owl#
+  -
+    RO: "&obo;RO_"
+  -
+    SEPIO: "&obo;SEPIO_"
   -
     sgd: http://www.yeastgenome.org/cgi-bin/locus.fpl?dbid=
   -
@@ -747,10 +775,18 @@ use_for_contraction_only:
     SNOMED: http://identifiers.org/snomedct/
   -
     SO: http://purl.obolibrary.org/obo/so#
+  -
+    SO: "&obo;SO_"
+  -
+    STATO: "&obo;STATO_"
+  -
+    TRANS: "&obo;TRANS_"
   -
     UBERON: http://purl.obolibrary.org/obo/uberon/insect-anatomy#
   -
     UBERON: http://purl.obolibrary.org/obo/uberon#
+  -
+    UBERON: "&obo;UBERON_"
   -
     UMLS: http://purl.obolibrary.org/obo/UMLS_
   -
@@ -767,6 +803,8 @@ use_for_contraction_only:
     UMLS: http://purl.bioontology.org/ontology/MEDLINEPLUS/
   -
     UniProtKB: "http://identifiers.org/uniprot/"
+  -
+    UO: "&obo;UO_"
   -
     wb: http://www.wormbase.org/species/c_elegans/gene/
   -

From 0d40be8a6cc6bfac28014ddcdd3c0681e8feb4da Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 27 Aug 2024 00:30:39 -0700
Subject: [PATCH 060/125] #387 patch to get around weird ids showing up when
 trying to prefix match

---
 maps/curies-to-urls-map.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 9ce1b30d..6cb9300d 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -579,10 +579,14 @@ use_for_contraction_only:
     DDANAT: http://purl.obolibrary.org/obo/ddanat#
   -
     DGIdb: https://www.dgidb.org/interaction_types/
+  -
+    dictybase.gene: http://dictybase.org/gene/DDB_
   -
     DOID: "&obo;DOID_"
   -
     DRUGBANK: http://purl.bioontology.org/ontology/DRUGBANK/
+  -
+    DRUGBANK: "DrugBank:"
   -
     ECOCORE: "&obo;ECOCORE_"
   -
@@ -635,6 +639,8 @@ use_for_contraction_only:
     HGNC: "http://identifiers.org/hgnc/"
   -
     HGNC: http://purl.bioontology.org/ontology/HGNC/
+  -
+    HMDB: "HMDB:"
   -
     HP: "http://purl.bioontology.org/ontology/HPO/HP%3A"
   -
@@ -661,6 +667,8 @@ use_for_contraction_only:
     IDO: "&obo;IDO_"
   -
     KEGG: http://purl.obolibrary.org/obo/KEGG_
+  -
+    KEGG.ENZYME: "EC:"
   -
     LOINC: http://purl.bioontology.org/ontology/LNC/
   -
@@ -753,6 +761,8 @@ use_for_contraction_only:
     PR: http://purl.obolibrary.org/obo/pr#
   -
     PR: "&obo;PR_"
+  -
+    REACT: "Reactome:"
   -
     RO: http://www.obofoundry.org/ro/ro.owl#
   -
@@ -803,6 +813,8 @@ use_for_contraction_only:
     UMLS: http://purl.bioontology.org/ontology/MEDLINEPLUS/
   -
     UniProtKB: "http://identifiers.org/uniprot/"
+  -
+    UniProtKB: "UniProtKB:"
   -
     UO: "&obo;UO_"
   -

From 0a229649ba450ef1cbd11da78d235408244783a5 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 27 Aug 2024 02:10:17 -0700
Subject: [PATCH 061/125] #387 more weird prefixes

---
 maps/curies-to-urls-map.yaml | 90 ++++++++++++++++++++++++++++++++++++
 1 file changed, 90 insertions(+)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 6cb9300d..84a730ab 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -543,6 +543,8 @@ use_for_contraction_only:
     AraPort: https://bar.utoronto.ca/thalemine/portal.do?externalids=
   -
     ARO: "&obo;ARO_"
+  -
+    ATC: "ATC_code:"
   -
     BAO: http://www.bioassayontology.org/bao#
   -
@@ -557,6 +559,10 @@ use_for_contraction_only:
     biolink: https://w3id.org/biolink/biolink-model
   -
     BTO: http://purl.obolibrary.org/obo/bto#
+  -
+    BTO: "BTO:"
+  -
+    BTO: "&obo;BTO_"
   -
     CDNO: "&obo;CDNO_"
   -
@@ -571,6 +577,8 @@ use_for_contraction_only:
     CHMO: "&obo;CHMO_"
   -
     CL: http://purl.obolibrary.org/obo/cl#
+  -
+    CL: "&obo;CL_"
   -
     COB: "&obo;COB_"
   -
@@ -583,10 +591,14 @@ use_for_contraction_only:
     dictybase.gene: http://dictybase.org/gene/DDB_
   -
     DOID: "&obo;DOID_"
+  -
+    DOID: "DOID:"
   -
     DRUGBANK: http://purl.bioontology.org/ontology/DRUGBANK/
   -
     DRUGBANK: "DrugBank:"
+  -
+    DrugCentral: "Drug_Central:"
   -
     ECOCORE: "&obo;ECOCORE_"
   -
@@ -597,10 +609,18 @@ use_for_contraction_only:
     EFO: http://www.ebi.ac.uk/efo/
   -
     EFO: http://www.ebi.ac.uk/efoIri
+  -
+    EFO: "EFO:"
+  -
+    EFO: "&efo;EFO_"
+  -
+    EHDAA2: "EHDAA2:"
   -
     ENSEMBL: http://www.ensembl.org/id/
   -
     ENVO: "&obo;ENVO_"
+  -
+    ENVO: "ENVO:"
   -
     EPO: "&obo;EPO_"
   -
@@ -615,14 +635,24 @@ use_for_contraction_only:
     FMA: http://purl.obolibrary.org/obo/fma#
   -
     FMA: http://purl.bioontology.org/ontology/FMA/
+  -
+    FMA: "FMA:"
   -
     FOODON: "&obo;FOODON_"
+  -
+    GARD: "GARD:"
   -
     GAZ: "&obo;GAZ_"
+  -
+    GAZ: "GAZ:"
   -
     GENEPIO: "&obo;GENEPIO_"
+  -
+    GENEPIO: "&obo;GENEPIO:"
   -
     GO: "&obo;GO_"
+  -
+    GO: "GO:"
   -
     GO: "http://purl.bioontology.org/ontology/GO/GO%3A"
   -
@@ -637,6 +667,8 @@ use_for_contraction_only:
     HGNC: "http://purl.bioontology.org/ontology/HGNC/HGNC%3A"
   -
     HGNC: "http://identifiers.org/hgnc/"
+  -
+    HGNC: "HGNC:"
   -
     HGNC: http://purl.bioontology.org/ontology/HGNC/
   -
@@ -645,6 +677,10 @@ use_for_contraction_only:
     HP: "http://purl.bioontology.org/ontology/HPO/HP%3A"
   -
     HP: http://purl.bioontology.org/ontology/HPO/
+  -
+    HP: "HP:"
+  -
+    HP: "&obo;HP_"
   -
     IAO: "&obo;IAO_"
   -
@@ -655,6 +691,10 @@ use_for_contraction_only:
     ICD10: http://purl.obolibrary.org/obo/ICD10AE_
   -
     ICD10: http://purl.obolibrary.org/obo/ICD10CM_
+  -
+    ICD10: "ICD10CM:"
+  -
+    ICD10: "ICD10:"
   -
     ICD10: http://purl.bioontology.org/ontology/ICD10AE/
   -
@@ -663,6 +703,8 @@ use_for_contraction_only:
     ICD9: http://purl.bioontology.org/ontology/ICD9CM/
   -
     ICD9: http://purl.obolibrary.org/obo/ICD9_
+  -
+    ICD9: "ICD9:"
   -
     IDO: "&obo;IDO_"
   -
@@ -675,16 +717,30 @@ use_for_contraction_only:
     MEDDRA: http://purl.obolibrary.org/obo/MedDRA_
   -
     MEDDRA: http://identifiers.org/meddra/
+  -
+    MEDDRA: "MedDRA:"
+  -
+    MEDDRA: "MEDDRA:"
   -
     medgen: http://purl.obolibrary.org/obo/MEDGEN_
   -
     medgen: http://identifiers.org/medgen/
+  -
+    medgen: "MEDGEN:"
+  -
+    medgen: "MedGen:"
+  -
+    medgen: "Medgen:"
   -
     MESH: http://purl.bioontology.org/ontology/MESH/
   -
     MESH: http://purl.bioontology.org/ontology/MSH/
   -
     MESH: http://identifiers.org/mesh/
+  -
+    MESH: "MESH:"
+  -
+    MESH : "MeSH:"
   -
     MGI: "http://www.informatics.jax.org/marker/MGI:"
   -
@@ -693,6 +749,8 @@ use_for_contraction_only:
     miRBase: "https://identifiers.org/mirbase:"
   -
     MONDO: http://purl.obolibrary.org/obo/mondo#
+  -
+    MONDO: "MONDO:"
   -
     NCBIGene: "https://identifiers.org/ncbigene:"
   -
@@ -723,6 +781,16 @@ use_for_contraction_only:
     NCIT: http://ncicb.nci.nih.gov/xml/owl/EVS/Thesaurus.owl#
   -
     NCIT: "&obo;NCIT_"
+  -
+    NCIT: "NCI:"
+  -
+    NCIT: "NCIT:"
+  -
+    NCIT: "NCIt:"
+  -
+    NCIT: "NCI Metathesaurus:"
+  -
+    NCIT: "NCI_"
   -
     OBI: "&obo;OBI_"
   -
@@ -731,10 +799,16 @@ use_for_contraction_only:
     OBOREL: "http://purl.org/obo/owl/OBO_REL#"
   -
     OCRe: http://purl.org/net/OCRe/research.owl#
+  -
+    OIO: "oboInOwl:"
   -
     OMIM: http://purl.bioontology.org/ontology/OMIM/
   -
     OMIM: http://identifiers.org/omim/
+  -
+    OMIM: "MIM:"
+  -
+    OMIM: "OMIM:"
   -
     OMIT: "&obo;OMIT_"
   -
@@ -745,6 +819,10 @@ use_for_contraction_only:
     ONS: "&obo;ONS_"
   -
     orphanet: http://purl.bioontology.org/ontology/ORDO/
+  -
+    orphanet: "ORDO:"
+  -
+    orphanet: "Orphanet:"
   -
     orphanet: https://data.bioontology.org/ontologies/ORDO/submissions/27/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb
   -
@@ -753,6 +831,8 @@ use_for_contraction_only:
     PATO: "&obo;PATO_"
   -
     PCO: "&obo;PCO_"
+  -
+    PMID: "PMID:"
   -
     PO: "&obo;PO_"
   -
@@ -761,12 +841,16 @@ use_for_contraction_only:
     PR: http://purl.obolibrary.org/obo/pr#
   -
     PR: "&obo;PR_"
+  -
+    rdfs: "rdfs:"
   -
     REACT: "Reactome:"
   -
     RO: http://www.obofoundry.org/ro/ro.owl#
   -
     RO: "&obo;RO_"
+  -
+    RO: "obo:RO_"
   -
     SEPIO: "&obo;SEPIO_"
   -
@@ -811,6 +895,12 @@ use_for_contraction_only:
     UMLS: http://purl.bioontology.org/ontology/MED-RT/
   -
     UMLS: http://purl.bioontology.org/ontology/MEDLINEPLUS/
+  -
+    UMLS: "MedlinePlus:"
+  -
+    UMLS: "UMLS_CUI:"
+  -
+    UMLS: "UMLS:"
   -
     UniProtKB: "http://identifiers.org/uniprot/"
   -

From 919d3b85d5933590144cd7906ed2145a2badd1fb Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 27 Aug 2024 02:27:05 -0700
Subject: [PATCH 062/125] #387 today's work on the ontologies ETL

---
 ontologies_jsonl_to_kg_jsonl.py | 58 +++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py
index 12784bc8..233ed4d2 100644
--- a/ontologies_jsonl_to_kg_jsonl.py
+++ b/ontologies_jsonl_to_kg_jsonl.py
@@ -22,6 +22,11 @@
 
 CLASSES_DICT = dict()
 
+URI_MAP = dict()
+URI_MAP_KEYS = list()
+
+MISSING_ID_PREFIXES = set()
+
 def get_args():
 	arg_parser = argparse.ArgumentParser()
 	arg_parser.add_argument('--test', dest='test',
@@ -37,7 +42,9 @@ def process_ontology_item(ontology_item):
 		if ID_TAG not in owl_class:
 			continue
 		# TODO: MAP THIS HERE, since not all sources use same IRIs for the same nodes
-		node_id = owl_class.get(ID_TAG, str())
+		node_id = match_prefix(owl_class.get(ID_TAG, str()))
+		if node_id is None:
+			continue
 
 		# Configure the name
 		name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
@@ -98,6 +105,16 @@ def process_ontology_item(ontology_item):
 			if RESOURCE_KEY in edge:
 				edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
 
+		final_edges_list = list()
+		for (edge_relation, edge_object) in edges_list:
+			edge_object = match_prefix(edge_object)
+			if edge_object is None:
+				continue
+			edge_relation = match_prefix(edge_relation)
+			if edge_relation is None:
+				continue
+			final_edges_list.append((edge_relation, edge_object))
+
 
 		# node_id = owl_class.get(ID_TAG, list())
 
@@ -125,9 +142,43 @@ def process_ontology_item(ontology_item):
 
 		# node = {"id": node_id, "superclasses": superclasses, "description": description, "xrefs": xrefs, "name": name, "exact_matches": exact_matches}
 
-		node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": edges_list}
+		node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list}
 		print(json.dumps(node, indent=4))
 
+def generate_uri_map():
+	uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml"))
+	bidirectional_map = uri_input_map['use_for_bidirectional_mapping']
+	contraction_map = uri_input_map['use_for_contraction_only']
+
+	for curie_prefix_dict in bidirectional_map:
+		for curie_prefix in curie_prefix_dict:
+			curie_url = curie_prefix_dict[curie_prefix]
+			URI_MAP[curie_url] = curie_prefix
+
+	for curie_prefix_dict in contraction_map:
+		for curie_prefix in curie_prefix_dict:
+			curie_url = curie_prefix_dict[curie_prefix]
+			URI_MAP[curie_url] = curie_prefix
+
+	# So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another)
+	# Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python)
+	global URI_MAP_KEYS
+	URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True)
+
+def match_prefix(node_id):
+	for curie_url in URI_MAP_KEYS:
+		if node_id.startswith(curie_url):
+			return node_id.replace(curie_url, URI_MAP[curie_url] + ":")
+	
+	if "http" in node_id:
+		MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/")
+	elif ':' in node_id:
+		MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":")
+	elif '_' in node_id:
+		MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_")
+	else:
+		MISSING_ID_PREFIXES.add(node_id)
+
 
 if __name__ == '__main__':
 	args = get_args()
@@ -138,8 +189,11 @@ def process_ontology_item(ontology_item):
 	input_data = input_read_jsonlines_info[0]
 
 	owl_class_count = 0
+	ontology_prefixes = set()
+	generate_uri_map()
 	for ontology_item in input_data:
 		process_ontology_item(ontology_item)
+	print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4))
 
 	# print("OWL Classes:", owl_class_count)
 	# for key in KEYS_DICT:

From 6613470ded6b49ab4597b5128f62b3b96d607e8e Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 27 Aug 2024 03:07:52 -0700
Subject: [PATCH 063/125] #387 finishing up the different edge types

---
 maps/curies-to-urls-map.yaml    |  6 ++++++
 ontologies_jsonl_to_kg_jsonl.py | 32 +++++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 84a730ab..3452641c 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -683,6 +683,8 @@ use_for_contraction_only:
     HP: "&obo;HP_"
   -
     IAO: "&obo;IAO_"
+  -
+    IAO: "obo:IAO_"
   -
     ICD10: http://purl.bioontology.org/ontology/ICD10/
   -
@@ -751,6 +753,8 @@ use_for_contraction_only:
     MONDO: http://purl.obolibrary.org/obo/mondo#
   -
     MONDO: "MONDO:"
+  -
+    MONDO: "mondo-base:"
   -
     NCBIGene: "https://identifiers.org/ncbigene:"
   -
@@ -857,6 +861,8 @@ use_for_contraction_only:
     sgd: http://www.yeastgenome.org/cgi-bin/locus.fpl?dbid=
   -
     skos: http://www.w3.org/2008/05/skos#
+  -
+    skos: "skos:"
   -
     SNOMED: http://purl.bioontology.org/ontology/SNOMEDCT/
   -
diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py
index 233ed4d2..93e119db 100644
--- a/ontologies_jsonl_to_kg_jsonl.py
+++ b/ontologies_jsonl_to_kg_jsonl.py
@@ -20,6 +20,28 @@
 
 COMMENT_PREFIX = "COMMENTS: "
 
+BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY,
+				   "mondo-base:closeMatch": RESOURCE_KEY,
+				   "mondo-base:relatedMatch": RESOURCE_KEY,
+				   "mondo-base:broadMatch": RESOURCE_KEY,
+				   "mondo-base:narrowMatch": RESOURCE_KEY,
+				   "skos:exactMatch": RESOURCE_KEY,
+				   "skos:closeMatch": RESOURCE_KEY,
+				   "skos:broadMatch": RESOURCE_KEY,
+				   "skos:relatedMatch": RESOURCE_KEY,
+				   "skos:narrowMatch": RESOURCE_KEY,
+				   "obo:IAO_0100001": RESOURCE_KEY,
+				   "obo:RO_0002175": RESOURCE_KEY,
+				   "obo:RO_0002161": RESOURCE_KEY,
+				   "obo:RO_0002604": RESOURCE_KEY,
+				   "obo:RO_0002171": RESOURCE_KEY,
+				   "obo:RO_0002174": RESOURCE_KEY,
+				   "obo:RO_0002475": RESOURCE_KEY,
+				   "obo:RO_0001900": RESOURCE_KEY,
+				   "oboInOwl:hasAlternativeId": TEXT_KEY,
+				   "oboInOwl:hasDbXref": TEXT_KEY,
+				   "oboInOwl:xref": TEXT_KEY}
+
 CLASSES_DICT = dict()
 
 URI_MAP = dict()
@@ -69,15 +91,11 @@ def process_ontology_item(ontology_item):
 		# Extract edge triples
 		edges_list = list()
 
-		for edge_type in ["obo:RO_0002175", "obo:RO_0002161", "obo:RO_0002604", "obo:RO_0002171", "obo:RO_0002174", "obo:RO_0002475", "obo:RO_0001900", "obo:RO_0004050"]:
+		for edge_type in BASE_EDGE_TYPES:
 			for edge in owl_class.get(edge_type, list()):
-				if RESOURCE_KEY in edge:
-					edges_list.append((edge_type, edge.get(RESOURCE_KEY, None)))
+				if BASE_EDGE_TYPES[edge_type] in edge:
+					edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
 
-		for edge_type in ["oboInOwl:hasDbXref"]:
-			for edge in owl_class.get(edge_type, list()):
-				if TEXT_KEY in edge:
-					edges_list.append((edge_type, edge.get(TEXT_KEY, None)))
 
 		restriction_edges = list()
 		restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]

From 07500628e27d474ed55fcd62e1b9012cba279ae1 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 28 Aug 2024 21:04:45 -0700
Subject: [PATCH 064/125] #387 for testing purposes

---
 validate/validate_kg2_util_curies_urls_categories.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py
index 4e670fae..e144bd21 100755
--- a/validate/validate_kg2_util_curies_urls_categories.py
+++ b/validate/validate_kg2_util_curies_urls_categories.py
@@ -47,6 +47,9 @@ def make_arg_parser():
                                                       kg2_util.CURIE_PREFIX_BIOLINK + ':' +
                                                       kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO)
 
+print("TESTTESTTEST")
+print(biolink_edge_labels)
+
 for variable_name in dir(kg2_util):
     variable_value = getattr(kg2_util, variable_name)
     if variable_name.startswith('CURIE_PREFIX_'):

From 7ed39cbfd81f551a138e861a75b5e912ced7d0d4 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 28 Aug 2024 23:54:09 -0700
Subject: [PATCH 065/125] #387 don't need that print statement anymore

---
 validate/validate_kg2_util_curies_urls_categories.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py
index e144bd21..4e670fae 100755
--- a/validate/validate_kg2_util_curies_urls_categories.py
+++ b/validate/validate_kg2_util_curies_urls_categories.py
@@ -47,9 +47,6 @@ def make_arg_parser():
                                                       kg2_util.CURIE_PREFIX_BIOLINK + ':' +
                                                       kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO)
 
-print("TESTTESTTEST")
-print(biolink_edge_labels)
-
 for variable_name in dir(kg2_util):
     variable_value = getattr(kg2_util, variable_name)
     if variable_name.startswith('CURIE_PREFIX_'):

From 7d184003a12d5a8571de6f359988196bb7dde5b0 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Wed, 28 Aug 2024 23:54:35 -0700
Subject: [PATCH 066/125] #387 looks like we're not using this anymore

---
 validate/{ => archive}/validate_ont_load_inventory.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename validate/{ => archive}/validate_ont_load_inventory.py (100%)

diff --git a/validate/validate_ont_load_inventory.py b/validate/archive/validate_ont_load_inventory.py
similarity index 100%
rename from validate/validate_ont_load_inventory.py
rename to validate/archive/validate_ont_load_inventory.py

From 2eb8b00848ac2149a5228d95f0dcc44d0c36b968 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 01:33:58 -0700
Subject: [PATCH 067/125] #387 try out the new validate kg2 util

---
 validate/run-validation-tests.sh              |  4 +-
 ...alidate_kg2_util_curies_urls_categories.py | 56 +++++++++++++------
 2 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/validate/run-validation-tests.sh b/validate/run-validation-tests.sh
index f2c997de..4bcec92f 100755
--- a/validate/run-validation-tests.sh
+++ b/validate/run-validation-tests.sh
@@ -68,8 +68,8 @@ ${python_command} -u ${VALIDATE_CODE_DIR}/validate_curies_to_urls_map_yaml.py \
 
 ${python_command} -u ${VALIDATE_CODE_DIR}/validate_kg2_util_curies_urls_categories.py \
            ${curies_to_urls_file} \
-           ${biolink_model_owl_url} \
-           ${biolink_model_owl_local_file}
+           ${biolink_model_yaml_url} \
+           ${biolink_model_yaml_local_file}
 
 ${python_command} -u ${VALIDATE_CODE_DIR}/validate_predicate_remap_yaml.py \
            ${curies_to_urls_file} \
diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py
index 4e670fae..4e09b14b 100755
--- a/validate/validate_kg2_util_curies_urls_categories.py
+++ b/validate/validate_kg2_util_curies_urls_categories.py
@@ -6,7 +6,7 @@
 
 __author__ = 'Stephen Ramsey'
 __copyright__ = 'Oregon State University'
-__credits__ = ['Stephen Ramsey']
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
 __license__ = 'MIT'
 __version__ = '0.1.0'
 __maintainer__ = ''
@@ -15,6 +15,11 @@
 
 import argparse
 import kg2_util
+import json
+
+DESCENDANT_KEY = "is_a"
+BASE_PREDICATE = "related to"
+BASE_CATEGORY = "named thing"
 
 def make_arg_parser():
     arg_parser = argparse.ArgumentParser(description='validate_kg2_util_curies_urls_categories.py: ' +
@@ -24,28 +29,51 @@ def make_arg_parser():
     arg_parser.add_argument('biolinkModelLocalFile', type=str)
     return arg_parser
 
+def construct_biolink_term_set(is_a_base, biolink_terms):
+    output_set = set()
+    for key in biolink_terms:
+        key_is_a = biolink_terms[key]
+        if key_is_a == is_a_base:
+            for item in construct_biolink_term_set(key, biolink_terms):
+                output_set.add(item)
+    output_set.add(is_a_base)
+    return output_set
+
+def identify_biolink_terms(biolink_model):
+    biolink_predicate_terms = dict()
+    biolink_category_terms = dict()
+    for predicate in biolink_model["slots"]:
+        if DESCENDANT_KEY in biolink_model["slots"][predicate]:
+            biolink_predicate_terms[predicate] = biolink_model["slots"][predicate][DESCENDANT_KEY]
+
+    for category in biolink_model["classes"]:
+        if DESCENDANT_KEY in biolink_model["classes"][category]:
+            biolink_category_terms[category] = biolink_model["classes"][category][DESCENDANT_KEY]
+
+    biolink_predicates = construct_biolink_term_set("related to", biolink_predicate_terms)
+    biolink_categories = construct_biolink_term_set("named thing", biolink_category_terms)
+
+    return list(biolink_predicates), list(biolink_categories)
+
+
 
 args = make_arg_parser().parse_args()
 biolink_model_url = args.biolinkModelURL
 biolink_model_file_name = args.biolinkModelLocalFile
 curies_to_urls_map_file_name = args.curiesToURLsMapFile
 
-iri_shortener = kg2_util.make_uri_to_curie_shortener(kg2_util.make_curies_to_uri_map(kg2_util.read_file_to_string(curies_to_urls_map_file_name),
-                                                                                     kg2_util.IDMapperType.CONTRACT))
-
 curies_to_url_map_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_map_file_name))
 curies_to_url_map_data_bidir = {key: listitem[key] for listitem in curies_to_url_map_data['use_for_bidirectional_mapping'] for key in listitem.keys()}
 
 curies_to_url_map_data_cont = {key: listitem[key] for listitem in curies_to_url_map_data['use_for_contraction_only'] for key in listitem.keys()}
 
+valid_base_urls = list()
+valid_base_urls += [value for value in curies_to_url_map_data_bidir.values()]
+valid_base_urls += [value for value in curies_to_url_map_data_cont.values()]
 
 kg2_util.download_file_if_not_exist_locally(biolink_model_url, biolink_model_file_name)
-biolink_ont = kg2_util.make_ontology_from_local_file(biolink_model_file_name)
-biolink_categories_ontology_depths = kg2_util.get_biolink_categories_ontology_depths(biolink_ont)
-
-biolink_edge_labels = kg2_util.ont_children_recursive(biolink_ont,
-                                                      kg2_util.CURIE_PREFIX_BIOLINK + ':' +
-                                                      kg2_util.EDGE_LABEL_BIOLINK_RELATED_TO)
+biolink_model = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(biolink_model_file_name))
+biolink_edge_labels, biolink_categories = identify_biolink_terms(biolink_model)
 
 for variable_name in dir(kg2_util):
     variable_value = getattr(kg2_util, variable_name)
@@ -53,21 +81,17 @@ def make_arg_parser():
         assert variable_value in curies_to_url_map_data_bidir, variable_name
     elif variable_name.startswith('BASE_URL_'):
         url_str = variable_value
-        curie = iri_shortener(url_str)
-        assert curie is not None, url_str
+        assert url_str in valid_base_urls, url_str
     elif variable_name.startswith('BIOLINK_CATEGORY_'):
         category_label = variable_value
         category_camelcase = kg2_util.convert_space_case_to_camel_case(category_label)
         category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase
-        assert category_curie in biolink_categories_ontology_depths, category_curie
+        assert category_curie in biolink_categories, category_curie
         #  assert category_label in categories_to_check, category_label
     elif variable_name.startswith('CURIE_ID_'):
         curie_id = variable_value
         assert ':' in curie_id, variable_name
         assert curie_id.split(':')[0] in curies_to_url_map_data_bidir, variable_name
-    elif variable_name.startswith('IRI_'):
-        url = variable_value
-        assert iri_shortener(url) is not None, url
     elif variable_name.startswith('EDGE_LABEL_BIOLINK_'):
         relation_label = variable_value
         assert kg2_util.CURIE_PREFIX_BIOLINK + ':' + relation_label in biolink_edge_labels, relation_label

From 2375994caaf693f8127e509e407a3cb3b943932a Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 01:53:42 -0700
Subject: [PATCH 068/125] #387 drastic changes: REMOVAL OF ONTOBIO from
 kg2_util and validation

---
 validate/run-validation-tests.sh              |  4 +-
 .../validate_curies_to_categories_yaml.py     | 12 +++---
 ...alidate_kg2_util_curies_urls_categories.py | 42 ++-----------------
 3 files changed, 12 insertions(+), 46 deletions(-)

diff --git a/validate/run-validation-tests.sh b/validate/run-validation-tests.sh
index 4bcec92f..7cdb7974 100755
--- a/validate/run-validation-tests.sh
+++ b/validate/run-validation-tests.sh
@@ -58,8 +58,8 @@ ${curl_get} ${infores_catalog_yaml_url} -o ${infores_catalog_yaml}
 ${python_command} -u ${VALIDATE_CODE_DIR}/validate_curies_to_categories_yaml.py \
            ${curies_to_categories_file} \
            ${curies_to_urls_file} \
-           ${biolink_model_owl_url} \
-           ${biolink_model_owl_local_file}
+           ${biolink_model_yaml_url} \
+           ${biolink_model_yaml_local_file}
 
 ${python_command} -u ${VALIDATE_CODE_DIR}/validate_curies_to_urls_map_yaml.py \
            ${curies_to_urls_file} \
diff --git a/validate/validate_curies_to_categories_yaml.py b/validate/validate_curies_to_categories_yaml.py
index 3af15002..80cdde53 100755
--- a/validate/validate_curies_to_categories_yaml.py
+++ b/validate/validate_curies_to_categories_yaml.py
@@ -7,7 +7,7 @@
 
 __author__ = 'Stephen Ramsey'
 __copyright__ = 'Oregon State University'
-__credits__ = ['Stephen Ramsey']
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
 __license__ = 'MIT'
 __version__ = '0.1.0'
 __maintainer__ = ''
@@ -22,8 +22,8 @@ def make_arg_parser():
     arg_parser = argparse.ArgumentParser(description='validate_curies_to_categories.py: checks the file `curies-to-categories.yaml` for correctness.')
     arg_parser.add_argument('curiesToCategoriesFile', type=str)
     arg_parser.add_argument('curiesToURLsMapFile', type=str)
-    arg_parser.add_argument('biolinkModelOWLURL', type=str)
-    arg_parser.add_argument('biolinkModelOWLLocalFile', type=str)
+    arg_parser.add_argument('biolinkModelYAMLURL', type=str)
+    arg_parser.add_argument('biolinkModelYAMLLocalFile', type=str)
     return arg_parser
 
 
@@ -37,8 +37,8 @@ def make_arg_parser():
 curies_to_url_map_data_bidir = {next(iter(listitem.keys())) for listitem in curies_to_url_map_data['use_for_bidirectional_mapping']}
 
 kg2_util.download_file_if_not_exist_locally(biolink_model_url, biolink_model_file_name)
-biolink_ont = kg2_util.make_ontology_from_local_file(biolink_model_file_name)
-biolink_categories_ontology_depths = kg2_util.get_biolink_categories_ontology_depths(biolink_ont)
+biolink_model = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(biolink_model_file_name))
+_, biolink_categories = kg2_util.identify_biolink_terms(biolink_model)
 
 for prefix in curies_to_categories_data['prefix-mappings'].keys():
     assert prefix in curies_to_url_map_data_bidir, prefix
@@ -53,4 +53,4 @@ def make_arg_parser():
 for category in categories_to_check:
     category_camelcase = kg2_util.convert_space_case_to_camel_case(category)
     category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase
-    assert category_curie in biolink_categories_ontology_depths, category_curie
+    assert category_curie in biolink_categories, category_curie
diff --git a/validate/validate_kg2_util_curies_urls_categories.py b/validate/validate_kg2_util_curies_urls_categories.py
index 4e09b14b..4ba57d50 100755
--- a/validate/validate_kg2_util_curies_urls_categories.py
+++ b/validate/validate_kg2_util_curies_urls_categories.py
@@ -17,10 +17,6 @@
 import kg2_util
 import json
 
-DESCENDANT_KEY = "is_a"
-BASE_PREDICATE = "related to"
-BASE_CATEGORY = "named thing"
-
 def make_arg_parser():
     arg_parser = argparse.ArgumentParser(description='validate_kg2_util_curies_urls_categories.py: ' +
                                          'checks the file `kg2_util.py` for correctness for its CURIE IDs, Base URLs, and biolink categories.')
@@ -29,34 +25,6 @@ def make_arg_parser():
     arg_parser.add_argument('biolinkModelLocalFile', type=str)
     return arg_parser
 
-def construct_biolink_term_set(is_a_base, biolink_terms):
-    output_set = set()
-    for key in biolink_terms:
-        key_is_a = biolink_terms[key]
-        if key_is_a == is_a_base:
-            for item in construct_biolink_term_set(key, biolink_terms):
-                output_set.add(item)
-    output_set.add(is_a_base)
-    return output_set
-
-def identify_biolink_terms(biolink_model):
-    biolink_predicate_terms = dict()
-    biolink_category_terms = dict()
-    for predicate in biolink_model["slots"]:
-        if DESCENDANT_KEY in biolink_model["slots"][predicate]:
-            biolink_predicate_terms[predicate] = biolink_model["slots"][predicate][DESCENDANT_KEY]
-
-    for category in biolink_model["classes"]:
-        if DESCENDANT_KEY in biolink_model["classes"][category]:
-            biolink_category_terms[category] = biolink_model["classes"][category][DESCENDANT_KEY]
-
-    biolink_predicates = construct_biolink_term_set("related to", biolink_predicate_terms)
-    biolink_categories = construct_biolink_term_set("named thing", biolink_category_terms)
-
-    return list(biolink_predicates), list(biolink_categories)
-
-
-
 args = make_arg_parser().parse_args()
 biolink_model_url = args.biolinkModelURL
 biolink_model_file_name = args.biolinkModelLocalFile
@@ -73,7 +41,7 @@ def identify_biolink_terms(biolink_model):
 
 kg2_util.download_file_if_not_exist_locally(biolink_model_url, biolink_model_file_name)
 biolink_model = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(biolink_model_file_name))
-biolink_edge_labels, biolink_categories = identify_biolink_terms(biolink_model)
+biolink_edge_labels, biolink_categories = kg2_util.identify_biolink_terms(biolink_model)
 
 for variable_name in dir(kg2_util):
     variable_value = getattr(kg2_util, variable_name)
@@ -84,14 +52,12 @@ def identify_biolink_terms(biolink_model):
         assert url_str in valid_base_urls, url_str
     elif variable_name.startswith('BIOLINK_CATEGORY_'):
         category_label = variable_value
-        category_camelcase = kg2_util.convert_space_case_to_camel_case(category_label)
-        category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase
-        assert category_curie in biolink_categories, category_curie
+        assert category_label in biolink_categories, category_curie
         #  assert category_label in categories_to_check, category_label
     elif variable_name.startswith('CURIE_ID_'):
         curie_id = variable_value
         assert ':' in curie_id, variable_name
         assert curie_id.split(':')[0] in curies_to_url_map_data_bidir, variable_name
     elif variable_name.startswith('EDGE_LABEL_BIOLINK_'):
-        relation_label = variable_value
-        assert kg2_util.CURIE_PREFIX_BIOLINK + ':' + relation_label in biolink_edge_labels, relation_label
+        relation_label = variable_value.replace('_', ' ')
+        assert relation_label in biolink_edge_labels, relation_label

From 5508b9b42407d43514d7f578a8bff9b58d811a0c Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 01:54:55 -0700
Subject: [PATCH 069/125] #387 the ACTUAL removal of ontobio

---
 kg2_util.py | 220 +++++++---------------------------------------------
 1 file changed, 30 insertions(+), 190 deletions(-)

diff --git a/kg2_util.py b/kg2_util.py
index a5aa0971..b694ea62 100644
--- a/kg2_util.py
+++ b/kg2_util.py
@@ -24,12 +24,10 @@
 import json
 import jsonlines
 import math
-import ontobio
 import os
 import pathlib
 import pickle
 import pprint
-import prefixcommons
 import re
 import shutil
 import ssl
@@ -137,8 +135,6 @@
 BASE_URL_IDENTIFIERS_ORG_REGISTRY = \
     'https://registry.identifiers.org/registry/'
 BASE_URL_BIOLINK_CONCEPTS = 'https://w3id.org/biolink/vocab/'
-BASE_URL_BIOLINK_ONTOLOGY = 'https://w3id.org/biolink/biolink-model.owl.ttl'
-BASE_URL_BIOLINK_META = 'https://w3id.org/biolink/biolinkml/meta/'
 BASE_URL_CHEMBL_COMPOUND = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.compound:'
 BASE_URL_CHEMBL_TARGET = BASE_BASE_URL_IDENTIFIERS_ORG + 'chembl.target:'
 BASE_URL_CHEMBL_MECHANISM = 'https://www.ebi.ac.uk/chembl/mechanism/inspect/'
@@ -245,9 +241,6 @@
 CURIE_ID_UNICHEM = CURIE_PREFIX_UNICHEM_SOURCE + ':'
 CURIE_ID_RDFS_SUBCLASS_OF = CURIE_PREFIX_RDFS + ':' + 'subClassOf'
 
-IRI_OBO_FORMAT_XREF = BASE_URL_OBO_FORMAT + 'xref'
-IRI_OWL_SAME_AS = BASE_URL_OWL + 'sameAs'
-
 EDGE_LABEL_OWL_SAME_AS = 'same_as'
 EDGE_LABEL_BIOLINK_GENE_ASSOCIATED_WITH_CONDITION = 'gene_associated_with_condition'
 EDGE_LABEL_BIOLINK_GENE_PRODUCT_OF = 'gene_product_of'
@@ -275,6 +268,9 @@
 OBO_ONT_CURIE_RE = re.compile(r'OBO:([^\.]+)\.owl')
 LOWER_TO_UPPER_RE = re.compile(r'([a-z0-9])([A-Z][^A-Z])')
 
+DESCENDANT_KEY = "is_a"
+BASE_PREDICATE = EDGE_LABEL_BIOLINK_RELATED_TO.replace('_', ' ')
+BASE_CATEGORY = BIOLINK_CATEGORY_NAMED_THING
 
 def convert_date(time):
     return datetime.datetime.fromtimestamp(time).strftime('%Y-%m-%d %H:%M:%S')
@@ -410,126 +406,6 @@ def allcaps_to_only_first_letter_capitalized(allcaps: str):
 def safe_load_yaml_from_string(yaml_string: str):
     return yaml.safe_load(io.StringIO(yaml_string))
 
-
-def shorten_iri_to_curie(iri: str, curie_to_iri_map: list) -> str:
-    if iri is None:
-        raise ValueError('cannot shorten an IRI with value None')
-    curie_list = prefixcommons.contract_uri(iri,
-                                            curie_to_iri_map)
-    if len(curie_list) == 0:
-        return None
-
-    if len(curie_list) == 1:
-        curie_id = curie_list[0]
-    else:
-        assert False, \
-            "somehow got a list after calling prefixcommons.contract: " + \
-            iri + "; list is: " + str(curie_list)
-        curie_id = None
-
-    # if curie_id is not None:
-    #     # deal with IRIs like 'https://identifiers.org/umls/ATC/L01AX02' which get converted to CURIE 'UMLS:ATC/L01AX02'
-    #     umls_match = REGEX_UMLS_CURIE.match(curie_id)
-    #     if umls_match is not None:
-    #         curie_id = umls_match[1] + ':' + umls_match[2]
-
-    return curie_id
-
-
-def make_uri_to_curie_shortener(curie_to_iri_map=None) -> callable:
-    if curie_to_iri_map is None:
-        curie_to_iri_map = []
-    return lambda iri: shorten_iri_to_curie(iri, curie_to_iri_map)
-
-
-def expand_curie_to_iri(curie_id: str, curie_to_iri_map: list) -> Optional[str]:
-    if curie_id.startswith('UMLS:CN'):
-        curie_id = curie_id.replace('UMLS:CN', 'medgen:CN')  # see GitHub issue 810
-    iri = prefixcommons.expand_uri(curie_id, curie_to_iri_map)
-    if iri == curie_id:
-        iri = None
-    return iri
-
-
-def make_curie_to_uri_expander(curie_to_iri_map: list = None) -> callable:
-    if curie_to_iri_map is None:
-        curie_to_iri_map = []
-    return lambda curie_id: expand_curie_to_iri(curie_id, curie_to_iri_map)
-
-
-class IDMapperType(enum.Enum):
-    EXPAND = 1
-    CONTRACT = 2
-
-
-def make_curies_to_uri_map(curies_to_uri_map_yaml_string: str, mapper_type: IDMapperType) -> dict:
-    yaml_data_structure_dict = safe_load_yaml_from_string(curies_to_uri_map_yaml_string)
-    if mapper_type == IDMapperType.CONTRACT:
-        return typing.cast(list, typing.cast(list, typing.cast(list, yaml_data_structure_dict['use_for_bidirectional_mapping']) +
-                                             yaml_data_structure_dict['use_for_contraction_only']))
-    elif mapper_type == IDMapperType.EXPAND:
-        return typing.cast(list, typing.cast(list, yaml_data_structure_dict['use_for_bidirectional_mapping']) +
-                           typing.cast(list, yaml_data_structure_dict['use_for_expansion_only']))
-    else:
-        raise ValueError("Invalid mapper type: " + str(mapper_type))
-
-
-def get_biolink_category_tree(biolink_ontology: ontobio.ontol.Ontology):
-    queue = collections.deque([CURIE_PREFIX_BIOLINK + ':' + 'NamedThing'])
-    biolink_category_dict = dict()
-    biolink_category_tree = dict()
-
-    while len(queue) > 0:
-        node_id = queue.popleft()
-        biolink_category_dict[node_id] = []
-        for child_node_id in biolink_ontology.children(node_id, ['subClassOf']):
-            biolink_category_dict[node_id].append(child_node_id)
-            queue.append(child_node_id)
-
-    for parent, children in biolink_category_dict.items():
-        parent = biolink_ontology.node(parent)['lbl']
-        for child in children:
-            if parent not in biolink_category_tree:
-                biolink_category_tree[parent] = []
-            child = biolink_ontology.node(child)['lbl']
-            biolink_category_tree[parent].append(child)
-            biolink_category_tree[parent] = sorted(biolink_category_tree[parent])
-
-    return biolink_category_tree
-
-
-def get_depths_of_ontology_terms(ontology: ontobio.ontol.Ontology,
-                                 top_node_id: str):
-    queue = collections.deque([top_node_id])
-    distances = dict()
-    distances[top_node_id] = 0
-    while len(queue) > 0:
-        node_id = queue.popleft()
-        node_dist = distances.get(node_id, math.inf)
-        assert not math.isinf(node_dist)
-        for child_node_id in ontology.children(node_id, ['subClassOf']):
-            if math.isinf(distances.get(child_node_id, math.inf)):
-                distances[child_node_id] = node_dist + 1
-                queue.append(child_node_id)
-    return distances
-
-
-def get_biolink_categories_ontology_depths(biolink_ontology: ontobio.ontol.Ontology):
-    url_depths = get_depths_of_ontology_terms(biolink_ontology, CURIE_PREFIX_BIOLINK + ':NamedThing')
-    ret_depths = {key.replace(BASE_URL_BIOLINK_META, ''): value for key, value in url_depths.items()}
-    ret_depths['UnknownCategory'] = -1
-    return ret_depths
-
-
-def make_uri_curie_mappers(curies_to_uri_file_name: str) -> Dict[str, callable]:
-    yaml_string = read_file_to_string(curies_to_uri_file_name)
-    expand_map = make_curies_to_uri_map(yaml_string, IDMapperType.EXPAND)
-    contract_map = make_curies_to_uri_map(yaml_string, IDMapperType.CONTRACT)
-    expander = make_curie_to_uri_expander(expand_map)
-    contracter = make_uri_to_curie_shortener(contract_map)
-    return {'expand': expander, 'contract': contracter}
-
-
 def log_message(message: str,
                 ontology_name: str = None,
                 node_curie_id: str = None,
@@ -807,15 +683,32 @@ def predicate_label_to_curie(predicate_label: str,
         predicate_label_to_use = predicate_label.replace(':', '_')
     return relation_curie_prefix + ':' + predicate_label_to_use
 
-
-def ont_children_recursive(ont_hier: ontobio.ontol.Ontology,
-                           node_name: str):
-    res_set = {node_name}
-    for child_node_name in ont_hier.children(node_name):
-        res_set |= ont_children_recursive(ont_hier, child_node_name)
-    return res_set
-
-    
+def construct_biolink_term_set(is_a_base, biolink_terms):
+    output_set = set()
+    for key in biolink_terms:
+        key_is_a = biolink_terms[key]
+        if key_is_a == is_a_base:
+            for item in construct_biolink_term_set(key, biolink_terms):
+                output_set.add(item)
+    output_set.add(is_a_base)
+    return output_set
+
+def identify_biolink_terms(biolink_model):
+    biolink_predicate_terms = dict()
+    biolink_category_terms = dict()
+    for predicate in biolink_model["slots"]:
+        if DESCENDANT_KEY in biolink_model["slots"][predicate]:
+            biolink_predicate_terms[predicate] = biolink_model["slots"][predicate][DESCENDANT_KEY]
+
+    for category in biolink_model["classes"]:
+        if DESCENDANT_KEY in biolink_model["classes"][category]:
+            biolink_category_terms[category] = biolink_model["classes"][category][DESCENDANT_KEY]
+
+    biolink_predicates = construct_biolink_term_set(BASE_PREDICATE, biolink_predicate_terms)
+    biolink_categories = construct_biolink_term_set(BASE_CATEGORY, biolink_category_terms)
+
+    return list(biolink_predicates), list(biolink_categories)
+ 
 def make_edge_biolink(subject_curie_id: str,
                       object_curie_id: str,
                       predicate_label: str,
@@ -839,57 +732,4 @@ def is_a_valid_http_url(id: str) -> bool:
         valid = id.startswith('http://') or id.startswith('https://')
     except validators.ValidationFailure:
         valid = False
-    return valid
-
-
-def load_ontology_from_owl_or_json_file(ontology_file_name: str):
-    if ontology_file_name.startswith('./'):
-        ontology_file_name = ontology_file_name[2:(len(ontology_file_name)+1)]
-    ont_factory = ontobio.ontol_factory.OntologyFactory()
-    return ont_factory.create(ontology_file_name, ignore_cache=True)
-
-
-# This function will load the ontology object from a pickle file (if it exists)
-# or it will create the ontology object by parsing the OWL-XML ontology file
-# NOTE: it seems that ontobio can't directly read a TTL file (at least, it is
-# not working for me), so we convert all input files (whether OWL or TTL) to
-# JSON and then load the JSON files using ontobio, for "simplicity". A second
-# reason why we load using JSON is because when it loads an OWL file, ontobio
-# does some internal caching that cannot be opted out of; it does not do this
-# caching if you load an ontology in JSON format.
-def make_ontology_from_local_file(file_name: str, save_pickle: bool = False):
-    file_name_without_ext = os.path.splitext(file_name)[0]
-    file_name_with_pickle_ext = file_name_without_ext + ".pickle"
-    if not os.path.isfile(file_name_with_pickle_ext) or save_pickle:
-        # the ontology hsa not been saved as a pickle file, so we need to load it from a text file
-        if not file_name.endswith('.json'):
-            temp_file_name = tempfile.mkstemp(prefix=TEMP_FILE_PREFIX + '-')[1] + '.json'
-            size = os.path.getsize(file_name)
-            log_message(message="Reading ontology file: " + file_name + "; size: " + "{0:.2f}".format(size/1024) + " KiB",
-                        ontology_name=None)
-            cp = subprocess.run(['owltools', file_name, '-o', '-f', 'json', temp_file_name],
-                                check=True)
-            # robot commented out because it is giving a NullPointerException on umls-semantictypes.owl
-            # Once robot no longer gives a NullPointerException, we can use it like this:
-            #cp = subprocess.run(['robot', 'convert', '--input', file_name, '--output', temp_file_name])
-            if cp.stdout is not None:
-                log_message(message="OWL convert result: " + cp.stdout, ontology_name=None, output_stream=sys.stdout)
-            if cp.stderr is not None:
-                log_message(message="OWL convert result: " + cp.stderr, ontology_name=None, output_stream=sys.stderr)
-            assert cp.returncode == 0
-            json_file = file_name_without_ext + ".json"
-            shutil.move(temp_file_name, json_file)
-        else:
-            json_file = file_name
-        size = os.path.getsize(json_file)
-        log_message(message="Reading ontology JSON file: " + json_file + "; size: " + "{0:.2f}".format(size/1024) + " KiB",
-                    ontology_name=None)
-        assert os.path.exists(json_file)
-        ont_return = load_ontology_from_owl_or_json_file(json_file)
-        if save_pickle:
-            pickle.dump(ont_return, open(file_name_with_pickle_ext, 'wb'))
-    else:
-        size = os.path.getsize(file_name_with_pickle_ext)
-        log_message("Reading ontology file: " + file_name_with_pickle_ext + "; size: " + "{0:.2f}".format(size/1024) + " KiB", ontology_name=None)
-        ont_return = pickle.load(open(file_name_with_pickle_ext, "rb"))
-    return ont_return
+    return valid
\ No newline at end of file

From a8906951c0ff5462592e32ab16f41af4a6f57528 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 01:57:56 -0700
Subject: [PATCH 070/125] #387 addressing a name change

---
 validate/validate_curies_to_categories_yaml.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/validate/validate_curies_to_categories_yaml.py b/validate/validate_curies_to_categories_yaml.py
index 80cdde53..8d8d6131 100755
--- a/validate/validate_curies_to_categories_yaml.py
+++ b/validate/validate_curies_to_categories_yaml.py
@@ -30,8 +30,8 @@ def make_arg_parser():
 args = make_arg_parser().parse_args()
 curies_to_categories_file_name = args.curiesToCategoriesFile
 curies_to_urls_map_file_name = args.curiesToURLsMapFile
-biolink_model_url = args.biolinkModelOWLURL
-biolink_model_file_name = args.biolinkModelOWLLocalFile
+biolink_model_url = args.biolinkModelYAMLURL
+biolink_model_file_name = args.biolinkModelYAMLLocalFile
 curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
 curies_to_url_map_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_map_file_name))
 curies_to_url_map_data_bidir = {next(iter(listitem.keys())) for listitem in curies_to_url_map_data['use_for_bidirectional_mapping']}

From 86492f525239b08b74b3dc91b7a47cea9dd7a32e Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 02:01:47 -0700
Subject: [PATCH 071/125] #387 format adjustment

---
 validate/validate_curies_to_categories_yaml.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/validate/validate_curies_to_categories_yaml.py b/validate/validate_curies_to_categories_yaml.py
index 8d8d6131..63b96f88 100755
--- a/validate/validate_curies_to_categories_yaml.py
+++ b/validate/validate_curies_to_categories_yaml.py
@@ -51,6 +51,4 @@ def make_arg_parser():
     list(curies_to_categories_data['term-mappings'].values())
 
 for category in categories_to_check:
-    category_camelcase = kg2_util.convert_space_case_to_camel_case(category)
-    category_curie = kg2_util.CURIE_PREFIX_BIOLINK + ':' + category_camelcase
-    assert category_curie in biolink_categories, category_curie
+    assert category in biolink_categories, category_curie

From 86d431ffdc0dbfbdb299916cd81e9967a373a3c1 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 02:05:30 -0700
Subject: [PATCH 072/125] #387 adjustments to the mapping file to go along with
 new comparison system

---
 maps/curies-to-urls-map.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 3452641c..c81cb4ed 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -324,7 +324,7 @@ use_for_bidirectional_mapping:
   -
     NBO-PROPERTY: 'http://purl.obolibrary.org/obo/nbo#'
   -
-    NCBIGene: 'http://identifiers.org/ncbigene/'
+    NCBIGene: 'https://identifiers.org/ncbigene:'
 #   - 
 #     NCBITaxon: 'http://purl.obolibrary.org/obo/ncbitaxon/subsets/taxslim.owl'
   -

From e63311b38998edab071708809cbb4e7df8aa1232 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 02:09:20 -0700
Subject: [PATCH 073/125] #387 apparently these have to be different to match
 biolink

---
 maps/curies-to-urls-map.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index c81cb4ed..3452641c 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -324,7 +324,7 @@ use_for_bidirectional_mapping:
   -
     NBO-PROPERTY: 'http://purl.obolibrary.org/obo/nbo#'
   -
-    NCBIGene: 'https://identifiers.org/ncbigene:'
+    NCBIGene: 'http://identifiers.org/ncbigene/'
 #   - 
 #     NCBITaxon: 'http://purl.obolibrary.org/obo/ncbitaxon/subsets/taxslim.owl'
   -

From 9d7213b31df87519265974a084b007f062f6ce13 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 02:13:21 -0700
Subject: [PATCH 074/125] #387 kg2_util didn't previously commit correctly

---
 kg2_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kg2_util.py b/kg2_util.py
index b694ea62..a32815f2 100644
--- a/kg2_util.py
+++ b/kg2_util.py
@@ -156,7 +156,7 @@
 BASE_URL_KEGG_GLYCAN = BASE_BASE_URL_IDENTIFIERS_ORG + 'kegg.glycan:'
 BASE_URL_KEGG_REACTION = BASE_BASE_URL_IDENTIFIERS_ORG + 'kegg.reaction:'
 BASE_URL_MIRBASE = BASE_BASE_URL_IDENTIFIERS_ORG + 'mirbase:'
-BASE_URL_NCBIGENE = BASE_BASE_URL_IDENTIFIERS_ORG + 'ncbigene:'
+BASE_URL_NCBIGENE = 'http://identifiers.org/ncbigene/'
 BASE_URL_OBO_FORMAT = 'http://purl.org/obo/owl/oboFormat#oboFormat_'
 BASE_URL_OWL = 'http://www.w3.org/2002/07/owl#'
 BASE_URL_PATHWHIZ = 'http://smpdb.ca/pathways/#'

From e1919881b5e262272479dcd09b2039560d8d6172 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Thu, 29 Aug 2024 12:50:41 -0700
Subject: [PATCH 075/125] #387 the recursive category picker is workingpython3
 ontologies_jsonl_to_kg_jsonl.py ontologies.json
 maps/curies-to-categories.yaml null > ontology_nodes.json python3
 ontologies_jsonl_to_kg_jsonl.py ontologies.json
 maps/curies-to-categories.yaml null > ontology_nodes.json

---
 ontologies_jsonl_to_kg_jsonl.py | 122 +++++++++++++++++++++++---------
 1 file changed, 88 insertions(+), 34 deletions(-)

diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py
index 93e119db..34494e56 100644
--- a/ontologies_jsonl_to_kg_jsonl.py
+++ b/ontologies_jsonl_to_kg_jsonl.py
@@ -42,6 +42,13 @@
 				   "oboInOwl:hasDbXref": TEXT_KEY,
 				   "oboInOwl:xref": TEXT_KEY}
 
+CLASS_TO_SUPERCLASSES = dict()
+SAVED_NODE_INFO = dict()
+SOURCE_INFO = dict()
+
+NODE_CATEGORY_MAPPINGS = dict()
+PREFIX_MAPPINGS = dict()
+
 CLASSES_DICT = dict()
 
 URI_MAP = dict()
@@ -49,21 +56,57 @@
 
 MISSING_ID_PREFIXES = set()
 
+FILE_MAPPING = "file"
+PREFIX_MAPPING = "prefix"
+RECURSE_MAPPING = "recurse"
+
 def get_args():
 	arg_parser = argparse.ArgumentParser()
 	arg_parser.add_argument('--test', dest='test',
 							action="store_true", default=False)
 	arg_parser.add_argument('inputFile', type=str)
+	arg_parser.add_argument('curiesToCategoriesYAML', type=str)
 	arg_parser.add_argument('outputFile', type=str)
 	return arg_parser.parse_args()
 
+def categorize_node(node_id, recursion_depth=0):
+	node_prefix = node_id.split(':')[0]
+
+	if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING:
+		return NODE_CATEGORY_MAPPINGS[node_id][0]
+
+	if node_prefix in PREFIX_MAPPINGS:
+		node_category = PREFIX_MAPPINGS[node_prefix]
+		NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING)
+		return PREFIX_MAPPINGS[node_prefix]
+
+	# Get try to get the most common superclass categorization
+	superclass_categorizations = dict()
+	highest_value = 0
+	highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING
+	if recursion_depth == 10:
+		return kg2_util.BIOLINK_CATEGORY_NAMED_THING
+
+	for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()):
+		superclass_category = categorize_node(superclass, recursion_depth + 1)
+		if superclass_category not in superclass_categorizations:
+			superclass_categorizations[superclass_category] = 0
+		superclass_categorizations[superclass_category] += 1
+		if superclass_categorizations[superclass_category] > highest_value:
+			highest_value = superclass_categorizations[superclass_category]
+			highest_category = superclass_category
+
+	NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING)
+	return highest_category
+
+
+
 def process_ontology_item(ontology_item):
 	source = ontology_item.get(OWL_SOURCE_KEY, str())
 	for owl_class in ontology_item.get(OWL_CLASS_TAG, list()):
 		# Typically genid classes which don't neatly map onto the KG2 schema
 		if ID_TAG not in owl_class:
 			continue
-		# TODO: MAP THIS HERE, since not all sources use same IRIs for the same nodes
 		node_id = match_prefix(owl_class.get(ID_TAG, str()))
 		if node_id is None:
 			continue
@@ -123,6 +166,7 @@ def process_ontology_item(ontology_item):
 			if RESOURCE_KEY in edge:
 				edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
 
+		superclasses = set()
 		final_edges_list = list()
 		for (edge_relation, edge_object) in edges_list:
 			edge_object = match_prefix(edge_object)
@@ -131,37 +175,38 @@ def process_ontology_item(ontology_item):
 			edge_relation = match_prefix(edge_relation)
 			if edge_relation is None:
 				continue
+			if edge_relation in ["rdfs:subClassOf"]:
+				superclasses.add(edge_object)
 			final_edges_list.append((edge_relation, edge_object))
 
+		# Imperfect way to make it deterministic
+		superclasses = sorted(list(superclasses))
+		if node_id not in CLASS_TO_SUPERCLASSES:
+			CLASS_TO_SUPERCLASSES[node_id] = list()
+		CLASS_TO_SUPERCLASSES[node_id] += superclasses
+		CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
+
+		if node_id not in SAVED_NODE_INFO:
+			SAVED_NODE_INFO[node_id] = list()
+		SAVED_NODE_INFO[node_id].append({"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list})
+
+	for ontology_node in ontology_item.get("owl:Ontology", list()):
+		ontology_version = None
+		ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get("owl:versionInfo", list()) if TEXT_KEY in version]
+		ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get("owl:versionIRI", list()) if RESOURCE_KEY in version]
+		ontology_date = [version.get(TEXT_KEY, str()) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version]
+		if len(ontology_versions) == 1:
+			ontology_version = ontology_versions[0]
+		elif len(ontology_version_iri) == 1:
+			ontology_version = ontology_version_iri[0]
+		elif len(ontology_date) == 1:
+			ontology_version = ontology_date[0]
+
+		if ontology_version is None:
+			print("Warning: source", source, "lacks any versioning information.")
+		if source not in SOURCE_INFO:
+			SOURCE_INFO[source] = {"source": source, "ontology_date": ontology_date, "ontology_version": ontology_version}
 
-		# node_id = owl_class.get(ID_TAG, list())
-
-		# superclasses = [superclass.get(RESOURCE_KEY, str()) for superclass in owl_class.get(SUBCLASS_TAG, list())]
-
-		# # Also query for comments?
-		# # Descriptions appear to be additive in current KG2
-		# descriptions = owl_class.get(DESCRIPTION_TAG, list())
-		# assert len(descriptions) <= 1
-		# description = str()
-		# for element in descriptions:
-		# 	description += element[TEXT_KEY]
-
-		# xrefs = [xref[TEXT_KEY] for xref in owl_class.get(XREF_TAG, list())]
-		# for element in owl_class.get(XREF_TAG, list()):
-		# 	xrefs.append(element[TEXT_KEY])
-
-		# exact_matches = [exact_match[RESOURCE_KEY] for exact_match in owl_class.get(EXACT_MATCH_TAG, list())]
-
-		# names = owl_class.get(NAME_TAG, list())
-		# assert len(names) <= 1, ontology_item
-		# name = str()
-		# for element in names:
-		# 	name += element[TEXT_KEY]
-
-		# node = {"id": node_id, "superclasses": superclasses, "description": description, "xrefs": xrefs, "name": name, "exact_matches": exact_matches}
-
-		node = {"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list}
-		print(json.dumps(node, indent=4))
 
 def generate_uri_map():
 	uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml"))
@@ -201,8 +246,15 @@ def match_prefix(node_id):
 if __name__ == '__main__':
 	args = get_args()
 	input_file_name = args.inputFile
+	curies_to_categories_file_name = args.curiesToCategoriesYAML
 	output_file_name = args.outputFile
 
+	curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
+	for mapping_node in curies_to_categories_data["term-mappings"]:
+		NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING)
+	for prefix in curies_to_categories_data["prefix-mappings"]:
+		PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix]
+
 	input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
 	input_data = input_read_jsonlines_info[0]
 
@@ -211,9 +263,11 @@ def match_prefix(node_id):
 	generate_uri_map()
 	for ontology_item in input_data:
 		process_ontology_item(ontology_item)
-	print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4))
 
-	# print("OWL Classes:", owl_class_count)
-	# for key in KEYS_DICT:
-	# 	KEYS_DICT[key] = KEYS_DICT[key] / owl_class_count
-	# print(json.dumps(KEYS_DICT, indent=4, sort_keys=True))
\ No newline at end of file
+	for node_id in SAVED_NODE_INFO:
+		categorize_node(node_id)
+
+	print(json.dumps(NODE_CATEGORY_MAPPINGS, indent=4))
+
+	# Can add this back in later
+	# print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4))

From 314b54f14b3731d50fcd6ee185347239271546de Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 1 Sep 2024 22:55:32 -0700
Subject: [PATCH 076/125] #387 some date restructuring, ontology node
 versioning, and changes to handle ORDO

---
 ontologies_jsonl_to_kg_jsonl.py | 219 ++++++++++++++++++++++++++++----
 1 file changed, 197 insertions(+), 22 deletions(-)

diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py
index 34494e56..c04b4b70 100644
--- a/ontologies_jsonl_to_kg_jsonl.py
+++ b/ontologies_jsonl_to_kg_jsonl.py
@@ -1,6 +1,7 @@
 import argparse
 import kg2_util
 import json
+import datetime
 
 OWL_CLASS_TAG = "owl:Class"
 SUBCLASS_TAG = "rdfs:subClassOf"
@@ -15,10 +16,12 @@
 RESOURCE_KEY = "rdf:resource"
 
 OWL_SOURCE_KEY = "owl_source"
+OWL_SOURCE_NAME_KEY = "owl_source_name"
 
 KEYS_DICT = dict()
 
 COMMENT_PREFIX = "COMMENTS: "
+DESCRIPTION_DELIM = " // "
 
 BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY,
 				   "mondo-base:closeMatch": RESOURCE_KEY,
@@ -53,6 +56,7 @@
 
 URI_MAP = dict()
 URI_MAP_KEYS = list()
+PREFIX_TO_IRI_MAP = dict()
 
 MISSING_ID_PREFIXES = set()
 
@@ -60,13 +64,28 @@
 PREFIX_MAPPING = "prefix"
 RECURSE_MAPPING = "recurse"
 
+ID_KEY = "id"
+DEPRECATED_KEY = "deprecated"
+UPDATE_DATE_KEY = "update_date"
+CREATION_DATE_KEY = "creation_date"
+SYNONYM_KEY = "synonym"
+DESCRIPTION_KEY = "description_list"
+NAME_KEY = "name"
+SOURCE_KEY = "source"
+BIOLOGICAL_SEQUENCE_KEY = "has_biological_sequence"
+CATEGORY_KEY = "category"
+EDGES_KEY = "edges"
+IRI_KEY = "iri"
+VERSION_KEY = "version"
+
 def get_args():
 	arg_parser = argparse.ArgumentParser()
 	arg_parser.add_argument('--test', dest='test',
 							action="store_true", default=False)
 	arg_parser.add_argument('inputFile', type=str)
 	arg_parser.add_argument('curiesToCategoriesYAML', type=str)
-	arg_parser.add_argument('outputFile', type=str)
+	arg_parser.add_argument('outputNodesFile', type=str)
+	arg_parser.add_argument('outputEdgesFile', type=str)
 	return arg_parser.parse_args()
 
 def categorize_node(node_id, recursion_depth=0):
@@ -99,10 +118,93 @@ def categorize_node(node_id, recursion_depth=0):
 	NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING)
 	return highest_category
 
+def reformat_obo_date(date_str):
+	if date_str is None:
+		return None
+
+	if '-' in date_str:
+		delim = 'T'
+		if ' ' in date_str:
+			delim = ' '
+		date_spl = date_str.strip('Z').split(delim)
+		date_fh = date_spl[0].split('-')
+		year = int(date_fh[0])
+		month = int(date_fh[1])
+		day = int(date_fh[2])
+
+		if month < 1 or month > 12 or day < 1 or day > 31:
+			return None
+
+		if len(date_spl) > 1:
+			date_sh = date_spl[1].split(':')
+			hour = int(date_sh[0])
+			minute = int(date_sh[1])
+			second = int(date_sh[2][0:1])
+
+			return datetime.datetime(year, month, day, hour, minute, second)
+		else:
+			return datetime.datetime(year, month, day)
+	else:
+		date_spl = date_str.split(' ')
+		date_fh = date_spl[0].split(':')
+		year = int(date_fh[2])
+		month = int(date_fh[1])
+		day = int(date_fh[0])
+
+		if month < 1 or month > 12 or day < 1 or day > 31:
+			return None
+
+		return datetime.datetime(year, month, day)
+
+def pick_most_recent_date(dates, alternate_date=None):
+	latest_date = None
+	for date in dates:
+		if date == None:
+			continue
+		if latest_date == None or date > latest_date:
+			latest_date = date
+	
+	if latest_date == None:
+		if alternate_date is not None:
+			latest_date = alternate_date
+		else:
+			return None
+
+	return latest_date.isoformat(sep=' ')
+
+def process_ontology_term(ontology_node, source, ontology_name, owl_source=True):
+	owl_prefix = ""
+	if owl_source:
+		owl_prefix = "owl:"
+	ontology_version = None
+	ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version]
+	ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version]
+	ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version]
+	ontology_iri = ontology_node.get("rdf:about", str())
+	if len(ontology_versions) == 1:
+		ontology_version = ontology_versions[0]
+	elif len(ontology_version_iri) == 1:
+		ontology_version = ontology_version_iri[0]
+		version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/']
+		for replacement in version_replacements:
+			ontology_version = ontology_version.replace(replacement, "")
+		ontology_version = ontology_version.split('/')[0]
+	elif len(ontology_dates) >= 1:
+		ontology_version = pick_most_recent_date(ontology_dates)
+
+	if ontology_version is None:
+		print("Warning: source", source, "lacks any versioning information.")
+
+	ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates))
+	source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source
+
+	if source not in SOURCE_INFO:
+		SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version}
 
 
 def process_ontology_item(ontology_item):
 	source = ontology_item.get(OWL_SOURCE_KEY, str())
+	ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str())
 	for owl_class in ontology_item.get(OWL_CLASS_TAG, list()):
 		# Typically genid classes which don't neatly map onto the KG2 schema
 		if ID_TAG not in owl_class:
@@ -110,6 +212,8 @@ def process_ontology_item(ontology_item):
 		node_id = match_prefix(owl_class.get(ID_TAG, str()))
 		if node_id is None:
 			continue
+		node_prefix = node_id.split(':')[0]
+		node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '')
 
 		# Configure the name
 		name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
@@ -124,6 +228,26 @@ def process_ontology_item(ontology_item):
 		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
 		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
 
+		deprecated = "true" in owl_class.get("owl:deprecated", list())
+		for name in name_list:
+			if name.startswith("obsolete") or name.startswith("(obsolete") or name.endswith("obsolete"):
+				deprecated = True
+
+		# Configure the synonyms
+		synonym_list = list()
+		synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym",
+						"go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111",
+						"obo:IAO_0000028", "skos:prefLabel"]
+		synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
+
+		update_date_list = list()
+		update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
+		update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
+
+		creation_date_list = list()
+		creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
+		creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
+
 		# Configure the biological sequence
 		has_biological_sequence = dict()
 		has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
@@ -188,25 +312,24 @@ def process_ontology_item(ontology_item):
 
 		if node_id not in SAVED_NODE_INFO:
 			SAVED_NODE_INFO[node_id] = list()
-		SAVED_NODE_INFO[node_id].append({"id": node_id, "description_list": description_list, "name": name_list, "source": source, "has_biological_sequence": has_biological_sequence, "edges": final_edges_list})
+		SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
+										 DEPRECATED_KEY: deprecated,
+										 UPDATE_DATE_KEY: update_date_list,
+										 CREATION_DATE_KEY: creation_date_list,
+										 SYNONYM_KEY: synonym_list,
+										 DESCRIPTION_KEY: description_list,
+										 NAME_KEY: name_list,
+										 SOURCE_KEY: source,
+										 BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence,
+										 IRI_KEY: node_iri,
+										 EDGES_KEY: final_edges_list})
 
 	for ontology_node in ontology_item.get("owl:Ontology", list()):
-		ontology_version = None
-		ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get("owl:versionInfo", list()) if TEXT_KEY in version]
-		ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get("owl:versionIRI", list()) if RESOURCE_KEY in version]
-		ontology_date = [version.get(TEXT_KEY, str()) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version]
-		if len(ontology_versions) == 1:
-			ontology_version = ontology_versions[0]
-		elif len(ontology_version_iri) == 1:
-			ontology_version = ontology_version_iri[0]
-		elif len(ontology_date) == 1:
-			ontology_version = ontology_date[0]
-
-		if ontology_version is None:
-			print("Warning: source", source, "lacks any versioning information.")
-		if source not in SOURCE_INFO:
-			SOURCE_INFO[source] = {"source": source, "ontology_date": ontology_date, "ontology_version": ontology_version}
+		process_ontology_term(ontology_node, source, ontology_name)
 
+	# Because of ORDO
+	for ontology_node in ontology_item.get("Ontology", list()):
+		process_ontology_term(ontology_node, source, ontology_name, False)
 
 def generate_uri_map():
 	uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml"))
@@ -217,6 +340,7 @@ def generate_uri_map():
 		for curie_prefix in curie_prefix_dict:
 			curie_url = curie_prefix_dict[curie_prefix]
 			URI_MAP[curie_url] = curie_prefix
+			PREFIX_TO_IRI_MAP[curie_prefix] = curie_url
 
 	for curie_prefix_dict in contraction_map:
 		for curie_prefix in curie_prefix_dict:
@@ -242,12 +366,62 @@ def match_prefix(node_id):
 	else:
 		MISSING_ID_PREFIXES.add(node_id)
 
+def construct_nodes_and_edges(nodes_output, edges_output):
+	for source in SOURCE_INFO:
+		source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]])
+		source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY]
+		source_id = SOURCE_INFO[source][SOURCE_KEY]
+		source_iri = SOURCE_INFO[source][IRI_KEY]
+		node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id)
+
+		nodes_output.write(node)
+
+
+	for node_id in SAVED_NODE_INFO:
+		for source_node_index in range(len(SAVED_NODE_INFO[node_id])):
+			if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]:
+				continue
+			name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name
+			node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY]
+			description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY])
+			has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None)
+			synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY]
+			category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY]
+
+			source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY]
+			provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source
+			source_date = SOURCE_INFO[source][UPDATE_DATE_KEY]
+
+			update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date)
+			creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date)
+
+			node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by)
+			node["description"] = description
+			node["has_biological_sequence"] = has_biological_sequence
+			node["creation_date"] = creation_date
+			node["synonym"] = synonyms
+
+			nodes_output.write(node)
+
+			for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]:
+				relation_label = edge_relation.split(':')[1]
+				edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date)
+
+				edges_output.write(edge)
+
+
 
 if __name__ == '__main__':
 	args = get_args()
 	input_file_name = args.inputFile
 	curies_to_categories_file_name = args.curiesToCategoriesYAML
-	output_file_name = args.outputFile
+	output_nodes_file_name = args.outputNodesFile
+	output_edges_file_name = args.outputEdgesFile
+	test_mode = args.test
+
+	nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
+	nodes_output = nodes_info[0]
+	edges_output = edges_info[0]
 
 	curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
 	for mapping_node in curies_to_categories_data["term-mappings"]:
@@ -258,7 +432,6 @@ def match_prefix(node_id):
 	input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
 	input_data = input_read_jsonlines_info[0]
 
-	owl_class_count = 0
 	ontology_prefixes = set()
 	generate_uri_map()
 	for ontology_item in input_data:
@@ -266,8 +439,10 @@ def match_prefix(node_id):
 
 	for node_id in SAVED_NODE_INFO:
 		categorize_node(node_id)
+		node_category = NODE_CATEGORY_MAPPINGS[node_id][0]
+		for index in range(len(SAVED_NODE_INFO[node_id])):
+			SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category
 
-	print(json.dumps(NODE_CATEGORY_MAPPINGS, indent=4))
+	construct_nodes_and_edges(nodes_output, edges_output)
 
-	# Can add this back in later
-	# print(json.dumps(sorted(list(MISSING_ID_PREFIXES)), indent=4))
+	kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
\ No newline at end of file

From c36f46c51ea089f4e937d9e9dc255ed2fb6fb408 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 1 Sep 2024 22:55:52 -0700
Subject: [PATCH 077/125] #387 need to have name for ontology node

---
 owlparser.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/owlparser.py b/owlparser.py
index 3f271dd4..34e99fe3 100644
--- a/owlparser.py
+++ b/owlparser.py
@@ -8,6 +8,7 @@ def get_args():
 	arg_parser.add_argument('--test', dest='test',
 							action="store_true", default=False)
 	arg_parser.add_argument('inputFile', type=str)
+	arg_parser.add_argument('owlFilePath', type=str)
 	arg_parser.add_argument('outputFile', type=str)
 	return arg_parser.parse_args()
 
@@ -354,7 +355,7 @@ def divide_into_lines(self, input_file_name):
 
 
 class OWLParser():
-	def __init__(self, input_files, output_file_name):
+	def __init__(self, input_files, input_file_names, owl_file_path, output_file_name):
 		self.XML_TAG = "?xml"
 		self.RDF_TAG = "rdf:RDF"
 		self.DOCTYPE_TAG = "!DOCTYPE"
@@ -366,6 +367,7 @@ def __init__(self, input_files, output_file_name):
 		self.GENID_PREFIX = "genid"
 
 		self.OWL_SOURCE_KEY = "owl_source"
+		self.OWL_SOURCE_NAME_KEY = "owl_source_name"
 
 		self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
 
@@ -378,6 +380,8 @@ def __init__(self, input_files, output_file_name):
 		self.ID_TO_GENIDS = dict()
 
 		self.input_files = input_files
+		self.input_file_names = input_file_names
+		self.owl_file_path = owl_file_path
 		self.output_file_name = output_file_name
 
 		self.output_info = kg2_util.create_single_jsonlines()
@@ -433,6 +437,7 @@ def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
 
 	def write_to_output(self, output_dict, source_file):
 		output_dict[self.OWL_SOURCE_KEY] = source_file
+		output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file]
 		self.output.write(output_dict)
 
 		return
@@ -464,18 +469,18 @@ def triage_nest_dict(self, nest_dict):
 				self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
 			else:
 				# Since all of the genids used in this class have been matched, output
-				self.output.write(nest_dict)
+				self.write_to_output(nest_dict, self.input_file)
 				self.GENID_REMAINING_NESTS[class_id] = None
 		else:
 			# There are no genids that need to be worked with, so just output
-			self.output.write(nest_dict)
+			self.write_to_output(nest_dict, self.input_file)
 
 
 	def parse_OWL_file(self):
 		for input_file in self.input_files:
 			self.input_file = input_file
 			print("Reading:", input_file, "starting at", date())
-			self.xml_parser.divide_into_lines(input_file)
+			self.xml_parser.divide_into_lines(self.owl_file_path + input_file)
 
 			# Genid wasn't filled, still want to include them though
 			for item in self.GENID_REMAINING_NESTS:
@@ -490,23 +495,30 @@ def parse_OWL_file(self):
 		kg2_util.close_single_jsonlines(self.output_info, self.output_file_name)
 
 
-def identify_input_files(ont_load_inventory):
+def identify_and_download_input_files(ont_load_inventory, path_to_owl_files):
 	input_files = list()
+	input_file_names = dict()
+	owl_file_path = path_to_owl_files.rstrip('/') + "/"
 	for item in ont_load_inventory:
 		input_files.append(item['file'])
+		input_file_names[item['file']] = item['title']
+		print("Downloading:", item['file'], "starting at", date())
+		kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file'])
+		print("Download of:", item['file'], "finished at", date())
 
-	return input_files
+	return input_files, input_file_names, owl_file_path
 
 if __name__ == '__main__':
 	args = get_args()
 	input_file_name = args.inputFile
+	owl_path = args.owlFilePath
 	output_file_name = args.outputFile
 
 	ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name))
-	input_files = identify_input_files(ont_load_inventory)
+	input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path)
 
 	print("Files:", input_files)
 	print("Start Time:", date())
-	owl_parser = OWLParser(input_files, output_file_name)
+	owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name)
 	owl_parser.parse_OWL_file()
 	print("End Time:", date())
\ No newline at end of file

From d4ffa050bd9e9fe8aa9aab3f4f5a55d77518f301 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 1 Sep 2024 22:56:03 -0700
Subject: [PATCH 078/125] credits for me

---
 kg2_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kg2_util.py b/kg2_util.py
index a32815f2..bfb7e2a8 100644
--- a/kg2_util.py
+++ b/kg2_util.py
@@ -7,7 +7,7 @@
 
 __author__ = 'Stephen Ramsey'
 __copyright__ = 'Oregon State University'
-__credits__ = ['Stephen Ramsey']
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
 __license__ = 'MIT'
 __version__ = '0.1.0'
 __maintainer__ = ''

From 786050669dfbd04588f581a0e0b9446adbc28f31 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 1 Sep 2024 22:56:20 -0700
Subject: [PATCH 079/125] #387 removing unnecessary curies

---
 maps/curies-to-categories.yaml | 127 ---------------------------------
 1 file changed, 127 deletions(-)

diff --git a/maps/curies-to-categories.yaml b/maps/curies-to-categories.yaml
index 9520387c..8b35d539 100644
--- a/maps/curies-to-categories.yaml
+++ b/maps/curies-to-categories.yaml
@@ -284,133 +284,6 @@ term-mappings:
     SNOMED:419891008: information content entity
     SNOMED:900000000000441003: information content entity
     SO:0000704: gene # formerly genomic entity
-    STY:T001: individual organism
-    STY:T002: organism taxon # formerly individual organism
-    STY:T004: organism taxon # formerly individual organism
-    STY:T005: organism taxon # formerly individual organism
-    STY:T007: organism taxon # formerly individual organism
-    STY:T008: organism taxon # formerly individual organism
-    STY:T010: organism taxon # formerly individual organism
-    STY:T011: organism taxon # formerly individual organism
-    STY:T012: organism taxon # formerly individual organism
-    STY:T013: organism taxon # formerly individual organism
-    STY:T014: organism taxon # formerly individual organism
-    STY:T015: organism taxon # formerly individual organism
-    STY:T016: organism taxon # formerly individual organism
-    STY:T017: anatomical entity
-    STY:T018: gross anatomical structure
-    STY:T019: disease
-    STY:T020: disease
-    STY:T021: gross anatomical structure
-    STY:T022: anatomical entity
-    STY:T023: gross anatomical structure
-    STY:T024: gross anatomical structure
-    STY:T025: cell
-    STY:T026: cellular component
-    STY:T028: biological entity
-    STY:T029: anatomical entity
-    STY:T030: anatomical entity
-    STY:T031: anatomical entity
-    STY:T032: named thing # formerly organism attribute
-    STY:T033: disease or phenotypic feature
-    STY:T034: phenomenon
-    STY:T037: pathological process
-    STY:T038: phenomenon
-    STY:T039: physiological process
-    STY:T040: physiological process
-    STY:T041: behavior
-    STY:T042: physiological process
-    STY:T043: physiological process
-    STY:T044: molecular activity
-    STY:T045: physiological process
-    STY:T046: pathological process
-    STY:T047: disease
-    STY:T048: disease
-    STY:T049: disease
-    STY:T050: biological entity
-    STY:T051: event # formerly activity
-    STY:T052: activity
-    STY:T053: behavior
-    STY:T054: behavior
-    STY:T055: behavior
-    STY:T056: activity
-    STY:T057: activity
-    STY:T058: activity
-    STY:T059: procedure
-    STY:T060: procedure
-    STY:T061: procedure
-    STY:T062: activity
-    STY:T063: procedure
-    STY:T064: activity
-    STY:T065: activity
-    STY:T066: activity
-    STY:T067: phenomenon
-    STY:T068: phenomenon
-    STY:T069: phenomenon
-    STY:T070: phenomenon
-    STY:T071: named thing
-    STY:T072: physical entity
-    STY:T073: physical entity
-    STY:T074: device
-    STY:T075: device
-    STY:T077: information content entity
-    STY:T078: information content entity
-    STY:T079: information content entity
-    STY:T080: information content entity
-    STY:T081: information content entity
-    STY:T082: information content entity
-    STY:T083: geographic location
-    STY:T085: biological entity
-    STY:T086: nucleic acid entity
-    STY:T087: polypeptide
-    STY:T088: biological entity
-    STY:T089: information content entity
-    STY:T090: individual organism
-    STY:T091: named thing
-    STY:T092: agent
-    STY:T093: agent
-    STY:T094: agent
-    STY:T095: agent
-    STY:T096: agent
-    STY:T097: cohort
-    STY:T098: population of individual organisms
-    STY:T099: cohort
-    STY:T100: cohort
-    STY:T101: cohort
-    STY:T102: information content entity
-    STY:T103: chemical entity # formerly chemical substance
-    STY:T104: chemical entity # formerly chemical substance
-    STY:T109: chemical entity
-    STY:T114: nucleic acid entity
-    STY:T116: polypeptide
-    STY:T120: chemical entity # formerly chemical substance
-    STY:T121: drug
-    STY:T122: device
-    STY:T123: chemical entity # formerly chemical substance
-    STY:T125: chemical entity # formerly chemical substance
-    STY:T126: protein
-    STY:T127: small molecule
-    STY:T129: biological entity # formerly chemical substance
-    STY:T130: chemical entity
-    STY:T131: chemical entity # formerly chemical substance
-    STY:T167: chemical entity # formerly chemical substance
-    STY:T168: food
-    STY:T169: information content entity
-    STY:T170: publication
-    STY:T171: information content entity
-    STY:T184: phenotypic feature
-    STY:T185: information content entity
-    STY:T190: disease
-    STY:T191: disease
-    STY:T192: protein
-    STY:T194: organism taxon # formerly individual organism
-    STY:T195: drug
-    STY:T196: small molecule
-    STY:T197: chemical entity
-    STY:T200: drug
-    STY:T201: named thing # formerly clinical attribute
-    STY:T203: device
-    STY:T204: organism taxon # formerly individual organism
     TRANS:0000000: named thing # formerly exposure event
     UBERON:0001062: anatomical entity
     UBERON:0000105: life stage

From 0f56540ae369e55f52129da48356d9e9ece13d05 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 1 Sep 2024 22:56:39 -0700
Subject: [PATCH 080/125] #387 no longer want biolink as an ontology source due
 to the parsing hassle

---
 maps/ont-load-inventory.yaml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/maps/ont-load-inventory.yaml b/maps/ont-load-inventory.yaml
index 7b808175..590781d1 100644
--- a/maps/ont-load-inventory.yaml
+++ b/maps/ont-load-inventory.yaml
@@ -1,8 +1,3 @@
-- # maps to CURIE prefix: biolink
-  url: https://raw.githubusercontent.com/biolink/biolink-model/v4.0.0/project/owl/biolink_model.owl.ttl
-  file: biolink_model.owl.ttl
-  download: true
-  title: Biolink meta-model
 - # maps to CURIE prefix: BFO
   url:  http://purl.obolibrary.org/obo/bfo.owl
   file: bfo.owl

From b3c8cea3757123bc2c2e31750b0b7fcd3ce08da9 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 1 Sep 2024 22:57:05 -0700
Subject: [PATCH 081/125] #387 we have this predicate again

---
 maps/predicate-remap.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml
index 4da6dcac..15452136 100644
--- a/maps/predicate-remap.yaml
+++ b/maps/predicate-remap.yaml
@@ -3056,9 +3056,9 @@ OBO:mondo/mondo-base#disease_responds_to:
 # OBO:uo#is_unit_of:
 #   operation: invert
 #   core_predicate: biolink:related_to
-# OIO:hasDbXref:
-#   operation: keep
-#   core_predicate: biolink:close_match
+OIO:hasDbXref:
+  operation: keep
+  core_predicate: biolink:close_match
 OMIM:CHD:
   operation: keep
   core_predicate: biolink:subclass_of

From c36d3c3a9843f0f916e6200505be087e7537a084 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 00:00:12 -0700
Subject: [PATCH 082/125] #387 remove sed-ing from validation tests now that
 biolink is gone

---
 validate/run-validation-tests.sh | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/validate/run-validation-tests.sh b/validate/run-validation-tests.sh
index 7cdb7974..d0a3abac 100755
--- a/validate/run-validation-tests.sh
+++ b/validate/run-validation-tests.sh
@@ -22,13 +22,6 @@ export PATH=$PATH:${BUILD_DIR}
 
 biolink_base_url_no_version=https://raw.githubusercontent.com/biolink/biolink-model/
 biolink_raw_base_url=${biolink_base_url_no_version}v${biolink_model_version}/
-biolink_download_url=${biolink_raw_base_url}/project/owl/biolink_model.owl.ttl
-curies_urls_map_replace_string="\    biolink_download_source: ${biolink_download_url}"
-ont_load_inventory_replace_string="\  url: ${biolink_download_url}"
-biolink_url_context_jsonld=${biolink_raw_base_url}context.jsonld
-biolink_model_owl=biolink_model.owl.ttl
-biolink_model_owl_local_file=${BUILD_DIR}/${biolink_model_owl}
-biolink_model_owl_url=${biolink_raw_base_url}project/owl/${biolink_model_owl}
 biolink_model_yaml=biolink_model.yaml
 biolink_model_yaml_url=${biolink_raw_base_url}src/biolink_model/schema/${biolink_model_yaml}
 biolink_model_yaml_local_file=${BUILD_DIR}/${biolink_model_yaml}
@@ -42,13 +35,6 @@ cat ${config_dir}/master-config.shinc
 echo ${VALIDATE_CODE_DIR}
 echo ${curies_to_urls_file}
 
-sed -i "\@${biolink_base_url_no_version}@c${curies_urls_map_replace_string}" \
-        ${curies_to_urls_file}
-
-sed -i "\@${biolink_base_url_no_version}@c${ont_load_inventory_replace_string}" \
-        ${ont_load_inventory_file}
-
-rm -f ${biolink_model_owl_local_file}
 rm -f ${biolink_model_yaml_local_file}
 
 cd ${BUILD_DIR}

From 9a3c22f459dd877d4025a259e32d451e3c71c5d3 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 00:06:03 -0700
Subject: [PATCH 083/125] #387 #390

---
 setup/requirements-kg2-build.txt |  2 --
 setup/setup-kg2-build.sh         | 12 ------------
 2 files changed, 14 deletions(-)

diff --git a/setup/requirements-kg2-build.txt b/setup/requirements-kg2-build.txt
index c40910aa..9823d186 100644
--- a/setup/requirements-kg2-build.txt
+++ b/setup/requirements-kg2-build.txt
@@ -5,8 +5,6 @@ HTMLParser==0.0.2
 isodate==0.6.0
 jsonlines==3.0.0
 jsonpickle==1.0.0
-ontobio==2.8.0
-prefixcommons==0.1.9
 pymongo==3.8.0
 PyMySQL==0.9.3
 python-dateutil==2.8.1
diff --git a/setup/setup-kg2-build.sh b/setup/setup-kg2-build.sh
index 126eff65..f119fc52 100755
--- a/setup/setup-kg2-build.sh
+++ b/setup/setup-kg2-build.sh
@@ -97,18 +97,6 @@ fi
 # we want python3.7 (also need python3.7-dev or else pip cannot install the python package "mysqlclient")
 source ${SETUP_CODE_DIR}/setup-python37-with-pip3-in-ubuntu.shinc
 ${VENV_DIR}/bin/pip3 install -r ${SETUP_CODE_DIR}/requirements-kg2-build.txt
-
-## install ROBOT (software: ROBOT is an OBO Tool) by downloading the jar file
-## distribution and cURLing the startup script (note github uses URL redirection
-## so we need the "-L" command-line option, and cURL doesn't like JAR files by
-## default so we need the "application/zip")
-${curl_get} -H "Accept: application/zip" https://github.com/RTXteam/robot/releases/download/v1.3.0/robot.jar > ${BUILD_DIR}/robot.jar 
-curl -s https://raw.githubusercontent.com/RTXteam/robot/v1.3.0/bin/robot > ${BUILD_DIR}/robot
-chmod +x ${BUILD_DIR}/robot
-
-## setup owltools
-${curl_get} ${BUILD_DIR} https://github.com/RTXteam/owltools/releases/download/v0.3.0/owltools > ${BUILD_DIR}/owltools
-chmod +x ${BUILD_DIR}/owltools
 }
 
 function setup_kg2_build_part2 () {

From e94e431f063c3657c4c71b1bc7f7dd5cedd8dbbd Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 00:18:59 -0700
Subject: [PATCH 084/125] #387 ordo actually included in ETL

---
 ontologies_jsonl_to_kg_jsonl.py | 249 ++++++++++++++++----------------
 1 file changed, 127 insertions(+), 122 deletions(-)

diff --git a/ontologies_jsonl_to_kg_jsonl.py b/ontologies_jsonl_to_kg_jsonl.py
index c04b4b70..3390c2ec 100644
--- a/ontologies_jsonl_to_kg_jsonl.py
+++ b/ontologies_jsonl_to_kg_jsonl.py
@@ -3,14 +3,8 @@
 import json
 import datetime
 
-OWL_CLASS_TAG = "owl:Class"
-SUBCLASS_TAG = "rdfs:subClassOf"
-DESCRIPTION_TAG = "obo:IAO_0000115"
-XREF_TAG = "oboInOwl:hasDbXref"
 ID_TAG = "rdf:about"
 NAME_TAG = "rdfs:label"
-EXACT_MATCH_TAG = "skos:exactMatch"
-COMMENT_TAG = "rdfs:comment"
 
 TEXT_KEY = "ENTRY_TEXT"
 RESOURCE_KEY = "rdf:resource"
@@ -202,127 +196,138 @@ def process_ontology_term(ontology_node, source, ontology_name, owl_source=True)
 		SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version}
 
 
+def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
+	owl_prefix = ""
+	if owl_source:
+		owl_prefix = "owl:"
+	# Typically genid classes which don't neatly map onto the KG2 schema
+	if ID_TAG not in owl_class:
+		return
+	node_id = match_prefix(owl_class.get(ID_TAG, str()))
+	if node_id is None:
+		return
+	node_prefix = node_id.split(':')[0]
+	node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '')
+
+	# Configure the name
+	name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
+	if len(name_list) == 0:
+		return
+
+	# Configure the description
+	description_list = list()
+	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)]
+	description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)]
+	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)]
+	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
+	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
+
+	deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list())
+	for name in name_list:
+		search_name = name.lower()
+		if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"):
+			deprecated = True
+
+	# Configure the synonyms
+	synonym_list = list()
+	synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym",
+					"go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111",
+					"obo:IAO_0000028", "skos:prefLabel"]
+	synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
+
+	update_date_list = list()
+	update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
+	update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
+
+	creation_date_list = list()
+	creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
+	creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
+
+	# Configure the biological sequence
+	has_biological_sequence = dict()
+	has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
+	has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
+	has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence]
+	has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence]
+
+	# Extract edge triples
+	edges_list = list()
+
+	for edge_type in BASE_EDGE_TYPES:
+		for edge in owl_class.get(edge_type, list()):
+			if BASE_EDGE_TYPES[edge_type] in edge:
+				edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
+
+
+	restriction_edges = list()
+	restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
+	for equiv in owl_class.get(owl_prefix + "equivalentClass", list()):
+		for mini_class in equiv.get(owl_prefix + "Class", list()):
+			for edge in mini_class.get(owl_prefix + "intersectionOf", list()):
+				restriction_edges.append((edge, owl_prefix + "equivalentClass"))
+
+	for (edge, general_edge_type) in restriction_edges:
+		for restriction in edge.get(owl_prefix + "Restriction", list()):
+			edge_type = restriction.get(owl_prefix + "onProperty", list())
+			edge_object = restriction.get(owl_prefix + "someValuesFrom", list())
+			if len(edge_type) != 1:
+				assert len(edge_type) <= 1, edge 
+				continue
+			if len(edge_object) != 1:
+				assert len(edge_object) <= 1, edge
+				continue
+			edge_type = edge_type[0].get(RESOURCE_KEY, None)
+			edge_object = edge_object[0].get(RESOURCE_KEY, None)
+
+			if edge_type != None and edge_object != None:
+				edges_list.append((edge_type, edge_object))
+
+		if RESOURCE_KEY in edge:
+			edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
+
+	superclasses = set()
+	final_edges_list = list()
+	for (edge_relation, edge_object) in edges_list:
+		edge_object = match_prefix(edge_object)
+		if edge_object is None:
+			continue
+		edge_relation = match_prefix(edge_relation)
+		if edge_relation is None:
+			continue
+		if edge_relation in ["rdfs:subClassOf"]:
+			superclasses.add(edge_object)
+		final_edges_list.append((edge_relation, edge_object))
+
+	# Imperfect way to make it deterministic
+	superclasses = sorted(list(superclasses))
+	if node_id not in CLASS_TO_SUPERCLASSES:
+		CLASS_TO_SUPERCLASSES[node_id] = list()
+	CLASS_TO_SUPERCLASSES[node_id] += superclasses
+	CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
+
+	if node_id not in SAVED_NODE_INFO:
+		SAVED_NODE_INFO[node_id] = list()
+	SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
+									 DEPRECATED_KEY: deprecated,
+									 UPDATE_DATE_KEY: update_date_list,
+									 CREATION_DATE_KEY: creation_date_list,
+									 SYNONYM_KEY: synonym_list,
+									 DESCRIPTION_KEY: description_list,
+									 NAME_KEY: name_list,
+									 SOURCE_KEY: source,
+									 BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence,
+									 IRI_KEY: node_iri,
+									 EDGES_KEY: final_edges_list})
+
 def process_ontology_item(ontology_item):
 	source = ontology_item.get(OWL_SOURCE_KEY, str())
 	ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str())
-	for owl_class in ontology_item.get(OWL_CLASS_TAG, list()):
-		# Typically genid classes which don't neatly map onto the KG2 schema
-		if ID_TAG not in owl_class:
-			continue
-		node_id = match_prefix(owl_class.get(ID_TAG, str()))
-		if node_id is None:
-			continue
-		node_prefix = node_id.split(':')[0]
-		node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '')
 
-		# Configure the name
-		name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
-		if len(name_list) == 0:
-			continue
+	for owl_class in ontology_item.get("owl:Class", list()):
+		process_ontology_class(owl_class, source, ontology_name)
 
-		# Configure the description
-		description_list = list()
-		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)]
-		description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)]
-		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)]
-		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
-		description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
-
-		deprecated = "true" in owl_class.get("owl:deprecated", list())
-		for name in name_list:
-			if name.startswith("obsolete") or name.startswith("(obsolete") or name.endswith("obsolete"):
-				deprecated = True
-
-		# Configure the synonyms
-		synonym_list = list()
-		synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym",
-						"go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111",
-						"obo:IAO_0000028", "skos:prefLabel"]
-		synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
-
-		update_date_list = list()
-		update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
-		update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
-
-		creation_date_list = list()
-		creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
-		creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
-
-		# Configure the biological sequence
-		has_biological_sequence = dict()
-		has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
-		has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
-		has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence]
-		has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence]
-
-		# Extract edge triples
-		edges_list = list()
-
-		for edge_type in BASE_EDGE_TYPES:
-			for edge in owl_class.get(edge_type, list()):
-				if BASE_EDGE_TYPES[edge_type] in edge:
-					edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
-
-
-		restriction_edges = list()
-		restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
-		for equiv in owl_class.get("owl:equivalentClass", list()):
-			for mini_class in equiv.get("owl:Class", list()):
-				for edge in mini_class.get("owl:intersectionOf", list()):
-					restriction_edges.append((edge, "owl:equivalentClass"))
-
-		for (edge, general_edge_type) in restriction_edges:
-			for restriction in edge.get("owl:Restriction", list()):
-				edge_type = restriction.get("owl:onProperty", list())
-				edge_object = restriction.get("owl:someValuesFrom", list())
-				if len(edge_type) != 1:
-					assert len(edge_type) <= 1, edge 
-					continue
-				if len(edge_object) != 1:
-					assert len(edge_object) <= 1, edge
-					continue
-				edge_type = edge_type[0].get(RESOURCE_KEY, None)
-				edge_object = edge_object[0].get(RESOURCE_KEY, None)
-
-				if edge_type != None and edge_object != None:
-					edges_list.append((edge_type, edge_object))
-
-			if RESOURCE_KEY in edge:
-				edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
-
-		superclasses = set()
-		final_edges_list = list()
-		for (edge_relation, edge_object) in edges_list:
-			edge_object = match_prefix(edge_object)
-			if edge_object is None:
-				continue
-			edge_relation = match_prefix(edge_relation)
-			if edge_relation is None:
-				continue
-			if edge_relation in ["rdfs:subClassOf"]:
-				superclasses.add(edge_object)
-			final_edges_list.append((edge_relation, edge_object))
-
-		# Imperfect way to make it deterministic
-		superclasses = sorted(list(superclasses))
-		if node_id not in CLASS_TO_SUPERCLASSES:
-			CLASS_TO_SUPERCLASSES[node_id] = list()
-		CLASS_TO_SUPERCLASSES[node_id] += superclasses
-		CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
-
-		if node_id not in SAVED_NODE_INFO:
-			SAVED_NODE_INFO[node_id] = list()
-		SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
-										 DEPRECATED_KEY: deprecated,
-										 UPDATE_DATE_KEY: update_date_list,
-										 CREATION_DATE_KEY: creation_date_list,
-										 SYNONYM_KEY: synonym_list,
-										 DESCRIPTION_KEY: description_list,
-										 NAME_KEY: name_list,
-										 SOURCE_KEY: source,
-										 BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence,
-										 IRI_KEY: node_iri,
-										 EDGES_KEY: final_edges_list})
+	for owl_class in ontology_item.get("Class", list()):
+		process_ontology_class(owl_class, source, ontology_name, False)
 
 	for ontology_node in ontology_item.get("owl:Ontology", list()):
 		process_ontology_term(ontology_node, source, ontology_name)

From c8c63de5e28dbd3d4b51b967e8da77b70749b57c Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 00:24:38 -0700
Subject: [PATCH 085/125] #387 moving to its permanent home

---
 .../ontologies_jsonl_to_kg_jsonl.py                               | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename ontologies_jsonl_to_kg_jsonl.py => convert/ontologies_jsonl_to_kg_jsonl.py (100%)

diff --git a/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
similarity index 100%
rename from ontologies_jsonl_to_kg_jsonl.py
rename to convert/ontologies_jsonl_to_kg_jsonl.py

From 2ec98bdf658c0b50f8948931772e4d9a958cd5df Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 00:25:44 -0700
Subject: [PATCH 086/125] #387 moving owlparser to its permanent home

---
 owlparser.py => extract/owlparser.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename owlparser.py => extract/owlparser.py (100%)

diff --git a/owlparser.py b/extract/owlparser.py
similarity index 100%
rename from owlparser.py
rename to extract/owlparser.py

From 82e6acb2810ff0c22c0e09fc1fbdf2aa76c45bda Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 00:46:08 -0700
Subject: [PATCH 087/125] #387 don't need this one anymore

---
 maps/ont-load-inventory-test.yaml | 289 ------------------------------
 1 file changed, 289 deletions(-)
 delete mode 100644 maps/ont-load-inventory-test.yaml

diff --git a/maps/ont-load-inventory-test.yaml b/maps/ont-load-inventory-test.yaml
deleted file mode 100644
index f79d6587..00000000
--- a/maps/ont-load-inventory-test.yaml
+++ /dev/null
@@ -1,289 +0,0 @@
-- # maps to CURIE prefix: biolink
-  url: https://raw.githubusercontent.com/biolink/biolink-model/master/biolink-model.owl.ttl
-  file: biolink-model.owl.ttl
-  download: true
-  title: Biolink meta-model
-# - # maps to CURIE prefix: UMLSSC (the trailling slash here is important:)
-#   url: http://purl.bioontology.org/ontology/STY/
-#   file: umls-semantictypes.ttl
-#   download: false
-#   title: UMLS Semantic Types
-# - # maps to CURIE prefix: ATC
-#   download: false
-#   file: umls-atc.ttl
-#   title: Anatomical Therapeutic Chemical Classification System
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ATC
-# - # maps to CURIE prefix CHV
-#   download: false
-#   file: umls-chv.ttl
-#   title: Consumer Health Vocabulary
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/CHV
-# - # maps to CURIE prefix CPT
-#   download: false
-#   file: umls-cpt.ttl
-#   title: Current Procedural Terminology
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/CPT
-# - # maps to CURIE prefix DRUGBANK
-#   download: false
-#   file: umls-drugbank.ttl
-#   title: DrugBank
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/DRUGBANK
-# - # maps to CURIE prefix FMA
-#   download: false
-#   file: umls-fma.ttl
-#   title: Foundational Model of Anatomy
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/FMA
-# - # maps to CURIE prefix GO
-#   download: false
-#   file: umls-go.ttl
-#   title: Gene Ontology
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/GO
-# - # maps to CURIE prefix HCPCS
-#   download: false
-#   file: umls-hcpcs.ttl
-#   title: Healthcare Common Procedure Coding System
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HCPCS
-# - # maps to CURIE prefix CPT
-#   download: false
-#   file: umls-hcpt.ttl
-#   title: CPT in HCPCS
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HCPT
-# - # maps to CURIE prefix HGNC
-#   download: false
-#   file: umls-hgnc.ttl
-#   title: HUGO Gene Nomenclature Committee
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HGNC
-# - # maps to CURIE prefix umls
-#   download: false
-#   file: umls-hl7.ttl
-#   title: HL7 Version 3.0
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HL7
-# - # maps to CURIE prefix HP
-#   download: false
-#   file: umls-hpo.ttl
-#   title: Human Phenotype Ontology
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/HPO
-# - # maps to CURIE prefix ICD10
-#   download: false
-#   file: umls-icd10.ttl
-#   title: International Classification of Diseases and Related Health Problems,
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10
-# - # maps to CURIE prefix ICD10
-#   download: false
-#   file: umls-icd10ae.ttl
-#   title: ICD-10, American English Equivalents
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10AE
-# - # maps to CURIE prefix ICD10
-#   download: false
-#   file: umls-icd10cm.ttl
-#   title: International Classification of Diseases, Tenth Revision, Clinical Modification
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10CM
-# - # maps to CURIE prefix ICD10PCS
-#   download: false
-#   file: umls-icd10pcs.ttl
-#   title: ICD-10 Procedure Coding System
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD10PCS
-# - # maps to CURIE prefix ICD9
-#   download: false
-#   file: umls-icd9cm.ttl
-#   title: International Classification of Diseases, Ninth Revision, Clinical Modification
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/ICD9CM
-# - # maps to CURIE prefix LOINC
-#   download: false
-#   file: umls-lnc.ttl
-#   title: Logical Observation Identifiers Names and Codes
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/LNC
-# - # maps to CURIE prefix MEDDRA
-#    download: false
-#    file: umls-mdr.ttl
-#    title: MedDRA
-#    url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MEDDRA
-# - # maps to CURIE prefix umls
-#   download: false
-#   file: umls-med-rt.ttl
-#   title: Medication Reference Terminology
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MED-RT
-# - # maps to CURIE prefix umls
-#   download: false
-#   file: umls-medlineplus.ttl
-#   title: MedlinePlus Health Topics
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MEDLINEPLUS
-# - # maps to CURIE prefix MESH
-#   download: false
-#   file: umls-msh.ttl
-#   title: Medical Subject Headings
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MSH
-# - # maps to CURIE prefix umls
-#   download: false
-  # file: umls-mth.ttl
-  # title: Metathesaurus Names
-  # url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/MTH
-# - # maps to CURIE prefix NCBITaxon
-#   download: false
-#   file: umls-ncbi.ttl
-#   title: NCBI
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NCBITAXON
-# - # maps to CURIE prefix NCIT
-#   download: false
-#   file: umls-nci.ttl
-#   title: NCI Thesaurus
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NCI
-# - # maps to CURIE prefix NDDF
-#   download: false
-#   file: umls-nddf.ttl
-#   title: National Drug Data File
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NDDF
-#- # maps to CURIE prefix NDFRT
-#  download: false
-#  file: umls-ndfrt.ttl
-#  title: National Drug File - Reference Terminology
-#  url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/NDFRT
-# - # maps to CURIE prefix OMIM
-#   download: false
-#   file: umls-omim.ttl
-#   title: Online Mendelian Inheritance in Man
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/OMIM
-# - # maps to CURIE prefix PDQ
-#   download: false
-#   file: umls-pdq.ttl
-#   title: Physician Data Query
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/PDQ
-# - # maps to CURIE prefix RXNORM
-#   download: false
-#   file: umls-rxnorm.ttl
-#   title: RXNORM
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/RXNORM
-# - # maps to CURIE prefix SNOMED
-#   download: false
-#   file: umls-snomedct_us.ttl
-#   title: SNOMED Clinical Terms US Edition
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT
-# # ==> unable to find an online set of pages for SNOMEDCT_VET concepts but I want to find one so that
-# # I can include SNOMEDCT_VET in the kg2 build, thus am keeping this section commented out [SAR]:
-# # -
-# #   download: false
-# #   file: umls-snomedct_vet.ttl
-# #   title: Veterinary Extension to SNOMED CT
-# #   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SNOMEDCT_VET
-# # ==> this section (UMLS Source Terminology Names) seems like it could be useful in the future, but
-# # I can't find purls to its concepts anywhere:
-# # -
-# #   download: false
-# #   file: umls-src.ttl
-# #   title: Source Terminology Names (UMLS)
-# #   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/SRC
-# - # maps to CURIE prefix VANDF
-#   download: false
-#   file: umls-vandf.ttl
-#   title: National Drug File
-#   url: https://www.nlm.nih.gov/research/umls/sourcereleasedocs/current/VANDF
-# - # maps to CURIE prefix: BFO
-#   url:  http://purl.obolibrary.org/obo/bfo.owl
-#   file: bfo.owl
-#   download: true
-#   title: Basic Formal Ontology
-#-  maps to CURIE prefix: GO
-  # url:  http://purl.obolibrary.org/obo/go/extensions/go-plus.owl
-  # file: go-plus.owl
-  # title: Gene Ontology
-  # download: true
-# - # maps to CURIE prefix: RO
-#   url:  http://purl.obolibrary.org/obo/ro.owl
-#   file: ro.owl
-#   download: true
-#   title: Relation Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/uberon/ext.owl
-#   file: uberon-ext.owl
-#   download: true
-#   title: Uber-anatomy Ontology
-# -
-#   url:  http://www.ebi.ac.uk/efo/efo.owl
-#   file: efo.owl
-#   download: true
-#   title: Experimental Factor Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/fma.owl
-#   file: fma.owl
-#   download: true
-#   title: Foundational Model of Anatomy
-# -
-#   url:  http://purl.obolibrary.org/obo/ddanat.owl
-#   file: ddanat.owl
-#   download: true
-#   title: Dictyostelium discoideum anatomy 
--
-  url:  http://purl.obolibrary.org/obo/cl.owl
-  file: cl.owl
-  download: true
-  title: Cell Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/chebi.owl
-#   file: chebi.owl
-#   download: true
-#   title: Chemical Entities of Biological Interest
-# -
-  # url: http://purl.obolibrary.org/obo/foodon.owl 
-  # file: foodon.owl
-  # download: false
-  # title: FOODON (Food Ontology)
-# -
-#   url:  http://data.bioontology.org/ontologies/ORDO/submissions/15/download?apikey=8b5b7825-538d-40e0-9e9e-5ab9274a9aeb
-#   file: ordo.owl
-#   download: true
-#   title: ORPHANET Rare Disease Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/ehdaa2.owl
-#   file: ehdaa2.owl
-#   download: true
-#   title: Human developmental anatomy, abstract
-# -
-#   url:  http://purl.obolibrary.org/obo/bspo.owl
-#   file: bspo.owl
-#   download: true
-#   title: Biological Spatial Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/hp.owl
-#   file: hp.owl
-#   download: true
-#   title: Human Phenotype Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/nbo.owl
-#   file: nbo.owl
-#   download: true
-#   title: Neuro Behavior Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/ncbitaxon/subsets/taxslim.owl
-#   file: taxslim.owl
-#   download: true
-#   title: NCBITaxon
-# -
-#   url:  http://purl.obolibrary.org/obo/pato.owl
-#   file: pato.owl
-#   download: true
-#   title: Phenotypic Quality Ontology
-# - # maps to CURIE prefix MONDO
-#   url:  http://purl.obolibrary.org/obo/mondo.owl
-#   file: mondo.owl
-#   download: true
-#   title: MONDO Disease Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/doid.owl
-#   file: doid.owl
-#   download: true
-#   title: Disease Ontology
-# -
-#   url:  http://purl.obolibrary.org/obo/pr.owl
-#   file: pr.owl
-#   download: true
-#   title: Protein Ontology
-# -
-#   url: http://purl.obolibrary.org/obo/ino.owl
-#   file: ino.owl
-#   download: true
-#   title: Interaction Network Ontology
-# - # maps to CURIE prefix GENEPIO
-#   url: http://purl.obolibrary.org/obo/genepio.owl
-#   file: genepio.owl
-#   download: true
-#   title: Genomic Epidemiology Ontology

From 192039c496df3ce7e18f9b6f583a7039dac46799 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 01:08:42 -0700
Subject: [PATCH 088/125] #387 #405 rethreading the pipeline for new ETL

---
 build/Snakefile-conversion              | 13 ++++++++-----
 build/Snakefile-extraction              | 13 +++++++++++++
 build/Snakefile-post-etl                |  8 ++++----
 build/snakemake-config-var.yaml         | 20 ++++++++++++++------
 convert/ontologies_jsonl_to_kg_jsonl.py |  8 +++++---
 master-config.shinc                     |  3 ---
 6 files changed, 44 insertions(+), 21 deletions(-)

diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion
index 6754be04..45db4eeb 100644
--- a/build/Snakefile-conversion
+++ b/build/Snakefile-conversion
@@ -16,15 +16,18 @@ rule UMLS_Conversion:
 
 rule Ontologies_Conversion:
     input:
-        code = config['ONT_CONVERSION_SCRIPT'],
+        code = config['ONTOLOGIES_CONVERSION_SCRIPT'],
+        real = config['ONTOLOGIES_EXTRACT_FILE'],
+        curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP']
+        curies_to_urls_map = config['CURIES_TO_URLS_FILE'],
         validation = config['VALIDATION_PLACEHOLDER']
     output:
-        nodes = config['ONT_OUTPUT_NODES_FILE'],
-        edges = config['ONT_OUTPUT_EDGES_FILE']
+        nodes = config['ONTOLOGIES_OUTPUT_NODES_FILE'],
+        edges = config['ONTOLOGIES_OUTPUT_EDGES_FILE']
     log:
-        config['ONT_CONVERSION_LOG']
+        config['ONTOLOGIES_CONVERSION_LOG']
     shell:
-        "bash -x {input.code} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" 
+        config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_categories_map} {input.curies_to_urls_map} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" 
 
 rule SemMedDB_Conversion:
     input:
diff --git a/build/Snakefile-extraction b/build/Snakefile-extraction
index c23d0ef0..e5c12890 100644
--- a/build/Snakefile-extraction
+++ b/build/Snakefile-extraction
@@ -9,6 +9,19 @@ rule UMLS:
     shell:
         "bash -x {input.code} {output} > {log} 2>&1" 
 
+rule Ontologies:
+    input:
+        code = config['ONTOLOGIES_EXTRACTION_SCRIPT'],
+        parser = config['ONTOLOGIES_EXTRACTION_PARSER'],
+        ontologies_load_inventory = config['ONTOLOGIES_LOAD_INVENTORY_FILE'],
+        validation = config['VALIDATION_PLACEHOLDER']
+    output:
+        config['ONTOLOGIES_EXTRACT_FILE']
+    log:
+        config['ONTOLOGIES_EXTRACTION_LOG']
+    shell:
+        "bash -x {input.code} {input.parser} {input.ontologies_load_inventory} {output} > {log} 2>&1" 
+
 rule SemMedDB:
     input:
         code = config['SEMMEDDB_EXTRACTION_SCRIPT'],
diff --git a/build/Snakefile-post-etl b/build/Snakefile-post-etl
index eeb1a44d..dab60237 100644
--- a/build/Snakefile-post-etl
+++ b/build/Snakefile-post-etl
@@ -3,8 +3,8 @@ rule Merge:
         code = config['MERGE_SCRIPT'],
         umls_nodes = config['UMLS_OUTPUT_NODES_FILE'],
         umls_edges = config['UMLS_OUTPUT_EDGES_FILE'],
-        ont_nodes = config['ONT_OUTPUT_NODES_FILE'],
-        ont_edges = config['ONT_OUTPUT_EDGES_FILE'],
+        ontologies_nodes = config['ONTOLOGIES_OUTPUT_NODES_FILE'],
+        ontologies_edges = config['ONTOLOGIES_OUTPUT_EDGES_FILE'],
         uniprot_nodes = config['UNIPROTKB_OUTPUT_NODES_FILE'],
         uniprot_edges = config['UNIPROTKB_OUTPUT_EDGES_FILE'],
         semmeddb_nodes = config['SEMMEDDB_OUTPUT_NODES_FILE'],
@@ -56,7 +56,7 @@ rule Merge:
             " --outputEdgesFile {output.edges} " + \
             " --kgNodesFiles " + \
             "{input.umls_nodes} " + \
-            "{input.ont_nodes} " + \
+            "{input.ontologies_nodes} " + \
             "{input.semmeddb_nodes} " + \
             "{input.uniprot_nodes} " + \
             "{input.ensembl_nodes} " + \
@@ -78,7 +78,7 @@ rule Merge:
             "{input.clinicaltrialskg_nodes} " + \
             " --kgEdgesFiles " + \
             "{input.umls_edges} " + \
-            "{input.ont_edges} " + \
+            "{input.ontologies_edges} " + \
             "{input.semmeddb_edges} " + \
             "{input.uniprot_edges} " + \
             "{input.ensembl_edges} " + \
diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index 209b3659..3f32aca4 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -13,6 +13,8 @@ umls_output_base: kg2-umls
 umls_extraction_script: ${EXTRACT_CODE_DIR}/${umls_extraction_base}.sh
 umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${version_suffix}${test_suffix}.log
 umls_extract_file: ${BUILD_DIR}/umls.jsonl
+umls_dir: ${BUILD_DIR}/umls
+umls_dest_dir: ${umls_dir}/META
 umls_conversion_script: ${CONVERT_CODE_DIR}/${umls_conversion_base}.py
 umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${version_suffix}${test_suffix}.log
 umls_name_heirarchy: ${MAPS_CODE_DIR}/umls-name-heirarchy.yaml
@@ -20,12 +22,18 @@ umls_tui_map: ${MAPS_CODE_DIR}/tui_combo_mappings.json
 umls_output_nodes_file: ${BUILD_DIR}/${umls_output_base}${nodes_suffix}${test_suffix}.jsonl
 umls_output_edges_file: ${BUILD_DIR}/${umls_output_base}${edges_suffix}${test_suffix}.jsonl
 
-ont_conversion_base: build-multi-ont-kg
-ont_output_base: kg2-ont
-ont_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.sh
-ont_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log
-ont_output_nodes_file: ${BUILD_DIR}/${ont_output_base}${nodes_suffix}${test_suffix}.jsonl
-ont_output_edges_file: ${BUILD_DIR}/${ont_output_base}${edges_suffix}${test_suffix}.jsonl
+ontologies_extraction_base: extract-ontologies
+ontologies_conversion_base: ontologies_jsonl_to_kg_jsonl
+ontologies_output_base: kg2-ontologies
+ontologies_extraction_script: ${EXTRACT_CODE_DIR}/${ontologies_extraction_base}.sh
+ontologies_extraction_parser: ${EXTRACT_CODE_DIR}/owlparser.py
+ontologies_extraction_log: ${BUILD_DIR}/${ontologies_extraction_base}${version_suffix}${test_suffix}.log
+ontologies_load_inventory_file: ${MAPS_CODE_DIR}/ont-load-inventory.yaml
+ontologies_extract_file: ${BUILD_DIR}/ontologies.jsonl
+ontologies_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.py
+ontologies_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log
+ontologies_output_nodes_file: ${BUILD_DIR}/${ontologies_output_base}${nodes_suffix}${test_suffix}.jsonl
+ontologies_output_edges_file: ${BUILD_DIR}/${ontologies_output_base}${edges_suffix}${test_suffix}.jsonl
 
 semmeddb_extraction_base: extract-semmeddb
 semmeddb_conversion_base: semmeddb_tuplelist_json_to_kg_jsonl
diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index 3390c2ec..4dfb9992 100644
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -78,6 +78,7 @@ def get_args():
 							action="store_true", default=False)
 	arg_parser.add_argument('inputFile', type=str)
 	arg_parser.add_argument('curiesToCategoriesYAML', type=str)
+	arg_parser.add_argument('curiesToURLsYAML', type=str)
 	arg_parser.add_argument('outputNodesFile', type=str)
 	arg_parser.add_argument('outputEdgesFile', type=str)
 	return arg_parser.parse_args()
@@ -336,8 +337,8 @@ def process_ontology_item(ontology_item):
 	for ontology_node in ontology_item.get("Ontology", list()):
 		process_ontology_term(ontology_node, source, ontology_name, False)
 
-def generate_uri_map():
-	uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string("maps/curies-to-urls-map.yaml"))
+def generate_uri_map(curies_to_urls_file_name):
+	uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name))
 	bidirectional_map = uri_input_map['use_for_bidirectional_mapping']
 	contraction_map = uri_input_map['use_for_contraction_only']
 
@@ -420,6 +421,7 @@ def construct_nodes_and_edges(nodes_output, edges_output):
 	args = get_args()
 	input_file_name = args.inputFile
 	curies_to_categories_file_name = args.curiesToCategoriesYAML
+	curies_to_urls_file_name = args.curiesToURLsYAML
 	output_nodes_file_name = args.outputNodesFile
 	output_edges_file_name = args.outputEdgesFile
 	test_mode = args.test
@@ -438,7 +440,7 @@ def construct_nodes_and_edges(nodes_output, edges_output):
 	input_data = input_read_jsonlines_info[0]
 
 	ontology_prefixes = set()
-	generate_uri_map()
+	generate_uri_map(curies_to_urls_file_name)
 	for ontology_item in input_data:
 		process_ontology_item(ontology_item)
 
diff --git a/master-config.shinc b/master-config.shinc
index 015cf2f7..3e78c226 100644
--- a/master-config.shinc
+++ b/master-config.shinc
@@ -11,8 +11,6 @@ NEO4J_CODE_DIR=${CODE_DIR}/neo4j
 PROCESS_CODE_DIR=${CODE_DIR}/process
 SETUP_CODE_DIR=${CODE_DIR}/setup
 VALIDATE_CODE_DIR=${CODE_DIR}/validate
-umls_dir=${BUILD_DIR}/umls
-umls_dest_dir=${umls_dir}/META
 s3_region=us-west-2
 s3_bucket=rtx-kg2
 s3_bucket_public=rtx-kg2-public
@@ -26,7 +24,6 @@ curies_to_urls_file=${MAPS_CODE_DIR}/curies-to-urls-map.yaml
 predicate_mapping_file=${MAPS_CODE_DIR}/predicate-remap.yaml
 infores_mapping_file=${MAPS_CODE_DIR}/kg2-provided-by-curie-to-infores-curie.yaml
 knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-type-map.yaml
-ont_load_inventory_file=${MAPS_CODE_DIR}/ont-load-inventory${test_suffix}.yaml
 rtx_config_file=RTXConfiguration-config.json
 biolink_model_version=4.2.1
 infores_registry_version=0.2.8

From d09ce05f3256d82e6c4823f77ae6e3297fb40489 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 01:09:20 -0700
Subject: [PATCH 089/125] #387 forgot to add the new extract

---
 extract/extract-ontologies.sh | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100755 extract/extract-ontologies.sh

diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh
new file mode 100755
index 00000000..3248cf4f
--- /dev/null
+++ b/extract/extract-ontologies.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# extract-ontologies.sh: Download OWL files and convert them into a JSONLines file
+# Copyright 2024 Stephen A. Ramsey
+# Author Erica Wood
+
+set -o nounset -o pipefail -o errexit
+
+if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then
+    echo Usage: "$0 <parsing_script> <ontologies_load_inventory> <output_file> <ontologies_dir>"
+    exit 2
+fi
+
+# Usage: extract-ontologies.sh <parsing_script> <ontologies_load_inventory> <output_file> <ontologies_dir>
+
+echo "================= starting extract-ontologies.sh =================="
+date
+
+config_dir=`dirname "$0"`
+source ${config_dir}/master-config.shinc
+
+parsing_script=${1-"${EXTRACT_CODE_DIR}/owlparser.py"}
+ontologies_load_inventory=${1-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"}
+output_file=${2-"${BUILD_DIR}/ontologies.jsonl"}
+ontologies_dir=${3-"${BUILD_DIR}/owl_files"}
+
+mkdir -p ${ontologies_dir}
+
+# Temporary adjustment for https://github.com/HUPO-PSI/psi-mi-CV/issues/456
+${s3_cp_cmd} s3://${s3_bucket}/mi.owl ${ontologies_dir}/mi.owl
+
+# Generate the ontologies.jsonl file
+${python_command} ${parsing_script} ${ontologies_load_inventory} ${ontologies_dir} ${output_file}
+
+date
+echo "================= finished extract-ontologies.sh =================="

From 76b996b9b2ffc9d73d592e44d131cc1e5e81af58 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 01:14:00 -0700
Subject: [PATCH 090/125] #387 adjusting some of the variables for new
 pipelining

---
 build/snakemake-config-var.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index 3f32aca4..2c5e1863 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -30,8 +30,8 @@ ontologies_extraction_parser: ${EXTRACT_CODE_DIR}/owlparser.py
 ontologies_extraction_log: ${BUILD_DIR}/${ontologies_extraction_base}${version_suffix}${test_suffix}.log
 ontologies_load_inventory_file: ${MAPS_CODE_DIR}/ont-load-inventory.yaml
 ontologies_extract_file: ${BUILD_DIR}/ontologies.jsonl
-ontologies_conversion_script: ${CONVERT_CODE_DIR}/${ont_conversion_base}.py
-ontologies_conversion_log: ${BUILD_DIR}/${ont_conversion_base}${version_suffix}${test_suffix}.log
+ontologies_conversion_script: ${CONVERT_CODE_DIR}/${ontologies_conversion_base}.py
+ontologies_conversion_log: ${BUILD_DIR}/${ontologies_conversion_base}${version_suffix}${test_suffix}.log
 ontologies_output_nodes_file: ${BUILD_DIR}/${ontologies_output_base}${nodes_suffix}${test_suffix}.jsonl
 ontologies_output_edges_file: ${BUILD_DIR}/${ontologies_output_base}${edges_suffix}${test_suffix}.jsonl
 

From 168071ef49ba38512ca133b560c8a7a12da43f1a Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 01:18:04 -0700
Subject: [PATCH 091/125] #387 adjusting for new pipelining syntax error

---
 build/Snakefile-conversion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion
index 45db4eeb..d2e61a00 100644
--- a/build/Snakefile-conversion
+++ b/build/Snakefile-conversion
@@ -18,7 +18,7 @@ rule Ontologies_Conversion:
     input:
         code = config['ONTOLOGIES_CONVERSION_SCRIPT'],
         real = config['ONTOLOGIES_EXTRACT_FILE'],
-        curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP']
+        curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP'],
         curies_to_urls_map = config['CURIES_TO_URLS_FILE'],
         validation = config['VALIDATION_PLACEHOLDER']
     output:

From 8baa76352a1ba0b98a6b98a8053af6ad28c0cc6a Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 01:21:30 -0700
Subject: [PATCH 092/125] #387 adjusting for new pipelining naming error

---
 build/Snakefile-conversion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion
index d2e61a00..d7f96c2e 100644
--- a/build/Snakefile-conversion
+++ b/build/Snakefile-conversion
@@ -18,7 +18,7 @@ rule Ontologies_Conversion:
     input:
         code = config['ONTOLOGIES_CONVERSION_SCRIPT'],
         real = config['ONTOLOGIES_EXTRACT_FILE'],
-        curies_to_categories_map = config['CURIES_TO_CATEGORIES_MAP'],
+        curies_to_categories_map = config['CURIES_TO_CATEGORIES_FILE'],
         curies_to_urls_map = config['CURIES_TO_URLS_FILE'],
         validation = config['VALIDATION_PLACEHOLDER']
     output:

From 58d1bddc4ce6bf4f1d88f0cdc0be8c6cf4a9d549 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 01:31:50 -0700
Subject: [PATCH 093/125] #387 cleaning up the formatting of the new files

---
 convert/ontologies_jsonl_to_kg_jsonl.py | 748 +++++++++----------
 extract/owlparser.py                    | 924 ++++++++++++------------
 2 files changed, 854 insertions(+), 818 deletions(-)

diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index 4dfb9992..a27561f4 100644
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -1,8 +1,25 @@
+#!/usr/bin/env python3
+''' ontologies_jsonl_to_kg_jsonl.py: Converts JSON Lines representation of ontologies into KG JSON Lines format
+
+    Usage: ontologies_jsonl_to_kg_jsonl.py [--test] <inputFile.jsonl> <curiesToCategoriesYAML.yaml> <curiesToURLsYAML.yaml> <outputNodesFile.jsonl> <outputEdgesFile.jsonl>
+'''
+
+
 import argparse
 import kg2_util
 import json
 import datetime
 
+__author__ = 'Erica Wood'
+__copyright__ = 'Oregon State University'
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
+__license__ = 'MIT'
+__version__ = '0.1.0'
+__maintainer__ = ''
+__email__ = ''
+__status__ = 'Prototype'
+
+
 ID_TAG = "rdf:about"
 NAME_TAG = "rdfs:label"
 
@@ -18,26 +35,26 @@
 DESCRIPTION_DELIM = " // "
 
 BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY,
-				   "mondo-base:closeMatch": RESOURCE_KEY,
-				   "mondo-base:relatedMatch": RESOURCE_KEY,
-				   "mondo-base:broadMatch": RESOURCE_KEY,
-				   "mondo-base:narrowMatch": RESOURCE_KEY,
-				   "skos:exactMatch": RESOURCE_KEY,
-				   "skos:closeMatch": RESOURCE_KEY,
-				   "skos:broadMatch": RESOURCE_KEY,
-				   "skos:relatedMatch": RESOURCE_KEY,
-				   "skos:narrowMatch": RESOURCE_KEY,
-				   "obo:IAO_0100001": RESOURCE_KEY,
-				   "obo:RO_0002175": RESOURCE_KEY,
-				   "obo:RO_0002161": RESOURCE_KEY,
-				   "obo:RO_0002604": RESOURCE_KEY,
-				   "obo:RO_0002171": RESOURCE_KEY,
-				   "obo:RO_0002174": RESOURCE_KEY,
-				   "obo:RO_0002475": RESOURCE_KEY,
-				   "obo:RO_0001900": RESOURCE_KEY,
-				   "oboInOwl:hasAlternativeId": TEXT_KEY,
-				   "oboInOwl:hasDbXref": TEXT_KEY,
-				   "oboInOwl:xref": TEXT_KEY}
+                   "mondo-base:closeMatch": RESOURCE_KEY,
+                   "mondo-base:relatedMatch": RESOURCE_KEY,
+                   "mondo-base:broadMatch": RESOURCE_KEY,
+                   "mondo-base:narrowMatch": RESOURCE_KEY,
+                   "skos:exactMatch": RESOURCE_KEY,
+                   "skos:closeMatch": RESOURCE_KEY,
+                   "skos:broadMatch": RESOURCE_KEY,
+                   "skos:relatedMatch": RESOURCE_KEY,
+                   "skos:narrowMatch": RESOURCE_KEY,
+                   "obo:IAO_0100001": RESOURCE_KEY,
+                   "obo:RO_0002175": RESOURCE_KEY,
+                   "obo:RO_0002161": RESOURCE_KEY,
+                   "obo:RO_0002604": RESOURCE_KEY,
+                   "obo:RO_0002171": RESOURCE_KEY,
+                   "obo:RO_0002174": RESOURCE_KEY,
+                   "obo:RO_0002475": RESOURCE_KEY,
+                   "obo:RO_0001900": RESOURCE_KEY,
+                   "oboInOwl:hasAlternativeId": TEXT_KEY,
+                   "oboInOwl:hasDbXref": TEXT_KEY,
+                   "oboInOwl:xref": TEXT_KEY}
 
 CLASS_TO_SUPERCLASSES = dict()
 SAVED_NODE_INFO = dict()
@@ -73,383 +90,386 @@
 VERSION_KEY = "version"
 
 def get_args():
-	arg_parser = argparse.ArgumentParser()
-	arg_parser.add_argument('--test', dest='test',
-							action="store_true", default=False)
-	arg_parser.add_argument('inputFile', type=str)
-	arg_parser.add_argument('curiesToCategoriesYAML', type=str)
-	arg_parser.add_argument('curiesToURLsYAML', type=str)
-	arg_parser.add_argument('outputNodesFile', type=str)
-	arg_parser.add_argument('outputEdgesFile', type=str)
-	return arg_parser.parse_args()
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('--test', dest='test',
+                            action="store_true", default=False)
+    arg_parser.add_argument('inputFile', type=str)
+    arg_parser.add_argument('curiesToCategoriesYAML', type=str)
+    arg_parser.add_argument('curiesToURLsYAML', type=str)
+    arg_parser.add_argument('outputNodesFile', type=str)
+    arg_parser.add_argument('outputEdgesFile', type=str)
+    return arg_parser.parse_args()
 
 def categorize_node(node_id, recursion_depth=0):
-	node_prefix = node_id.split(':')[0]
-
-	if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING:
-		return NODE_CATEGORY_MAPPINGS[node_id][0]
-
-	if node_prefix in PREFIX_MAPPINGS:
-		node_category = PREFIX_MAPPINGS[node_prefix]
-		NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING)
-		return PREFIX_MAPPINGS[node_prefix]
-
-	# Get try to get the most common superclass categorization
-	superclass_categorizations = dict()
-	highest_value = 0
-	highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING
-	if recursion_depth == 10:
-		return kg2_util.BIOLINK_CATEGORY_NAMED_THING
-
-	for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()):
-		superclass_category = categorize_node(superclass, recursion_depth + 1)
-		if superclass_category not in superclass_categorizations:
-			superclass_categorizations[superclass_category] = 0
-		superclass_categorizations[superclass_category] += 1
-		if superclass_categorizations[superclass_category] > highest_value:
-			highest_value = superclass_categorizations[superclass_category]
-			highest_category = superclass_category
-
-	NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING)
-	return highest_category
+    node_prefix = node_id.split(':')[0]
+
+    if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING:
+        return NODE_CATEGORY_MAPPINGS[node_id][0]
+
+    if node_prefix in PREFIX_MAPPINGS:
+        node_category = PREFIX_MAPPINGS[node_prefix]
+        NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING)
+        return PREFIX_MAPPINGS[node_prefix]
+
+    # Get try to get the most common superclass categorization
+    superclass_categorizations = dict()
+    highest_value = 0
+    highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING
+    if recursion_depth == 10:
+        return kg2_util.BIOLINK_CATEGORY_NAMED_THING
+
+    for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()):
+        superclass_category = categorize_node(superclass, recursion_depth + 1)
+        if superclass_category not in superclass_categorizations:
+            superclass_categorizations[superclass_category] = 0
+        superclass_categorizations[superclass_category] += 1
+        if superclass_categorizations[superclass_category] > highest_value:
+            highest_value = superclass_categorizations[superclass_category]
+            highest_category = superclass_category
+
+    NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING)
+    return highest_category
 
 def reformat_obo_date(date_str):
-	if date_str is None:
-		return None
-
-	if '-' in date_str:
-		delim = 'T'
-		if ' ' in date_str:
-			delim = ' '
-		date_spl = date_str.strip('Z').split(delim)
-		date_fh = date_spl[0].split('-')
-		year = int(date_fh[0])
-		month = int(date_fh[1])
-		day = int(date_fh[2])
-
-		if month < 1 or month > 12 or day < 1 or day > 31:
-			return None
-
-		if len(date_spl) > 1:
-			date_sh = date_spl[1].split(':')
-			hour = int(date_sh[0])
-			minute = int(date_sh[1])
-			second = int(date_sh[2][0:1])
-
-			return datetime.datetime(year, month, day, hour, minute, second)
-		else:
-			return datetime.datetime(year, month, day)
-	else:
-		date_spl = date_str.split(' ')
-		date_fh = date_spl[0].split(':')
-		year = int(date_fh[2])
-		month = int(date_fh[1])
-		day = int(date_fh[0])
-
-		if month < 1 or month > 12 or day < 1 or day > 31:
-			return None
-
-		return datetime.datetime(year, month, day)
+    if date_str is None:
+        return None
+
+    if '-' in date_str:
+        delim = 'T'
+        if ' ' in date_str:
+            delim = ' '
+        date_spl = date_str.strip('Z').split(delim)
+        date_fh = date_spl[0].split('-')
+        year = int(date_fh[0])
+        month = int(date_fh[1])
+        day = int(date_fh[2])
+
+        if month < 1 or month > 12 or day < 1 or day > 31:
+            return None
+
+        if len(date_spl) > 1:
+            date_sh = date_spl[1].split(':')
+            hour = int(date_sh[0])
+            minute = int(date_sh[1])
+            second = int(date_sh[2][0:1])
+
+            return datetime.datetime(year, month, day, hour, minute, second)
+        else:
+            return datetime.datetime(year, month, day)
+    else:
+        date_spl = date_str.split(' ')
+        date_fh = date_spl[0].split(':')
+        year = int(date_fh[2])
+        month = int(date_fh[1])
+        day = int(date_fh[0])
+
+        if month < 1 or month > 12 or day < 1 or day > 31:
+            return None
+
+        return datetime.datetime(year, month, day)
 
 def pick_most_recent_date(dates, alternate_date=None):
-	latest_date = None
-	for date in dates:
-		if date == None:
-			continue
-		if latest_date == None or date > latest_date:
-			latest_date = date
-	
-	if latest_date == None:
-		if alternate_date is not None:
-			latest_date = alternate_date
-		else:
-			return None
-
-	return latest_date.isoformat(sep=' ')
+    latest_date = None
+    for date in dates:
+        if date == None:
+            continue
+        if latest_date == None or date > latest_date:
+            latest_date = date
+    
+    if latest_date == None:
+        if alternate_date is not None:
+            latest_date = alternate_date
+        else:
+            return None
+
+    return latest_date.isoformat(sep=' ')
 
 def process_ontology_term(ontology_node, source, ontology_name, owl_source=True):
-	owl_prefix = ""
-	if owl_source:
-		owl_prefix = "owl:"
-	ontology_version = None
-	ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version]
-	ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version]
-	ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version]
-	ontology_iri = ontology_node.get("rdf:about", str())
-	if len(ontology_versions) == 1:
-		ontology_version = ontology_versions[0]
-	elif len(ontology_version_iri) == 1:
-		ontology_version = ontology_version_iri[0]
-		version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/']
-		for replacement in version_replacements:
-			ontology_version = ontology_version.replace(replacement, "")
-		ontology_version = ontology_version.split('/')[0]
-	elif len(ontology_dates) >= 1:
-		ontology_version = pick_most_recent_date(ontology_dates)
-
-	if ontology_version is None:
-		print("Warning: source", source, "lacks any versioning information.")
-
-	ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates))
-	source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source
-
-	if source not in SOURCE_INFO:
-		SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version}
+    owl_prefix = ""
+    if owl_source:
+        owl_prefix = "owl:"
+    ontology_version = None
+    ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version]
+    ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version]
+    ontology_dates = [reformat_obo_date(version.get(TEXT_KEY, str())) for date_type in ["oboInOwl:date", "dcterms:date", "dc:date"] for version in ontology_node.get(date_type, list()) if TEXT_KEY in version]
+    ontology_iri = ontology_node.get("rdf:about", str())
+    if len(ontology_versions) == 1:
+        ontology_version = ontology_versions[0]
+    elif len(ontology_version_iri) == 1:
+        ontology_version = ontology_version_iri[0]
+        version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/']
+        for replacement in version_replacements:
+            ontology_version = ontology_version.replace(replacement, "")
+        ontology_version = ontology_version.split('/')[0]
+    elif len(ontology_dates) >= 1:
+        ontology_version = pick_most_recent_date(ontology_dates)
+
+    if ontology_version is None:
+        print("Warning: source", source, "lacks any versioning information.")
+
+    ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates))
+    source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source
+
+    if source not in SOURCE_INFO:
+        SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version}
 
 
 def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
-	owl_prefix = ""
-	if owl_source:
-		owl_prefix = "owl:"
-	# Typically genid classes which don't neatly map onto the KG2 schema
-	if ID_TAG not in owl_class:
-		return
-	node_id = match_prefix(owl_class.get(ID_TAG, str()))
-	if node_id is None:
-		return
-	node_prefix = node_id.split(':')[0]
-	node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '')
-
-	# Configure the name
-	name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
-	if len(name_list) == 0:
-		return
-
-	# Configure the description
-	description_list = list()
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)]
-	description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)]
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)]
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
-	description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
-
-	deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list())
-	for name in name_list:
-		search_name = name.lower()
-		if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"):
-			deprecated = True
-
-	# Configure the synonyms
-	synonym_list = list()
-	synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym",
-					"go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111",
-					"obo:IAO_0000028", "skos:prefLabel"]
-	synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
-
-	update_date_list = list()
-	update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
-	update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
-
-	creation_date_list = list()
-	creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
-	creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
-
-	# Configure the biological sequence
-	has_biological_sequence = dict()
-	has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
-	has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
-	has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence]
-	has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence]
-
-	# Extract edge triples
-	edges_list = list()
-
-	for edge_type in BASE_EDGE_TYPES:
-		for edge in owl_class.get(edge_type, list()):
-			if BASE_EDGE_TYPES[edge_type] in edge:
-				edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
-
-
-	restriction_edges = list()
-	restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
-	for equiv in owl_class.get(owl_prefix + "equivalentClass", list()):
-		for mini_class in equiv.get(owl_prefix + "Class", list()):
-			for edge in mini_class.get(owl_prefix + "intersectionOf", list()):
-				restriction_edges.append((edge, owl_prefix + "equivalentClass"))
-
-	for (edge, general_edge_type) in restriction_edges:
-		for restriction in edge.get(owl_prefix + "Restriction", list()):
-			edge_type = restriction.get(owl_prefix + "onProperty", list())
-			edge_object = restriction.get(owl_prefix + "someValuesFrom", list())
-			if len(edge_type) != 1:
-				assert len(edge_type) <= 1, edge 
-				continue
-			if len(edge_object) != 1:
-				assert len(edge_object) <= 1, edge
-				continue
-			edge_type = edge_type[0].get(RESOURCE_KEY, None)
-			edge_object = edge_object[0].get(RESOURCE_KEY, None)
-
-			if edge_type != None and edge_object != None:
-				edges_list.append((edge_type, edge_object))
-
-		if RESOURCE_KEY in edge:
-			edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
-
-	superclasses = set()
-	final_edges_list = list()
-	for (edge_relation, edge_object) in edges_list:
-		edge_object = match_prefix(edge_object)
-		if edge_object is None:
-			continue
-		edge_relation = match_prefix(edge_relation)
-		if edge_relation is None:
-			continue
-		if edge_relation in ["rdfs:subClassOf"]:
-			superclasses.add(edge_object)
-		final_edges_list.append((edge_relation, edge_object))
-
-	# Imperfect way to make it deterministic
-	superclasses = sorted(list(superclasses))
-	if node_id not in CLASS_TO_SUPERCLASSES:
-		CLASS_TO_SUPERCLASSES[node_id] = list()
-	CLASS_TO_SUPERCLASSES[node_id] += superclasses
-	CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
-
-	if node_id not in SAVED_NODE_INFO:
-		SAVED_NODE_INFO[node_id] = list()
-	SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
-									 DEPRECATED_KEY: deprecated,
-									 UPDATE_DATE_KEY: update_date_list,
-									 CREATION_DATE_KEY: creation_date_list,
-									 SYNONYM_KEY: synonym_list,
-									 DESCRIPTION_KEY: description_list,
-									 NAME_KEY: name_list,
-									 SOURCE_KEY: source,
-									 BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence,
-									 IRI_KEY: node_iri,
-									 EDGES_KEY: final_edges_list})
+    owl_prefix = ""
+    if owl_source:
+        owl_prefix = "owl:"
+    # Typically genid classes which don't neatly map onto the KG2 schema
+    if ID_TAG not in owl_class:
+        return
+    node_id = match_prefix(owl_class.get(ID_TAG, str()))
+    if node_id is None:
+        return
+    node_prefix = node_id.split(':')[0]
+    node_iri = PREFIX_TO_IRI_MAP[node_prefix] + node_id.replace(node_prefix + ':', '')
+
+    # Configure the name
+    name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
+    if len(name_list) == 0:
+        return
+
+    # Configure the description
+    description_list = list()
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:IAO_0000115", list()) if (TEXT_KEY in description)]
+    description_list += [COMMENT_PREFIX + description.get(TEXT_KEY, str()) for description in owl_class.get("rdfs:comment", list()) if (TEXT_KEY in description)]
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000001", list()) if (TEXT_KEY in description)]
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
+    description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
+
+    deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list())
+    for name in name_list:
+        search_name = name.lower()
+        if search_name.startswith("obsolete") or search_name.startswith("(obsolete") or search_name.endswith("obsolete"):
+            deprecated = True
+
+    # Configure the synonyms
+    synonym_list = list()
+    synonym_keys = ["oboInOwl:hasExactSynonym", "oboInOwl:hasRelatedSynonym", "oboInOwl:hasNarrowSynonym", "oboInOwl:hasBroadSynonym", "go:hasExactSynonym",
+                    "go:hasSynonym", "go:hasNarrowSynonym", "go:hasBroadSynonym", "obo:IAO_0000118", "obo:IAO_0000589", "go:hasRelatedSynonym", "obo:IAO_0000111",
+                    "obo:IAO_0000028", "skos:prefLabel"]
+    synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
+
+    update_date_list = list()
+    update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
+    update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
+
+    creation_date_list = list()
+    creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
+    creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
+
+    # Configure the biological sequence
+    has_biological_sequence = dict()
+    has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
+    has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
+    has_biological_sequence['inchi'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchi", list()) if TEXT_KEY in biological_sequence]
+    has_biological_sequence['inchikey'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:inchikey", list()) if TEXT_KEY in biological_sequence]
+
+    # Extract edge triples
+    edges_list = list()
+
+    for edge_type in BASE_EDGE_TYPES:
+        for edge in owl_class.get(edge_type, list()):
+            if BASE_EDGE_TYPES[edge_type] in edge:
+                edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
+
+
+    restriction_edges = list()
+    restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
+    for equiv in owl_class.get(owl_prefix + "equivalentClass", list()):
+        for mini_class in equiv.get(owl_prefix + "Class", list()):
+            for edge in mini_class.get(owl_prefix + "intersectionOf", list()):
+                restriction_edges.append((edge, owl_prefix + "equivalentClass"))
+
+    for (edge, general_edge_type) in restriction_edges:
+        for restriction in edge.get(owl_prefix + "Restriction", list()):
+            edge_type = restriction.get(owl_prefix + "onProperty", list())
+            edge_object = restriction.get(owl_prefix + "someValuesFrom", list())
+            if len(edge_type) != 1:
+                assert len(edge_type) <= 1, edge 
+                continue
+            if len(edge_object) != 1:
+                assert len(edge_object) <= 1, edge
+                continue
+            edge_type = edge_type[0].get(RESOURCE_KEY, None)
+            edge_object = edge_object[0].get(RESOURCE_KEY, None)
+
+            if edge_type != None and edge_object != None:
+                edges_list.append((edge_type, edge_object))
+
+        if RESOURCE_KEY in edge:
+            edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
+
+    superclasses = set()
+    final_edges_list = list()
+    for (edge_relation, edge_object) in edges_list:
+        edge_object = match_prefix(edge_object)
+        if edge_object is None:
+            continue
+        edge_relation = match_prefix(edge_relation)
+        if edge_relation is None:
+            continue
+        if edge_relation in ["rdfs:subClassOf"]:
+            superclasses.add(edge_object)
+        final_edges_list.append((edge_relation, edge_object))
+
+    # Imperfect way to make it deterministic
+    superclasses = sorted(list(superclasses))
+    if node_id not in CLASS_TO_SUPERCLASSES:
+        CLASS_TO_SUPERCLASSES[node_id] = list()
+    CLASS_TO_SUPERCLASSES[node_id] += superclasses
+    CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
+
+    if node_id not in SAVED_NODE_INFO:
+        SAVED_NODE_INFO[node_id] = list()
+    SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
+                                     DEPRECATED_KEY: deprecated,
+                                     UPDATE_DATE_KEY: update_date_list,
+                                     CREATION_DATE_KEY: creation_date_list,
+                                     SYNONYM_KEY: synonym_list,
+                                     DESCRIPTION_KEY: description_list,
+                                     NAME_KEY: name_list,
+                                     SOURCE_KEY: source,
+                                     BIOLOGICAL_SEQUENCE_KEY: has_biological_sequence,
+                                     IRI_KEY: node_iri,
+                                     EDGES_KEY: final_edges_list})
 
 def process_ontology_item(ontology_item):
-	source = ontology_item.get(OWL_SOURCE_KEY, str())
-	ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str())
+    source = ontology_item.get(OWL_SOURCE_KEY, str())
+    ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str())
 
-	for owl_class in ontology_item.get("owl:Class", list()):
-		process_ontology_class(owl_class, source, ontology_name)
+    for owl_class in ontology_item.get("owl:Class", list()):
+        process_ontology_class(owl_class, source, ontology_name)
 
-	for owl_class in ontology_item.get("Class", list()):
-		process_ontology_class(owl_class, source, ontology_name, False)
+    for owl_class in ontology_item.get("Class", list()):
+        process_ontology_class(owl_class, source, ontology_name, False)
 
-	for ontology_node in ontology_item.get("owl:Ontology", list()):
-		process_ontology_term(ontology_node, source, ontology_name)
+    for ontology_node in ontology_item.get("owl:Ontology", list()):
+        process_ontology_term(ontology_node, source, ontology_name)
 
-	# Because of ORDO
-	for ontology_node in ontology_item.get("Ontology", list()):
-		process_ontology_term(ontology_node, source, ontology_name, False)
+    # Because of ORDO
+    for ontology_node in ontology_item.get("Ontology", list()):
+        process_ontology_term(ontology_node, source, ontology_name, False)
 
 def generate_uri_map(curies_to_urls_file_name):
-	uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name))
-	bidirectional_map = uri_input_map['use_for_bidirectional_mapping']
-	contraction_map = uri_input_map['use_for_contraction_only']
-
-	for curie_prefix_dict in bidirectional_map:
-		for curie_prefix in curie_prefix_dict:
-			curie_url = curie_prefix_dict[curie_prefix]
-			URI_MAP[curie_url] = curie_prefix
-			PREFIX_TO_IRI_MAP[curie_prefix] = curie_url
-
-	for curie_prefix_dict in contraction_map:
-		for curie_prefix in curie_prefix_dict:
-			curie_url = curie_prefix_dict[curie_prefix]
-			URI_MAP[curie_url] = curie_prefix
-
-	# So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another)
-	# Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python)
-	global URI_MAP_KEYS
-	URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True)
+    uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name))
+    bidirectional_map = uri_input_map['use_for_bidirectional_mapping']
+    contraction_map = uri_input_map['use_for_contraction_only']
+
+    for curie_prefix_dict in bidirectional_map:
+        for curie_prefix in curie_prefix_dict:
+            curie_url = curie_prefix_dict[curie_prefix]
+            URI_MAP[curie_url] = curie_prefix
+            PREFIX_TO_IRI_MAP[curie_prefix] = curie_url
+
+    for curie_prefix_dict in contraction_map:
+        for curie_prefix in curie_prefix_dict:
+            curie_url = curie_prefix_dict[curie_prefix]
+            URI_MAP[curie_url] = curie_prefix
+
+    # So that you get the most accurate match, you want to match to the longest url (in case one is a substring of another)
+    # Apparently have to use global key word to write to a module wide list (https://stackoverflow.com/questions/4630543/defining-lists-as-global-variables-in-python)
+    global URI_MAP_KEYS
+    URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True)
 
 def match_prefix(node_id):
-	for curie_url in URI_MAP_KEYS:
-		if node_id.startswith(curie_url):
-			return node_id.replace(curie_url, URI_MAP[curie_url] + ":")
-	
-	if "http" in node_id:
-		MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/")
-	elif ':' in node_id:
-		MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":")
-	elif '_' in node_id:
-		MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_")
-	else:
-		MISSING_ID_PREFIXES.add(node_id)
+    for curie_url in URI_MAP_KEYS:
+        if node_id.startswith(curie_url):
+            return node_id.replace(curie_url, URI_MAP[curie_url] + ":")
+    
+    if "http" in node_id:
+        MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/")
+    elif ':' in node_id:
+        MISSING_ID_PREFIXES.add(node_id.split(':')[0] + ":")
+    elif '_' in node_id:
+        MISSING_ID_PREFIXES.add(node_id.split('_')[0] + "_")
+    else:
+        MISSING_ID_PREFIXES.add(node_id)
 
 def construct_nodes_and_edges(nodes_output, edges_output):
-	for source in SOURCE_INFO:
-		source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]])
-		source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY]
-		source_id = SOURCE_INFO[source][SOURCE_KEY]
-		source_iri = SOURCE_INFO[source][IRI_KEY]
-		node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id)
+    for source in SOURCE_INFO:
+        source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]])
+        source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY]
+        source_id = SOURCE_INFO[source][SOURCE_KEY]
+        source_iri = SOURCE_INFO[source][IRI_KEY]
+        node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id)
 
-		nodes_output.write(node)
+        nodes_output.write(node)
 
 
-	for node_id in SAVED_NODE_INFO:
-		for source_node_index in range(len(SAVED_NODE_INFO[node_id])):
-			if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]:
-				continue
-			name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name
-			node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY]
-			description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY])
-			has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None)
-			synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY]
-			category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY]
+    for node_id in SAVED_NODE_INFO:
+        for source_node_index in range(len(SAVED_NODE_INFO[node_id])):
+            if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]:
+                continue
+            name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name
+            node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY]
+            description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY])
+            has_biological_sequence = SAVED_NODE_INFO[node_id][source_node_index][BIOLOGICAL_SEQUENCE_KEY].get("smiles", None)
+            synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY]
+            category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY]
 
-			source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY]
-			provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source
-			source_date = SOURCE_INFO[source][UPDATE_DATE_KEY]
+            source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY]
+            provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source
+            source_date = SOURCE_INFO[source][UPDATE_DATE_KEY]
 
-			update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date)
-			creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date)
+            update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date)
+            creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date)
 
-			node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by)
-			node["description"] = description
-			node["has_biological_sequence"] = has_biological_sequence
-			node["creation_date"] = creation_date
-			node["synonym"] = synonyms
+            node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by)
+            node["description"] = description
+            node["has_biological_sequence"] = has_biological_sequence
+            node["creation_date"] = creation_date
+            node["synonym"] = synonyms
 
-			nodes_output.write(node)
+            nodes_output.write(node)
 
-			for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]:
-				relation_label = edge_relation.split(':')[1]
-				edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date)
+            for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]:
+                relation_label = edge_relation.split(':')[1]
+                edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date)
 
-				edges_output.write(edge)
+                edges_output.write(edge)
 
 
 
 if __name__ == '__main__':
-	args = get_args()
-	input_file_name = args.inputFile
-	curies_to_categories_file_name = args.curiesToCategoriesYAML
-	curies_to_urls_file_name = args.curiesToURLsYAML
-	output_nodes_file_name = args.outputNodesFile
-	output_edges_file_name = args.outputEdgesFile
-	test_mode = args.test
-
-	nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
-	nodes_output = nodes_info[0]
-	edges_output = edges_info[0]
-
-	curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
-	for mapping_node in curies_to_categories_data["term-mappings"]:
-		NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING)
-	for prefix in curies_to_categories_data["prefix-mappings"]:
-		PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix]
-
-	input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
-	input_data = input_read_jsonlines_info[0]
-
-	ontology_prefixes = set()
-	generate_uri_map(curies_to_urls_file_name)
-	for ontology_item in input_data:
-		process_ontology_item(ontology_item)
-
-	for node_id in SAVED_NODE_INFO:
-		categorize_node(node_id)
-		node_category = NODE_CATEGORY_MAPPINGS[node_id][0]
-		for index in range(len(SAVED_NODE_INFO[node_id])):
-			SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category
-
-	construct_nodes_and_edges(nodes_output, edges_output)
-
-	kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
\ No newline at end of file
+    print("Start time: ", kg2_util.date())
+    args = get_args()
+    input_file_name = args.inputFile
+    curies_to_categories_file_name = args.curiesToCategoriesYAML
+    curies_to_urls_file_name = args.curiesToURLsYAML
+    output_nodes_file_name = args.outputNodesFile
+    output_edges_file_name = args.outputEdgesFile
+    test_mode = args.test
+
+    nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
+    nodes_output = nodes_info[0]
+    edges_output = edges_info[0]
+
+    curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
+    for mapping_node in curies_to_categories_data["term-mappings"]:
+        NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING)
+    for prefix in curies_to_categories_data["prefix-mappings"]:
+        PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix]
+
+    input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
+    input_data = input_read_jsonlines_info[0]
+
+    ontology_prefixes = set()
+    generate_uri_map(curies_to_urls_file_name)
+    for ontology_item in input_data:
+        process_ontology_item(ontology_item)
+
+    for node_id in SAVED_NODE_INFO:
+        categorize_node(node_id)
+        node_category = NODE_CATEGORY_MAPPINGS[node_id][0]
+        for index in range(len(SAVED_NODE_INFO[node_id])):
+            SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category
+
+    construct_nodes_and_edges(nodes_output, edges_output)
+
+    kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)
+
+    print("Finish time: ", kg2_util.date())
diff --git a/extract/owlparser.py b/extract/owlparser.py
index 34e99fe3..fe540f3b 100644
--- a/extract/owlparser.py
+++ b/extract/owlparser.py
@@ -1,524 +1,540 @@
+#!/usr/bin/env python3
+''' owlparser.py: Converts OWL (XML) Files into JSON Lines Representations
+
+    Usage: owlparser.py [--test] <inputFile.yaml> <owlFilePath> <outputFile.jsonl>
+'''
+
 import json
 import argparse
 import datetime
 import kg2_util
 
+__author__ = 'Erica Wood'
+__copyright__ = 'Oregon State University'
+__credits__ = ['Stephen Ramsey', 'Erica Wood']
+__license__ = 'MIT'
+__version__ = '0.1.0'
+__maintainer__ = ''
+__email__ = ''
+__status__ = 'Prototype'
+
+
 def get_args():
-	arg_parser = argparse.ArgumentParser()
-	arg_parser.add_argument('--test', dest='test',
-							action="store_true", default=False)
-	arg_parser.add_argument('inputFile', type=str)
-	arg_parser.add_argument('owlFilePath', type=str)
-	arg_parser.add_argument('outputFile', type=str)
-	return arg_parser.parse_args()
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('--test', dest='test',
+                            action="store_true", default=False)
+    arg_parser.add_argument('inputFile', type=str)
+    arg_parser.add_argument('owlFilePath', type=str)
+    arg_parser.add_argument('outputFile', type=str)
+    return arg_parser.parse_args()
 
 def date():
-	return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
 class LineElementRead():
-	NONE = 0
-	TAG = 1
-	ATTRIBUTE_TAG = 2
-	ATTRIBUTE_TEXT = 3
-	MAIN = 4
-	END_TAG = 5
+    NONE = 0
+    TAG = 1
+    ATTRIBUTE_TAG = 2
+    ATTRIBUTE_TEXT = 3
+    MAIN = 4
+    END_TAG = 5
 
 class XMLParser():
-	def __init__(self, skip_tags, ignored_attributes, processing_func):
-		self.COMMENT = "!--"
-		self.OUTMOST_TAGS_SKIP = skip_tags
-		self.IGNORED_ATTRIBUTES = ignored_attributes
-		self.processing_func = processing_func
-
-		self.LINE_TYPE_IGNORE = "ignore"
-		self.LINE_TYPE_START_NEST = "start nest"
-		self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
-		self.LINE_TYPE_ENTRY = "entry"
-		self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes"
-		self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes"
-		self.LINE_TYPE_END_NEST = "end nest"
-
-		self.KEY_TAG = "tag"
-		self.KEY_ATTRIBUTES = "attributes"
-		self.KEY_TEXT = "ENTRY_TEXT"
-		self.KEY_TYPE = "type"
-
-		# Variables for line reading
-		self.tag = ""
-		self.attributes = dict()
-		self.attribute_tag = ""
-		self.attribute_text = ""
-		self.main_text = ""
-		self.end_tag = ""
-		self.only_tag = False
-		self.start_brackets = 0
-		self.line = ""
-		self.letter = ""
-		self.next_letter = ""
-		self.prev_letter = ""
-		self.type_to_read = LineElementRead.NONE
-
-	def categorize_line(self):
-		# Categorize the type of line
-		line_type = str()
-		out = dict()
-
-		# Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
-		if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag:
-			line_type = self.LINE_TYPE_IGNORE
-		else:
-			start_tag_exists = (self.tag != str())
-			attributes_exist = (self.attributes != dict())
-			text_exists = (self.main_text != str())
-			end_tag_exists = (self.end_tag != str())
-
-			if start_tag_exists:
-				if attributes_exist:
-					if text_exists:
-						line_type = self.LINE_TYPE_ENTRY_WITH_ATTR
-						out[self.KEY_TAG] = self.tag
-						out[self.KEY_ATTRIBUTES] = self.attributes
-						out[self.KEY_TEXT] = self.main_text
-					elif end_tag_exists:
-						line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR
-						out[self.KEY_TAG] = self.tag
-						out[self.KEY_ATTRIBUTES] = self.attributes
-					else:
-						line_type = self.LINE_TYPE_START_NEST_WITH_ATTR
-						out[self.KEY_TAG] = self.tag
-						out[self.KEY_ATTRIBUTES] = self.attributes
-				elif text_exists:
-					line_type = self.LINE_TYPE_ENTRY
-					out[self.KEY_TAG] = self.tag
-					out[self.KEY_TEXT] = self.main_text
-				else:
-					line_type = self.LINE_TYPE_START_NEST
-					out[self.KEY_TAG] = self.tag
-			elif end_tag_exists:
-				line_type = self.LINE_TYPE_END_NEST
-				out[self.KEY_TAG] = self.end_tag
-
-		out[self.KEY_TYPE] = line_type
-
-		return out
-
-	def get_letters(self, letter_index):
-		self.letter = self.line[letter_index]
-		self.next_letter = ""
-		self.prev_letter = ""
-		if letter_index + 1 < len(self.line):
-			self.next_letter = self.line[letter_index + 1]
-		if letter_index - 1 >= 0:
-			self.prev_letter = self.line[letter_index - 1]
-
-		if self.letter == '<':
-			self.start_brackets += 1
-		if self.letter == '>':
-			self.start_brackets -= 1
-
-
-	def identify_tag_type(self, letter_index):
-		changed = True
-
-		if self.letter == '<' and letter_index == 0:
-			if self.next_letter != '/':
-				self.type_to_read = LineElementRead.TAG
-		elif self.letter == '/' and self.prev_letter == '<':
-			self.type_to_read = LineElementRead.END_TAG
-		else:
-			changed = False
-
-		return changed
-
-
-	def read_tag(self):
-		changed = True
-
-		if self.letter == ' ' and self.type_to_read == LineElementRead.TAG:
-			self.type_to_read = LineElementRead.ATTRIBUTE_TAG
-		elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0:
-			self.type_to_read = LineElementRead.MAIN
-
-			if self.prev_letter == '/':
-				print("Warning - strange tag, ignoring", self.line)
-				self.only_tag = True
-		elif self.type_to_read == LineElementRead.TAG:
-			self.tag += self.letter
-		else:
-			changed = False
-
-		return changed
-
-
-	def store_attribute(self):
-		if self.attribute_tag not in self.IGNORED_ATTRIBUTES:
-			self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"')
-		self.attribute_tag = ""
-		self.attribute_text = ""
-
-
-	def read_attributes(self):
-		changed = True
-		start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT)
-
-		if self.letter == '>' and start_reading_attributes and self.start_brackets == 0:
-			self.type_to_read = LineElementRead.MAIN
-			
-			self.store_attribute()
-
-			if self.prev_letter == '/':
-				self.end_tag = self.tag
-		elif start_reading_attributes:
-			if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
-				self.type_to_read = LineElementRead.ATTRIBUTE_TEXT
-			elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
-				self.attribute_tag += self.letter
-			elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-				self.type_to_read = LineElementRead.ATTRIBUTE_TAG
-				self.store_attribute()
-			elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
-				self.attribute_text += self.letter
-		else:
-			changed = False
+    def __init__(self, skip_tags, ignored_attributes, processing_func):
+        self.COMMENT = "!--"
+        self.OUTMOST_TAGS_SKIP = skip_tags
+        self.IGNORED_ATTRIBUTES = ignored_attributes
+        self.processing_func = processing_func
+
+        self.LINE_TYPE_IGNORE = "ignore"
+        self.LINE_TYPE_START_NEST = "start nest"
+        self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
+        self.LINE_TYPE_ENTRY = "entry"
+        self.LINE_TYPE_ENTRY_WITH_ATTR = "entry with attributes"
+        self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes"
+        self.LINE_TYPE_END_NEST = "end nest"
+
+        self.KEY_TAG = "tag"
+        self.KEY_ATTRIBUTES = "attributes"
+        self.KEY_TEXT = "ENTRY_TEXT"
+        self.KEY_TYPE = "type"
+
+        # Variables for line reading
+        self.tag = ""
+        self.attributes = dict()
+        self.attribute_tag = ""
+        self.attribute_text = ""
+        self.main_text = ""
+        self.end_tag = ""
+        self.only_tag = False
+        self.start_brackets = 0
+        self.line = ""
+        self.letter = ""
+        self.next_letter = ""
+        self.prev_letter = ""
+        self.type_to_read = LineElementRead.NONE
+
+    def categorize_line(self):
+        # Categorize the type of line
+        line_type = str()
+        out = dict()
+
+        # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
+        if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag:
+            line_type = self.LINE_TYPE_IGNORE
+        else:
+            start_tag_exists = (self.tag != str())
+            attributes_exist = (self.attributes != dict())
+            text_exists = (self.main_text != str())
+            end_tag_exists = (self.end_tag != str())
+
+            if start_tag_exists:
+                if attributes_exist:
+                    if text_exists:
+                        line_type = self.LINE_TYPE_ENTRY_WITH_ATTR
+                        out[self.KEY_TAG] = self.tag
+                        out[self.KEY_ATTRIBUTES] = self.attributes
+                        out[self.KEY_TEXT] = self.main_text
+                    elif end_tag_exists:
+                        line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR
+                        out[self.KEY_TAG] = self.tag
+                        out[self.KEY_ATTRIBUTES] = self.attributes
+                    else:
+                        line_type = self.LINE_TYPE_START_NEST_WITH_ATTR
+                        out[self.KEY_TAG] = self.tag
+                        out[self.KEY_ATTRIBUTES] = self.attributes
+                elif text_exists:
+                    line_type = self.LINE_TYPE_ENTRY
+                    out[self.KEY_TAG] = self.tag
+                    out[self.KEY_TEXT] = self.main_text
+                else:
+                    line_type = self.LINE_TYPE_START_NEST
+                    out[self.KEY_TAG] = self.tag
+            elif end_tag_exists:
+                line_type = self.LINE_TYPE_END_NEST
+                out[self.KEY_TAG] = self.end_tag
+
+        out[self.KEY_TYPE] = line_type
+
+        return out
+
+    def get_letters(self, letter_index):
+        self.letter = self.line[letter_index]
+        self.next_letter = ""
+        self.prev_letter = ""
+        if letter_index + 1 < len(self.line):
+            self.next_letter = self.line[letter_index + 1]
+        if letter_index - 1 >= 0:
+            self.prev_letter = self.line[letter_index - 1]
+
+        if self.letter == '<':
+            self.start_brackets += 1
+        if self.letter == '>':
+            self.start_brackets -= 1
+
+
+    def identify_tag_type(self, letter_index):
+        changed = True
+
+        if self.letter == '<' and letter_index == 0:
+            if self.next_letter != '/':
+                self.type_to_read = LineElementRead.TAG
+        elif self.letter == '/' and self.prev_letter == '<':
+            self.type_to_read = LineElementRead.END_TAG
+        else:
+            changed = False
+
+        return changed
+
+
+    def read_tag(self):
+        changed = True
+
+        if self.letter == ' ' and self.type_to_read == LineElementRead.TAG:
+            self.type_to_read = LineElementRead.ATTRIBUTE_TAG
+        elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0:
+            self.type_to_read = LineElementRead.MAIN
+
+            if self.prev_letter == '/':
+                print("Warning - strange tag, ignoring", self.line)
+                self.only_tag = True
+        elif self.type_to_read == LineElementRead.TAG:
+            self.tag += self.letter
+        else:
+            changed = False
+
+        return changed
+
+
+    def store_attribute(self):
+        if self.attribute_tag not in self.IGNORED_ATTRIBUTES:
+            self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"')
+        self.attribute_tag = ""
+        self.attribute_text = ""
+
+
+    def read_attributes(self):
+        changed = True
+        start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT)
+
+        if self.letter == '>' and start_reading_attributes and self.start_brackets == 0:
+            self.type_to_read = LineElementRead.MAIN
+            
+            self.store_attribute()
+
+            if self.prev_letter == '/':
+                self.end_tag = self.tag
+        elif start_reading_attributes:
+            if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
+                self.type_to_read = LineElementRead.ATTRIBUTE_TEXT
+            elif self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
+                self.attribute_tag += self.letter
+            elif self.letter == ' ' and self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+                self.type_to_read = LineElementRead.ATTRIBUTE_TAG
+                self.store_attribute()
+            elif self.type_to_read == LineElementRead.ATTRIBUTE_TEXT:
+                self.attribute_text += self.letter
+        else:
+            changed = False
 
-		return changed
+        return changed
 
 
-	def read_main(self):
-		changed = True
-		if self.letter == '<' and self.type_to_read == LineElementRead.MAIN:
-			self.type_to_read = LineElementRead.END_TAG
-		elif self.type_to_read == LineElementRead.MAIN:
-			self.main_text += self.letter
-		else:
-			changed = False
+    def read_main(self):
+        changed = True
+        if self.letter == '<' and self.type_to_read == LineElementRead.MAIN:
+            self.type_to_read = LineElementRead.END_TAG
+        elif self.type_to_read == LineElementRead.MAIN:
+            self.main_text += self.letter
+        else:
+            changed = False
 
-		return changed
+        return changed
 
 
-	def read_end_tag(self):
-		changed = True
-		if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0:
-			pass
-		elif self.type_to_read == LineElementRead.END_TAG:
-			self.end_tag += self.letter
-		else:
-			changed = False
+    def read_end_tag(self):
+        changed = True
+        if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0:
+            pass
+        elif self.type_to_read == LineElementRead.END_TAG:
+            self.end_tag += self.letter
+        else:
+            changed = False
 
-		return changed
+        return changed
 
 
-	def convert_line(self):
-		self.tag = ""
-		self.attributes = dict()
-		self.attribute_tag = ""
-		self.attribute_text = ""
-		self.main_text = ""
-		self.end_tag = ""
+    def convert_line(self):
+        self.tag = ""
+        self.attributes = dict()
+        self.attribute_tag = ""
+        self.attribute_text = ""
+        self.main_text = ""
+        self.end_tag = ""
 
-		self.type_to_read = LineElementRead.NONE
+        self.type_to_read = LineElementRead.NONE
 
-		self.only_tag = False
+        self.only_tag = False
 
-		self.start_brackets = 0
+        self.start_brackets = 0
 
-		for letter_index in range(len(self.line)):
-			self.get_letters(letter_index)
+        for letter_index in range(len(self.line)):
+            self.get_letters(letter_index)
 
-			# First <
-			if self.identify_tag_type(letter_index):
-				continue
+            # First <
+            if self.identify_tag_type(letter_index):
+                continue
 
-			if self.read_tag():
-				continue
+            if self.read_tag():
+                continue
 
-			if self.read_attributes():
-				continue
+            if self.read_attributes():
+                continue
 
-			if self.read_main():
-				continue
+            if self.read_main():
+                continue
 
-			if self.read_end_tag():
-				continue
+            if self.read_end_tag():
+                continue
 
-		return self.categorize_line()
+        return self.categorize_line()
 
 
-	def convert_nest(self, nest, start_index):
-		nest_dict = dict()
-		curr_index = start_index
+    def convert_nest(self, nest, start_index):
+        nest_dict = dict()
+        curr_index = start_index
 
-		while curr_index < len(nest):
-			element = nest[curr_index]
-			line_type = element[self.KEY_TYPE]
-			line_tag = element[self.KEY_TAG]
-			line_text = element.get(self.KEY_TEXT, None)
-			line_attributes = element.get(self.KEY_ATTRIBUTES, None)
+        while curr_index < len(nest):
+            element = nest[curr_index]
+            line_type = element[self.KEY_TYPE]
+            line_tag = element[self.KEY_TAG]
+            line_text = element.get(self.KEY_TEXT, None)
+            line_attributes = element.get(self.KEY_ATTRIBUTES, None)
 
-			if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
-				if line_tag not in nest_dict:
-					nest_dict[line_tag] = list()
+            if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
+                if line_tag not in nest_dict:
+                    nest_dict[line_tag] = list()
 
-				converted_nest, ret_index = self.convert_nest(nest, curr_index + 1)
+                converted_nest, ret_index = self.convert_nest(nest, curr_index + 1)
 
-				if line_attributes is not None:
-					for attribute in line_attributes:
-						converted_nest[attribute] = line_attributes[attribute]
+                if line_attributes is not None:
+                    for attribute in line_attributes:
+                        converted_nest[attribute] = line_attributes[attribute]
 
-				nest_dict[line_tag].append(converted_nest)
+                nest_dict[line_tag].append(converted_nest)
 
-				curr_index = ret_index + 1
-				continue
+                curr_index = ret_index + 1
+                continue
 
-			if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]:
-				if line_tag not in nest_dict:
-					nest_dict[line_tag] = list()
+            if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]:
+                if line_tag not in nest_dict:
+                    nest_dict[line_tag] = list()
 
-				curr_dict = dict()
+                curr_dict = dict()
 
-				if line_text is not None:
-					curr_dict[self.KEY_TEXT] = line_text
+                if line_text is not None:
+                    curr_dict[self.KEY_TEXT] = line_text
 
-				if line_attributes is not None:
-					for attribute in line_attributes:
-						curr_dict[attribute] = line_attributes[attribute]
+                if line_attributes is not None:
+                    for attribute in line_attributes:
+                        curr_dict[attribute] = line_attributes[attribute]
 
-				nest_dict[line_tag].append(curr_dict)
+                nest_dict[line_tag].append(curr_dict)
 
-				curr_index += 1
-				continue
+                curr_index += 1
+                continue
 
-			if line_type in [self.LINE_TYPE_END_NEST]:
-				return nest_dict, curr_index
+            if line_type in [self.LINE_TYPE_END_NEST]:
+                return nest_dict, curr_index
 
-		return nest_dict, curr_index
+        return nest_dict, curr_index
 
 
-	def divide_into_lines(self, input_file_name):
-		curr_str = ""
-		curr_nest = list()
-		curr_nest_tags = list() # Treating it as a stack
-		start_brackets = 0
+    def divide_into_lines(self, input_file_name):
+        curr_str = ""
+        curr_nest = list()
+        curr_nest_tags = list() # Treating it as a stack
+        start_brackets = 0
 
-		with open(input_file_name) as input_file:
-			for line in input_file:
-				line_str = line.strip()
+        with open(input_file_name) as input_file:
+            for line in input_file:
+                line_str = line.strip()
 
-				for letter_index in range(len(line_str)):
-					letter = line_str[letter_index]
-					if letter == '<':
-						start_brackets += 1
-					if letter == '>':
-						start_brackets -= 1
+                for letter_index in range(len(line_str)):
+                    letter = line_str[letter_index]
+                    if letter == '<':
+                        start_brackets += 1
+                    if letter == '>':
+                        start_brackets -= 1
 
-					next_letter = ""
-					if letter_index + 1 < len(line_str):
-						next_letter = line_str[letter_index + 1]
+                    next_letter = ""
+                    if letter_index + 1 < len(line_str):
+                        next_letter = line_str[letter_index + 1]
 
-					curr_str += letter
+                    curr_str += letter
 
-					if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
-						# Only return if nesting
-						self.line = curr_str
-						line_parsed = self.convert_line()
+                    if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
+                        # Only return if nesting
+                        self.line = curr_str
+                        line_parsed = self.convert_line()
 
-						tag = line_parsed.get(self.KEY_TAG, None)
-						assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely
-						line_type = line_parsed.get(self.KEY_TYPE, None)
-						attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys()
+                        tag = line_parsed.get(self.KEY_TAG, None)
+                        assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely
+                        line_type = line_parsed.get(self.KEY_TYPE, None)
+                        attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys()
 
-						if line_type != self.LINE_TYPE_IGNORE:
-							curr_nest.append(line_parsed)
+                        if line_type != self.LINE_TYPE_IGNORE:
+                            curr_nest.append(line_parsed)
 
-						output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
+                        output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
 
-						if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
-							curr_nest_tags.append(tag)
-						elif line_type == self.LINE_TYPE_END_NEST:
-							popped_curr_nest_tag = curr_nest_tags.pop()
-							assert popped_curr_nest_tag == tag, curr_nest
-							if len(curr_nest_tags) == 0:
-								output_nest = True
-						if output_nest: 
-							nest_dict, _ = self.convert_nest(curr_nest, 0)
+                        if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
+                            curr_nest_tags.append(tag)
+                        elif line_type == self.LINE_TYPE_END_NEST:
+                            popped_curr_nest_tag = curr_nest_tags.pop()
+                            assert popped_curr_nest_tag == tag, curr_nest
+                            if len(curr_nest_tags) == 0:
+                                output_nest = True
+                        if output_nest: 
+                            nest_dict, _ = self.convert_nest(curr_nest, 0)
 
-							self.processing_func(nest_dict)
+                            self.processing_func(nest_dict)
 
-							curr_nest = list()
-							curr_nest_tag = str()
+                            curr_nest = list()
+                            curr_nest_tag = str()
 
-						curr_str = ""
+                        curr_str = ""
 
-				if curr_str != "":
-					# divide lines by a space
-					curr_str += ' '
+                if curr_str != "":
+                    # divide lines by a space
+                    curr_str += ' '
 
 
 class OWLParser():
-	def __init__(self, input_files, input_file_names, owl_file_path, output_file_name):
-		self.XML_TAG = "?xml"
-		self.RDF_TAG = "rdf:RDF"
-		self.DOCTYPE_TAG = "!DOCTYPE"
-		self.CLASS_TAG = "owl:Class"
-		self.RESTRICTION_TAG = "owl:Restriction"
-		self.SUBCLASS_TAG = "rdfs:subClassOf"
-		self.NODEID_TAG = "rdf:nodeID"
-		self.RDF_ABOUT_TAG = "rdf:about"
-		self.GENID_PREFIX = "genid"
-
-		self.OWL_SOURCE_KEY = "owl_source"
-		self.OWL_SOURCE_NAME_KEY = "owl_source_name"
-
-		self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
-
-		self.ignored_attributes = ["xml:lang"]
-
-		self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict)
-
-		self.GENID_REMAINING_NESTS = dict()
-		self.GENID_TO_ID = dict()
-		self.ID_TO_GENIDS = dict()
-
-		self.input_files = input_files
-		self.input_file_names = input_file_names
-		self.owl_file_path = owl_file_path
-		self.output_file_name = output_file_name
-
-		self.output_info = kg2_util.create_single_jsonlines()
-		self.output = self.output_info[0]
-
-	def check_for_class_genids(self, nest_dict):
-		genids = list()
-
-		nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
-		for nest_class_index in range(len(nest_dict_classes)):
-			nest_class = nest_dict_classes[nest_class_index]
-			nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
-			for nest_subclass_index in range(len(nest_subclasses)):
-				nest_subclass = nest_subclasses[nest_subclass_index]
-				potential_genid = nest_subclass.get(self.NODEID_TAG, str())
-				if potential_genid.startswith(self.GENID_PREFIX):
-					genids.append(potential_genid)
-
-		return genids
-
-
-	def check_for_restriction_genids(self, nest_dict):
-		for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()):
-			potential_genid = nest_restriction.get(self.NODEID_TAG, str())
-			if potential_genid.startswith(self.GENID_PREFIX):
-					return potential_genid
-		return None
-
-	def extract_class_id(self, nest_dict):
-		nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
-		# Can't have competing class_ids
-		assert len(nest_dict_classes) <= 1
-
-		for nest_class_index in range(len(nest_dict_classes)):
-			nest_class = nest_dict_classes[nest_class_index]
-			return nest_class.get(self.RDF_ABOUT_TAG, str())
-
-	def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
-		output_class_nest = class_nest
-		
-		nest_dict_classes = class_nest.get(self.CLASS_TAG, list())
-		for nest_class_index in range(len(nest_dict_classes)):
-			nest_class = nest_dict_classes[nest_class_index]
-			nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
-			for nest_subclass_index in range(len(nest_subclasses)):
-				nest_subclass = nest_subclasses[nest_subclass_index]
-				potential_genid = nest_subclass.get(self.NODEID_TAG, str())
-				if potential_genid == genid:
-					output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG]
-
-		return output_class_nest
-
-
-	def write_to_output(self, output_dict, source_file):
-		output_dict[self.OWL_SOURCE_KEY] = source_file
-		output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file]
-		self.output.write(output_dict)
-
-		return
-
-
-	def triage_nest_dict(self, nest_dict):
-		genids = self.check_for_class_genids(nest_dict)
-		restriction_genid = self.check_for_restriction_genids(nest_dict)
-		class_id = self.extract_class_id(nest_dict)
-
-		if len(genids) > 0:
-			for genid in genids:
-				self.GENID_TO_ID[genid] = class_id
-			self.ID_TO_GENIDS[class_id] = genids
-			self.GENID_REMAINING_NESTS[class_id] = nest_dict
-		elif restriction_genid is not None:
-			class_id = self.GENID_TO_ID.get(restriction_genid, str())
-			if len(class_id) == 0:
-				print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
-
-				# Save to output despite not matching with an existing class
-				self.write_to_output(nest_dict, self.input_file)
-				return
-			class_nest = self.GENID_REMAINING_NESTS[class_id]
-			self.ID_TO_GENIDS[class_id].remove(restriction_genid)
-			updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
-
-			if len(self.ID_TO_GENIDS[class_id]) > 0:
-				self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
-			else:
-				# Since all of the genids used in this class have been matched, output
-				self.write_to_output(nest_dict, self.input_file)
-				self.GENID_REMAINING_NESTS[class_id] = None
-		else:
-			# There are no genids that need to be worked with, so just output
-			self.write_to_output(nest_dict, self.input_file)
-
-
-	def parse_OWL_file(self):
-		for input_file in self.input_files:
-			self.input_file = input_file
-			print("Reading:", input_file, "starting at", date())
-			self.xml_parser.divide_into_lines(self.owl_file_path + input_file)
-
-			# Genid wasn't filled, still want to include them though
-			for item in self.GENID_REMAINING_NESTS:
-				if self.GENID_REMAINING_NESTS[item] != None:
-					self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file)
-
-			# Refresh everything for the next file
-			self.GENID_REMAINING_NESTS = dict()
-			self.GENID_TO_ID = dict()
-			self.ID_TO_GENIDS = dict()
-
-		kg2_util.close_single_jsonlines(self.output_info, self.output_file_name)
+    def __init__(self, input_files, input_file_names, owl_file_path, output_file_name):
+        self.XML_TAG = "?xml"
+        self.RDF_TAG = "rdf:RDF"
+        self.DOCTYPE_TAG = "!DOCTYPE"
+        self.CLASS_TAG = "owl:Class"
+        self.RESTRICTION_TAG = "owl:Restriction"
+        self.SUBCLASS_TAG = "rdfs:subClassOf"
+        self.NODEID_TAG = "rdf:nodeID"
+        self.RDF_ABOUT_TAG = "rdf:about"
+        self.GENID_PREFIX = "genid"
+
+        self.OWL_SOURCE_KEY = "owl_source"
+        self.OWL_SOURCE_NAME_KEY = "owl_source_name"
+
+        self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
+
+        self.ignored_attributes = ["xml:lang"]
+
+        self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict)
+
+        self.GENID_REMAINING_NESTS = dict()
+        self.GENID_TO_ID = dict()
+        self.ID_TO_GENIDS = dict()
+
+        self.input_files = input_files
+        self.input_file_names = input_file_names
+        self.owl_file_path = owl_file_path
+        self.output_file_name = output_file_name
+
+        self.output_info = kg2_util.create_single_jsonlines()
+        self.output = self.output_info[0]
+
+    def check_for_class_genids(self, nest_dict):
+        genids = list()
+
+        nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
+        for nest_class_index in range(len(nest_dict_classes)):
+            nest_class = nest_dict_classes[nest_class_index]
+            nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
+            for nest_subclass_index in range(len(nest_subclasses)):
+                nest_subclass = nest_subclasses[nest_subclass_index]
+                potential_genid = nest_subclass.get(self.NODEID_TAG, str())
+                if potential_genid.startswith(self.GENID_PREFIX):
+                    genids.append(potential_genid)
+
+        return genids
+
+
+    def check_for_restriction_genids(self, nest_dict):
+        for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()):
+            potential_genid = nest_restriction.get(self.NODEID_TAG, str())
+            if potential_genid.startswith(self.GENID_PREFIX):
+                    return potential_genid
+        return None
+
+    def extract_class_id(self, nest_dict):
+        nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
+        # Can't have competing class_ids
+        assert len(nest_dict_classes) <= 1
+
+        for nest_class_index in range(len(nest_dict_classes)):
+            nest_class = nest_dict_classes[nest_class_index]
+            return nest_class.get(self.RDF_ABOUT_TAG, str())
+
+    def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
+        output_class_nest = class_nest
+        
+        nest_dict_classes = class_nest.get(self.CLASS_TAG, list())
+        for nest_class_index in range(len(nest_dict_classes)):
+            nest_class = nest_dict_classes[nest_class_index]
+            nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
+            for nest_subclass_index in range(len(nest_subclasses)):
+                nest_subclass = nest_subclasses[nest_subclass_index]
+                potential_genid = nest_subclass.get(self.NODEID_TAG, str())
+                if potential_genid == genid:
+                    output_class_nest[self.CLASS_TAG][nest_class_index][self.SUBCLASS_TAG][nest_subclass_index][self.RESTRICTION_TAG] = genid_nest[self.RESTRICTION_TAG]
+
+        return output_class_nest
+
+
+    def write_to_output(self, output_dict, source_file):
+        output_dict[self.OWL_SOURCE_KEY] = source_file
+        output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file]
+        self.output.write(output_dict)
+
+        return
+
+
+    def triage_nest_dict(self, nest_dict):
+        genids = self.check_for_class_genids(nest_dict)
+        restriction_genid = self.check_for_restriction_genids(nest_dict)
+        class_id = self.extract_class_id(nest_dict)
+
+        if len(genids) > 0:
+            for genid in genids:
+                self.GENID_TO_ID[genid] = class_id
+            self.ID_TO_GENIDS[class_id] = genids
+            self.GENID_REMAINING_NESTS[class_id] = nest_dict
+        elif restriction_genid is not None:
+            class_id = self.GENID_TO_ID.get(restriction_genid, str())
+            if len(class_id) == 0:
+                print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
+
+                # Save to output despite not matching with an existing class
+                self.write_to_output(nest_dict, self.input_file)
+                return
+            class_nest = self.GENID_REMAINING_NESTS[class_id]
+            self.ID_TO_GENIDS[class_id].remove(restriction_genid)
+            updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
+
+            if len(self.ID_TO_GENIDS[class_id]) > 0:
+                self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
+            else:
+                # Since all of the genids used in this class have been matched, output
+                self.write_to_output(nest_dict, self.input_file)
+                self.GENID_REMAINING_NESTS[class_id] = None
+        else:
+            # There are no genids that need to be worked with, so just output
+            self.write_to_output(nest_dict, self.input_file)
+
+
+    def parse_OWL_file(self):
+        for input_file in self.input_files:
+            self.input_file = input_file
+            print("Reading:", input_file, "starting at", date())
+            self.xml_parser.divide_into_lines(self.owl_file_path + input_file)
+
+            # Genid wasn't filled, still want to include them though
+            for item in self.GENID_REMAINING_NESTS:
+                if self.GENID_REMAINING_NESTS[item] != None:
+                    self.write_to_output(self.GENID_REMAINING_NESTS[item], self.input_file)
+
+            # Refresh everything for the next file
+            self.GENID_REMAINING_NESTS = dict()
+            self.GENID_TO_ID = dict()
+            self.ID_TO_GENIDS = dict()
+
+        kg2_util.close_single_jsonlines(self.output_info, self.output_file_name)
 
 
 def identify_and_download_input_files(ont_load_inventory, path_to_owl_files):
-	input_files = list()
-	input_file_names = dict()
-	owl_file_path = path_to_owl_files.rstrip('/') + "/"
-	for item in ont_load_inventory:
-		input_files.append(item['file'])
-		input_file_names[item['file']] = item['title']
-		print("Downloading:", item['file'], "starting at", date())
-		kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file'])
-		print("Download of:", item['file'], "finished at", date())
-
-	return input_files, input_file_names, owl_file_path
+    input_files = list()
+    input_file_names = dict()
+    owl_file_path = path_to_owl_files.rstrip('/') + "/"
+    for item in ont_load_inventory:
+        input_files.append(item['file'])
+        input_file_names[item['file']] = item['title']
+        print("Downloading:", item['file'], "starting at", date())
+        kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file'])
+        print("Download of:", item['file'], "finished at", date())
+
+    return input_files, input_file_names, owl_file_path
 
 if __name__ == '__main__':
-	args = get_args()
-	input_file_name = args.inputFile
-	owl_path = args.owlFilePath
-	output_file_name = args.outputFile
-
-	ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name))
-	input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path)
-
-	print("Files:", input_files)
-	print("Start Time:", date())
-	owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name)
-	owl_parser.parse_OWL_file()
-	print("End Time:", date())
\ No newline at end of file
+    args = get_args()
+    input_file_name = args.inputFile
+    owl_path = args.owlFilePath
+    output_file_name = args.outputFile
+
+    ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name))
+    input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path)
+
+    print("Files:", input_files)
+    print("Start Time:", date())
+    owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name)
+    owl_parser.parse_OWL_file()
+    print("End Time:", date())
\ No newline at end of file

From 59c6192b9e5391f21fb331bee3106aaed597f2f3 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 02:58:05 -0700
Subject: [PATCH 094/125] #387 comments about the inner workings of ontologies
 conversion

---
 convert/ontologies_jsonl_to_kg_jsonl.py | 135 ++++++++++++++++++++++--
 1 file changed, 125 insertions(+), 10 deletions(-)

diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index a27561f4..b0adf675 100644
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -34,6 +34,7 @@
 COMMENT_PREFIX = "COMMENTS: "
 DESCRIPTION_DELIM = " // "
 
+# Encoding styles for different predicates
 BASE_EDGE_TYPES = {"mondo-base:exactMatch": RESOURCE_KEY,
                    "mondo-base:closeMatch": RESOURCE_KEY,
                    "mondo-base:relatedMatch": RESOURCE_KEY,
@@ -56,25 +57,35 @@
                    "oboInOwl:hasDbXref": TEXT_KEY,
                    "oboInOwl:xref": TEXT_KEY}
 
+# Mapping structure used to recursively determine node category
 CLASS_TO_SUPERCLASSES = dict()
+
+# Node information storage, for while categories are determined
 SAVED_NODE_INFO = dict()
+
+# Storage for source information
 SOURCE_INFO = dict()
 
+# Used to store the category of nodes both from curies-to-categories.yaml and as they are recursively mapped
 NODE_CATEGORY_MAPPINGS = dict()
-PREFIX_MAPPINGS = dict()
 
-CLASSES_DICT = dict()
+# Used to store the prefix mappings from curies-to-categories.yaml
+PREFIX_MAPPINGS = dict()
 
+# Used to store extracted information from curies-to-urls-map.yaml
 URI_MAP = dict()
 URI_MAP_KEYS = list()
 PREFIX_TO_IRI_MAP = dict()
 
+# Prefixes for owl:Class elements that were unable to be mapped with curies-to-urls-map.yaml
 MISSING_ID_PREFIXES = set()
 
+# Category mapping techniques
 FILE_MAPPING = "file"
 PREFIX_MAPPING = "prefix"
 RECURSE_MAPPING = "recurse"
 
+# Keys for saving node and edges information between its initial processing and node/edge creation
 ID_KEY = "id"
 DEPRECATED_KEY = "deprecated"
 UPDATE_DATE_KEY = "update_date"
@@ -89,6 +100,7 @@
 IRI_KEY = "iri"
 VERSION_KEY = "version"
 
+
 def get_args():
     arg_parser = argparse.ArgumentParser()
     arg_parser.add_argument('--test', dest='test',
@@ -100,26 +112,42 @@ def get_args():
     arg_parser.add_argument('outputEdgesFile', type=str)
     return arg_parser.parse_args()
 
+
 def categorize_node(node_id, recursion_depth=0):
+    """
+        Recursively navigate the hierarchy of node superclasses to identify the optimal categorization for a node.
+        If a particular category for a node is desired, classify it as such within curies-to-categories.yaml.
+    """
+    # First, retrieve the node prefix
     node_prefix = node_id.split(':')[0]
 
+    # If the node is directly mapped in curies-to-categories.yaml, utilize that mapping
+    # The [1] field of NODE_CATEGORY_MAPPINGS[node_id] refers to the way that node was mapped
     if node_id in NODE_CATEGORY_MAPPINGS and NODE_CATEGORY_MAPPINGS[node_id][1] == FILE_MAPPING:
         return NODE_CATEGORY_MAPPINGS[node_id][0]
 
+    # If the node isn't in curies-to-categories.yaml, but its prefix is, use that mapping
     if node_prefix in PREFIX_MAPPINGS:
         node_category = PREFIX_MAPPINGS[node_prefix]
         NODE_CATEGORY_MAPPINGS[node_id] = (node_category, PREFIX_MAPPING)
         return PREFIX_MAPPINGS[node_prefix]
 
-    # Get try to get the most common superclass categorization
+    # Try to get the most common superclass categorization (naive method for picking category of nodes with multiple superclasses)
+    # Initialize the category as named thing as a default
     superclass_categorizations = dict()
     highest_value = 0
     highest_category = kg2_util.BIOLINK_CATEGORY_NAMED_THING
+
+    # To avoid unnecessary recursion, stop at 10 layers (prevents errors in subclass cycles)
     if recursion_depth == 10:
         return kg2_util.BIOLINK_CATEGORY_NAMED_THING
 
+    # Perform the recursive mapping search for all of the node's superclasses
     for superclass in CLASS_TO_SUPERCLASSES.get(node_id, list()):
+        # First, recurse
         superclass_category = categorize_node(superclass, recursion_depth + 1)
+
+        # Then, determine the optimal categorization for the node based on naive category determination method
         if superclass_category not in superclass_categorizations:
             superclass_categorizations[superclass_category] = 0
         superclass_categorizations[superclass_category] += 1
@@ -127,10 +155,15 @@ def categorize_node(node_id, recursion_depth=0):
             highest_value = superclass_categorizations[superclass_category]
             highest_category = superclass_category
 
+    # Save the categorization
     NODE_CATEGORY_MAPPINGS[node_id] = (highest_category, RECURSE_MAPPING)
     return highest_category
 
+
 def reformat_obo_date(date_str):
+    """
+        Reformat a date from an OWL field and save it as a datetime object for comparison purposes, handling a variety of input date styles
+    """
     if date_str is None:
         return None
 
@@ -168,7 +201,11 @@ def reformat_obo_date(date_str):
 
         return datetime.datetime(year, month, day)
 
+
 def pick_most_recent_date(dates, alternate_date=None):
+    """
+        Given a list of datetime objects, determine the most recent one
+    """
     latest_date = None
     for date in dates:
         if date == None:
@@ -184,10 +221,17 @@ def pick_most_recent_date(dates, alternate_date=None):
 
     return latest_date.isoformat(sep=' ')
 
+
 def process_ontology_term(ontology_node, source, ontology_name, owl_source=True):
+    """
+        Given an owl:Ontology (or analogous) element, determine all of the relevant attributes to construct a source node
+    """
+    # Only use the owl prefix on terms if it is an owl_source (i.e., not ORDO)
     owl_prefix = ""
     if owl_source:
         owl_prefix = "owl:"
+
+    # Determine the version of the ontology through one of the three encoding methods (version number, version IRI, or date)
     ontology_version = None
     ontology_versions = [version.get(TEXT_KEY, str()) for version in ontology_node.get(owl_prefix + "versionInfo", list()) if TEXT_KEY in version]
     ontology_version_iri = [version.get(RESOURCE_KEY, str()) for version in ontology_node.get(owl_prefix + "versionIRI", list()) if RESOURCE_KEY in version]
@@ -196,6 +240,7 @@ def process_ontology_term(ontology_node, source, ontology_name, owl_source=True)
     if len(ontology_versions) == 1:
         ontology_version = ontology_versions[0]
     elif len(ontology_version_iri) == 1:
+        # Strip the version number out of the IRI
         ontology_version = ontology_version_iri[0]
         version_replacements = [ontology_iri.replace('.owl', '') + '/', '/' + source, 'releases/']
         for replacement in version_replacements:
@@ -204,22 +249,33 @@ def process_ontology_term(ontology_node, source, ontology_name, owl_source=True)
     elif len(ontology_dates) >= 1:
         ontology_version = pick_most_recent_date(ontology_dates)
 
+    # Issue a warning if there is no versioning information
     if ontology_version is None:
         print("Warning: source", source, "lacks any versioning information.")
 
     ontology_date = reformat_obo_date(pick_most_recent_date(ontology_dates))
+
+    # Convert the source file name to a CURIE ID
     source_id = kg2_util.CURIE_PREFIX_OBO + ':' + source
 
+    # Add the source information to the SOURCE_INFO dictionary to later be made into a source node
     if source not in SOURCE_INFO:
         SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: ontology_date, VERSION_KEY: ontology_version}
 
 
 def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
+    """
+        Given an owl:Class (or analogous) element, strip out all of the relevant date to construct the node and edges 
+    """
+    # Only use the owl prefix on terms if it is an owl_source (i.e., not ORDO)
     owl_prefix = ""
     if owl_source:
         owl_prefix = "owl:"
-    # Typically genid classes which don't neatly map onto the KG2 schema
+
+    # Configure the node_id and node_iri
+    # We only want to construct nodes for standard nodes that fit into KG2 mappings
     if ID_TAG not in owl_class:
+        # These are typically genid classes which don't neatly map onto the KG2 schema
         return
     node_id = match_prefix(owl_class.get(ID_TAG, str()))
     if node_id is None:
@@ -229,6 +285,7 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
 
     # Configure the name
     name_list = [name.get(TEXT_KEY, None) for name in owl_class.get("rdfs:label", dict()) if TEXT_KEY in name]
+    # Return if the node has no names
     if len(name_list) == 0:
         return
 
@@ -240,6 +297,7 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
     description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("obo:UBPROP_0000005", list()) if (TEXT_KEY in description)]
     description_list += [description.get(TEXT_KEY, None) for description in owl_class.get("efo1:source_description", list()) if (TEXT_KEY in description)]
 
+    # Determine whether the node is deprecated
     deprecated = "true" in owl_class.get(owl_prefix + "deprecated", list())
     for name in name_list:
         search_name = name.lower()
@@ -253,15 +311,18 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
                     "obo:IAO_0000028", "skos:prefLabel"]
     synonym_list += [synonym.get(TEXT_KEY, None) for synonym_key in synonym_keys for synonym in owl_class.get(synonym_key, list()) if (TEXT_KEY in synonym)]
 
+    # Configure the update date
     update_date_list = list()
     update_date_keys = ["dc:date", "dcterms:date", "terms:date"]
     update_date_list += [reformat_obo_date(update_date.get(TEXT_KEY, None)) for update_date_key in update_date_keys for update_date in owl_class.get(update_date_key, list()) if (TEXT_KEY in update_date)]
 
+    # Configure the creation date
     creation_date_list = list()
     creation_date_keys = ["oboInOwl:creation_date", "go:creation_date"]
     creation_date_list += [reformat_obo_date(creation_date.get(TEXT_KEY, None)) for creation_date_key in creation_date_keys for creation_date in owl_class.get(creation_date_key, list()) if (TEXT_KEY in creation_date)]
 
     # Configure the biological sequence
+    # We are only currently using inchi, but we might as well extract all of them in case this changes in the future
     has_biological_sequence = dict()
     has_biological_sequence['formula'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:formula", list()) if TEXT_KEY in biological_sequence]
     has_biological_sequence['smiles'] = [biological_sequence.get(TEXT_KEY, None) for biological_sequence in owl_class.get("chebi:smiles", list()) if TEXT_KEY in biological_sequence]
@@ -271,12 +332,13 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
     # Extract edge triples
     edges_list = list()
 
+    # First, extract the edges with predicates that easily map to extraction patters
     for edge_type in BASE_EDGE_TYPES:
         for edge in owl_class.get(edge_type, list()):
             if BASE_EDGE_TYPES[edge_type] in edge:
                 edges_list.append((edge_type, edge.get(BASE_EDGE_TYPES[edge_type], None)))
 
-
+    # Next, identify the edges which are tightly nested under a layer of other information
     restriction_edges = list()
     restriction_edges += [(edge, "rdfs:subClassOf") for edge in owl_class.get("rdfs:subClassOf", list())]
     for equiv in owl_class.get(owl_prefix + "equivalentClass", list()):
@@ -284,10 +346,14 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
             for edge in mini_class.get(owl_prefix + "intersectionOf", list()):
                 restriction_edges.append((edge, owl_prefix + "equivalentClass"))
 
+    # Then, extract the actual information from those edges
     for (edge, general_edge_type) in restriction_edges:
+        # First, handle those with the restriction elements (the owl:EquivalentClass and rdfs:subClassOf sub-predicate cases)
         for restriction in edge.get(owl_prefix + "Restriction", list()):
             edge_type = restriction.get(owl_prefix + "onProperty", list())
             edge_object = restriction.get(owl_prefix + "someValuesFrom", list())
+
+            # Ensure each of those lists only have one item, so that we can pull item [0] in the next step to correctly identify the respective information
             if len(edge_type) != 1:
                 assert len(edge_type) <= 1, edge 
                 continue
@@ -300,31 +366,41 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
             if edge_type != None and edge_object != None:
                 edges_list.append((edge_type, edge_object))
 
+        # Then handle the generic rdfs:subClassOf case
         if RESOURCE_KEY in edge:
             edges_list.append((general_edge_type, edge.get(RESOURCE_KEY, None)))
 
+    # Convert the edges into CURIEs so they can later be converted into edges
     superclasses = set()
     final_edges_list = list()
     for (edge_relation, edge_object) in edges_list:
+        # Ensure the edge corresponds to a KG2 mapping
         edge_object = match_prefix(edge_object)
         if edge_object is None:
             continue
         edge_relation = match_prefix(edge_relation)
         if edge_relation is None:
             continue
+
+        # Identify superclass relationships
         if edge_relation in ["rdfs:subClassOf"]:
             superclasses.add(edge_object)
+
+        # Add the processed edge to the list of this node's edges
         final_edges_list.append((edge_relation, edge_object))
 
-    # Imperfect way to make it deterministic
+    # Formally save the superclass relations to the superclass hierarchy
+    # This is an imperfect way to make it deterministic; We don't want duplicate superclasses, but we also want the order to remain the same across runs
     superclasses = sorted(list(superclasses))
     if node_id not in CLASS_TO_SUPERCLASSES:
         CLASS_TO_SUPERCLASSES[node_id] = list()
     CLASS_TO_SUPERCLASSES[node_id] += superclasses
     CLASS_TO_SUPERCLASSES[node_id] = sorted(list(set(CLASS_TO_SUPERCLASSES[node_id])))
 
+    # Make sure that we have an entry for the node (since multiple sources can have information on a particular node)
     if node_id not in SAVED_NODE_INFO:
         SAVED_NODE_INFO[node_id] = list()
+    # Save this source's version of the node information
     SAVED_NODE_INFO[node_id].append({ID_KEY: node_id,
                                      DEPRECATED_KEY: deprecated,
                                      UPDATE_DATE_KEY: update_date_list,
@@ -337,34 +413,46 @@ def process_ontology_class(owl_class, source, ontology_name, owl_source=True):
                                      IRI_KEY: node_iri,
                                      EDGES_KEY: final_edges_list})
 
+
 def process_ontology_item(ontology_item):
+    """
+        Handler for processing ontology subsets
+    """
+    # Extract these custom input attributes (parts of ont-load-inventory.yaml put into owlparser.py output)
     source = ontology_item.get(OWL_SOURCE_KEY, str())
     ontology_name = ontology_item.get(OWL_SOURCE_NAME_KEY, str())
 
     for owl_class in ontology_item.get("owl:Class", list()):
         process_ontology_class(owl_class, source, ontology_name)
 
+    # Special case for non-owl prefix sources (e.g. ORDO)
     for owl_class in ontology_item.get("Class", list()):
         process_ontology_class(owl_class, source, ontology_name, False)
 
     for ontology_node in ontology_item.get("owl:Ontology", list()):
         process_ontology_term(ontology_node, source, ontology_name)
 
-    # Because of ORDO
+    # Special case for non-owl prefix sources (e.g. ORDO)
     for ontology_node in ontology_item.get("Ontology", list()):
         process_ontology_term(ontology_node, source, ontology_name, False)
 
+
 def generate_uri_map(curies_to_urls_file_name):
+    """
+        Import the curies-to-urls-map.yaml for use in CURIE ID and IRI resolution
+    """
     uri_input_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_urls_file_name))
     bidirectional_map = uri_input_map['use_for_bidirectional_mapping']
     contraction_map = uri_input_map['use_for_contraction_only']
 
+    # Import the bidirectional map for both ID mapping (URI_MAP) and IRI expansion, given the standard prefix (PREFIX_TO_IRI_MAP)
     for curie_prefix_dict in bidirectional_map:
         for curie_prefix in curie_prefix_dict:
             curie_url = curie_prefix_dict[curie_prefix]
             URI_MAP[curie_url] = curie_prefix
             PREFIX_TO_IRI_MAP[curie_prefix] = curie_url
 
+    # Import the contraction map for ID mapping (URI_MAP)
     for curie_prefix_dict in contraction_map:
         for curie_prefix in curie_prefix_dict:
             curie_url = curie_prefix_dict[curie_prefix]
@@ -375,11 +463,17 @@ def generate_uri_map(curies_to_urls_file_name):
     global URI_MAP_KEYS
     URI_MAP_KEYS = sorted(URI_MAP.keys(), key=len, reverse=True)
 
+
 def match_prefix(node_id):
+    """
+        Given a node_id from an ontology (possibly actually an IRI), return the KG2-standard CURIE ID for the node
+    """
+    # Iterate through the map keys, comparing with the longest urls first (for the most accurate match)
     for curie_url in URI_MAP_KEYS:
         if node_id.startswith(curie_url):
             return node_id.replace(curie_url, URI_MAP[curie_url] + ":")
     
+    # If there is no match, attempt to distill down the ID into just the prefix (not always possible) and add it to the list of prefixes not in KG2
     if "http" in node_id:
         MISSING_ID_PREFIXES.add('/'.join(node_id.split('/')[0:-1]) + "/")
     elif ':' in node_id:
@@ -389,7 +483,12 @@ def match_prefix(node_id):
     else:
         MISSING_ID_PREFIXES.add(node_id)
 
+
 def construct_nodes_and_edges(nodes_output, edges_output):
+    """
+        Output the nodes and edges from the ontologies once the node information has been extracted and categories have been assigned
+    """
+    # Construct all of the source nodes
     for source in SOURCE_INFO:
         source_date = pick_most_recent_date([SOURCE_INFO[source][UPDATE_DATE_KEY]])
         source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY]
@@ -399,11 +498,14 @@ def construct_nodes_and_edges(nodes_output, edges_output):
 
         nodes_output.write(node)
 
-
+    # Constrct the regular nodes and edges
     for node_id in SAVED_NODE_INFO:
+        # Iterate across all of the sources which have defined this node
         for source_node_index in range(len(SAVED_NODE_INFO[node_id])):
+            # Ignore deprecated nodes
             if SAVED_NODE_INFO[node_id][source_node_index][DEPRECATED_KEY]:
                 continue
+            # Extract all of the information from the SAVED_NODE_INFO dictionary
             name = SAVED_NODE_INFO[node_id][source_node_index][NAME_KEY][0] # Imperfect way of choosing the name
             node_iri = SAVED_NODE_INFO[node_id][source_node_index][IRI_KEY]
             description = DESCRIPTION_DELIM.join(SAVED_NODE_INFO[node_id][source_node_index][DESCRIPTION_KEY])
@@ -411,21 +513,26 @@ def construct_nodes_and_edges(nodes_output, edges_output):
             synonyms = SAVED_NODE_INFO[node_id][source_node_index][SYNONYM_KEY]
             category = SAVED_NODE_INFO[node_id][source_node_index][CATEGORY_KEY]
 
+            # Obtain source information
             source = SAVED_NODE_INFO[node_id][source_node_index][SOURCE_KEY]
             provided_by = kg2_util.CURIE_PREFIX_OBO + ':' + source
             source_date = SOURCE_INFO[source][UPDATE_DATE_KEY]
 
+            # Determine the node's dates
             update_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][UPDATE_DATE_KEY], source_date)
             creation_date = pick_most_recent_date(SAVED_NODE_INFO[node_id][source_node_index][CREATION_DATE_KEY], source_date)
 
+            # Construct the node and add in the other attributes
             node = kg2_util.make_node(node_id, node_iri, name, category, update_date, provided_by)
             node["description"] = description
             node["has_biological_sequence"] = has_biological_sequence
             node["creation_date"] = creation_date
             node["synonym"] = synonyms
 
+            # Output the node
             nodes_output.write(node)
 
+            # Construct the edges from the triples saved
             for (edge_relation, edge_object) in SAVED_NODE_INFO[node_id][source_node_index][EDGES_KEY]:
                 relation_label = edge_relation.split(':')[1]
                 edge = kg2_util.make_edge(node_id, edge_object, edge_relation, relation_label, provided_by, update_date)
@@ -433,9 +540,10 @@ def construct_nodes_and_edges(nodes_output, edges_output):
                 edges_output.write(edge)
 
 
-
 if __name__ == '__main__':
     print("Start time: ", kg2_util.date())
+
+    # Obtain all of the input arguments
     args = get_args()
     input_file_name = args.inputFile
     curies_to_categories_file_name = args.curiesToCategoriesYAML
@@ -444,30 +552,37 @@ def construct_nodes_and_edges(nodes_output, edges_output):
     output_edges_file_name = args.outputEdgesFile
     test_mode = args.test
 
+    # Create the output files
     nodes_info, edges_info = kg2_util.create_kg2_jsonlines(test_mode)
     nodes_output = nodes_info[0]
     edges_output = edges_info[0]
 
+    # Prepare the node category dictionaries with the information from curies-to-categories.yaml 
     curies_to_categories_data = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(curies_to_categories_file_name))
     for mapping_node in curies_to_categories_data["term-mappings"]:
         NODE_CATEGORY_MAPPINGS[mapping_node] = (curies_to_categories_data["term-mappings"][mapping_node], FILE_MAPPING)
     for prefix in curies_to_categories_data["prefix-mappings"]:
         PREFIX_MAPPINGS[prefix] = curies_to_categories_data["prefix-mappings"][prefix]
 
+    # Begin reading the JSON Lines input file containing all of the ontologies
     input_read_jsonlines_info = kg2_util.start_read_jsonlines(input_file_name)
     input_data = input_read_jsonlines_info[0]
 
-    ontology_prefixes = set()
+    # Prepare the URI maps for mapping ontology information to KG2 CURIE IDs and IRIs
     generate_uri_map(curies_to_urls_file_name)
+
+    # Extract all of the necessary information from the ontologies
     for ontology_item in input_data:
         process_ontology_item(ontology_item)
 
+    # Categorize every node and save the information in the information dictionary for the node
     for node_id in SAVED_NODE_INFO:
         categorize_node(node_id)
         node_category = NODE_CATEGORY_MAPPINGS[node_id][0]
         for index in range(len(SAVED_NODE_INFO[node_id])):
             SAVED_NODE_INFO[node_id][index][CATEGORY_KEY] = node_category
 
+    # Save all of the node and edge information in KG2 format
     construct_nodes_and_edges(nodes_output, edges_output)
 
     kg2_util.close_kg2_jsonlines(nodes_info, edges_info, output_nodes_file_name, output_edges_file_name)

From b0850552d01a9ac2d93ef7b6a8ead23aba16dc7c Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 03:00:26 -0700
Subject: [PATCH 095/125] #387 archiving multi ont

---
 convert/{ => archive}/multi_ont_to_kg_jsonl.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename convert/{ => archive}/multi_ont_to_kg_jsonl.py (100%)

diff --git a/convert/multi_ont_to_kg_jsonl.py b/convert/archive/multi_ont_to_kg_jsonl.py
similarity index 100%
rename from convert/multi_ont_to_kg_jsonl.py
rename to convert/archive/multi_ont_to_kg_jsonl.py

From 321981cc505cb3d71abc740394693ed553230d7d Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 03:00:59 -0700
Subject: [PATCH 096/125] #387 archiving build multi ont

---
 convert/{ => archive}/build-multi-ont-kg.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename convert/{ => archive}/build-multi-ont-kg.sh (100%)

diff --git a/convert/build-multi-ont-kg.sh b/convert/archive/build-multi-ont-kg.sh
similarity index 100%
rename from convert/build-multi-ont-kg.sh
rename to convert/archive/build-multi-ont-kg.sh

From cea05b7b9645fe02636db2e1c8fa562cfabf92f9 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 03:02:40 -0700
Subject: [PATCH 097/125] updating executability for newer files

---
 convert/clinicaltrialskg_tsv_to_kg_jsonl.py | 0
 convert/ontologies_jsonl_to_kg_jsonl.py     | 0
 convert/umls_list_jsonl_to_kg_jsonl.py      | 0
 extract/extract-clinicaltrialskg.sh         | 0
 4 files changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 convert/clinicaltrialskg_tsv_to_kg_jsonl.py
 mode change 100644 => 100755 convert/ontologies_jsonl_to_kg_jsonl.py
 mode change 100644 => 100755 convert/umls_list_jsonl_to_kg_jsonl.py
 mode change 100644 => 100755 extract/extract-clinicaltrialskg.sh

diff --git a/convert/clinicaltrialskg_tsv_to_kg_jsonl.py b/convert/clinicaltrialskg_tsv_to_kg_jsonl.py
old mode 100644
new mode 100755
diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
old mode 100644
new mode 100755
diff --git a/convert/umls_list_jsonl_to_kg_jsonl.py b/convert/umls_list_jsonl_to_kg_jsonl.py
old mode 100644
new mode 100755
diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh
old mode 100644
new mode 100755

From fd482ba7285c01da769d51a5b1db9b321bd08870 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 04:24:18 -0700
Subject: [PATCH 098/125] #387 comments through owlparser

---
 extract/owlparser.py | 208 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 194 insertions(+), 14 deletions(-)

diff --git a/extract/owlparser.py b/extract/owlparser.py
index fe540f3b..418bf0d9 100644
--- a/extract/owlparser.py
+++ b/extract/owlparser.py
@@ -28,10 +28,15 @@ def get_args():
     arg_parser.add_argument('outputFile', type=str)
     return arg_parser.parse_args()
 
+
 def date():
     return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 
+
 class LineElementRead():
+    """
+        Custom enum for identifying which element is currently being read in an XML Line
+    """
     NONE = 0
     TAG = 1
     ATTRIBUTE_TAG = 2
@@ -39,13 +44,21 @@ class LineElementRead():
     MAIN = 4
     END_TAG = 5
 
+
 class XMLParser():
+    """
+        General XML to JSON Lines parser optimized for XML consisting of many short nests
+    """
     def __init__(self, skip_tags, ignored_attributes, processing_func):
+        # Defining the types of lines which will be skipped by the processor
         self.COMMENT = "!--"
-        self.OUTMOST_TAGS_SKIP = skip_tags
+        self.OUTMOST_TAGS_SKIP = skip_tags # To avoid one large JSON Line, the outmost tags should be skipped
         self.IGNORED_ATTRIBUTES = ignored_attributes
+
+        # Function for processing each nest
         self.processing_func = processing_func
 
+        # Line categorization labels
         self.LINE_TYPE_IGNORE = "ignore"
         self.LINE_TYPE_START_NEST = "start nest"
         self.LINE_TYPE_START_NEST_WITH_ATTR = "start nest with attributes"
@@ -54,6 +67,7 @@ def __init__(self, skip_tags, ignored_attributes, processing_func):
         self.LINE_TYPE_ENTRY_ONLY_ATTR = "entry with only attributes"
         self.LINE_TYPE_END_NEST = "end nest"
 
+        # Processing labels for components of each line
         self.KEY_TAG = "tag"
         self.KEY_ATTRIBUTES = "attributes"
         self.KEY_TEXT = "ENTRY_TEXT"
@@ -74,11 +88,16 @@ def __init__(self, skip_tags, ignored_attributes, processing_func):
         self.prev_letter = ""
         self.type_to_read = LineElementRead.NONE
 
+
     def categorize_line(self):
+        """
+            Logic for determining which type of line is being processed based on the content of its attributes
+        """
         # Categorize the type of line
         line_type = str()
         out = dict()
 
+        # If it is one of these first line types, skip it
         # Putting "only_tag" here isn't necessarily the best idea, but I don't know what else to do with it
         if self.tag == self.COMMENT or self.tag in self.OUTMOST_TAGS_SKIP or self.end_tag in self.OUTMOST_TAGS_SKIP or self.only_tag:
             line_type = self.LINE_TYPE_IGNORE
@@ -91,34 +110,45 @@ def categorize_line(self):
             if start_tag_exists:
                 if attributes_exist:
                     if text_exists:
+                        # This type of line has everything
                         line_type = self.LINE_TYPE_ENTRY_WITH_ATTR
                         out[self.KEY_TAG] = self.tag
                         out[self.KEY_ATTRIBUTES] = self.attributes
                         out[self.KEY_TEXT] = self.main_text
                     elif end_tag_exists:
+                        # This type of line acts an an entry, but doesn't have text. There is not another end_tag coming for it.
                         line_type = self.LINE_TYPE_ENTRY_ONLY_ATTR
                         out[self.KEY_TAG] = self.tag
                         out[self.KEY_ATTRIBUTES] = self.attributes
                     else:
+                        # This type of line does not have an entry and acts as the start of an inner nest
                         line_type = self.LINE_TYPE_START_NEST_WITH_ATTR
                         out[self.KEY_TAG] = self.tag
                         out[self.KEY_ATTRIBUTES] = self.attributes
                 elif text_exists:
+                    # This type of line does not have attributes and only contains an entry
                     line_type = self.LINE_TYPE_ENTRY
                     out[self.KEY_TAG] = self.tag
                     out[self.KEY_TEXT] = self.main_text
                 else:
+                    # This type of line is only starting a nest and does not contain any of its own information
                     line_type = self.LINE_TYPE_START_NEST
                     out[self.KEY_TAG] = self.tag
             elif end_tag_exists:
+                # This type of line ends a started nest
                 line_type = self.LINE_TYPE_END_NEST
                 out[self.KEY_TAG] = self.end_tag
 
+        # Assign the key type based on the determined line type
         out[self.KEY_TYPE] = line_type
 
         return out
 
+
     def get_letters(self, letter_index):
+        """
+            Get the current letter, previous letter, and next letter in the line and count the brackets status (in case there are brackets inside of brackets)
+        """
         self.letter = self.line[letter_index]
         self.next_letter = ""
         self.prev_letter = ""
@@ -134,6 +164,9 @@ def get_letters(self, letter_index):
 
 
     def identify_tag_type(self, letter_index):
+        """
+            Depending on the presence of a "/" character, determine whether this is an end tag
+        """
         changed = True
 
         if self.letter == '<' and letter_index == 0:
@@ -148,8 +181,13 @@ def identify_tag_type(self, letter_index):
 
 
     def read_tag(self):
+        """
+            Determine the tag of an XML line
+        """
         changed = True
 
+        # Once you hit a space or bracket, switch to the next type of line element
+        # If not, keep adding to the tag
         if self.letter == ' ' and self.type_to_read == LineElementRead.TAG:
             self.type_to_read = LineElementRead.ATTRIBUTE_TAG
         elif self.letter == '>' and self.type_to_read == LineElementRead.TAG and self.start_brackets == 0:
@@ -167,16 +205,28 @@ def read_tag(self):
 
 
     def store_attribute(self):
+        """
+            Clean and save an attribute for later processing
+        """
+        # Only save desired attributes
         if self.attribute_tag not in self.IGNORED_ATTRIBUTES:
             self.attributes[self.attribute_tag] = self.attribute_text.strip('/').strip('"')
+
+        # Reset our attribute trackers
         self.attribute_tag = ""
         self.attribute_text = ""
 
 
     def read_attributes(self):
+        """
+            Determine the attributes of an XML line
+        """
         changed = True
+
+        # Identify whether it is time to process the attributes of the line
         start_reading_attributes = (self.type_to_read == LineElementRead.ATTRIBUTE_TAG or self.type_to_read == LineElementRead.ATTRIBUTE_TEXT)
 
+        # At the end of the attributes section, save the attributes and switch to the text portion of the line
         if self.letter == '>' and start_reading_attributes and self.start_brackets == 0:
             self.type_to_read = LineElementRead.MAIN
             
@@ -184,6 +234,7 @@ def read_attributes(self):
 
             if self.prev_letter == '/':
                 self.end_tag = self.tag
+        # Otherwise, read the correct part of the line and switch parts based on the delimiter ('=' and ' ')
         elif start_reading_attributes:
             if self.letter == '=' and self.type_to_read == LineElementRead.ATTRIBUTE_TAG:
                 self.type_to_read = LineElementRead.ATTRIBUTE_TEXT
@@ -201,7 +252,12 @@ def read_attributes(self):
 
 
     def read_main(self):
+        """
+            Determine the main textual entry of an XML line
+        """
         changed = True
+
+        # Stop reading and switch to reading the end tag once you hit a start bracket
         if self.letter == '<' and self.type_to_read == LineElementRead.MAIN:
             self.type_to_read = LineElementRead.END_TAG
         elif self.type_to_read == LineElementRead.MAIN:
@@ -213,9 +269,15 @@ def read_main(self):
 
 
     def read_end_tag(self):
+        """
+            Determine the end tag of an XML line
+        """
         changed = True
+
+        # Stop once you've reached the end of the line
         if self.letter == '>' and self.type_to_read == LineElementRead.END_TAG and self.start_brackets == 0:
             pass
+        # Otherwise, add to the end tag
         elif self.type_to_read == LineElementRead.END_TAG:
             self.end_tag += self.letter
         else:
@@ -225,153 +287,214 @@ def read_end_tag(self):
 
 
     def convert_line(self):
+        """
+            Using a streaming reading technique, convert a line into its tag, attributes, text, and type
+        """
+        # Initialize all of the line elements for the new line
         self.tag = ""
         self.attributes = dict()
         self.attribute_tag = ""
         self.attribute_text = ""
         self.main_text = ""
         self.end_tag = ""
-
         self.type_to_read = LineElementRead.NONE
-
         self.only_tag = False
-
         self.start_brackets = 0
 
+        # Read the line letter by letter
         for letter_index in range(len(self.line)):
+            # Get the letters required for analysis regardless of the element type
             self.get_letters(letter_index)
 
-            # First <
+            # Start by determining if it is a start or end tag
             if self.identify_tag_type(letter_index):
+                # If this was the work done on this letter, move to the next
                 continue
 
+            # Determine the tag of the line
             if self.read_tag():
+                # If this was the work done on this letter, move to the next
                 continue
 
+            # Determine the attributes of the line (if applicable)
             if self.read_attributes():
+                # If this was the work done on this letter, move to the next
                 continue
 
+            # Determine the main text given in the line (if applicable)
             if self.read_main():
+                # If this was the work done on this letter, move to the next
                 continue
 
+            # Determine the end tag of the line (if applicable)
             if self.read_end_tag():
+                # If this was the work done on this letter, move to the next
                 continue
 
+        # Categorize the line based on the saved characteristics
         return self.categorize_line()
 
 
     def convert_nest(self, nest, start_index):
+        """
+            Recursively the set of lines (from the first start tag to its pairing end tag) into a dictionary (nested as necessary)
+        """
+        # Initialize the current dictionary in the nest
         nest_dict = dict()
+
+        # Start at the given index
         curr_index = start_index
 
+        # Iterate linearly (without repeat) through every element in the nest
         while curr_index < len(nest):
+            # Get the basic characteristics of the nest element
             element = nest[curr_index]
             line_type = element[self.KEY_TYPE]
             line_tag = element[self.KEY_TAG]
             line_text = element.get(self.KEY_TEXT, None)
             line_attributes = element.get(self.KEY_ATTRIBUTES, None)
 
+            # If we are starting a new nest, we need to recurse
             if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
+                # Initialize every element to a list to simplify later processing (don't have to deal with some entries being strings and some being lists then)
                 if line_tag not in nest_dict:
                     nest_dict[line_tag] = list()
 
+                # Recurse to build the inner dictionary
                 converted_nest, ret_index = self.convert_nest(nest, curr_index + 1)
 
+                # If we have line attributes, we need to save them in the dictionary
                 if line_attributes is not None:
                     for attribute in line_attributes:
                         converted_nest[attribute] = line_attributes[attribute]
 
+                # Add this converted nest to the overall list
                 nest_dict[line_tag].append(converted_nest)
 
+                # Set the new index to prevent duplication
                 curr_index = ret_index + 1
                 continue
 
+            # If we're not starting a new nest, process additively
             if line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR]:
+                # Initialize every element to a list to simplify later processing (don't have to deal with some entries being strings and some being lists then)
                 if line_tag not in nest_dict:
                     nest_dict[line_tag] = list()
 
                 curr_dict = dict()
 
+                # If we have line text, we need to save it in the dictionary
                 if line_text is not None:
                     curr_dict[self.KEY_TEXT] = line_text
 
+                # If we have line attributes, we need to save them in the dictionary
                 if line_attributes is not None:
                     for attribute in line_attributes:
                         curr_dict[attribute] = line_attributes[attribute]
 
+                # Add this converted nest to the overall list
                 nest_dict[line_tag].append(curr_dict)
 
+                # Move to the next element
                 curr_index += 1
                 continue
 
+            # Recursive base case, to exit the nest building when we hit the end of a nest
             if line_type in [self.LINE_TYPE_END_NEST]:
                 return nest_dict, curr_index
 
+        # Once we reach the end, we need to return the nest
         return nest_dict, curr_index
 
 
     def divide_into_lines(self, input_file_name):
+        """
+            Split a given XML file into sets of lines representing a nest (at a given level within the overall XML nest, based on the ignored lines) and process these nests
+        """
+        # Initialize the current nest
         curr_str = ""
         curr_nest = list()
-        curr_nest_tags = list() # Treating it as a stack
+        curr_nest_tags = list() # Treating it as a stack, since some tags will be identical within a nest and we want to make sure start and end tags match
         start_brackets = 0
 
         with open(input_file_name) as input_file:
+            # Iterate linearly through the file
             for line in input_file:
                 line_str = line.strip()
 
+                # Process each letter in the line linearly
                 for letter_index in range(len(line_str)):
                     letter = line_str[letter_index]
+
+                    # In case of nested brackets ("<<>>"), need to maintain matching brackets
                     if letter == '<':
                         start_brackets += 1
                     if letter == '>':
                         start_brackets -= 1
 
+                    # Identify the next letter, to aid in identifying the end of the line
                     next_letter = ""
                     if letter_index + 1 < len(line_str):
                         next_letter = line_str[letter_index + 1]
 
+                    # Build up the current line
                     curr_str += letter
 
+                    # Determine when we have reached the end of the line and process accordingly 
                     if letter == '>' and (next_letter == '<' or next_letter == "") and start_brackets == 0:
-                        # Only return if nesting
+                        # Assign the class variable the current string to facilitate processing
                         self.line = curr_str
+                        # Process the line
                         line_parsed = self.convert_line()
 
+                        # Determine important traits of the line to build the nest
                         tag = line_parsed.get(self.KEY_TAG, None)
                         assert tag != self.KEY_TEXT # This could cause a massive conflict, but it is unlikely
                         line_type = line_parsed.get(self.KEY_TYPE, None)
-                        attribute_keys = line_parsed.get(self.KEY_ATTRIBUTES, dict()).keys()
 
+                        # Add non-ignore lines to the nest
                         if line_type != self.LINE_TYPE_IGNORE:
                             curr_nest.append(line_parsed)
 
+                        # Initialize the output_file criteria
                         output_nest = (line_type in [self.LINE_TYPE_ENTRY, self.LINE_TYPE_ENTRY_WITH_ATTR, self.LINE_TYPE_ENTRY_ONLY_ATTR] and len(curr_nest_tags) == 0)
 
+                        # If we are starting a new internal nest, push the current tag to the stack to ensure it has a matching end tag
                         if line_type in [self.LINE_TYPE_START_NEST, self.LINE_TYPE_START_NEST_WITH_ATTR]:
                             curr_nest_tags.append(tag)
+                        # Ensure that the reached end tag matches the last start tag
                         elif line_type == self.LINE_TYPE_END_NEST:
                             popped_curr_nest_tag = curr_nest_tags.pop()
                             assert popped_curr_nest_tag == tag, curr_nest
+
+                            # The nest is ready to process once we have matched the original start tag
                             if len(curr_nest_tags) == 0:
                                 output_nest = True
+
+                        # Once the nest has been finished, convert it into a dictionary and process it
                         if output_nest: 
                             nest_dict, _ = self.convert_nest(curr_nest, 0)
 
+                            # Process the given nest dictionary based on a given processing function
                             self.processing_func(nest_dict)
 
+                            # Reinitialize variables for the next loop
                             curr_nest = list()
                             curr_nest_tag = str()
 
                         curr_str = ""
 
+                # If we have to go to the next line to finish processing one XML line, add a delimiting space
                 if curr_str != "":
-                    # divide lines by a space
                     curr_str += ' '
 
 
 class OWLParser():
+    """
+        Custom parser (into JSON Lines) for XML-style OWL files
+    """
     def __init__(self, input_files, input_file_names, owl_file_path, output_file_name):
+        # Important tags within OWL files for processing
         self.XML_TAG = "?xml"
         self.RDF_TAG = "rdf:RDF"
         self.DOCTYPE_TAG = "!DOCTYPE"
@@ -380,35 +503,50 @@ def __init__(self, input_files, input_file_names, owl_file_path, output_file_nam
         self.SUBCLASS_TAG = "rdfs:subClassOf"
         self.NODEID_TAG = "rdf:nodeID"
         self.RDF_ABOUT_TAG = "rdf:about"
+
+        # Generic OWL ID prefix
         self.GENID_PREFIX = "genid"
 
+        # Custom additions to JSON Lines output to propagate ont-load-inventory.yaml information
         self.OWL_SOURCE_KEY = "owl_source"
         self.OWL_SOURCE_NAME_KEY = "owl_source_name"
 
+        # Tags to exclude from JSON Lines representation, to be passed into XML Parser
         self.skip_tags = [self.XML_TAG, self.RDF_TAG, self.DOCTYPE_TAG]
 
+        # Attributes to ignore for JSON Lines representation (due to overcrowding)
         self.ignored_attributes = ["xml:lang"]
 
+        # XML Parser for OWL Parser, using triage_nest_dict as the processing_func
         self.xml_parser = XMLParser(self.skip_tags, self.ignored_attributes, self.triage_nest_dict)
 
+        # Initialize the genid processing dictionaries required
         self.GENID_REMAINING_NESTS = dict()
         self.GENID_TO_ID = dict()
         self.ID_TO_GENIDS = dict()
 
+        # File names for input/output
         self.input_files = input_files
         self.input_file_names = input_file_names
         self.owl_file_path = owl_file_path
         self.output_file_name = output_file_name
 
+        # Output writer
         self.output_info = kg2_util.create_single_jsonlines()
         self.output = self.output_info[0]
 
+
     def check_for_class_genids(self, nest_dict):
+        """
+            Scanner for genids within an "owl:Class", to prepare them for later matching
+        """
         genids = list()
 
         nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
         for nest_class_index in range(len(nest_dict_classes)):
             nest_class = nest_dict_classes[nest_class_index]
+
+            # genids are contained within "rdfs:subClassOf" elements
             nest_subclasses = nest_class.get(self.SUBCLASS_TAG, list())
             for nest_subclass_index in range(len(nest_subclasses)):
                 nest_subclass = nest_subclasses[nest_subclass_index]
@@ -420,13 +558,20 @@ def check_for_class_genids(self, nest_dict):
 
 
     def check_for_restriction_genids(self, nest_dict):
+        """
+            Check a nest for possibly containing a "genid" term within an "owl:Restriction" element
+        """
         for nest_restriction in nest_dict.get(self.RESTRICTION_TAG, dict()):
             potential_genid = nest_restriction.get(self.NODEID_TAG, str())
             if potential_genid.startswith(self.GENID_PREFIX):
                     return potential_genid
         return None
 
+
     def extract_class_id(self, nest_dict):
+        """
+            Determine the id of an "owl:Class", for use as a key
+        """
         nest_dict_classes = nest_dict.get(self.CLASS_TAG, list())
         # Can't have competing class_ids
         assert len(nest_dict_classes) <= 1
@@ -435,7 +580,11 @@ def extract_class_id(self, nest_dict):
             nest_class = nest_dict_classes[nest_class_index]
             return nest_class.get(self.RDF_ABOUT_TAG, str())
 
+
     def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
+        """
+            Replace a genid entry in an "rdfs:subClassOf" element with its corresponding "owl:Restriction" definition (which contains an actual identifier)
+        """
         output_class_nest = class_nest
         
         nest_dict_classes = class_nest.get(self.CLASS_TAG, list())
@@ -452,50 +601,69 @@ def store_genid_nest_in_class_nest(self, genid, genid_nest, class_nest):
 
 
     def write_to_output(self, output_dict, source_file):
+        """
+            Save other information to an output dictionary before writing it to the output JSON Lines file
+        """
         output_dict[self.OWL_SOURCE_KEY] = source_file
         output_dict[self.OWL_SOURCE_NAME_KEY] = self.input_file_names[source_file]
         self.output.write(output_dict)
 
-        return
-
 
     def triage_nest_dict(self, nest_dict):
+        """
+            Process a nest dictionary by outputting it if it's ready (no outstanding "genid" terms)
+        """
+        # Check for elements which complicate save pattern
         genids = self.check_for_class_genids(nest_dict)
         restriction_genid = self.check_for_restriction_genids(nest_dict)
         class_id = self.extract_class_id(nest_dict)
 
+        # If there are class genids, save these for future identification and store the nest to be outputted later
         if len(genids) > 0:
             for genid in genids:
                 self.GENID_TO_ID[genid] = class_id
             self.ID_TO_GENIDS[class_id] = genids
             self.GENID_REMAINING_NESTS[class_id] = nest_dict
+        # If this nest contains a genid definition to be placed in its "owl:Class", place it, then output the nest
         elif restriction_genid is not None:
             class_id = self.GENID_TO_ID.get(restriction_genid, str())
+
+            # Issue a warning if genid doesn't correspond to an "owl:Class"
             if len(class_id) == 0:
                 print("WARNING WITH:", restriction_genid, "- NO CLASS_ID FOUND")
 
                 # Save to output despite not matching with an existing class
                 self.write_to_output(nest_dict, self.input_file)
                 return
+
+            # Store the genid and remove it from the list of outstanding genids
             class_nest = self.GENID_REMAINING_NESTS[class_id]
             self.ID_TO_GENIDS[class_id].remove(restriction_genid)
             updated_class_nest = self.store_genid_nest_in_class_nest(restriction_genid, nest_dict, class_nest)
 
+            # We must wait until all of the genids in the "owl:Class" have been matched to finally output
             if len(self.ID_TO_GENIDS[class_id]) > 0:
                 self.GENID_REMAINING_NESTS[class_id] = updated_class_nest
             else:
-                # Since all of the genids used in this class have been matched, output
                 self.write_to_output(nest_dict, self.input_file)
                 self.GENID_REMAINING_NESTS[class_id] = None
+        # Otherwise, it is a normal situation
         else:
             # There are no genids that need to be worked with, so just output
             self.write_to_output(nest_dict, self.input_file)
 
 
     def parse_OWL_file(self):
+        """
+            Handler for parsing the owl files
+        """
+        # Iterate through the input files, processing them
         for input_file in self.input_files:
+            # Set the current OWLParser input file to this input file
             self.input_file = input_file
             print("Reading:", input_file, "starting at", date())
+
+            # Process the file
             self.xml_parser.divide_into_lines(self.owl_file_path + input_file)
 
             # Genid wasn't filled, still want to include them though
@@ -512,9 +680,14 @@ def parse_OWL_file(self):
 
 
 def identify_and_download_input_files(ont_load_inventory, path_to_owl_files):
+    """
+        Download all of the input files in ont-load-inventory.yaml
+    """
     input_files = list()
     input_file_names = dict()
     owl_file_path = path_to_owl_files.rstrip('/') + "/"
+
+    # Download every file in the file and store the file name and title for later use as provenance
     for item in ont_load_inventory:
         input_files.append(item['file'])
         input_file_names[item['file']] = item['title']
@@ -522,19 +695,26 @@ def identify_and_download_input_files(ont_load_inventory, path_to_owl_files):
         kg2_util.download_file_if_not_exist_locally(item['url'], owl_file_path + item['file'])
         print("Download of:", item['file'], "finished at", date())
 
+    # Return, providing the file path so the files can be opened by the XMLParser later
     return input_files, input_file_names, owl_file_path
 
 if __name__ == '__main__':
+    print("Start Time:", date())
     args = get_args()
+
+    # Obtain all arguments
     input_file_name = args.inputFile
     owl_path = args.owlFilePath
     output_file_name = args.outputFile
 
+    # Read ont-load-inventory.yaml to prepare for OWL processing
     ont_load_inventory = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(input_file_name))
     input_files, input_file_names, owl_file_path = identify_and_download_input_files(ont_load_inventory, owl_path)
 
     print("Files:", input_files)
-    print("Start Time:", date())
+    # Initialize the OWLParser
     owl_parser = OWLParser(input_files, input_file_names, owl_file_path, output_file_name)
+
+    # Run parsing on all of the OWL files
     owl_parser.parse_OWL_file()
-    print("End Time:", date())
\ No newline at end of file
+    print("End Time:", date())

From 7bd5e8f8b2aff6e445ad4ebb5e1f563f36c818ed Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 04:28:22 -0700
Subject: [PATCH 099/125] #387 adjusting for CHEBI issues

---
 extract/extract-ontologies.sh | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh
index 3248cf4f..5c90b782 100755
--- a/extract/extract-ontologies.sh
+++ b/extract/extract-ontologies.sh
@@ -28,6 +28,9 @@ mkdir -p ${ontologies_dir}
 # Temporary adjustment for https://github.com/HUPO-PSI/psi-mi-CV/issues/456
 ${s3_cp_cmd} s3://${s3_bucket}/mi.owl ${ontologies_dir}/mi.owl
 
+# Temporary adjustment due to lack of resolution of chebi PURL
+${s3_cp_cmd} s3://${s3_bucket}/chebi.owl ${ontologies_dir}/mi.owl
+
 # Generate the ontologies.jsonl file
 ${python_command} ${parsing_script} ${ontologies_load_inventory} ${ontologies_dir} ${output_file}
 

From 0804556050e20ffde0b51aee048cb3b1065ccec5 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 04:29:13 -0700
Subject: [PATCH 100/125] #387 want to remove old ontologies

---
 extract/extract-ontologies.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh
index 5c90b782..359114b2 100755
--- a/extract/extract-ontologies.sh
+++ b/extract/extract-ontologies.sh
@@ -23,6 +23,7 @@ ontologies_load_inventory=${1-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"}
 output_file=${2-"${BUILD_DIR}/ontologies.jsonl"}
 ontologies_dir=${3-"${BUILD_DIR}/owl_files"}
 
+rm -rf ${ontologies_dir}
 mkdir -p ${ontologies_dir}
 
 # Temporary adjustment for https://github.com/HUPO-PSI/psi-mi-CV/issues/456

From a1a7c6ee7f3efd32ed45f045a56e5debd62e09b6 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 04:30:28 -0700
Subject: [PATCH 101/125] #387 have to fully handle CHEBI

---
 extract/extract-ontologies.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh
index 359114b2..e82d8a5b 100755
--- a/extract/extract-ontologies.sh
+++ b/extract/extract-ontologies.sh
@@ -30,7 +30,7 @@ mkdir -p ${ontologies_dir}
 ${s3_cp_cmd} s3://${s3_bucket}/mi.owl ${ontologies_dir}/mi.owl
 
 # Temporary adjustment due to lack of resolution of chebi PURL
-${s3_cp_cmd} s3://${s3_bucket}/chebi.owl ${ontologies_dir}/mi.owl
+${s3_cp_cmd} s3://${s3_bucket}/chebi.owl ${ontologies_dir}/chebi.owl
 
 # Generate the ontologies.jsonl file
 ${python_command} ${parsing_script} ${ontologies_load_inventory} ${ontologies_dir} ${output_file}

From b0aee1c1d284a29b351da76241939091fbcb68b3 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 05:25:03 -0700
Subject: [PATCH 102/125] #387 revising predicate remap for new ontology etl

---
 maps/predicate-remap.yaml | 152 +++++++++++++++++++++++++++-----------
 1 file changed, 109 insertions(+), 43 deletions(-)

diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml
index 15452136..3d4bc482 100644
--- a/maps/predicate-remap.yaml
+++ b/maps/predicate-remap.yaml
@@ -415,6 +415,12 @@ CL:lacks_part:
 CL:lacks_plasma_membrane_part:
   operation: keep
   core_predicate: biolink:lacks_part
+COB:0000078:
+  operation: delete
+COB:0000081:
+  operation: delete
+COB:0000087:
+  operation: delete
 CTD:increases_expression_of:
   operation: keep
   core_predicate: biolink:affects
@@ -425,6 +431,9 @@ CTD:increases_expression_of:
 DDANAT:develops_from:
   operation: keep
   core_predicate: biolink:develops_from
+DDANAT:part_of:
+  operation: invert
+  core_predicate: biolink:has_part
 DGIdb:activator:
   operation: keep
   core_predicate: biolink:affects
@@ -1469,6 +1478,9 @@ FMA:surrounded_by:
 FMA:surrounds:
   operation: keep
   core_predicate: biolink:coexists_with
+FMA:systemic_part_of:
+  operation: invert
+  core_predicate: biolink:has_part
 FMA:transforms_from:
   operation: invert
   core_predicate: biolink:precedes
@@ -1759,6 +1771,8 @@ IAO:0000142:
 IAO:0000219:
   operation: keep
   core_predicate: biolink:related_to
+IAO:0100001:
+  operation: delete
 ICD10PCS:CHD:
   operation: keep
   core_predicate: biolink:subclass_of
@@ -2280,6 +2294,16 @@ MONDO:part_of_progression_of_disease:
 MONDO:predisposes_towards:
   operation: keep
   core_predicate: biolink:contributes_to
+NBO-PROPERTY:by_means:
+  operation: delete
+NBO-PROPERTY:has_participant:
+  operation: keep
+  core_predicate: biolink:has_participant
+NBO-PROPERTY:in_response_to:
+  operation: invert
+  core_predicate: biolink:causes
+NBO-PROPERTY:is_about:
+  operation: delete
 NCBITaxon:CHD:
   operation: keep
   core_predicate: biolink:subclass_of
@@ -2969,6 +2993,9 @@ OBI:0000295:
 OBI:0000299:
   operation: keep
   core_predicate: biolink:has_output
+OBI:0000312:
+  operation: invert
+  core_predicate: biolink:has_output
 OBI:0000417:
   operation: keep
   core_predicate: biolink:has_output
@@ -3020,6 +3047,10 @@ OBO:HANCESTRO_0308:
 OBO:HANCESTRO_0330:
   operation: keep
   core_predicate: biolink:related_to
+OBO:INO_0000154:
+  operation: delete
+OBO:MF#manifestationOf:
+  operation: delete
 OBO:nbo#by_means:
   operation: invert
   core_predicate: biolink:actively_involved_in
@@ -3032,6 +3063,18 @@ OBO:nbo#in_response_to:
 OBO:nbo#is_about:
   operation: keep
   core_predicate: biolink:related_to
+OBO:NCIT_R163:
+  operation: keep
+  core_predicate: biolink:related_to
+OBO:NCIT_R81:
+  operation: keep
+  core_predicate: biolink:related_to
+OBO:NCIT_R82:
+  operation: keep
+  core_predicate: biolink:related_to
+OBO:has_role:
+  operation: keep
+  core_predicate: biolink:related_to
 OBO:mondo/mondo-base#predisposes_towards:
   operation: keep
   core_predicate: biolink:contributes_to
@@ -3056,9 +3099,15 @@ OBO:mondo/mondo-base#disease_responds_to:
 # OBO:uo#is_unit_of:
 #   operation: invert
 #   core_predicate: biolink:related_to
+OIO:hasAlternativeId:
+  operation: keep
+  core_predicate: biolink:close_match
 OIO:hasDbXref:
   operation: keep
   core_predicate: biolink:close_match
+OIO:xref:
+  operation: keep
+  core_predicate: biolink:close_match
 OMIM:CHD:
   operation: keep
   core_predicate: biolink:subclass_of
@@ -3114,7 +3163,7 @@ OMIM:phenotype_of:
 # ORPHA:317346:
 #   operation: keep
 #   core_predicate: biolink:causes
-# ORPHANET:327767:
+# orphanet:327767:
 #   operation: keep
 #   core_predicate: biolink:actively_involved_in
 # ORPHA:410295:
@@ -3126,49 +3175,46 @@ OMIM:phenotype_of:
 # ORPHA:465410:
 #   operation: keep
 #   core_predicate: biolink:biomarker_for
-ORPHANET:317343:
+orphanet:317343:
   operation: keep
   core_predicate: biolink:causes
-ORPHANET:317344:
+orphanet:317344:
   operation: keep
   core_predicate: biolink:causes
-ORPHANET:317345:
+orphanet:317345:
   operation: keep
   core_predicate: biolink:correlated_with
 orphanet:317346:
   operation: keep
   core_predicate: biolink:causes
-ORPHANET:317346:
-  operation: keep
-  core_predicate: biolink:causes
-ORPHANET:317348:
+orphanet:317348:
   operation: keep
   core_predicate: biolink:actively_involved_in
-ORPHANET:317349:
+orphanet:317349:
   operation: keep
   core_predicate: biolink:actively_involved_in
-ORPHANET:327767:
+orphanet:327767:
   operation: keep
   core_predicate: biolink:actively_involved_in
-ORPHANET:410295:
+orphanet:410295:
   operation: keep
   core_predicate: biolink:causes
-ORPHANET:410296:
+orphanet:410296:
   operation: keep
   core_predicate: biolink:causes
-ORPHANET:465410:
+orphanet:465410:
   operation: keep
   core_predicate: biolink:biomarker_for
-ORPHANET:C016:
+orphanet:C016:
   operation: keep
   core_predicate: biolink:related_to
-ORPHANET:C017:
+orphanet:C017:
   operation: keep
   core_predicate: biolink:related_to
-ORPHANET:C056:
+orphanet:C056:
   operation: keep
   core_predicate: biolink:close_match
-ORPHANET:C057:
+orphanet:C057:
   operation: keep
   core_predicate: biolink:close_match
 # PATO:0000085:
@@ -3418,6 +3464,9 @@ RO:0001022:
 RO:0001025:
   operation: keep
   core_predicate: biolink:located_in
+RO:0001900:
+  operation: keep
+  core_predicate: biolink:related_to
 RO:0002001:
   operation: keep
   core_predicate: biolink:related_to
@@ -3502,6 +3551,15 @@ RO:0002162:
 RO:0002170:
   operation: keep
   core_predicate: biolink:related_to
+RO:0002171:
+  operation: keep
+  core_predicate: biolink:related_to
+RO:0002174:
+  operation: keep
+  core_predicate: biolink:related_to
+RO:0002175:
+  operation: keep
+  core_predicate: biolink:related_to
 RO:0002176:
   operation: keep
   core_predicate: biolink:related_to
@@ -3768,6 +3826,8 @@ RO:0002387:
   core_predicate: biolink:related_to
 RO:0002388:
   operation: delete
+RO:0002404:
+  operation: delete
 # RO:0002410:
 #   operation: keep
 #   core_predicate: biolink:causes
@@ -3822,6 +3882,9 @@ RO:0002470:
 RO:0002473:
   operation: keep
   core_predicate: biolink:composed_primarily_of
+RO:0002475:
+  operation: keep
+  core_predicate: biolink:related_to
 RO:0002488:
   operation: keep
   core_predicate: biolink:temporally_related_to
@@ -3921,9 +3984,9 @@ RO:0002596: # capable of regulating
 # RO:0002599:
 #   operation: keep
 #   core_predicate: biolink:prevents
-# RO:0002604:
-#   operation: keep
-#   core_predicate: biolink:opposite_of
+RO:0002604:
+  operation: keep
+  core_predicate: biolink:opposite_of
 # RO:0002606:
 #   operation: keep
 #   core_predicate: biolink:treats
@@ -4501,12 +4564,15 @@ SO:has_part:
   core_predicate: biolink:has_part
 SO:has_quality:
   operation: delete
+SO:member_of:
+  operation: invert
+  core_predicate: biolink:has_member
 SO:overlaps:
   operation: keep
   core_predicate: biolink:overlaps
-SO:member_of:
+SO:part_of:
   operation: invert
-  core_predicate: biolink:has_member
+  core_predicate: biolink:has_part
 # SO:similar_to:
 #   operation: keep
 #   core_predicate: biolink:similar_to
@@ -4612,9 +4678,9 @@ UBERON_CORE:in_outermost_side_of:
 UBERON_CORE:indirectly_supplies:
   operation: keep
   core_predicate: biolink:coexists_with
-# UBERON_CORE:layer_part_of:
-#   operation: invert
-#   core_predicate: biolink:has_part
+UBERON_CORE:layer_part_of:
+  operation: invert
+  core_predicate: biolink:has_part
 UBERON_CORE:posteriorly_connected_to:
   operation: keep
   core_predicate: biolink:coexists_with
@@ -4648,9 +4714,9 @@ UBERON_CORE:synapsed_by:
 # UBERON_CORE:transitively_proximally_connected_to:
 #   operation: keep
 #   core_predicate: biolink:coexists_with
-# UBERON_CORE:trunk_part_of:
-#   operation: invert
-#   core_predicate: biolink:has_part
+UBERON_CORE:trunk_part_of:
+  operation: invert
+  core_predicate: biolink:has_part
 # UBERON_NONAMESPACE:connected_to:
 #   operation: keep
 #   core_predicate: biolink:related_to
@@ -5021,21 +5087,21 @@ rdfs:subClassOf:
 rdfs:subPropertyOf: 
   operation: keep
   core_predicate: biolink:subclass_of
-# skos:broadMatch: 
-#   operation: keep
-#   core_predicate: biolink:broad_match
-# skos:closeMatch: 
-#   operation: keep
-#   core_predicate: biolink:close_match
-# skos:exactMatch: 
-#   operation: keep
-#   core_predicate: biolink:exact_match
+skos:broadMatch: 
+  operation: keep
+  core_predicate: biolink:broad_match
+skos:closeMatch: 
+  operation: keep
+  core_predicate: biolink:close_match
+skos:exactMatch: 
+  operation: keep
+  core_predicate: biolink:exact_match
 skos:member:
   operation: keep
   core_predicate: biolink:has_member
-# skos:narrowMatch: 
-#   operation: invert
-#   core_predicate: biolink:broad_match
-# skos:relatedMatch: 
-#   operation: keep
-#   core_predicate: biolink:related_to
+skos:narrowMatch: 
+  operation: invert
+  core_predicate: biolink:broad_match
+skos:relatedMatch: 
+  operation: keep
+  core_predicate: biolink:related_to

From 139b1d56b56bd62778383043575f84244e036c1d Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 05:28:51 -0700
Subject: [PATCH 103/125] #387 updating provided by to infores for new
 ontologies etl

---
 maps/kg2-provided-by-curie-to-infores-curie.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml
index 718cbd11..37854c95 100644
--- a/maps/kg2-provided-by-curie-to-infores-curie.yaml
+++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml
@@ -78,7 +78,7 @@ OBO:genepio.owl:
   source_name: Genomic Epidemiology Ontology
   infores_curie: infores:genepio
   knowledge_type: knowledge_source
-OBO:go/extensions/go-plus.owl:
+OBO:go-plus.owl:
   source_name: Gene Ontology Plus
   infores_curie: infores:go-plus
   knowledge_type: knowledge_source
@@ -102,7 +102,7 @@ OBO:nbo.owl:
   source_name: Neuro Behavior Ontology
   infores_curie: infores:nbo
   knowledge_type: knowledge_source
-OBO:ncbitaxon/subsets/taxslim.owl:
+OBO:taxslim.owl:
   source_name: NCBI Taxonomy Ontology
   infores_curie: infores:ncbi-taxon
   knowledge_type: knowledge_source
@@ -118,7 +118,7 @@ OBO:ro.owl:
   source_name: Relations Ontology
   infores_curie: infores:ro
   knowledge_type: knowledge_source
-OBO:uberon.owl:
+OBO:uberon-ext.owl:
   source_name: Uber Anatomy Ontology
   infores_curie: infores:uberon
   knowledge_type: knowledge_source
@@ -126,7 +126,7 @@ OBO:uberon.owl:
   source_name: Online Mendelian Inheritance in Man (OMIM)
   infores_curie: infores:omim
   knowledge_type: knowledge_source
-'ORPHANET:':
+OBO:ordo.owl:
   source_name: Orphanet Rare Disease Ontology
   infores_curie: infores:ordo
   knowledge_type: knowledge_source

From ee09f491018609ec481ff7d1565afba6241356be Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:02:23 -0700
Subject: [PATCH 104/125] #387 adding biolink version node in and correcting
 source node category

---
 convert/ontologies_jsonl_to_kg_jsonl.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index b0adf675..797953ba 100755
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 ''' ontologies_jsonl_to_kg_jsonl.py: Converts JSON Lines representation of ontologies into KG JSON Lines format
 
-    Usage: ontologies_jsonl_to_kg_jsonl.py [--test] <inputFile.jsonl> <curiesToCategoriesYAML.yaml> <curiesToURLsYAML.yaml> <outputNodesFile.jsonl> <outputEdgesFile.jsonl>
+    Usage: ontologies_jsonl_to_kg_jsonl.py [--test] <inputFile.jsonl> <curiesToCategoriesYAML.yaml> <curiesToURLsYAML.yaml>  <biolink_version_number> <outputNodesFile.jsonl> <outputEdgesFile.jsonl>
 '''
 
 
@@ -108,6 +108,7 @@ def get_args():
     arg_parser.add_argument('inputFile', type=str)
     arg_parser.add_argument('curiesToCategoriesYAML', type=str)
     arg_parser.add_argument('curiesToURLsYAML', type=str)
+    arg_parser.add_argument('biolinkVersionNumber', type=str)
     arg_parser.add_argument('outputNodesFile', type=str)
     arg_parser.add_argument('outputEdgesFile', type=str)
     return arg_parser.parse_args()
@@ -222,6 +223,17 @@ def pick_most_recent_date(dates, alternate_date=None):
     return latest_date.isoformat(sep=' ')
 
 
+def save_biolink_information(biolink_version_number):
+    """
+        Save the Biolink version with the ontologies versions so we can construct a Biolink version node (hacky workaround)
+    """
+    source = kg2_util.CURIE_PREFIX_BIOLINK_SOURCE
+    source_id = source + ":"
+    ontology_iri = URI_MAP[source]
+    name = "Biolink"
+    SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number}
+
+
 def process_ontology_term(ontology_node, source, ontology_name, owl_source=True):
     """
         Given an owl:Ontology (or analogous) element, determine all of the relevant attributes to construct a source node
@@ -494,7 +506,7 @@ def construct_nodes_and_edges(nodes_output, edges_output):
         source_name = SOURCE_INFO[source][NAME_KEY] + " v" + SOURCE_INFO[source][VERSION_KEY]
         source_id = SOURCE_INFO[source][SOURCE_KEY]
         source_iri = SOURCE_INFO[source][IRI_KEY]
-        node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.BIOLINK_CATEGORY_INFORMATION_CONTENT_ENTITY, source_date, source_id)
+        node = kg2_util.make_node(source_id, source_iri, source_name, kg2_util.SOURCE_NODE_CATEGORY, source_date, source_id)
 
         nodes_output.write(node)
 
@@ -548,6 +560,7 @@ def construct_nodes_and_edges(nodes_output, edges_output):
     input_file_name = args.inputFile
     curies_to_categories_file_name = args.curiesToCategoriesYAML
     curies_to_urls_file_name = args.curiesToURLsYAML
+    biolink_version_number = args.biolinkVersionNumber
     output_nodes_file_name = args.outputNodesFile
     output_edges_file_name = args.outputEdgesFile
     test_mode = args.test

From 4f17e56aa227da188982d0fbca2803a3a7f4fd49 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:03:08 -0700
Subject: [PATCH 105/125] #392 edge blocklist logic implemented

---
 maps/edge-blocklist.yaml                  | 16 +++++------
 process/filter_kg_and_remap_predicates.py | 35 +++++++++++++++++------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/maps/edge-blocklist.yaml b/maps/edge-blocklist.yaml
index bb64c0be..617741dc 100644
--- a/maps/edge-blocklist.yaml
+++ b/maps/edge-blocklist.yaml
@@ -1,4 +1,4 @@
--
+- # "Vaccines---causes---Autism/Autism Spectrum Disorders"
   subject_name: Vaccines
   subject_ids:
     - ATC:J07
@@ -6,7 +6,7 @@
     - UMLS:C0042210
     - VANDF:4021642
   predicate: biolink:causes
-  object_name: Autism
+  object_name: Autism/Autism Spectrum Disorders
   object_ids:
     - CHV:0000001598
     - CHV:0000050438
@@ -34,7 +34,7 @@
     - UMLS:C0856975
     - UMLS:C1510586
     - UMLS:C1968924
--
+- # "Measles-Mumps-Rubella Vaccine---causes---Autism/Autism Spectrum Disorders"
   subject_name: Measles-Mumps-Rubella Vaccine
   subject_ids:
     - MESH:D022542
@@ -42,7 +42,7 @@
     - PDQ:CDR0000702931
     - UMLS:C0065828
   predicate: biolink:causes
-  object_name: Autism
+  object_name: Autism/Autism Spectrum Disorders
   object_ids:
     - CHV:0000001598
     - CHV:0000050438
@@ -70,7 +70,7 @@
     - UMLS:C0856975
     - UMLS:C1510586
     - UMLS:C1968924
--
+- # "Mercury---causes---Autism/Autism Spectrum Disorders"
   subject_name: Mercury
   subject_ids:
     - CHEBI:16170
@@ -83,7 +83,7 @@
     - UMLS:C0025424
     - VANDF:4025953
   predicate: biolink:causes
-  object_name: Autism
+  object_name: Autism/Autism Spectrum Disorders
   object_ids:
     - CHV:0000001598
     - CHV:0000050438
@@ -111,7 +111,7 @@
     - UMLS:C0856975
     - UMLS:C1510586
     - UMLS:C1968924
--
+- # "Thimerosal---causes---Autism/Autism Spectrum Disorders"
   subject_name: Thimerosal
   subject_ids:
     - ATC:D08AK06
@@ -128,7 +128,7 @@
     - UMLS:C0039867
     - VANDF:4017480
   predicate: biolink:causes
-  object_name: Autism
+  object_name: Autism/Autism Spectrum Disorders
   object_ids:
     - CHV:0000001598
     - CHV:0000050438
diff --git a/process/filter_kg_and_remap_predicates.py b/process/filter_kg_and_remap_predicates.py
index 5daace1b..1d103df3 100644
--- a/process/filter_kg_and_remap_predicates.py
+++ b/process/filter_kg_and_remap_predicates.py
@@ -45,6 +45,7 @@ def make_arg_parser():
     arg_parser.add_argument('inforesRemapYaml', type=str, help="The YAML file describing how knowledge_source fields should be remapped to Translator infores curies")
     arg_parser.add_argument('curiesToURIFile', type=str, help="The file mapping CURIE prefixes to URI fragments")
     arg_parser.add_argument('knowledgeLevelAgentTypeFile', type=str, help="The file mapping infores curies to knowledge_level and agent_type source information")
+    arg_parser.add_argument('edgeBlocklistFile', type=str, help="File containing blocked edges from KG2")
     arg_parser.add_argument('inputNodesFile', type=str, help="The input KG2 graph, in JSON format")
     arg_parser.add_argument('inputEdgesFile', type=str, help="The input KG2 graph, in JSON format")
     arg_parser.add_argument('outputNodesFile', type=str, help="The output KG2 graph, in JSON format")
@@ -171,11 +172,11 @@ def process_nodes(input_nodes_file_name, infores_remap_config, nodes_output):
     return nodes_set
 
 
-def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes):
+def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes, edge_blocklist):
     predicate_remap_config = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(predicate_remap_file_name))
-    map_dict = kg2_util.make_uri_curie_mappers(curies_to_uri_file_name)
+    # map_dict = kg2_util.make_uri_curie_mappers(curies_to_uri_file_name)
 
-    curie_to_uri_expander = map_dict['expand']
+    # curie_to_uri_expander = map_dict['expand']
     source_predicate_curies_not_in_config = set()
     source_predicate_curies_not_in_nodes = set()
     knowledge_source_curies_not_in_config_edges = set()
@@ -275,10 +276,11 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a
 
         if predicate_curie not in nodes:
             predicate_curie_prefix = predicate_curie.split(':')[0]
-            predicate_uri_prefix = curie_to_uri_expander(predicate_curie_prefix + ':')
-            # Create list of curies to complain about if not in biolink
-            if predicate_uri_prefix == predicate_curie_prefix:
-                source_predicate_curies_not_in_nodes.add(predicate_curie)
+            # predicate_uri_prefix = curie_to_uri_expander(predicate_curie_prefix + ':')
+            # # Create list of curies to complain about if not in biolink
+            # if predicate_uri_prefix == predicate_curie_prefix:
+            #     source_predicate_curies_not_in_nodes.add(predicate_curie)
+            source_predicate_curies_not_in_nodes.add(predicate_curie)
         if edge_dict.get("primary_knowledge_source") is None:
             #print(f"{edge_dict}")
             edge_dict["primary_knowledge_source"] = edge_dict.pop("knowledge_source")
@@ -305,6 +307,11 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a
         edge_subject = edge_dict['subject'] 
         edge_object = edge_dict['object']
 
+        edge_triple = (edge_subject, edge_dict['predicate'], edge_object)
+        if edge_triple in edge_blocklist:
+            print("Edge:", edge_triple, "in the edge blocklist. Not adding it to edges_output.")
+            continue
+
         edge_key = f"{edge_subject} /// {predicate_curie} /// {qualified_predicate} /// {qualified_object_aspect} /// {qualified_object_direction} /// {edge_object} /// {primary_knowledge_source}"
 
         edges_output.write(edge_dict)
@@ -320,12 +327,23 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a
     warning_knowledge_level_agent_source_not_in_config_edges(knowledge_source_not_in_klat_map)
 
 
+def load_edge_blocklist(edge_blocklist_dict):
+    edge_blocklist = list()
+    for edge in edge_blocklist_dict:
+        for edge_subject in edge['subject_ids']:
+            for edge_object in edge['object_ids']:
+                edge_blocklist.append((edge_subject, edge['predicate'], edge_object))
+
+    return edge_blocklist
+
+
 if __name__ == '__main__':
     args = make_arg_parser().parse_args()
     predicate_remap_file_name = args.predicateRemapYaml
     infores_remap_file_name = args.inforesRemapYaml
     curies_to_uri_file_name = args.curiesToURIFile
     knowledge_level_agent_type_file_name = args.knowledgeLevelAgentTypeFile
+    edge_blocklist_file_name = args.edgeBlocklistFile
     input_nodes_file_name = args.inputNodesFile
     input_edges_file_name = args.inputEdgesFile
     output_nodes_file_name = args.outputNodesFile
@@ -340,6 +358,7 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a
 
     infores_remap_config = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(infores_remap_file_name))
     knowledge_level_agent_type_map = kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(knowledge_level_agent_type_file_name))
+    edge_blocklist_map = load_edge_blocklist(kg2_util.safe_load_yaml_from_string(kg2_util.read_file_to_string(edge_blocklist_file_name)))
 
     source_predicate_curies_not_in_config = set()
     knowledge_source_curies_not_in_config_nodes = set()
@@ -352,7 +371,7 @@ def process_edges(input_edges_file_name, infores_remap_config, knowledge_level_a
 
     nodes = process_nodes(input_nodes_file_name, infores_remap_config, nodes_output)
     
-    process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes)
+    process_edges(input_edges_file_name, infores_remap_config, knowledge_level_agent_type_map, predicate_remap_file_name, curies_to_uri_file_name, edges_output, drop_self_edges_except, nodes, edge_blocklist_map)
     
     update_date = datetime.now().strftime("%Y-%m-%d %H:%M")
     version_file = open(args.versionFile, 'r')

From e3f0f8e05abaca6fa636598cf477888b2d55863e Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:05:30 -0700
Subject: [PATCH 106/125] #387 correcting the pipeline for new ontologies input

---
 build/Snakefile-conversion | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build/Snakefile-conversion b/build/Snakefile-conversion
index d7f96c2e..227d5ade 100644
--- a/build/Snakefile-conversion
+++ b/build/Snakefile-conversion
@@ -27,7 +27,7 @@ rule Ontologies_Conversion:
     log:
         config['ONTOLOGIES_CONVERSION_LOG']
     shell:
-        config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_categories_map} {input.curies_to_urls_map} {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" 
+        config['PYTHON_COMMAND'] + " {input.code} {input.real} {input.curies_to_categories_map} {input.curies_to_urls_map} " + config['BIOLINK_MODEL_VERSION'] + " {output.nodes} {output.edges} " + config['TEST_FLAG'] + " > {log} 2>&1" 
 
 rule SemMedDB_Conversion:
     input:

From 0e9611064113ea1ff727f576b8c93735a17f814f Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:09:03 -0700
Subject: [PATCH 107/125] #392 restringing pipeline for edge blocklist

---
 master-config.shinc     | 1 +
 process/run-simplify.sh | 4 ++--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/master-config.shinc b/master-config.shinc
index 3e78c226..a37d6ff8 100644
--- a/master-config.shinc
+++ b/master-config.shinc
@@ -24,6 +24,7 @@ curies_to_urls_file=${MAPS_CODE_DIR}/curies-to-urls-map.yaml
 predicate_mapping_file=${MAPS_CODE_DIR}/predicate-remap.yaml
 infores_mapping_file=${MAPS_CODE_DIR}/kg2-provided-by-curie-to-infores-curie.yaml
 knowledge_level_agent_type_mapping_file=${MAPS_CODE_DIR}/knowledge-level-agent-type-map.yaml
+edge_blocklist_file=${MAPS_CODE_DIR}/edge-blocklist.yaml
 rtx_config_file=RTXConfiguration-config.json
 biolink_model_version=4.2.1
 infores_registry_version=0.2.8
diff --git a/process/run-simplify.sh b/process/run-simplify.sh
index 4a033273..d88660dc 100755
--- a/process/run-simplify.sh
+++ b/process/run-simplify.sh
@@ -22,13 +22,13 @@ input_nodes_json=${1:-}
 input_edges_json=${2:-}
 output_nodes_json=${3:-}
 output_edges_json=${4:-}
-build_flag=${5:-""}
+test_flag=${5:-""}
 
 # TODO: Inhibits and increase are not in biolink model anymore - Find out what that should be now
 ${VENV_DIR}/bin/python3 -u ${PROCESS_CODE_DIR}/filter_kg_and_remap_predicates.py ${test_flag} --dropNegated \
                         --dropSelfEdgesExcept interacts_with,regulates,inhibits,increase \
                         ${predicate_mapping_file} ${infores_mapping_file} ${curies_to_urls_file} \
-                        ${knowledge_level_agent_type_mapping_file} ${input_nodes_json} ${input_edges_json} \
+                        ${knowledge_level_agent_type_mapping_file} ${edge_blocklist_file} ${input_nodes_json} ${input_edges_json} \
                         ${output_nodes_json} ${output_edges_json} ${kg2_version_file_local}
 
 date

From c499f551efec02e74e3e5a506f21867d8775523e Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:25:48 -0700
Subject: [PATCH 108/125] #387 correcting biolink version number code

---
 convert/ontologies_jsonl_to_kg_jsonl.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index 797953ba..96bc46e0 100755
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -588,6 +588,9 @@ def construct_nodes_and_edges(nodes_output, edges_output):
     for ontology_item in input_data:
         process_ontology_item(ontology_item)
 
+    # Save the Biolink node information before processing
+    save_biolink_information(biolink_version_number)
+
     # Categorize every node and save the information in the information dictionary for the node
     for node_id in SAVED_NODE_INFO:
         categorize_node(node_id)

From 36c63c684a582d9d3bcce294b604541d9851b489 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:35:46 -0700
Subject: [PATCH 109/125] #387 use correct dictionary to map IRI

---
 convert/ontologies_jsonl_to_kg_jsonl.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index 96bc46e0..41f86943 100755
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -229,7 +229,7 @@ def save_biolink_information(biolink_version_number):
     """
     source = kg2_util.CURIE_PREFIX_BIOLINK_SOURCE
     source_id = source + ":"
-    ontology_iri = URI_MAP[source]
+    ontology_iri = PREFIX_TO_IRI_MAP[source]
     name = "Biolink"
     SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number}
 
@@ -584,13 +584,13 @@ def construct_nodes_and_edges(nodes_output, edges_output):
     # Prepare the URI maps for mapping ontology information to KG2 CURIE IDs and IRIs
     generate_uri_map(curies_to_urls_file_name)
 
+    # Save the Biolink node information before processing
+    save_biolink_information(biolink_version_number)
+
     # Extract all of the necessary information from the ontologies
     for ontology_item in input_data:
         process_ontology_item(ontology_item)
 
-    # Save the Biolink node information before processing
-    save_biolink_information(biolink_version_number)
-
     # Categorize every node and save the information in the information dictionary for the node
     for node_id in SAVED_NODE_INFO:
         categorize_node(node_id)

From d54a3d6161e0608267345b48e99017a145c5e016 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:36:45 -0700
Subject: [PATCH 110/125] #387 correct variable names

---
 convert/ontologies_jsonl_to_kg_jsonl.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert/ontologies_jsonl_to_kg_jsonl.py b/convert/ontologies_jsonl_to_kg_jsonl.py
index 41f86943..d21cd6b3 100755
--- a/convert/ontologies_jsonl_to_kg_jsonl.py
+++ b/convert/ontologies_jsonl_to_kg_jsonl.py
@@ -229,9 +229,9 @@ def save_biolink_information(biolink_version_number):
     """
     source = kg2_util.CURIE_PREFIX_BIOLINK_SOURCE
     source_id = source + ":"
-    ontology_iri = PREFIX_TO_IRI_MAP[source]
+    iri = PREFIX_TO_IRI_MAP[source]
     name = "Biolink"
-    SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: ontology_iri, NAME_KEY: ontology_name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number}
+    SOURCE_INFO[source] = {SOURCE_KEY: source_id, IRI_KEY: iri, NAME_KEY: name, UPDATE_DATE_KEY: None, VERSION_KEY: biolink_version_number}
 
 
 def process_ontology_term(ontology_node, source, ontology_name, owl_source=True):

From 59347ab2bc363213bd0124a5839a60e424c201ba Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:49:01 -0700
Subject: [PATCH 111/125] #387 can use shortened link now that we don't
 actually have to download biolink

---
 maps/curies-to-urls-map.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index 3452641c..fbbb987e 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -34,7 +34,7 @@ use_for_bidirectional_mapping:
 #  -
 #    biolink: https://w3id.org/linkml/
   -
-    biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/vVERSION_HERE/project/owl/biolink_model.owl.ttl
+    biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/
   -
     bioschemas: 'https://bioschemas.org/'
   -

From f71881542cb847e2f90920873b69f4324090b8a8 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:49:59 -0700
Subject: [PATCH 112/125] #387 actually just change the biolink link to the
 repo

---
 maps/curies-to-urls-map.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/maps/curies-to-urls-map.yaml b/maps/curies-to-urls-map.yaml
index fbbb987e..19faa9c9 100644
--- a/maps/curies-to-urls-map.yaml
+++ b/maps/curies-to-urls-map.yaml
@@ -34,7 +34,7 @@ use_for_bidirectional_mapping:
 #  -
 #    biolink: https://w3id.org/linkml/
   -
-    biolink_download_source: https://raw.githubusercontent.com/biolink/biolink-model/
+    biolink_download_source: https://github.com/biolink/biolink-model.git
   -
     bioschemas: 'https://bioschemas.org/'
   -

From 0699d849b8d2527cdcc19d438df8ea99f3eff6cb Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 13:56:48 -0700
Subject: [PATCH 113/125] #140 correct the filename

---
 master-config.shinc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/master-config.shinc b/master-config.shinc
index a37d6ff8..f28bdd4d 100644
--- a/master-config.shinc
+++ b/master-config.shinc
@@ -28,6 +28,6 @@ edge_blocklist_file=${MAPS_CODE_DIR}/edge-blocklist.yaml
 rtx_config_file=RTXConfiguration-config.json
 biolink_model_version=4.2.1
 infores_registry_version=0.2.8
-kg2_version_file=version.txt
+kg2_version_file=kg2-version.txt
 kg2_version_file_local=${BUILD_DIR}/${kg2_version_file}
 kg2_version=
\ No newline at end of file

From e104912e7349057928a9e9d7a0a3947bf896156f Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 14:03:29 -0700
Subject: [PATCH 114/125] #387 pipelining issue thwarted

---
 extract/extract-ontologies.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/extract/extract-ontologies.sh b/extract/extract-ontologies.sh
index e82d8a5b..94351b87 100755
--- a/extract/extract-ontologies.sh
+++ b/extract/extract-ontologies.sh
@@ -19,9 +19,9 @@ config_dir=`dirname "$0"`
 source ${config_dir}/master-config.shinc
 
 parsing_script=${1-"${EXTRACT_CODE_DIR}/owlparser.py"}
-ontologies_load_inventory=${1-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"}
-output_file=${2-"${BUILD_DIR}/ontologies.jsonl"}
-ontologies_dir=${3-"${BUILD_DIR}/owl_files"}
+ontologies_load_inventory=${2-"${MAPS_CODE_DIR}/ont-load-inventory.yaml"}
+output_file=${3-"${BUILD_DIR}/ontologies.jsonl"}
+ontologies_dir=${4-"${BUILD_DIR}/owl_files"}
 
 rm -rf ${ontologies_dir}
 mkdir -p ${ontologies_dir}

From bd93687f8d20224edbeb95c31520b293d7a52b96 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 14:07:28 -0700
Subject: [PATCH 115/125] #405 umls cleanup issue

---
 build/snakemake-config-var.yaml | 2 --
 extract/extract-umls.sh         | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/build/snakemake-config-var.yaml b/build/snakemake-config-var.yaml
index 2c5e1863..2943089f 100644
--- a/build/snakemake-config-var.yaml
+++ b/build/snakemake-config-var.yaml
@@ -13,8 +13,6 @@ umls_output_base: kg2-umls
 umls_extraction_script: ${EXTRACT_CODE_DIR}/${umls_extraction_base}.sh
 umls_extraction_log: ${BUILD_DIR}/${umls_extraction_base}${version_suffix}${test_suffix}.log
 umls_extract_file: ${BUILD_DIR}/umls.jsonl
-umls_dir: ${BUILD_DIR}/umls
-umls_dest_dir: ${umls_dir}/META
 umls_conversion_script: ${CONVERT_CODE_DIR}/${umls_conversion_base}.py
 umls_conversion_log: ${BUILD_DIR}/${umls_conversion_base}${version_suffix}${test_suffix}.log
 umls_name_heirarchy: ${MAPS_CODE_DIR}/umls-name-heirarchy.yaml
diff --git a/extract/extract-umls.sh b/extract/extract-umls.sh
index dcaae277..d25e2028 100755
--- a/extract/extract-umls.sh
+++ b/extract/extract-umls.sh
@@ -19,6 +19,8 @@ source ${config_dir}/master-config.shinc
 
 output_file=${2:-${BUILD_DIR}/umls.jsonl}
 
+umls_dir=${BUILD_DIR}/umls
+umls_dest_dir=${umls_dir}/META
 umls_ver=2023AA
 umls_file_base=umls-${umls_ver}-metathesaurus-full
 config_file=${umls_dir}/config.prop

From 3488401263dc4868ebca7d5bdf91ee851a7713de Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 14:11:01 -0700
Subject: [PATCH 116/125] #408 #398 curl problems

---
 extract/extract-clinicaltrialskg.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh
index 95278512..86b6a799 100755
--- a/extract/extract-clinicaltrialskg.sh
+++ b/extract/extract-clinicaltrialskg.sh
@@ -24,7 +24,10 @@ version="2.2.6"
 clinicaltrialskg_download_link="https://db.systemsbiology.net/gestalt/KG/clinical_trials_kg_edges_v${version}.tsv"
 
 echo "# ${version}" > ${clinicaltrialskg_output_file}
-${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file}
+# ${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file}
+
+# Short term fix because download link is not resolving and I cannot identify the correct download link
+${aws_s3_cp} s3://${s3_bucket}/clinicaltrialskg-edges.tsv ${clinicaltrialskg_output_file}
 
 date
 echo "================= finishing extract-clinicaltrialskg.sh =================="
\ No newline at end of file

From 848a24fef9e69e46be1001e2ee602af594ecbb25 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 14:14:38 -0700
Subject: [PATCH 117/125] #408 issue with DisGeNET download

---
 extract/extract-disgenet.sh | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/extract/extract-disgenet.sh b/extract/extract-disgenet.sh
index c4c6c74e..5e6ddd73 100755
--- a/extract/extract-disgenet.sh
+++ b/extract/extract-disgenet.sh
@@ -22,9 +22,12 @@ disgenet_output_file=${1:-"${BUILD_DIR}/all_gene_disease_pmid_associations.tsv"}
 
 disgenet_download_link="https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_pmid_associations.tsv.gz"
 
-${curl_get} ${disgenet_download_link} > ${disgenet_output_file}.gz
+# ${curl_get} ${disgenet_download_link} > ${disgenet_output_file}.gz
 
-gzip -d ${disgenet_output_file}.gz
+# gzip -d ${disgenet_output_file}.gz
+
+# Temporary patch due to link failing to resolve
+${s3_cp_cmd} s3://${s3_bucket}/all_gene_disease_pmid_associations.tsv ${disgenet_output_file}
 
 
 date

From e7d76d6894f53551bcec652d49927225f4e3ae21 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 14:14:57 -0700
Subject: [PATCH 118/125] #408 typo for download

---
 extract/extract-clinicaltrialskg.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/extract/extract-clinicaltrialskg.sh b/extract/extract-clinicaltrialskg.sh
index 86b6a799..18630d16 100755
--- a/extract/extract-clinicaltrialskg.sh
+++ b/extract/extract-clinicaltrialskg.sh
@@ -27,7 +27,7 @@ echo "# ${version}" > ${clinicaltrialskg_output_file}
 # ${curl_get} ${clinicaltrialskg_download_link} >> ${clinicaltrialskg_output_file}
 
 # Short term fix because download link is not resolving and I cannot identify the correct download link
-${aws_s3_cp} s3://${s3_bucket}/clinicaltrialskg-edges.tsv ${clinicaltrialskg_output_file}
+${s3_cp_cmd} s3://${s3_bucket}/clinicaltrialskg-edges.tsv ${clinicaltrialskg_output_file}
 
 date
 echo "================= finishing extract-clinicaltrialskg.sh =================="
\ No newline at end of file

From 7edd9886bf6da84f1ff34570b779c6c29ede6c9c Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 14:28:58 -0700
Subject: [PATCH 119/125] #408 download SMPDB while link is failing

---
 extract/extract-smpdb.sh | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/extract/extract-smpdb.sh b/extract/extract-smpdb.sh
index 8062a446..a9d86592 100755
--- a/extract/extract-smpdb.sh
+++ b/extract/extract-smpdb.sh
@@ -27,8 +27,12 @@ smpdb_link="https://pathbank.org/downloads/pathbank_all_pathways.csv.zip"
 pwml_link="https://pathbank.org/downloads/pathbank_all_pwml.zip"
 smpdb_pmids_file="SMPDB_pubmed_IDs.csv"
 
-${curl_get} ${output_dir}/ ${smpdb_link} > ${output_dir}/${smpdb_output_file}.zip
-${curl_get} ${output_dir}/ ${pwml_link} > ${output_dir}/${pw_output_file}
+# ${curl_get} ${output_dir}/ ${smpdb_link} > ${output_dir}/${smpdb_output_file}.zip
+# ${curl_get} ${output_dir}/ ${pwml_link} > ${output_dir}/${pw_output_file}
+
+# Temporary patch due to cURL failure
+${s3_cp_cmd} s3://${s3_bucket}/${smpdb_output_file}.zip ${output_dir}/${smpdb_output_file}.zip
+${s3_cp_cmd} s3://${s3_bucket}/${pw_output_file} ${output_dir}/${pw_output_file}
 
 unzip -o ${output_dir}/${smpdb_output_file}.zip -d ${output_dir}/
 unzip -o -q ${output_dir}/${pw_output_file} -d ${output_dir}/

From 5cef56ae7397a242a162827a5479dffc763a51ee Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Mon, 2 Sep 2024 20:59:49 -0700
Subject: [PATCH 120/125] #408 cURL issue with HMDB

---
 extract/extract-hmdb.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/extract/extract-hmdb.sh b/extract/extract-hmdb.sh
index 93f1e0b1..cf362477 100755
--- a/extract/extract-hmdb.sh
+++ b/extract/extract-hmdb.sh
@@ -22,7 +22,10 @@ output_file=hmdb_metabolites
 
 hmdb_link="https://hmdb.ca/system/downloads/current/hmdb_metabolites.zip"
 
-${curl_get} ${hmdb_link} > ${BUILD_DIR}/${output_file}.zip
+# ${curl_get} ${hmdb_link} > ${BUILD_DIR}/${output_file}.zip
+
+# Temporary patch due to cURL issues
+${s3_cp_cmd} s3://${s3_bucket}/hmdb_metabolites.zip ${BUILD_DIR}/${output_file}.zip
 
 unzip -o ${BUILD_DIR}/${output_file}.zip -d ${BUILD_DIR}
 

From 893fb716362dd9eeb9c96e1913f68e26ec178d0d Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 3 Sep 2024 22:02:56 -0700
Subject: [PATCH 121/125] #408 build issue with knowledge_source node curies

---
 maps/kg2-provided-by-curie-to-infores-curie.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/maps/kg2-provided-by-curie-to-infores-curie.yaml b/maps/kg2-provided-by-curie-to-infores-curie.yaml
index 37854c95..25176d2c 100644
--- a/maps/kg2-provided-by-curie-to-infores-curie.yaml
+++ b/maps/kg2-provided-by-curie-to-infores-curie.yaml
@@ -1,4 +1,4 @@
-'ClinicalTrialKG:':
+'ClinicalTrialsKG:':
   source_name: Multiomics ClinicalTrials KP API
   infores_curie: infores:biothings-multiomics-clinicaltrials
   knowledge_type: knowledge_source
@@ -22,7 +22,7 @@ DOID:doid.owl:
   source_name: DrugCentral
   infores_curie: infores:drugcentral
   knowledge_type: knowledge_source
-EFO:efo.owl:
+OBO:efo.owl:
   source_name: Experimental Factor Ontology
   infores_curie: infores:efo
   knowledge_type: knowledge_source

From b37ee73198fad16c5e7d448486b661d8b9f39714 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Tue, 3 Sep 2024 23:33:45 -0700
Subject: [PATCH 122/125] #408 missing predicate

---
 maps/predicate-remap.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/maps/predicate-remap.yaml b/maps/predicate-remap.yaml
index 3d4bc482..3927f0e8 100644
--- a/maps/predicate-remap.yaml
+++ b/maps/predicate-remap.yaml
@@ -3545,6 +3545,8 @@ RO:0002159:
 RO:0002160:
   operation: keep
   core_predicate: biolink:in_taxon
+RO:0002161:
+  operation: delete
 RO:0002162:
   operation: keep
   core_predicate: biolink:in_taxon

From 257997298ac0512f3979d0fb01b74ee460d72ca9 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 8 Sep 2024 13:27:05 -0700
Subject: [PATCH 123/125] #408 bucket problem

---
 neo4j/tsv-to-neo4j.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neo4j/tsv-to-neo4j.sh b/neo4j/tsv-to-neo4j.sh
index 07c2f692..bd1a3682 100755
--- a/neo4j/tsv-to-neo4j.sh
+++ b/neo4j/tsv-to-neo4j.sh
@@ -54,7 +54,7 @@ rm -r -f ${tsv_dir}
 mkdir -p ${tsv_dir}
 
 # get the latest KG2 version
-${s3_cp_cmd} s3://${s3_bucket}/${kg2_version_file} ${kg2_version_file_local}
+${s3_cp_cmd} s3://${s3_bucket_public}/${kg2_version_file} ${kg2_version_file_local}
 kg2_version=`cat ${kg2_version_file_local}`
 
 # download the latest TSV files from the S3 Bucket

From 08bd4c5d22b8f07c2043bf50c6a2ff7461f12012 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 8 Sep 2024 14:30:53 -0700
Subject: [PATCH 124/125] #408 kg2-versions entry for KG2.10.1

---
 docs/kg2-versions.md | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/docs/kg2-versions.md b/docs/kg2-versions.md
index 4f2744e1..34121b52 100644
--- a/docs/kg2-versions.md
+++ b/docs/kg2-versions.md
@@ -1,3 +1,38 @@
+# 2.10.1
+**Date:  2024.9.02**
+
+Counts:
+- Nodes: 8,507,201
+- Edges: 57,418,405
+
+Issues:
+- Issue [#388](https://github.com/RTXteam/RTX-KG2/issues/388)
+- Issue [#392](https://github.com/RTXteam/RTX-KG2/issues/392)
+- Issue [#398](https://github.com/RTXteam/RTX-KG2/issues/398)
+- Issue [#404](https://github.com/RTXteam/RTX-KG2/issues/404)
+- Additional issues that arose during the build: [#395 (Comment)](https://github.com/RTXteam/RTX-KG2/issues/395#issuecomment-2223612095)
+
+Build info:
+- Biolink Model version: 4.2.1
+- InfoRes Registry version: 0.2.8
+- Build host: `kg2101build.rtx.ai`
+- Build directory: `/home/ubuntu/kg2-build`
+- Build code branch: `midjuly24work`
+- Neo4j endpoint CNAME: `kg2endpoint-kg2-10-1.rtx.ai`
+- Neo4j endpoint hostname: `kg2endpoint4.rtx.ai`
+- Tracking issue for the build: [#408](https://github.com/RTXteam/RTX-KG2/issues/408)
+- Major knowledge source versions:
+  - SemMedDB: `43 (2023)`
+  - UMLS: `2023AA`
+  - ChEMBL: `33`
+  - DrugBank: `5.1.10`
+  - Ensembl: `106`
+  - Reactome: `80`
+  - UniProtKB: `2024_04`
+  - DrugCentral: `52`
+  - KEGG: `111.0`
+
+
 # 2.10.0
 **Date:  2024.07.11**
 

From b158cc578931225f06edbe4e00aaabb6635aa823 Mon Sep 17 00:00:00 2001
From: ecwood <wooderi@stanford.edu>
Date: Sun, 8 Sep 2024 14:31:10 -0700
Subject: [PATCH 125/125] #408 rest of kg2-versions entry for KG2.10.1

---
 docs/kg2-versions.md | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/kg2-versions.md b/docs/kg2-versions.md
index 34121b52..33846422 100644
--- a/docs/kg2-versions.md
+++ b/docs/kg2-versions.md
@@ -6,11 +6,18 @@ Counts:
 - Edges: 57,418,405
 
 Issues:
+- Issue [#140](https://github.com/RTXteam/RTX-KG2/issues/140)
+- Issue [#387](https://github.com/RTXteam/RTX-KG2/issues/387)
 - Issue [#388](https://github.com/RTXteam/RTX-KG2/issues/388)
+- Issue [#390](https://github.com/RTXteam/RTX-KG2/issues/390)
 - Issue [#392](https://github.com/RTXteam/RTX-KG2/issues/392)
+- Issue [#393](https://github.com/RTXteam/RTX-KG2/issues/393)
 - Issue [#398](https://github.com/RTXteam/RTX-KG2/issues/398)
+- Issue [#399](https://github.com/RTXteam/RTX-KG2/issues/399)
+- Issue [#400](https://github.com/RTXteam/RTX-KG2/issues/400)
 - Issue [#404](https://github.com/RTXteam/RTX-KG2/issues/404)
-- Additional issues that arose during the build: [#395 (Comment)](https://github.com/RTXteam/RTX-KG2/issues/395#issuecomment-2223612095)
+- Issue [#405](https://github.com/RTXteam/RTX-KG2/issues/405)
+- Additional issues that arose during the build: [#408 (Comment)](https://github.com/RTXteam/RTX-KG2/issues/408#issuecomment-2336826509)
 
 Build info:
 - Biolink Model version: 4.2.1