From 723b998454f6216e3aea12f0075a1b49202d6e10 Mon Sep 17 00:00:00 2001
From: Jay R Bolton <jayrbolton@gmail.com>
Date: Wed, 29 May 2019 11:11:07 -0700
Subject: [PATCH] Initial write

---
 .travis.yml |   5 ++
 README.md   |   4 ++
 config.yaml | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 validate.py |   5 ++
 4 files changed, 170 insertions(+)
 create mode 100644 .travis.yml
 create mode 100644 config.yaml
 create mode 100644 validate.py

diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..8f0d593
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,5 @@
+language: python
+python:
+  - 3.6
+script:
+  - python validate.py
diff --git a/README.md b/README.md
index 007c5b9..9158815 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,10 @@ This repo holds configuration files that can be used in other KBase search codeb
 * ES type mappings for each index
 * Mapping of KBase Workspace type names to index names in ES.
 
+## Validate syntax
+
+Run `python validate.py` to validate the yaml syntax in the config file.
+
 ## KBase Search Stack
 
 * [Index Runner](https://github.com/kbaseIncubator/index_runner_deluxe) - Kafka consumer to construct indexes and documents.
diff --git a/config.yaml b/config.yaml
new file mode 100644
index 0000000..e7b81ee
--- /dev/null
+++ b/config.yaml
@@ -0,0 +1,156 @@
+
+# Generic, global type mappings. These apply to all workspace objects, but not subobjects.
+global_ws_mappings:
+  timestamp: {type: date}
+  obj_name: {type: keyword}
+  guid: {type: keyword}
+  creation_date: {type: date}
+  shared_users: {type: keyword}
+  access_group: {type: integer}
+  creator: {type: keyword}
+  version: {type: integer}
+  obj_id: {type: integer}
+  is_public: {type: boolean}
+  copied: {type: keyword}
+  tags: {type: keyword}
+  obj_type_version: {type: keyword}
+  obj_type_module: {type: keyword}
+  obj_type_name: {type: keyword}
+
+# Mapping of KBase type names to index names
+ws_type_to_indexes:
+  Narrative: "narrative"
+  PairedEndLibrary: "reads"
+  SingleEndLibrary: "reads"
+  Assembly: "assembly"
+  Genome: "genome"
+  Pangenome: "pangenome"
+  Taxon: "taxon"
+  Tree: "tree"
+
+# Which indexes are considered "subobjects" (nested under workspace objects, such as genome features)
+ws_subobjects:
+  - "genome_features:1"
+  - "pangenome_orthologfamily:1"
+
+aliases:
+  "narrative:1": narrative
+  "reads:1": reads
+  "assembly:1": assembly
+  "genome:1": genome
+  "genome_features:1": genome_features
+  "pangenome:1": pangenome
+  "pangenome_orthologfamily:1": pangenome_orthologfamily
+  "taxon:1": taxon
+  "tree:1": tree
+
+# All ES type mappings
+mappings:
+
+  "narrative:1":
+    narrative_title: {type: text}
+    data_objects:
+      type: nested
+      properties:
+        name: {type: keyword}
+        obj_type: {type: keyword}
+    cells:
+      type: object
+      properties:
+        desc: {type: text}
+        cell_type: {type: keyword}
+    total_cells: {type: short}
+
+  "reads:1":
+    sequencing_tech: {type: keyword}
+    size: {type: integer}
+    interleaved: {type: boolean}
+    single_genome: {type: boolean}
+    provenance_services: {type: keyword}
+    phred_type: {type: text}
+    gc_content: {type: float}
+    mean_quality_score: {type: float}
+    mean_read_length: {type: float}
+
+  "assembly:1":
+    assembly_name: {type: keyword}
+    mean_contig_length: {type: float}
+    percent_complete_contigs: {type: float}
+    percent_circle_contigs: {type: float}
+    assembly_id: {type: keyword}
+    gc_content: {type: float}
+    size: {type: integer}
+    num_contigs: {type: integer}
+    taxon_ref: {type: keyword}
+    external_origination_date: {type: keyword}  # should maybe be of type date?
+    external_source_id: {type: keyword}
+    external_source: {type: keyword}
+
+  "genome:1":
+    genome_id: {type: keyword}
+    scientific_name: {type: keyword}
+    size: {type: integer}
+    num_contigs: {type: integer}
+    genome_type: {type: keyword}
+    gc_content: {type: float}
+    taxonomy: {type: keyword}
+    mean_contig_length: {type: float}
+    external_origination_date: {type: keyword}  # should maybe be of type date?
+    original_source_file_name: {type: keyword}
+    # new fields to include:
+    cds_count: {type: integer}
+    feature_count: {type: integer}
+    mrna_count: {type: integer}
+    non_coding_feature_count: {type: integer}
+    assembly_ref: {type: keyword}
+    source_id: {type: keyword}
+    feature_counts: {type: object}
+    source: {type: keyword}
+    warnings: {type: text}
+
+  "genome_features:1":
+    feature_type: {type: keyword}
+    functions: {type: keyword}
+    contig_ids: {type: keyword}
+    sequence_length: {type: integer}
+    id: {type: keyword}
+    # genome_upa: {type: keyword}
+    guid: {type: keyword}
+    genome_version: {type: integer}
+    # new fields to include:
+    assembly_ref: {type: keyword}
+    genome_feature_type: {type: keyword}
+    starts: {type: integer}
+    strands: {type: keyword}
+    stops: {type: integer}
+    aliases: {type: keyword}
+
+  "pangenome:1":
+    pangenome_id: {type: keyword}
+    pangenome_name: {type: keyword}
+    pangenome_type: {type: keyword}
+    genome_upas: {type: keyword}
+
+  "pangenome_orthologfamily:1":
+    ortholog_id: {type: keyword}
+    ortholog_type: {type: keyword}
+    function: {type: keyword}
+    gene_ids: {type: keyword}
+
+  "taxon:1":
+    scientific_name: {type: keyword}
+    scientific_lineage: {type: keyword}
+    domain: {type: keyword}
+    kingdom: {type: keyword}
+    parent_taxon_ref: {type: keyword}
+    genetic_code: {type: integer}
+    aliases: {type: keyword}
+
+  "tree:1":
+    tree_name: {type: keyword}
+    type: {type: keyword}
+    labels:
+      type: nested
+      properties:
+        node_id: {type: text}
+        label: {type: text}
diff --git a/validate.py b/validate.py
new file mode 100644
index 0000000..9895a4d
--- /dev/null
+++ b/validate.py
@@ -0,0 +1,5 @@
+import yaml
+
+with open('./config.yaml') as fd:
+    yaml.load(fd)
+    print('YAML successfully parsed')