From 723b998454f6216e3aea12f0075a1b49202d6e10 Mon Sep 17 00:00:00 2001 From: Jay R Bolton Date: Wed, 29 May 2019 11:11:07 -0700 Subject: [PATCH] Initial write --- .travis.yml | 5 ++ README.md | 4 ++ config.yaml | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++ validate.py | 5 ++ 4 files changed, 170 insertions(+) create mode 100644 .travis.yml create mode 100644 config.yaml create mode 100644 validate.py diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..8f0d593 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,5 @@ +language: python +python: + - 3.6 +script: + - python validate.py diff --git a/README.md b/README.md index 007c5b9..9158815 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,10 @@ This repo holds configuration files that can be used in other KBase search codeb * ES type mappings for each index * Mapping of KBase Workspace type names to index names in ES. +## Validate syntax + +Run `python validate.py` to validate the yaml syntax in the config file. + ## KBase Search Stack * [Index Runner](https://github.com/kbaseIncubator/index_runner_deluxe) - Kafka consumer to construct indexes and documents. diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..e7b81ee --- /dev/null +++ b/config.yaml @@ -0,0 +1,156 @@ + +# Generic, global type mappings. These apply to all workspace objects, but not subobjects. +global_ws_mappings: + timestamp: {type: date} + obj_name: {type: keyword} + guid: {type: keyword} + creation_date: {type: date} + shared_users: {type: keyword} + access_group: {type: integer} + creator: {type: keyword} + version: {type: integer} + obj_id: {type: integer} + is_public: {type: boolean} + copied: {type: keyword} + tags: {type: keyword} + obj_type_version: {type: keyword} + obj_type_module: {type: keyword} + obj_type_name: {type: keyword} + +# Mapping of KBase type names to index names +ws_type_to_indexes: + Narrative: "narrative" + PairedEndLibrary: "reads" + SingleEndLibrary: "reads" + Assembly: "assembly" + Genome: "genome" + Pangenome: "pangenome" + Taxon: "taxon" + Tree: "tree" + +# Which indexes are considered "subobjects" (nested under workspace objects, such as genome features) +ws_subobjects: + - "genome_features:1" + - "pangenome_orthologfamily:1" + +aliases: + "narrative:1": narrative + "reads:1": reads + "assembly:1": assembly + "genome:1": genome + "genome_features:1": genome_features + "pangenome:1": pangenome + "pangenome_orthologfamily:1": pangenome_orthologfamily + "taxon:1": taxon + "tree:1": tree + +# All ES type mappings +mappings: + + "narrative:1": + narrative_title: {type: text} + data_objects: + type: nested + properties: + name: {type: keyword} + obj_type: {type: keyword} + cells: + type: object + properties: + desc: {type: text} + cell_type: {type: keyword} + total_cells: {type: short} + + "reads:1": + sequencing_tech: {type: keyword} + size: {type: integer} + interleaved: {type: boolean} + single_genome: {type: boolean} + provenance_services: {type: keyword} + phred_type: {type: text} + gc_content: {type: float} + mean_quality_score: {type: float} + mean_read_length: {type: float} + + "assembly:1": + assembly_name: {type: keyword} + mean_contig_length: {type: float} + percent_complete_contigs: {type: float} + percent_circle_contigs: {type: float} + assembly_id: {type: keyword} + gc_content: {type: float} + size: {type: integer} + num_contigs: {type: integer} + taxon_ref: {type: keyword} + external_origination_date: {type: keyword} # should maybe be of type date? + external_source_id: {type: keyword} + external_source: {type: keyword} + + "genome:1": + genome_id: {type: keyword} + scientific_name: {type: keyword} + size: {type: integer} + num_contigs: {type: integer} + genome_type: {type: keyword} + gc_content: {type: float} + taxonomy: {type: keyword} + mean_contig_length: {type: float} + external_origination_date: {type: keyword} # should maybe be of type date? + original_source_file_name: {type: keyword} + # new fields to include: + cds_count: {type: integer} + feature_count: {type: integer} + mrna_count: {type: integer} + non_coding_feature_count: {type: integer} + assembly_ref: {type: keyword} + source_id: {type: keyword} + feature_counts: {type: object} + source: {type: keyword} + warnings: {type: text} + + "genome_features:1": + feature_type: {type: keyword} + functions: {type: keyword} + contig_ids: {type: keyword} + sequence_length: {type: integer} + id: {type: keyword} + # genome_upa: {type: keyword} + guid: {type: keyword} + genome_version: {type: integer} + # new fields to include: + assembly_ref: {type: keyword} + genome_feature_type: {type: keyword} + starts: {type: integer} + strands: {type: keyword} + stops: {type: integer} + aliases: {type: keyword} + + "pangenome:1": + pangenome_id: {type: keyword} + pangenome_name: {type: keyword} + pangenome_type: {type: keyword} + genome_upas: {type: keyword} + + "pangenome_orthologfamily:1": + ortholog_id: {type: keyword} + ortholog_type: {type: keyword} + function: {type: keyword} + gene_ids: {type: keyword} + + "taxon:1": + scientific_name: {type: keyword} + scientific_lineage: {type: keyword} + domain: {type: keyword} + kingdom: {type: keyword} + parent_taxon_ref: {type: keyword} + genetic_code: {type: integer} + aliases: {type: keyword} + + "tree:1": + tree_name: {type: keyword} + type: {type: keyword} + labels: + type: nested + properties: + node_id: {type: text} + label: {type: text} diff --git a/validate.py b/validate.py new file mode 100644 index 0000000..9895a4d --- /dev/null +++ b/validate.py @@ -0,0 +1,5 @@ +import yaml + +with open('./config.yaml') as fd: + yaml.load(fd) + print('YAML successfully parsed')