taxo-db

#!/bin/sh
#
#  taxo-db - SQL serve the taxonomy database
#  Copyright (C) 2017  Marco van Zwetselaar <io@zwets.it>
#
#  This program is free software: you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation, either version 3 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#  Home: http://github.com/zwets/taxo


# Constants -----------------------------------------------------------------

# Our name, for messages
PRG_NAME="$(basename "$0")"

# Default location for the SQLite database
DEFAULT_SQLITE_DB="$HOME/.taxo-db/taxo.db"
SQLITE_DB="${TAXO_DB_FILE:-"$DEFAULT_SQLITE_DB"}"

# The location of the NCBI taxdump file
TAXDUMP_URL="ftp://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz"


# Database definition -------------------------------------------------------
#
# Column descriptions taken from the taxdump readme.txt.
# We use TEMP tables for non-normalised tables and decompose after import.

DATABASE_CREATE_SQL='

SELECT "creating tables";

-- File nodes.dmp / tables Nodes and Ranks

CREATE TEMPORARY TABLE NodesTMP ( -- only used for import, dropped after move to Nodes
  tax_id INTEGER PRIMARY KEY,   -- node id in GenBank taxonomy database
  parent_tax_id INTEGER,        -- parent node id in GenBank taxonomy database
  rank TEXT,                    -- rank of this node (superkingdom, kingdom, ...)
                                -- note: replaced by rank_id join to table Ranks
  embl_code CHAR(2),            -- locus-name prefix; not unique (e.g. SO,PP,AR,..)
  div_id INTEGER,               -- see division.dmp file
  div_is_inh BOOLEAN,           -- 1 if node inherits division from parent
  gc_id INTEGER,                -- see gencode.dmp file
  gc_is_inh BOOLEAN,            -- 1 if node inherits genetic code from parent
  mito_gc_id INTEGER,           -- see gencode.dmp file
  mito_gc_is_inh BOOLEAN,       -- 1 if node inherits mitochondrial gencode from parent
  is_gb_hidden BOOLEAN,         -- 1 if name is suppressed in GenBank entry lineage
  is_hidden_root BOOLEAN,       -- 1 if this subtree has no sequence data yet
  comments VARCHAR              -- free-text comments and citations
  );

CREATE TABLE Nodes (
  tax_id INTEGER PRIMARY KEY,   -- node id in GenBank taxonomy database
  parent_tax_id INTEGER REFERENCES Nodes(tax_id),
  rank_id INTEGER REFERENCES Ranks(rank_id),
  embl_code CHAR(2),            -- locus-name prefix; not unique (e.g. SO,PP,AR,..)
  div_id INTEGER REFERENCES Divisions(div_id),
  div_is_inh BOOLEAN,           -- 1 if node inherits division from parent
  gc_id INTEGER REFERENCES GenCodes(gc_id),
  gc_is_inh BOOLEAN,            -- 1 if node inherits genetic code from parent
  mito_gc_id INTEGER REFERENCES GenCodes(gc_id),
  mito_gc_is_inh BOOLEAN,       -- 1 if node inherits mitochondrial gencode from parent
  is_gb_hidden BOOLEAN,         -- 1 if name is suppressed in GenBank entry lineage
  is_hidden_root BOOLEAN,       -- 1 if this subtree has no sequence data yet
  comments VARCHAR              -- free-text comments and citations
  );

CREATE TABLE Ranks ( -- normalised out from NodesTMP
  rank_id INTEGER PRIMARY KEY,
  rank_name CHAR(20)
  );

CREATE INDEX RankName1TMP ON NodesTMP (rank);
CREATE UNIQUE INDEX RankName2TMP ON Ranks (rank_name);

-- File names.dmp / tables Names and NameClass

CREATE TEMPORARY TABLE NamesTMP (
  tax_id INTEGER,       -- the id of node associated with this name
  name VARCHAR,         -- name itself
  unique_name VARCHAR,  -- the unique variant of name if name not unique
  name_class CHAR(20)   -- {"synonym", "authority", "common name", "scientific name", ...}
                        -- note: replaced by name_class_id pointing to NameClass
  );

CREATE TABLE Names (
  tax_id INTEGER REFERENCES Nodes(tax_id),
  name VARCHAR,                -- name itself
  unique_name VARCHAR,         -- the unique variant of name if name not unique
  name_class_id INTEGER REFERENCES NameClass(name_class_id)
  );

CREATE TABLE NameClass ( -- normalised out from NamesTMP
  name_class_id INTEGER PRIMARY KEY,
  name_class_name CHAR(20)     -- "synonym", "scientific name", "authority", etc.
  );

CREATE INDEX NameClassName1TMP ON NamesTMP (name_class);
CREATE UNIQUE INDEX NameClassName2TMP ON NameClass (name_class_name);

-- File divisions.dmp / table Divisions (division of tree of life)

CREATE TABLE Divisions (
  div_id INTEGER PRIMARY KEY,  -- taxonomy database division id
  div_code CHAR(3),            -- GenBank division code (three characters) BCT, PLN, VRT, MAM, PRI...
  div_name CHAR(20),           -- Bacteria, Invertebrates, Mammals, Synthetic, ...
  div_comments VARCHAR
  );

-- File gencode.dmp / table GenCodes (genetic codes)

CREATE TABLE GenCodes (
  gc_id INTEGER PRIMARY KEY,   -- GenBank genetic code id
  gc_abbr CHAR(1),	       -- genetic code name abbreviation (empty)
  gc_name VARCHAR, 	       -- genetic code name
  gc_cde CHAR(64),	       -- translation table for this genetic code
  gc_starts CHAR(64)
  );

-- File citations.dmp / tables Citations and NodesCitations

CREATE TEMPORARY TABLE CitationsTMP (
  cit_id INTEGER PRIMARY KEY,  -- the unique id of citation
  cit_key VARCHAR,     -- citation key
  pubmed_id INTEGER,   -- unique id in PubMed database (0 if not in PubMed)
  medline_id INTEGER,  -- unique id in MedLine database (0 if not in MedLine)
  cit_url VARCHAR,     -- URL associated with citation
  cit_text VARCHAR,    -- any text (usually article name and authors)
                       -- note: backslash-escaped NEWLINE, TAB, DOUBLE QUOTE, BACKSLASH
  tax_ids VARCHAR      -- list of node ids separated by a single space
                       -- note: removed and replaced by m:n table NodesCitations
  );

CREATE TABLE Citations ( -- citations in literature (m:n relation with Nodes)
  cit_id INTEGER PRIMARY KEY,  -- the unique id of citation
  cit_key VARCHAR,     -- citation key (words)
  pubmed_id INTEGER,   -- unique id in PubMed database (0 if not in PubMed)
  medline_id INTEGER,  -- unique id in MedLine database (0 if not in MedLine)
  cit_url VARCHAR,     -- URL associated with citation
  cit_text VARCHAR     -- any text (usually article name and authors)
                       -- note: backslash-escaped NEWLINE, TAB, DOUBLE QUOTE, BACKSLASH
  );

CREATE TABLE NodesCitations ( -- m:n relation of nodes and citations
  tax_id INTEGER REFERENCES Nodes(tax_id),
  cit_id INTEGER REFERENCES Citations(cit_id)
  );

CREATE UNIQUE INDEX NodesCitationsIdx ON NodesCitations (tax_id, cit_id);

-- File delnodes.dmp / table DelNodes (deleted taxonomy IDs)

CREATE TABLE DelNodes ( -- del_tax_id that exist no longer
  del_tax_id INTEGER PRIMARY KEY  -- note: no del_tax_id references a Nodes.tax_id
  );

-- File mrgnodes.dmp / table MergedNodes (merged or renamed taxonomy IDs)

CREATE TABLE MergedNodes ( -- old_tax_id that have merged into or renamed to tax_id
  old_tax_id INTEGER PRIMARY KEY, -- note: no old_tax_id references a Nodes.tax_id
  tax_id INTEGER REFERENCES Nodes (tax_id)
  );

CREATE INDEX MergedNodesIdx ON MergedNodes (tax_id);
  '

# Database decomposition ----------------------------------------------------
#
# We normalise rank names to table Ranks, name classes to table NameClass,
# and move the m:n relation of tax_id on cit_id to a linking table.

DATABASE_NORMALISE_SQL='
  
SELECT "normalising nodes";

INSERT INTO Ranks (rank_name) SELECT DISTINCT rank as rank_name from NodesTMP;
ALTER TABLE NodesTMP ADD COLUMN rank_id INTEGER;
UPDATE NodesTMP SET rank_id = (SELECT rank_id FROM Ranks WHERE rank_name = rank);
UPDATE Ranks SET rank_name = NULL WHERE rank_name = "no rank";
INSERT INTO Nodes (tax_id, parent_tax_id, rank_id, embl_code, div_id, div_is_inh, gc_id, gc_is_inh, mito_gc_id, mito_gc_is_inh, is_gb_hidden)
       SELECT tax_id, parent_tax_id, rank_id, embl_code, div_id, div_is_inh, gc_id, gc_is_inh, mito_gc_id, mito_gc_is_inh, is_gb_hidden FROM NodesTMP;
DROP INDEX RankName1TMP;
DROP INDEX RankName2TMP;
DROP TABLE NodesTMP;

SELECT "normalising names";

INSERT INTO NameClass (name_class_name) SELECT DISTINCT name_class as name_class_name from NamesTMP;
ALTER TABLE NamesTMP ADD COLUMN name_class_id INTEGER;
UPDATE NamesTMP SET name_class_id = (SELECT name_class_id FROM NameClass WHERE name_class_name = name_class);
INSERT INTO Names (tax_id, name, unique_name, name_class_id)
       SELECT tax_id, name, unique_name, name_class_id FROM NamesTMP;
DROP INDEX NameClassName1TMP;
DROP INDEX NameClassName2TMP;
DROP TABLE NamesTMP;

SELECT "normalising citations";

INSERT INTO Citations SELECT cit_id, cit_key, pubmed_id, medline_id, cit_url, cit_text FROM CitationsTMP;
DROP TABLE CitationsTMP;

SELECT "compacting database";

VACUUM;

SELECT "vacuum done";
  '

# Database views creation ---------------------------------------------------
#
# Define common views that flatten the database to a virtual single table

DATABASE_VIEWS_SQL='

CREATE VIEW SciNamesView ( -- for translating between tax_id and scientific name,
  tax_id,                  -- additionally returns rank name and parent id 
  rank_name, 
  sci_name, 
  parent_id
  ) AS
    SELECT T.tax_id, R.rank_name, N.name, T.parent_tax_id
    FROM Nodes T NATURAL JOIN Names N NATURAL JOIN NameClass C NATURAL JOIN Ranks R
    WHERE C.name_class_name = "scientific name";

CREATE VIEW AllNamesView ( -- for searching on all names
  tax_id,
  rank_name,
  name,
  name_class_name,
  parent_id
  ) AS
    SELECT T.tax_id, R.rank_name, N.name, C.name_class_name, T.parent_tax_id
    FROM Nodes T NATURAL JOIN Names N NATURAL JOIN NameClass C NATURAL JOIN Ranks R;
  '


# Functions -----------------------------------------------------------------

# Emit arguments to stderr if VERBOSE
emit () {
    [ -z "$VERBOSE" ] || echo "${PRG_NAME}: $*" >&2 || true
}

# Exit this script with an error message on stderr
err_exit () {
    echo "${PRG_NAME}: $*" >&2
    exit 1
}

# Remove the argument unless KEEP is set
clean_up () {
    while [ $# -ne 0 ]; do
        if [ -n "$KEEP" ]; then
            echo "$PRG_NAME: not removing $1"
        elif [ -f "$1" ]; then
            emit "removing file: $1"
            rm -f "$1" || true
        elif [ -d "$1" ]; then
            emit "removing directory: $1"
            rm -rf "$1" || true
        fi
        shift
    done
}

# Import database from taxdump archive $1 
import_database () {

    # Check existence of taxdump to import
    [ "$1" = "-" ] || [ -n "$1" -a -f "$1" ] || err_exit "no such file: $1"

    # Create working directory and unpack the taxdump
    local TMP_DIR="$(mktemp -d)" || err_exit "failed to create temporary directory"
    emit "unpacking taxdump in $TMP_DIR"
    tar -C "$TMP_DIR" -xzf "$1" || err_exit "failed to unpack: $1"

    # Produce the NodesCitations import file
    emit "creating $TMP_DIR/nodes_cites.tab"
    awk -b -O 'BEGIN { FS="\t\\|\t"; RS="\t\\|\n"; OFS="\t"; ORS="\n"; }
        { split ($7, tids, " "); for (i in tids) print tids[i], $1; }
        ' "$TMP_DIR/citations.dmp" > "$TMP_DIR/nodes_cites.tab"

    # Recode the field separators to single tabs, and escape unescaped double quotes
    # Note: the citation file has a rather unsanitised cit_text field; it should
    # escape double quote, tab, newline and backslash, but does not always.
    for F in nodes names division gencode citations delnodes merged; do
        local FILE="$TMP_DIR/$F.dmp"
        emit "preparing for import: $FILE"
        sed -i -e 's/\t|//g' -e 's/\([^\\]\)"/\1\\"/g' "$FILE" || exit 1
    done

    # Create temporary database so as to not overwrite existing until success
    local TMP_DBFILE="${SQLITE_DB}.$$"
    emit "create temporary database: $TMP_DBFILE"
    mkdir -p "$(dirname "$TMP_DBFILE")"

    # Run the database creation, import and normalisation
    echo "$DATABASE_CREATE_SQL
    SELECT \"Importing data ...\";
.separator '\t'
.import '$TMP_DIR/gencode.dmp' GenCodes
.import '$TMP_DIR/division.dmp' Divisions
.import '$TMP_DIR/nodes.dmp' NodesTMP
.import '$TMP_DIR/names.dmp' NamesTMP
.import '$TMP_DIR/delnodes.dmp' DelNodes
.import '$TMP_DIR/merged.dmp' MergedNodes
.import '$TMP_DIR/citations.dmp' CitationsTMP
.import '$TMP_DIR/nodes_cites.tab' NodesCitations
    $DATABASE_NORMALISE_SQL
    $DATABASE_VIEWS_SQL
" | sqlite3 -batch -bail "$TMP_DBFILE" || {
        clean_up "$TMP_DBFILE" "$TMP_DIR"
        err_exit "no changes, database import failed"
    } | while read OUTPUT; do emit "$OUTPUT"; done # 2>&1

    # Successful import
    emit "database import successful: $SQLITE_DB"
    clean_up "$TMP_DIR"
    mv -f "$TMP_DBFILE" "$SQLITE_DB"
}

# Show usage information and exit
usage_exit() {
    echo "
Usage: $PRG_NAME [OPTIONS] [SQL|-]

  Query the taxonomy database using SQL.

  OPTIONS
   -d,--db DBFILE      Name of database file (default: ~/.taxo-db/taxo.db
                       or the file pointed to by TAXO_DB_FILE if set)
   -i,--import FILE|-  Create or update DBFILE from taxdump FILE or stdin
   -k,--keep           Do not delete temporary files (for debugging)
   -r,--regexp         Enable REGEXP extension, or exit with error
   -v,--verbose        Show progress messages on stderr
   -h,--help           Show usage information

  This program executes SQL against the taxonomy database DBFILE.  If no
  SQL argument is present or when it is '-', starts a SQLite interactive
  session on stdin.

  To explore the database structure, type '.schema' in interactive mode,
  which gives a fully annotated definition of the database. Type '.help'
  for help, and ^D or '.quit' to quit.

  The DBFILE is generated from the NCBI taxdump archive.  To create or
  update it, download the archive and use --import.  In one command:

    wget $TAXDUMP_URL | ./taxo-db -v -i -

  For REGEXP search to work, SQLite3 must have the PCRE extension.  This
  extension is available on Ubuntu as package sqlite3-pcre.
" >&2
    exit ${1:-1}
}

# Check for sqlite3 

which sqlite3 >/dev/null 2>&1 || err_exit "sqlite3 command not found (is SQLite3 installed?)"

# Parse options

unset KEEP IMPORT_TGZ ENABLE_REGEXP VERBOSE
while [ $# -ne 0 -a "$(expr "$1" : '\(.\).*')" = "-" ]; do
    case $1 in
    -d|--db)      shift && SQLITE_DB="$1" || usage_exit ;;
    --db=*)       SQLITE_DB="${1#--db=}" ;;
    -i|--import)  shift && IMPORT_TGZ="$1" || usage_exit ;;
    --import=*)   IMPORT_TGZ="${1#--import=}" ;;
    -k|--keep)    KEEP=1 ;;
    -r|--regex*)  ENABLE_REGEXP=1 ;;
    -v|--verbose) VERBOSE=1 ;;
    -h|--help)    usage_exit 0 ;;
    *) usage_exit ;;
    esac
    shift
done

# Check arguments

[ $# -lt 2 ] || usage_exit
[ $# -eq 0 ] || SQL="$1;"   # Add gratuitous terminating semicolon

# Possibly import taxdump

if [ -n "$IMPORT_TGZ" ]; then
    import_database "$IMPORT_TGZ" || err_exit "taxdump import failed"
    echo "$(basename "$0"): database imported"
    exit 0
fi

# Check for the SQLite database

[ -f "$SQLITE_DB" ] || err_exit "no database file (use --import or --db): $SQLITE_DB"

# Check for the REGEXP extension, when option present

if [ -n "$ENABLE_REGEXP" ] && ! printf "SELECT 1 REGEXP 1;" | sqlite3 -bail -batch >/dev/null 2>&1; then
    if [ -f "/usr/lib/sqlite3/pcre.so" ]; then
        LOAD_STMT='.load /usr/lib/sqlite3/pcre.so'
    else
        err_exit "cannot do REGEXP search, SQLite3 lacks the required extension"
    fi
fi

# Execute either $SQL against database or start SQLite CLI

if [ -n "$SQL" ]; then # non-interactive 
    printf "${SQL}" | sqlite3 -cmd "$LOAD_STMT" -bail -batch -separator '	' "$SQLITE_DB"
else
    sqlite3 -cmd "$LOAD_STMT" -separator '	' "$SQLITE_DB"
fi

# vim: sts=4:sw=4:ai:si:et