cluster-check.sh

#!/bin/bash
#
# A simple tool to check the basics of our cluster - machines up, services running
# This tool uses the cluster hardware list file cluster.txt
#
# The first param specifies the environment
#
# For the second, optional param :
# - if no param is passed, or 'all', all nodes are checked
# - if 'head' is passed only head nodes are checked
# - if 'work' is passed only work nodes are checked
# - if an IP address or hostname is passed, just that node is checked
# 
# If any third param is passed, output is verbose, otherwise only
# output considered an exception is passed. You have to provide a 2nd
# param to allow a 3rd param to be recognized as simple positional
# param processing is used
#
# This may be helpful as a quick check after completing the knife
# bootstrap phase (assigning roles to nodes).
#
if [[ -z "$1" ]]; then
    echo "Usage $0 'environment' [role|IP] [verbose]"
    exit
fi
if [[ -z `which fping` ]]; then
    echo "This tool uses fping. You should be able to install fpring with `sudo apt-get install fping`"
    exit
fi

echo "$0 : Checking which hosts are online..."
UPHOSTS=`./cluster-whatsup.sh $2`

#set -x
ENVIRONMENT="$1"
HOSTWANTED="$2"
VERBOSE="$3"
# verbose trace - information that's not normally needed
function vtrace {
    if [[ ! -z "$VERBOSE" ]]; then
        for STR in "$@"; do
            echo -e $STR
        done
    fi
}

declare -A HOSTNAMES

if [[ -f cluster.txt ]]; then
    while read HOSTNAME MACADDR IPADDR ILOIPADDR DOMAIN ROLE; do
        if [[ $HOSTNAME = "end" ]]; then
            continue
        fi
	if [[ "$ROLE" = "bootstrap" ]]; then
	    continue
	fi
	THISUP="false"
	for UPHOST in $UPHOSTS; do
	    if [[ "$IPADDR" = "$UPHOST" ]]; then
		THISUP="true"
		UP=$[UP + 1]
	    fi
	done
        if [[ -z "$HOSTWANTED" || "$HOSTWANTED" = all || "$HOSTWANTED" = "$ROLE" || "$HOSTWANTED" = "$IPADDR" || "$HOSTWANTED" = "$HOSTNAME" ]]; then
#       HOSTS="$HOSTS $HOSTNAME"
	    if [[ "$THISUP" = "false" ]]; then
		echo "$HOSTNAME is down"
		continue
	    else
		vtrace "$HOSTNAME is up"
	    fi
            HOSTS="$HOSTS $IPADDR"
	    IDX=`echo $IPADDR | tr '.' '-'`
	    HOSTNAMES["$IDX"]="$HOSTNAME"
        fi
    done < cluster.txt
    vtrace "HOSTS = $HOSTS"
    echo
    
    for HOST in $HOSTS; do

	echo "Checking name resolution"
	./nodessh.sh $ENVIRONMENT $HOST "grep -m1 server /etc/ntp.conf | cut -f2 -d' ' > /tmp/clusterjunk.txt "
	./nodessh.sh $ENVIRONMENT $HOST "cat /tmp/clusterjunk.txt | xargs -n1 host"

	echo "checking NTP server"
	./nodessh.sh $ENVIRONMENT $HOST "cat /tmp/clusterjunk.txt | xargs -n1 ping -c 1"

	IDX=`echo $HOST | tr '.' '-'`
	NAME=${HOSTNAMES["$IDX"]}
	vtrace "Checking $NAME ($HOST)..."

	ROOTSIZE=`./nodessh.sh $ENVIRONMENT $HOST "df -k / | grep -v Filesystem"`
	ROOTSIZE=`echo $ROOTSIZE | awk '{print $4}'`
	ROOTGIGS=$((ROOTSIZE/(1024*1024)))
	if [[ $ROOTSIZE -eq 0 ]]; then
	    echo "Root fileystem size = $ROOTSIZE ($ROOTGIGS GB) !!WARNING!!"
	    echo "Machine may still be installing the operating system ... skipping"
	    continue
	elif [[ $ROOTSIZE -lt 100*1024*1024 ]]; then
	    echo "Root fileystem size = $ROOTSIZE ($ROOTGIGS GB) !!WARNING!!"
	else
            vtrace "Root fileystem size = $ROOTSIZE ($ROOTGIGS GB) "
	fi

# ugh, so slow
#	printf "Disks : "
#	for DISK in sda sdb sdc sdd sde sdf sdg sdh sdi sdj sdk sdl sdm; do
#	    DISKTHERE=`./nodessh.sh $ENVIRONMENT $HOST "/sbin/fdisk -l /dev/$DISK"`
#	    # this is a bit tricksy. Invoking fdisk without root
#	    # privilege on a disk that is present raises an error, but
#	    # does nothing if the device is not there at all - so a
#	    # "present/notpresent" check is implemented as
#	    # non-null-string/null-string
#	    if [[ ! -z "$DISKTHERE" ]]; then
#	        printf "$DISK "
#	    else
#		printf "!$DISK "
#	    fi
#	done
#	printf "\n"

        if [[ -z `./nodessh.sh $ENVIRONMENT $HOST "ip route show table mgmt | grep default"` ]]; then
            echo "$HOST no mgmt default route !!WARNING!!"
	    BADHOSTS="$BADHOSTS $HOST"
        else
            vtrace "$HOST has a default mgmt route"
            MG=$[MG + 1]
        fi
        if [[ -z `./nodessh.sh $ENVIRONMENT $HOST "ip route show table storage | grep default"` ]]; then
            echo "$HOST has no storage default route !!WARNING!!"
	    BADHOSTS="$BADHOSTS $HOST"
        else
            vtrace "$HOST has a default storage route"
            SG=$[SG + 1]
        fi
        CHEF=`./nodessh.sh $ENVIRONMENT $HOST "which chef-client"`
        if [[ -z "$CHEF" ]]; then
            echo "$HOST doesn't seem to have chef installed so probably hasn't been assigned a role"
            echo
            continue
        fi
        STAT=`./nodessh.sh $ENVIRONMENT $HOST "ceph -s | grep HEALTH" sudo`
        STAT=`echo $STAT | cut -f2 -d:`
        if [[ "$STAT" =~ "HEALTH_OK" ]]; then
            vtrace "$HOST ceph : healthy"
        else
            printf "$HOST %20s %s\n" ceph "$STAT"
        fi
        # fluentd has a ridiculous status output from the normal
        # service reporting (something like "* ruby running"), try to
        # do better, according to this:
        # http://docs.treasure-data.com/articles/td-agent-monitoring
        # Roughly speaking if we have two lines of output from the
        # following ps command it's in good shape, if not dump the
        # entire output of that command to the status. This needs more
        # work
        FLUENTD=`./nodessh.sh $ENVIRONMENT $HOST "ps w -C ruby -C td-agent --no-heading | grep -v chef-client" sudo`
        STAT=`./nodessh.sh $ENVIRONMENT $HOST "ps w -C ruby -C td-agent --no-heading | grep -v chef-client | wc -l" sudo`
        STAT=`echo $STAT | cut -f2 -d:`  
        if [[ "$STAT" =~ 2 ]]; then
            if [[ ! -z "$VERBOSE" ]]; then 
		printf "$HOST %20s %s\n" "fluentd" "normal"
	    fi
        else
            printf "$HOST %20s %s\n" fluentd "$FLUENTD"
        fi
        for SERVICE in keystone glance-api glance-registry cinder-scheduler cinder-volume cinder-api nova-api nova-novncproxy nova-scheduler nova-consoleauth nova-cert nova-conductor nova-compute nova-network haproxy; do
            STAT=`./nodessh.sh $ENVIRONMENT $HOST "service $SERVICE status | grep running" sudo`
            if [[ ! "$STAT" =~ "unrecognized" ]]; then
                STAT=`echo $STAT | cut -f2 -d":"`
                if [[ ! "$STAT" =~ "start/running" ]]; then
                    printf "$HOST %20s %s\n" "$SERVICE" "$STAT"
		    BADHOSTS="$BADHOSTS $HOST"
                else
            # couldn't get a "verbose printf" function to work
                    if [[ ! -z "$VERBOSE" ]]; then
                        printf "$HOST %20s %s\n" "$SERVICE" "$STAT"
                    fi
                fi
            fi
        done
        echo
    done
else
    echo "Warning 'cluster.txt' not found"
fi
echo "$ENVIRONMENT cluster summary: $UP hosts up. $MG hosts with default mgmt route. $SG hosts with default storage route"
BADHOSTS=`echo $BADHOSTS | uniq | sort`
if [[ ! -z "$BADHOSTS" ]]; then
    echo "Bad hosts $BADHOSTS - definite issues on these"
fi