forked from MariusCC/chef-bcpc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
cluster-check.sh
executable file
·192 lines (181 loc) · 6.94 KB
/
cluster-check.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
#!/bin/bash
#
# A simple tool to check the basics of our cluster - machines up, services running
# This tool uses the cluster hardware list file cluster.txt
#
# The first param specifies the environment
#
# For the second, optional param :
# - if no param is passed, or 'all', all nodes are checked
# - if 'head' is passed only head nodes are checked
# - if 'work' is passed only work nodes are checked
# - if an IP address or hostname is passed, just that node is checked
#
# If any third param is passed, output is verbose, otherwise only
# output considered an exception is passed. You have to provide a 2nd
# param to allow a 3rd param to be recognized as simple positional
# param processing is used
#
# This may be helpful as a quick check after completing the knife
# bootstrap phase (assigning roles to nodes).
#
if [[ -z "$1" ]]; then
echo "Usage $0 'environment' [role|IP] [verbose]"
exit
fi
if [[ -z `which fping` ]]; then
echo "This tool uses fping. You should be able to install fpring with `sudo apt-get install fping`"
exit
fi
echo "$0 : Checking which hosts are online..."
UPHOSTS=`./cluster-whatsup.sh $2`
#set -x
ENVIRONMENT="$1"
HOSTWANTED="$2"
VERBOSE="$3"
# verbose trace - information that's not normally needed
function vtrace {
if [[ ! -z "$VERBOSE" ]]; then
for STR in "$@"; do
echo -e $STR
done
fi
}
declare -A HOSTNAMES
if [[ -f cluster.txt ]]; then
while read HOSTNAME MACADDR IPADDR ILOIPADDR DOMAIN ROLE; do
if [[ $HOSTNAME = "end" ]]; then
continue
fi
if [[ "$ROLE" = "bootstrap" ]]; then
continue
fi
THISUP="false"
for UPHOST in $UPHOSTS; do
if [[ "$IPADDR" = "$UPHOST" ]]; then
THISUP="true"
UP=$[UP + 1]
fi
done
if [[ -z "$HOSTWANTED" || "$HOSTWANTED" = all || "$HOSTWANTED" = "$ROLE" || "$HOSTWANTED" = "$IPADDR" || "$HOSTWANTED" = "$HOSTNAME" ]]; then
# HOSTS="$HOSTS $HOSTNAME"
if [[ "$THISUP" = "false" ]]; then
echo "$HOSTNAME is down"
continue
else
vtrace "$HOSTNAME is up"
fi
HOSTS="$HOSTS $IPADDR"
IDX=`echo $IPADDR | tr '.' '-'`
HOSTNAMES["$IDX"]="$HOSTNAME"
fi
done < cluster.txt
vtrace "HOSTS = $HOSTS"
echo
for HOST in $HOSTS; do
echo "Checking name resolution"
./nodessh.sh $ENVIRONMENT $HOST "grep -m1 server /etc/ntp.conf | cut -f2 -d' ' > /tmp/clusterjunk.txt "
./nodessh.sh $ENVIRONMENT $HOST "cat /tmp/clusterjunk.txt | xargs -n1 host"
echo "checking NTP server"
./nodessh.sh $ENVIRONMENT $HOST "cat /tmp/clusterjunk.txt | xargs -n1 ping -c 1"
IDX=`echo $HOST | tr '.' '-'`
NAME=${HOSTNAMES["$IDX"]}
vtrace "Checking $NAME ($HOST)..."
ROOTSIZE=`./nodessh.sh $ENVIRONMENT $HOST "df -k / | grep -v Filesystem"`
ROOTSIZE=`echo $ROOTSIZE | awk '{print $4}'`
ROOTGIGS=$((ROOTSIZE/(1024*1024)))
if [[ $ROOTSIZE -eq 0 ]]; then
echo "Root fileystem size = $ROOTSIZE ($ROOTGIGS GB) !!WARNING!!"
echo "Machine may still be installing the operating system ... skipping"
continue
elif [[ $ROOTSIZE -lt 100*1024*1024 ]]; then
echo "Root fileystem size = $ROOTSIZE ($ROOTGIGS GB) !!WARNING!!"
else
vtrace "Root fileystem size = $ROOTSIZE ($ROOTGIGS GB) "
fi
# ugh, so slow
# printf "Disks : "
# for DISK in sda sdb sdc sdd sde sdf sdg sdh sdi sdj sdk sdl sdm; do
# DISKTHERE=`./nodessh.sh $ENVIRONMENT $HOST "/sbin/fdisk -l /dev/$DISK"`
# # this is a bit tricksy. Invoking fdisk without root
# # privilege on a disk that is present raises an error, but
# # does nothing if the device is not there at all - so a
# # "present/notpresent" check is implemented as
# # non-null-string/null-string
# if [[ ! -z "$DISKTHERE" ]]; then
# printf "$DISK "
# else
# printf "!$DISK "
# fi
# done
# printf "\n"
if [[ -z `./nodessh.sh $ENVIRONMENT $HOST "ip route show table mgmt | grep default"` ]]; then
echo "$HOST no mgmt default route !!WARNING!!"
BADHOSTS="$BADHOSTS $HOST"
else
vtrace "$HOST has a default mgmt route"
MG=$[MG + 1]
fi
if [[ -z `./nodessh.sh $ENVIRONMENT $HOST "ip route show table storage | grep default"` ]]; then
echo "$HOST has no storage default route !!WARNING!!"
BADHOSTS="$BADHOSTS $HOST"
else
vtrace "$HOST has a default storage route"
SG=$[SG + 1]
fi
CHEF=`./nodessh.sh $ENVIRONMENT $HOST "which chef-client"`
if [[ -z "$CHEF" ]]; then
echo "$HOST doesn't seem to have chef installed so probably hasn't been assigned a role"
echo
continue
fi
STAT=`./nodessh.sh $ENVIRONMENT $HOST "ceph -s | grep HEALTH" sudo`
STAT=`echo $STAT | cut -f2 -d:`
if [[ "$STAT" =~ "HEALTH_OK" ]]; then
vtrace "$HOST ceph : healthy"
else
printf "$HOST %20s %s\n" ceph "$STAT"
fi
# fluentd has a ridiculous status output from the normal
# service reporting (something like "* ruby running"), try to
# do better, according to this:
# http://docs.treasure-data.com/articles/td-agent-monitoring
# Roughly speaking if we have two lines of output from the
# following ps command it's in good shape, if not dump the
# entire output of that command to the status. This needs more
# work
FLUENTD=`./nodessh.sh $ENVIRONMENT $HOST "ps w -C ruby -C td-agent --no-heading | grep -v chef-client" sudo`
STAT=`./nodessh.sh $ENVIRONMENT $HOST "ps w -C ruby -C td-agent --no-heading | grep -v chef-client | wc -l" sudo`
STAT=`echo $STAT | cut -f2 -d:`
if [[ "$STAT" =~ 2 ]]; then
if [[ ! -z "$VERBOSE" ]]; then
printf "$HOST %20s %s\n" "fluentd" "normal"
fi
else
printf "$HOST %20s %s\n" fluentd "$FLUENTD"
fi
for SERVICE in keystone glance-api glance-registry cinder-scheduler cinder-volume cinder-api nova-api nova-novncproxy nova-scheduler nova-consoleauth nova-cert nova-conductor nova-compute nova-network haproxy; do
STAT=`./nodessh.sh $ENVIRONMENT $HOST "service $SERVICE status | grep running" sudo`
if [[ ! "$STAT" =~ "unrecognized" ]]; then
STAT=`echo $STAT | cut -f2 -d":"`
if [[ ! "$STAT" =~ "start/running" ]]; then
printf "$HOST %20s %s\n" "$SERVICE" "$STAT"
BADHOSTS="$BADHOSTS $HOST"
else
# couldn't get a "verbose printf" function to work
if [[ ! -z "$VERBOSE" ]]; then
printf "$HOST %20s %s\n" "$SERVICE" "$STAT"
fi
fi
fi
done
echo
done
else
echo "Warning 'cluster.txt' not found"
fi
echo "$ENVIRONMENT cluster summary: $UP hosts up. $MG hosts with default mgmt route. $SG hosts with default storage route"
BADHOSTS=`echo $BADHOSTS | uniq | sort`
if [[ ! -z "$BADHOSTS" ]]; then
echo "Bad hosts $BADHOSTS - definite issues on these"
fi