Skip to content

Commit

Permalink
SOLR-15300: Report collection and shard "health" state in CLUSTERSTAT…
Browse files Browse the repository at this point in the history
…US response.
  • Loading branch information
sigram committed May 6, 2021
1 parent 69ff663 commit 0eb7b3e
Show file tree
Hide file tree
Showing 4 changed files with 161 additions and 6 deletions.
2 changes: 2 additions & 0 deletions solr/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ New Features

* SOLR-15090: A new 'gcs-repository' contrib can be used to store and retrieve backups from Google Cloud Storage. (Jason Gerlowski, Shalin Mangar, Cao Manh Dat)

* SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response. (ab, janhoy)

Improvements
----------------------
* LUCENE-8984: MoreLikeThis MLT is biased for uncommon fields (Andy Hind via Anshum Gupta)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,49 @@ public class ClusterStatus {
private final ZkNodeProps message;
private final String collection; // maybe null

/** Shard / collection health state. */
public enum Health {
/** All replicas up, leader exists. */
GREEN,
/** Some replicas down, leader exists. */
YELLOW,
/** Most replicas down, leader exists. */
ORANGE,
/** No leader or all replicas down. */
RED;

public static final float ORANGE_LEVEL = 0.5f;
public static final float RED_LEVEL = 0.0f;

public static Health calcShardHealth(float fractionReplicasUp, boolean hasLeader) {
if (hasLeader) {
if (fractionReplicasUp == 1.0f) {
return GREEN;
} else if (fractionReplicasUp > ORANGE_LEVEL) {
return YELLOW;
} else if (fractionReplicasUp > RED_LEVEL) {
return ORANGE;
} else {
return RED;
}
} else {
return RED;
}
}

/** Combine multiple states into one. Always reports as the worst state. */
public static Health combine(Collection<Health> states) {
Health res = GREEN;
for (Health state : states) {
if (state.ordinal() > res.ordinal()) {
res = state;
}
}
return res;
}
}


public ClusterStatus(ZkStateReader zkStateReader, ZkNodeProps props) {
this.zkStateReader = zkStateReader;
this.message = props;
Expand Down Expand Up @@ -131,11 +174,12 @@ public void getClusterStatus(@SuppressWarnings({"rawtypes"})NamedList results)
requestedShards.addAll(Arrays.asList(paramShards));
}

byte[] bytes = Utils.toJSON(clusterStateCollection);
Map<String, Object> docCollection = (Map<String, Object>) Utils.fromJSON(bytes);
collectionStatus = getCollectionStatus(docCollection, name, requestedShards);
byte[] bytes = Utils.toJSON(clusterStateCollection);
Map<String, Object> docCollection = (Map<String, Object>) Utils.fromJSON(bytes);
collectionStatus = getCollectionStatus(docCollection, name, requestedShards);

collectionStatus.put("znodeVersion", clusterStateCollection.getZNodeVersion());

if (collectionVsAliases.containsKey(name) && !collectionVsAliases.get(name).isEmpty()) {
collectionStatus.put("aliases", collectionVsAliases.get(name));
}
Expand Down Expand Up @@ -248,17 +292,39 @@ public static Map<String,Object> postProcessCollectionJSON(Map<String, Object> c
final Map<String, Map<String,Object>> shards = collection != null
? (Map<String, Map<String,Object>>)collection.getOrDefault("shards", Collections.emptyMap())
: Collections.emptyMap();
shards.values().forEach(s -> {
final List<Health> healthStates = new ArrayList<>(shards.size());
shards.forEach((shardName, s) -> {
final Map<String, Map<String,Object>> replicas =
(Map<String, Map<String,Object>>)s.getOrDefault("replicas", Collections.emptyMap());
replicas.values().forEach(r -> {
int[] totalVsActive = new int[2];
boolean hasLeader = false;
for (Map<String, Object> r : replicas.values()) {
totalVsActive[0]++;
boolean active = false;
if (Replica.State.ACTIVE.toString().equals(r.get("state"))) {
totalVsActive[1]++;
active = true;
}
if ("true".equals(r.get("leader")) && active) {
hasLeader = true;
}
String nodeName = (String)r.get(ZkStateReader.NODE_NAME_PROP);
if (nodeName != null) {
// UI needs the base_url set
r.put(ZkStateReader.BASE_URL_PROP, UrlScheme.INSTANCE.getBaseUrlForNodeName(nodeName));
}
});
}
float ratioActive;
if (totalVsActive[0] == 0) {
ratioActive = 0.0f;
} else {
ratioActive = (float) totalVsActive[1] / totalVsActive[0];
}
Health health = Health.calcShardHealth(ratioActive, hasLeader);
s.put("health", health.toString());
healthStates.add(health);
});
collection.put("health", Health.combine(healthStates).toString());
return collection;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,22 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import com.google.common.collect.Lists;
import org.apache.solr.client.solrj.SolrRequest;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.cloud.SolrCloudManager;
import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.BaseHttpSolrClient;
import org.apache.solr.client.solrj.impl.CloudSolrClient;
import org.apache.solr.client.solrj.impl.SolrClientCloudManager;
import org.apache.solr.client.solrj.request.CollectionAdminRequest;
import org.apache.solr.client.solrj.request.QueryRequest;
import org.apache.solr.client.solrj.request.V2Request;
import org.apache.solr.client.solrj.response.CollectionAdminResponse;
import org.apache.solr.cloud.CloudUtil;
import org.apache.solr.cloud.ZkConfigSetService;
import org.apache.solr.cloud.ZkTestServer;
import org.apache.solr.common.SolrException;
Expand All @@ -42,10 +47,12 @@
import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.common.util.Utils;
import org.apache.zookeeper.KeeperException;
import org.junit.Test;

Expand Down Expand Up @@ -85,6 +92,7 @@ public void test() throws Exception {
clusterStatusWithCollection();
clusterStatusWithCollectionAndShard();
clusterStatusWithCollectionAndMultipleShards();
clusterStatusWithCollectionHealthState();
clusterStatusWithRouteKey();
clusterStatusAliasTest();
if (!isDistributedCollectionApi) {
Expand Down Expand Up @@ -340,6 +348,64 @@ private void clusterStatusWithCollectionAndMultipleShards() throws IOException,
}
}

@SuppressWarnings({"unchecked"})
private void clusterStatusWithCollectionHealthState() throws Exception {
try (CloudSolrClient client = createCloudClient(null)) {
final CollectionAdminRequest.ClusterStatus request = new CollectionAdminRequest.ClusterStatus();
request.setCollectionName(COLLECTION_NAME);
NamedList<Object> rsp = request.process(client).getResponse();
NamedList<Object> cluster = (NamedList<Object>) rsp.get("cluster");
assertNotNull("Cluster state should not be null", cluster);
Map<String, Object> collection = (Map<String, Object>) Utils.getObjectByPath(cluster, false, "collections/" + COLLECTION_NAME);
assertEquals("collection health", "GREEN", collection.get("health"));
Map<String, Object> shardStatus = (Map<String, Object>) collection.get("shards");
assertEquals(2, shardStatus.size());
String health = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
assertEquals("shard1 health", "GREEN", health);
health = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
assertEquals("shard2 health", "GREEN", health);

// bring some replicas down
JettySolrRunner jetty = chaosMonkey.getShard("shard1", 0);
String nodeName = jetty.getNodeName();
jetty.stop();
ZkStateReader zkStateReader = client.getZkStateReader();
zkStateReader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> n != null && !n.contains(nodeName));

rsp = request.process(client).getResponse();
collection = (Map<String, Object>) Utils.getObjectByPath(rsp, false, "cluster/collections/" + COLLECTION_NAME);
assertFalse("collection health should not be GREEN", "GREEN".equals(collection.get("health")));
shardStatus = (Map<String, Object>) collection.get("shards");
assertEquals(2, shardStatus.size());
String health1 = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
String health2 = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
assertTrue("shard1=" + health1 + ", shard2=" + health2, !"GREEN".equals(health1) || !"GREEN".equals(health2));

// bring them up again
jetty.start();
SolrCloudManager cloudManager = new SolrClientCloudManager(null, client);
zkStateReader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> n != null && n.contains(nodeName));
CloudUtil.waitForState(cloudManager, COLLECTION_NAME, 30, TimeUnit.SECONDS, (liveNodes, coll) -> {
for (Replica r : coll.getReplicas()) {
if (!r.isActive(liveNodes)) {
return false;
}
}
return true;
});
rsp = request.process(client).getResponse();
collection = (Map<String, Object>) Utils.getObjectByPath(rsp, false, "cluster/collections/" + COLLECTION_NAME);
assertEquals("collection health", "GREEN", collection.get("health"));
shardStatus = (Map<String, Object>) collection.get("shards");
assertEquals(2, shardStatus.size());
health = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
assertEquals("shard1 health", "GREEN", health);
health = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
assertEquals("shard2 health", "GREEN", health);

}
}


private void listCollection() throws IOException, SolrServerException {
try (CloudSolrClient client = createCloudClient(null)) {
Expand Down
21 changes: 21 additions & 0 deletions solr/solr-ref-guide/src/cluster-node-management.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,24 @@ These API commands work with a SolrCloud cluster at the entire cluster level, or

Fetch the cluster status including collections, shards, replicas, configuration name as well as collection aliases and cluster properties.

Additionally, this command reports a `health` status of each collection and shard, in
order to make it easier to monitor the operational state of the collections. The
following health state values are defined, ordered from the best to worst, based on
the percentage of active replicas (`active`):

`GREEN`::
`active == 100%`, all replicas are active and there's a shard leader.
`YELLOW`::
`100% > active > 50%`, AND there's a shard leader.
`ORANGE`::
`50% >= active > 0%`, AND there's a shard leader.
`RED`::
No active replicas *OR* there's no shard leader.

The collection health state is reported as the worst state of any shard, e.g. for a
collection with all shards GREEN except for one YELLOW the collection health will be
reported as YELLOW.

[.dynamic-tabs]
--
[example.tab-pane#v1clusterstatus]
Expand Down Expand Up @@ -87,6 +105,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
"shard1":{
"range":"80000000-ffffffff",
"state":"active",
"health": "GREEN",
"replicas":{
"core_node1":{
"state":"active",
Expand All @@ -102,6 +121,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
"shard2":{
"range":"0-7fffffff",
"state":"active",
"health": "GREEN",
"replicas":{
"core_node2":{
"state":"active",
Expand All @@ -119,6 +139,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
"znodeVersion": 11,
"autoCreated":"true",
"configName" : "my_config",
"health": "GREEN",
"aliases":["both_collections"]
},
"collection2":{
Expand Down

0 comments on commit 0eb7b3e

Please sign in to comment.