diff --git a/charms/jimm-k8s/src/charm.py b/charms/jimm-k8s/src/charm.py index 52c6e7bf2..57148f368 100755 --- a/charms/jimm-k8s/src/charm.py +++ b/charms/jimm-k8s/src/charm.py @@ -46,7 +46,7 @@ ) from ops.charm import ActionEvent, CharmBase, RelationJoinedEvent from ops.main import main -from ops.model import ActiveStatus, BlockedStatus, WaitingStatus +from ops.model import ActiveStatus, BlockedStatus, ErrorStatus, WaitingStatus from state import State, requires_state, requires_state_setter @@ -74,6 +74,13 @@ PROMETHEUS_PORT = 8080 +class DeferError(Exception): + """Used to indicate to the calling function that an event could be deferred + if the hook needs to be retried.""" + + pass + + class JimmOperatorCharm(CharmBase): """JIMM Operator Charm.""" @@ -302,15 +309,21 @@ def _update_workload(self, event): }, } container.add_layer("jimm", pebble_layer, combine=True) - if self._ready(): - if container.get_service(JIMM_SERVICE_NAME).is_running(): - logger.info("replanning service") - container.replan() + try: + if self._ready(): + if container.get_service(JIMM_SERVICE_NAME).is_running(): + logger.info("replanning service") + container.replan() + else: + logger.info("starting service") + container.start(JIMM_SERVICE_NAME) + self.unit.status = ActiveStatus("running") + if self.unit.is_leader(): + self.app.status = ActiveStatus() else: - logger.info("starting service") - container.start(JIMM_SERVICE_NAME) - self.unit.status = ActiveStatus("running") - else: + logger.info("workload not ready - returning") + return + except DeferError: logger.info("workload container not ready - deferring") event.defer() return @@ -337,11 +350,23 @@ def _on_stop(self, _): container.stop(JIMM_SERVICE_NAME) except Exception as e: logger.info("failed to stop the jimm service: {}".format(e)) - self._ready() + try: + self._ready() + except DeferError: + logger.info("workload not ready") + return def _on_update_status(self, _): """Update the status of the charm.""" - self._ready() + if self.unit.status.name == ErrorStatus.name: + # Skip ready check if unit in error to allow for error resolution. + logger.info("unit in error status, skipping ready check") + return + try: + self._ready() + except DeferError: + logger.info("workload not ready") + return @requires_state_setter def _on_dashboard_relation_joined(self, event: RelationJoinedEvent): @@ -410,9 +435,7 @@ def _ready(self): self.unit.status = WaitingStatus("stopped") return True else: - logger.error("cannot connect to workload container") - self.unit.status = WaitingStatus("waiting for jimm workload") - return False + raise DeferError def _get_network_address(self, event): return str(self.model.get_binding(event.relation).network.egress_subnets[0].network_address) @@ -562,11 +585,6 @@ def _on_openfga_store_created(self, event: OpenFGAStoreCreateEvent): @requires_state def _get_dns_name(self, event): - if not self._state.is_ready(): - event.defer() - logger.warning("State is not ready") - return None - default_dns_name = "{}.{}-endpoints.{}.svc.cluster.local".format( self.unit.name.replace("/", "-"), self.app.name,