description: rados/cephadm/workunits/{0-distro/ubuntu_22.04 agent/on mon_election/connectivity
  task/test_monitoring_stack_basic}
duration: 1049.2495551109314
failure_reason: 'Command failed on trial045 with status 4: ''sudo /home/ubuntu/cephtest/cephadm
  --image quay.ceph.io/ceph-ci/ceph:afefaa40503a4bf024e96f97dd7f852b75f00ad6 shell
  -c /etc/ceph/ceph.conf -k /etc/ceph/ceph.client.admin.keyring --fsid 45ea20b2-0c30-11f1-af2e-d404e6e7d460
  -- bash -c \''set -e\nset -x\nceph orch apply node-exporter\nceph orch apply grafana\nceph
  orch apply alertmanager\nceph orch apply prometheus\nsleep 240\nceph orch ls\nceph
  orch ps\nceph orch host ls\nMON_DAEMON=$(ceph orch ps --daemon-type mon -f json
  | jq -r \''"\''"\''last | .daemon_name\''"\''"\'')\nGRAFANA_HOST=$(ceph orch ps
  --daemon-type grafana -f json | jq -e \''"\''"\''.[]\''"\''"\'' | jq -r \''"\''"\''.hostname\''"\''"\'')\nPROM_HOST=$(ceph
  orch ps --daemon-type prometheus -f json | jq -e \''"\''"\''.[]\''"\''"\'' | jq
  -r \''"\''"\''.hostname\''"\''"\'')\nALERTM_HOST=$(ceph orch ps --daemon-type alertmanager
  -f json | jq -e \''"\''"\''.[]\''"\''"\'' | jq -r \''"\''"\''.hostname\''"\''"\'')\nGRAFANA_IP=$(ceph
  orch host ls -f json | jq -r --arg GRAFANA_HOST "$GRAFANA_HOST" \''"\''"\''.[] |
  select(.hostname==$GRAFANA_HOST) | .addr\''"\''"\'')\nPROM_IP=$(ceph orch host ls
  -f json | jq -r --arg PROM_HOST "$PROM_HOST" \''"\''"\''.[] | select(.hostname==$PROM_HOST)
  | .addr\''"\''"\'')\nALERTM_IP=$(ceph orch host ls -f json | jq -r --arg ALERTM_HOST
  "$ALERTM_HOST" \''"\''"\''.[] | select(.hostname==$ALERTM_HOST) | .addr\''"\''"\'')\n#
  check each host node-exporter metrics endpoint is responsive\nALL_HOST_IPS=$(ceph
  orch host ls -f json | jq -r \''"\''"\''.[] | .addr\''"\''"\'')\nfor ip in $ALL_HOST_IPS;
  do\n  curl -s http://${ip}:9100/metric\ndone\n# check grafana endpoints are responsive
  and database health is okay\ncurl -k -s https://${GRAFANA_IP}:3000/api/health\ncurl
  -k -s https://${GRAFANA_IP}:3000/api/health | jq -e \''"\''"\''.database == "ok"\''"\''"\''\n#
  stop mon daemon in order to trigger an alert\nceph orch daemon stop $MON_DAEMON\nsleep
  120\n# check prometheus endpoints are responsive and mon down alert is firing\ncurl
  -s http://${PROM_IP}:9095/api/v1/status/config\ncurl -s http://${PROM_IP}:9095/api/v1/status/config
  | jq -e \''"\''"\''.status == "success"\''"\''"\''\ncurl -s http://${PROM_IP}:9095/api/v1/alerts\ncurl
  -s http://${PROM_IP}:9095/api/v1/alerts | jq -e \''"\''"\''.data | .alerts | .[]
  | select(.labels | .alertname == "CephMonDown") | .state == "firing"\''"\''"\''\n#
  check alertmanager endpoints are responsive and mon down alert is active\ncurl -s
  http://${ALERTM_IP}:9093/api/v2/status\ncurl -s http://${ALERTM_IP}:9093/api/v2/alerts\ncurl
  -s http://${ALERTM_IP}:9093/api/v2/alerts | jq -e \''"\''"\''.[] | select(.labels
  | .alertname == "CephMonDown") | .status | .state == "active"\''"\''"\''\n# check
  prometheus metrics endpoint is not empty and make sure we can get metrics\nMETRICS_URL=$(ceph
  mgr services | jq -r .prometheus)\n[ -n "$METRICS_URL" ] || exit 1\ncurl -s "${METRICS_URL}metrics"
  | grep -q \''"\''"\''^ceph_health_status\''"\''"\''\n\'''''
flavor: default
owner: scheduled_yuriw@soko04.front.sepia.ceph.com
sentry_event: null
status: fail
success: false