archive_path: /home/teuthworker/archive/adking-2024-03-16_05:03:17-orch:cephadm-wip-adk-testing-2024-03-15-1513-distro-default-smithi/7606156 branch: wip-adk-testing-2024-03-15-1513 description: orch:cephadm/workunits/{0-distro/centos_9.stream_runc agent/off mon_election/classic task/test_monitoring_stack_basic} email: adking@redhat.com first_in_suite: false job_id: '7606156' kernel: kdb: true sha1: distro last_in_suite: false machine_type: smithi name: adking-2024-03-16_05:03:17-orch:cephadm-wip-adk-testing-2024-03-15-1513-distro-default-smithi no_nested_subset: false nuke-on-error: true os_type: centos os_version: 9.stream overrides: admin_socket: branch: wip-adk-testing-2024-03-15-1513 ceph: conf: global: mon election default strategy: 1 mgr: debug mgr: 20 debug ms: 1 mgr/cephadm/use_agent: false mon: debug mon: 20 debug ms: 1 debug paxos: 20 osd: debug ms: 1 debug osd: 20 flavor: default log-ignorelist: - \(MDS_ALL_DOWN\) - \(MDS_UP_LESS_THAN_MAX\) - MON_DOWN - mons down, quorum - is down \(out of quorum\) sha1: c33a62492e867e5c7d71887c4cfc1411e8834c1d ceph-deploy: conf: client: log file: /var/log/ceph/ceph-$name.$pid.log mon: {} install: ceph: flavor: default sha1: c33a62492e867e5c7d71887c4cfc1411e8834c1d selinux: allowlist: - scontext=system_u:system_r:logrotate_t:s0 workunit: branch: wip-adk-testing-2024-03-15-1513 sha1: c33a62492e867e5c7d71887c4cfc1411e8834c1d owner: scheduled_adking@teuthology priority: 80 repo: https://git.ceph.com/ceph-ci.git roles: - - host.a - mon.a - mgr.a - osd.0 - - host.b - mon.b - mgr.b - osd.1 - - host.c - mon.c - osd.2 seed: 1882 sha1: c33a62492e867e5c7d71887c4cfc1411e8834c1d sleep_before_teardown: 0 subset: 1/15 suite: orch:cephadm suite_branch: wip-adk-testing-2024-03-15-1513 suite_path: /home/teuthworker/src/git.ceph.com_ceph-c_c33a62492e867e5c7d71887c4cfc1411e8834c1d/qa suite_relpath: qa suite_repo: https://git.ceph.com/ceph-ci.git suite_sha1: c33a62492e867e5c7d71887c4cfc1411e8834c1d targets: smithi029.front.sepia.ceph.com: ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBKC6FtpKMY/xJrAoqNdk6vJ/EjVFajX0zYbyLFv9p6EOUqkURd1KBEW0qy69EuviIDkuIWblWec4Q92zvX1YKwg= smithi046.front.sepia.ceph.com: ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBCeqOy8ra+XcBGrLGmChEUzTwoeGHHG5K7L2o9Ve/V6T8aKuXNUYv6/0Jb8aGocPEVYkY/Yrfd9R/4xm3cRfCBs= smithi179.front.sepia.ceph.com: ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBN+ridr31+cIJdgUcnbbJGjinkAVH2S2g9cSQFq2sTxNXrCxUX14s0uIbStMBObCMvwnsu/H7E/iYjTaCCZnQYU= tasks: - pexec: all: - sudo dnf install runc -y - sudo sed -i 's/^#runtime = "crun"/runtime = "runc"/g' /usr/share/containers/containers.conf - sudo sed -i 's/runtime = "crun"/#runtime = "crun"/g' /usr/share/containers/containers.conf - install: null - cephadm: null - cephadm.shell: host.a: - "set -e\nset -x\nceph orch apply node-exporter\nceph orch apply grafana\nceph\ \ orch apply alertmanager\nceph orch apply prometheus\nsleep 240\nceph orch\ \ ls\nceph orch ps\nceph orch host ls\nMON_DAEMON=$(ceph orch ps --daemon-type\ \ mon -f json | jq -r 'last | .daemon_name')\nGRAFANA_HOST=$(ceph orch ps --daemon-type\ \ grafana -f json | jq -e '.[]' | jq -r '.hostname')\nPROM_HOST=$(ceph orch\ \ ps --daemon-type prometheus -f json | jq -e '.[]' | jq -r '.hostname')\nALERTM_HOST=$(ceph\ \ orch ps --daemon-type alertmanager -f json | jq -e '.[]' | jq -r '.hostname')\n\ GRAFANA_IP=$(ceph orch host ls -f json | jq -r --arg GRAFANA_HOST \"$GRAFANA_HOST\"\ \ '.[] | select(.hostname==$GRAFANA_HOST) | .addr')\nPROM_IP=$(ceph orch host\ \ ls -f json | jq -r --arg PROM_HOST \"$PROM_HOST\" '.[] | select(.hostname==$PROM_HOST)\ \ | .addr')\nALERTM_IP=$(ceph orch host ls -f json | jq -r --arg ALERTM_HOST\ \ \"$ALERTM_HOST\" '.[] | select(.hostname==$ALERTM_HOST) | .addr')\n# check\ \ each host node-exporter metrics endpoint is responsive\nALL_HOST_IPS=$(ceph\ \ orch host ls -f json | jq -r '.[] | .addr')\nfor ip in $ALL_HOST_IPS; do\n\ \ curl -s http://${ip}:9100/metric\ndone\n# check grafana endpoints are responsive\ \ and database health is okay\ncurl -k -s https://${GRAFANA_IP}:3000/api/health\n\ curl -k -s https://${GRAFANA_IP}:3000/api/health | jq -e '.database == \"ok\"\ '\n# stop mon daemon in order to trigger an alert\nceph orch daemon stop $MON_DAEMON\n\ sleep 120\n# check prometheus endpoints are responsive and mon down alert is\ \ firing\ncurl -s http://${PROM_IP}:9095/api/v1/status/config\ncurl -s http://${PROM_IP}:9095/api/v1/status/config\ \ | jq -e '.status == \"success\"'\ncurl -s http://${PROM_IP}:9095/api/v1/alerts\n\ curl -s http://${PROM_IP}:9095/api/v1/alerts | jq -e '.data | .alerts | .[]\ \ | select(.labels | .alertname == \"CephMonDown\") | .state == \"firing\"'\n\ # check alertmanager endpoints are responsive and mon down alert is active\n\ curl -s http://${ALERTM_IP}:9093/api/v1/status\ncurl -s http://${ALERTM_IP}:9093/api/v1/alerts\n\ curl -s http://${ALERTM_IP}:9093/api/v1/alerts | jq -e '.data | .[] | select(.labels\ \ | .alertname == \"CephMonDown\") | .status | .state == \"active\"'\n" teuthology: fragments_dropped: [] meta: {} postmerge: [] teuthology_branch: main teuthology_sha1: e691533f9cbb33d85b2187bba20d7102f098636d timestamp: 2024-03-16_05:03:17 tube: smithi user: adking verbose: false worker_log: /home/teuthworker/archive/worker_logs/dispatcher.smithi.2226885