From db0e46ecd49b228643e111288757ad240f84d084 Mon Sep 17 00:00:00 2001 From: Aidan Hilt Date: Wed, 3 Jul 2024 14:01:25 -0400 Subject: [PATCH] Adding a monitor for long-pending pods in the argo namespace --- .../argo-monitors/application.yaml | 22 +++++++++ .../argo-monitors/argo-pod-pending.yaml | 46 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 kube/services/argo-pod-pending-monitor/argo-monitors/application.yaml create mode 100644 kube/services/argo-pod-pending-monitor/argo-monitors/argo-pod-pending.yaml diff --git a/kube/services/argo-pod-pending-monitor/argo-monitors/application.yaml b/kube/services/argo-pod-pending-monitor/argo-monitors/application.yaml new file mode 100644 index 000000000..b5778681b --- /dev/null +++ b/kube/services/argo-pod-pending-monitor/argo-monitors/application.yaml @@ -0,0 +1,22 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: argo-pod-pending-monitor-application + namespace: argocd +spec: + destination: + namespace: default + server: https://kubernetes.default.svc + project: default + source: + repoURL: https://github.com/uc-cdis/cloud-automation.git + targetRevision: master + path: kube/services/argo-pod-pending-monitors + directory: + exclude: "application.yaml" + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/kube/services/argo-pod-pending-monitor/argo-monitors/argo-pod-pending.yaml b/kube/services/argo-pod-pending-monitor/argo-monitors/argo-pod-pending.yaml new file mode 100644 index 000000000..22bd91dfe --- /dev/null +++ b/kube/services/argo-pod-pending-monitor/argo-monitors/argo-pod-pending.yaml @@ -0,0 +1,46 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: argo-node-age + namespace: default +spec: + schedule: "*/5 * * * *" + jobTemplate: + spec: + template: + metadata: + labels: + app: gen3job + spec: + serviceAccountName: node-monitor + containers: + - name: kubectl + image: quay.io/cdis/awshelper + env: + # This is the label we want to monitor, probably will never need to change + - name: NODE_LABEL + value: purpose=workflow + # This is in minutes + - name: THRESHOLD_TIME + value: "15" + - name: SLACK_WEBHOOK_URL + valueFrom: + configMapKeyRef: + name: global + key: slack_webhook + + command: ["/bin/bash"] + args: + - "-c" + - | + #!/bin/bash + # Get all nodes with specific label and check their age + pending_pods=$(kubectl get pods -n argo -o json | jq -r '.items[] | select(.status.phase == "Pending") | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp} | select(((now - (.creationTimestamp | fromdateiso8601)) / 60) > $THRESHOLD_TIME) | .name') + if [[ ! -z $pending_pods ]]; then + echo "Pods $pending_pods has been around too long, sending an alert" + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Pods \`${pending_pods}\` are older than 15 minutes!\"}" $SLACK_WEBHOOK_URL + else + echo "All good here!" + fi + done + restartPolicy: OnFailure