Skip to content

Commit

Permalink
Adding a monitor for long-pending pods in the argo namespace
Browse files Browse the repository at this point in the history
  • Loading branch information
AidanHilt committed Jul 3, 2024
1 parent f050dc8 commit db0e46e
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: argo-pod-pending-monitor-application
namespace: argocd
spec:
destination:
namespace: default
server: https://kubernetes.default.svc
project: default
source:
repoURL: https://github.com/uc-cdis/cloud-automation.git
targetRevision: master
path: kube/services/argo-pod-pending-monitors
directory:
exclude: "application.yaml"
syncPolicy:
automated:
prune: true
selfHeal: true
syncOptions:
- CreateNamespace=true
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
apiVersion: batch/v1
kind: CronJob
metadata:
name: argo-node-age
namespace: default
spec:
schedule: "*/5 * * * *"
jobTemplate:
spec:
template:
metadata:
labels:
app: gen3job
spec:
serviceAccountName: node-monitor
containers:
- name: kubectl
image: quay.io/cdis/awshelper
env:
# This is the label we want to monitor, probably will never need to change
- name: NODE_LABEL
value: purpose=workflow
# This is in minutes
- name: THRESHOLD_TIME
value: "15"
- name: SLACK_WEBHOOK_URL
valueFrom:
configMapKeyRef:
name: global
key: slack_webhook

command: ["/bin/bash"]
args:
- "-c"
- |
#!/bin/bash
# Get all nodes with specific label and check their age
pending_pods=$(kubectl get pods -n argo -o json | jq -r '.items[] | select(.status.phase == "Pending") | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp} | select(((now - (.creationTimestamp | fromdateiso8601)) / 60) > $THRESHOLD_TIME) | .name')
if [[ ! -z $pending_pods ]]; then
echo "Pods $pending_pods has been around too long, sending an alert"
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Pods \`${pending_pods}\` are older than 15 minutes!\"}" $SLACK_WEBHOOK_URL
else
echo "All good here!"
fi
done
restartPolicy: OnFailure

0 comments on commit db0e46e

Please sign in to comment.