Skip to content

Commit

Permalink
Updating the argo workflow monitor to only alert on workflows that ha…
Browse files Browse the repository at this point in the history
…ve a started time
  • Loading branch information
AidanHilt committed Jul 12, 2024
1 parent d04fad6 commit b2d7571
Showing 1 changed file with 14 additions and 12 deletions.
26 changes: 14 additions & 12 deletions kube/services/workflow-age-monitor/argo-workflow-age.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,22 +34,24 @@ spec:
# Get all workflows with specific label and check their age
kubectl get workflows --all-namespaces -o json | jq -c '.items[] | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp}' | while read workflow_info; do
WORKFLOW_NAME=$(echo $workflow_info | jq -r '.name')
CREATION_TIMESTAMP=$(echo $workflow_info | jq -r '.creationTimestamp')
STARTED_TIMESTAMP=$(echo $workflow_info | jq -r '.status.startedAt')
# Convert creation timestamp to Unix Epoch time
CREATION_EPOCH=$(date -d "$CREATION_TIMESTAMP" +%s)
if [ ! -z STARTED_TIMESTAMP ]; then
# Convert creation timestamp to Unix Epoch time
CREATION_EPOCH=$(date -d "$STARTED_TIMESTAMP" +%s)
# Get current Unix Epoch time
CURRENT_EPOCH=$(date +%s)
# Get current Unix Epoch time
CURRENT_EPOCH=$(date +%s)
# Calculate workflow age in seconds
WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH))
# Calculate workflow age in seconds
WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH))
# Check if workflow age is greater than threshold
if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then
echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert"
# Send alert to Slack
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL
# Check if workflow age is greater than threshold
if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then
echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert"
# Send alert to Slack
curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL
fi
fi
done
restartPolicy: OnFailure

0 comments on commit b2d7571

Please sign in to comment.