diff --git a/doc/dbbackup.md b/doc/dbbackup.md new file mode 100644 index 0000000000..9e21f2bde3 --- /dev/null +++ b/doc/dbbackup.md @@ -0,0 +1,52 @@ +# TL;DR + +This script facilitates the management of database backup and restore within the Gen3 environment. It can establish policies, service accounts, roles, and S3 buckets. Depending on the command provided, it can initiate a database dump, perform a restore, migrate databases to a new RDS instance on Aurora, or clone databases to an RDS Aurora instance. + +## Usage + +```sh +gen3 dbbackup [dump|restore|va-dump|create-sa|migrate-to-aurora|copy-to-aurora] +``` + +### Commands + +#### dump + +Initiates a database dump and pushes it to an S3 bucket, creating the essential AWS resources if they are absent. The dump operation is intended to be executed from the namespace/commons that requires the backup. + +```sh +gen3 dbbackup dump +``` + +#### restore + +Initiates a database restore from an S3 bucket, creating the essential AWS resources if they are absent. The restore operation is meant to be executed in the target namespace where the backup needs to be restored. + +```sh +gen3 dbbackup restore +``` + +#### create-sa + +Creates the necessary service account and roles for DB copy. + +```sh +gen3 dbbackup create-sa +``` + +#### migrate-to-aurora + +Triggers a service account creation and a job to migrate a Gen3 commons to an AWS RDS Aurora instance. + +```sh +gen3 dbbackup migrate-to-aurora +``` + +#### copy-to-aurora + +Triggers a service account creation and a job to copy the databases Indexd, Sheepdog & Metadata to new databases within an RDS Aurora cluster from another namespace in same RDS cluster. + +```sh +gen3 dbbackup copy-to-aurora +``` + diff --git a/files/scripts/ecr-access-job.md b/files/scripts/ecr-access-job.md index 9659b186b9..5f8dff7670 100644 --- a/files/scripts/ecr-access-job.md +++ b/files/scripts/ecr-access-job.md @@ -59,7 +59,7 @@ Trust policy (allows Acct2): } ``` -- Policy in the account (Acct2) that contains the DynamoDB table (created automatically by `kube-setup-ecr-access-job.sh`): +- Policy in the account (Acct2) that contains the DynamoDB table (created automatically by `kube-setup-ecr-access-cronjob.sh`): ``` { "Version": "2012-10-17", diff --git a/files/squid_whitelist/web_whitelist b/files/squid_whitelist/web_whitelist index e32c7f483a..b0759ba32e 100644 --- a/files/squid_whitelist/web_whitelist +++ b/files/squid_whitelist/web_whitelist @@ -14,6 +14,7 @@ clinicaltrials.gov charts.bitnami.com ctds-planx.atlassian.net data.cityofchicago.org +data.stage.qdr.org dataguids.org api.login.yahoo.com apt.kubernetes.io diff --git a/gen3/bin/dbbackup.sh b/gen3/bin/dbbackup.sh index eb9611a907..eeb5695198 100644 --- a/gen3/bin/dbbackup.sh +++ b/gen3/bin/dbbackup.sh @@ -1,26 +1,28 @@ #!/bin/bash #################################################################################################### -# Script: dbdump.sh +# Script: dbbackup.sh # # Description: # This script facilitates the management of database backups within the gen3 environment. It is -# equipped to establish policies, service accounts, roles, and S3 buckets. Depending on the -# command provided, it will either initiate a database dump or perform a restore. +# equipped to establish policies, service accounts, roles, and S3 buckets. Depending on the +# command provided, it will either initiate a database dump, perform a restore, migrate to Aurora, +# or copy to Aurora. # # Usage: -# gen3 dbbackup [dump|restore] +# gen3 dbbackup [dump|restore|va-dump|create-sa|migrate-to-aurora|copy-to-aurora ] # -# dump - Initiates a database dump, creating the essential AWS resources if they are absent. -# The dump operation is intended to be executed from the namespace/commons that requires -# the backup. -# restore - Initiates a database restore, creating the essential AWS resources if they are absent. -# The restore operation is meant to be executed in the target namespace, where the backup -# needs to be restored. +# dump - Initiates a database dump, creating the essential AWS resources if they are absent. +# The dump operation is intended to be executed from the namespace/commons that requires +# the backup. +# restore - Initiates a database restore, creating the essential AWS resources if they are absent. +# The restore operation is meant to be executed in the target namespace, where the backup +# needs to be restored. +# va-dump - Runs a va-testing DB dump. +# create-sa - Creates the necessary service account and roles for DB copy. +# migrate-to-aurora - Triggers a service account creation and a job to migrate a Gen3 commons to an AWS RDS Aurora instance. +# copy-to-aurora - Triggers a service account creation and a job to copy the databases Indexd, Sheepdog & Metadata to new databases within an RDS Aurora cluster. # -# Notes: -# This script extensively utilizes the AWS CLI and the gen3 CLI. Proper functioning demands a -# configured gen3 environment and the availability of the necessary CLI tools. # #################################################################################################### @@ -49,7 +51,6 @@ gen3_log_info "namespace: $namespace" gen3_log_info "sa_name: $sa_name" gen3_log_info "bucket_name: $bucket_name" - # Create an S3 access policy if it doesn't exist create_policy() { # Check if policy exists @@ -87,7 +88,6 @@ EOM fi } - # Create or update the Service Account and its corresponding IAM Role create_service_account_and_role() { cluster_arn=$(kubectl config current-context) @@ -101,7 +101,6 @@ create_service_account_and_role() { gen3_log_info "oidc_url: $oidc_url" gen3_log_info "role_name: $role_name" - cat > ${trust_policy} <" + exit 1 + fi + gen3_log_info "Copying databases within Aurora..." + copy_to_aurora "$2" + ;; *) - echo "Invalid command. Usage: gen3 dbbackup [dump|restore|va-dump]" + echo "Invalid command. Usage: gen3 dbbackup [dump|restore|va-dump|create-sa|migrate-to-aurora|copy-to-aurora ]" return 1 ;; esac } -main "$1" +main "$@" diff --git a/gen3/bin/kube-setup-argo.sh b/gen3/bin/kube-setup-argo.sh index 677f62257e..1a25a98c82 100644 --- a/gen3/bin/kube-setup-argo.sh +++ b/gen3/bin/kube-setup-argo.sh @@ -204,6 +204,18 @@ EOF aws iam put-role-policy --role-name ${roleName} --policy-name ${internalBucketPolicy} --policy-document file://$internalBucketPolicyFile || true fi + # Create a secret for the slack webhook + alarm_webhook=$(g3kubectl get cm global -o yaml | yq .data.slack_alarm_webhook | tr -d '"') + + if [ -z "$alarm_webhook" ]; then + gen3_log_err "Please set a slack_alarm_webhook in the 'global' configmap. This is needed to alert for failed workflows." + exit 1 + fi + + g3kubectl -n argo delete secret slack-webhook-secret + g3kubectl -n argo create secret generic "slack-webhook-secret" --from-literal=SLACK_WEBHOOK_URL=$alarm_webhook + + ## if new bucket then do the following # Get the aws keys from secret # Create and attach lifecycle policy diff --git a/gen3/bin/kube-setup-hatchery.sh b/gen3/bin/kube-setup-hatchery.sh index dadbbd9307..97365677d3 100644 --- a/gen3/bin/kube-setup-hatchery.sh +++ b/gen3/bin/kube-setup-hatchery.sh @@ -175,6 +175,8 @@ $assumeImageBuilderRolePolicyBlock "Action": [ "batch:DescribeComputeEnvironments", "batch:CreateComputeEnvironment", + "batch:UpdateComputeEnvironment", + "batch:ListJobs", "batch:CreateJobQueue", "batch:TagResource", "iam:ListPolicies", @@ -197,10 +199,28 @@ $assumeImageBuilderRolePolicyBlock "iam:CreateInstanceProfile", "iam:AddRoleToInstanceProfile", "iam:PassRole", - "s3:CreateBucket" + "kms:CreateKey", + "kms:CreateAlias", + "kms:DescribeKey", + "kms:TagResource", + "s3:CreateBucket", + "s3:PutEncryptionConfiguration", + "s3:PutBucketPolicy", + "s3:PutLifecycleConfiguration" ], "Resource": "*" }, + { + "Sid": "CreateSlrForNextflowBatchWorkspaces", + "Effect": "Allow", + "Action": "iam:CreateServiceLinkedRole", + "Resource": "arn:aws:iam::*:role/aws-service-role/batch.amazonaws.com/*", + "Condition": { + "StringLike": { + "iam:AWSServiceName": "batch.amazonaws.com" + } + } + }, { "Sid": "PassRoleForNextflowBatchWorkspaces", "Effect": "Allow", diff --git a/kube/services/argo-events/workflows/configmap.yaml b/kube/services/argo-events/workflows/configmap.yaml index c754c36949..4ebb90f198 100644 --- a/kube/services/argo-events/workflows/configmap.yaml +++ b/kube/services/argo-events/workflows/configmap.yaml @@ -84,7 +84,7 @@ data: purpose: workflow limits: resources: - cpu: 2000 + cpu: 4000 providerRef: name: workflow-WORKFLOW_NAME # Kill nodes after 30 days to ensure they stay up to date diff --git a/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml b/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml index 9486d06c25..d3d75a84e1 100644 --- a/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml +++ b/kube/services/argo-pod-pending-monitor/argo-pod-pending.yaml @@ -25,7 +25,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook command: ["/bin/bash"] args: diff --git a/kube/services/argo/values.yaml b/kube/services/argo/values.yaml index c8178dd2a4..eeb2e9e01e 100644 --- a/kube/services/argo/values.yaml +++ b/kube/services/argo/values.yaml @@ -61,6 +61,20 @@ controller: workflowDefaults: spec: archiveLogs: true + onExit: alert-on-timeout + templates: + - name: alert-on-timeout + script: + image: quay.io/cdis/amazonlinux-debug:master + command: [sh] + envFrom: + - secretRef: + name: slack-webhook-secret + source: | + failure_reason=$(echo {{workflow.failures}} | jq 'any(.[]; .message == "Step exceeded its deadline")' ) + if [ "$failure_reason" ]; then + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"ALERT: Workflow {{workflow.name}} has been killed due to timeout\"}" "$SLACK_WEBHOOK_URL" + fi # -- [Node selector] nodeSelector: diff --git a/kube/services/jobs/psql-db-aurora-migration-job.yaml b/kube/services/jobs/psql-db-aurora-migration-job.yaml new file mode 100644 index 0000000000..dc6f40c11a --- /dev/null +++ b/kube/services/jobs/psql-db-aurora-migration-job.yaml @@ -0,0 +1,219 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: psql-db-aurora-migration +spec: + template: + metadata: + labels: + app: gen3job + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 + preference: + matchExpressions: + - key: eks.amazonaws.com/capacityType + operator: In + values: + - ONDEMAND + serviceAccountName: psql-db-copy-sa + containers: + - name: pgdump + image: quay.io/cdis/awshelper:master + imagePullPolicy: Always + env: + - name: gen3Env + valueFrom: + configMapKeyRef: + name: global + key: environment + - name: JENKINS_HOME + value: "devterm" + - name: GEN3_HOME + value: /home/ubuntu/cloud-automation + command: [ "/bin/bash" ] + args: + - "-c" + - | + # This job migrates (takes backup and restores) the databases in a Gen3 instance to an Aurora RDS cluster. + # Requirements: + # 1. Aurora server credentials should be present in the Gen3Secrets/creds.json with name 'aurora'. + # 2. Ensure that `gen3 psql aurora` and `gen3 secrets decode aurora-creds` work as expected. + # 3. The job needs the "psql-db-copy-sa" service account with the necessary permissions to read secrets from all relevant namespaces. + + source "${GEN3_HOME}/gen3/lib/utils.sh" + gen3_load "gen3/gen3setup" + namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + default_databases=($(echo -e "$(gen3 db services)" | sort -r)) + date_str=$(date -u +%y%m%d_%H%M%S) + databases=("${default_databases[@]}") + gen3_log_info "databases: ${databases[@]}" + + # Initialize sheepdog_db_name and failed_migrations variables + sheepdog_db_name="" + failed_migrations="" + + # find Aurora Server credentials + aurora_host_name=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_host') + aurora_master_username=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_username') + aurora_master_password=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_password') + aurora_master_database=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_database') + + gen3_log_info "Aurora Creds: \n aurora_host_name: $aurora_host_name \n aurora_master_username: $aurora_master_username \n aurora_master_database: $aurora_master_database" + + # Verify important variables are present + if [ -z "$aurora_host_name" ] || [ -z "$aurora_master_username" ] || [ -z "$aurora_master_password" ] || [ -z "$aurora_master_database" ]; then + gen3_log_err "Aurora credentials are missing. Exiting." + exit 1 + fi + + new_resources="" + + # Function to truncate to 63 characters + function truncate_identifier() { + local identifier=$1 + if [ ${#identifier} -gt 63 ]; then + echo "${identifier:0:63}" + else + echo "$identifier" + fi + } + + # Function to create a database with retry logic + function create_database_with_retry() { + local db_name=$1 + local retries=5 + local wait_time=10 + for i in $(seq 1 $retries); do + PGPASSWORD=${db_password} psql -h $aurora_host_name -U "$db_user" -d postgres -c "CREATE DATABASE $db_name" + if [ $? -eq 0 ]; then + return 0 + fi + gen3_log_err "Failed to create database $db_name. Retrying in $wait_time seconds..." + sleep $wait_time + done + return 1 + } + + # Looping through each service to: + # - Extract the database credentials. + # - Check if the user already exists, if not, create the user. + # - Grant required privileges. + # - Create the database (except for peregrine). + # - Backup and restore the database on the Aurora Cluster. + for database in "${databases[@]}"; do + for secret_name in "${database}-creds creds.json" "$database-g3auto dbcreds.json"; do + creds=$(gen3 secrets decode $secret_name 2>/dev/null) + if [ $? -eq 0 ] && [ ! -z "$creds" ]; then + db_hostname=$(echo $creds | jq -r .db_host) + db_username=$(echo $creds | jq -r .db_username) + db_password=$(echo $creds | jq -r .db_password) + db_database=$(echo $creds | jq -r .db_database) + gen3_log_info "Extracting service credentials for $database from $secret_name: \n db_hostname: $db_hostname \n db_username: $db_username \n db_database: $db_database \n" + break + fi + done + + if [ -z "$db_hostname" ] || [ -z "$db_username" ] || [ -z "$db_password" ] || [ -z "$db_database" ]; then + gen3_log_err "Failed to extract database credentials for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to extract credentials" + continue + fi + + # Check source database accessibility + PGPASSWORD=${db_password} pg_isready -h $db_hostname -U "$db_username" -d "$db_database" + if [ $? -ne 0 ]; then + gen3_log_err "Cannot connect to source database $db_database at $db_hostname. Skipping database $database." + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Cannot connect to source database at $db_hostname" + continue + fi + + # Define db_user and db_name variables with replaced hyphens + db_user="$(echo $database | tr '-' '_')_user_$(echo $namespace | tr '-' '_')" + db_name="$(echo $database | tr '-' '_')_$(echo $namespace | tr '-' '_')_${date_str}" + + # Truncate identifiers if necessary + db_user=$(truncate_identifier $db_user) + db_name=$(truncate_identifier $db_name) + + # Try to connect to the Aurora database with the extracted credentials. + # If the connection is successful, it means the user already exists. + # If not, create the user. + + PGPASSWORD=${db_password} psql -h $aurora_host_name -U "$db_user" -d postgres -c "\q" + if [ $? -eq 0 ]; then + gen3_log_info "User $db_user, password already exists" + else + gen3 psql aurora -c "CREATE USER \"$db_user\" WITH PASSWORD '$db_password' CREATEDB" + if [ $? -ne 0 ]; then + gen3_log_err "Failed to create user for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to create user" + continue + else + gen3_log_info "Database user $db_user created successfully" + fi + fi + + if [ "$database" != "peregrine" ]; then + # Create the database with a unique name by appending namespace and date. + create_database_with_retry $db_name + if [ $? -ne 0 ]; then + gen3_log_err "Failed to create database for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to create database" + continue + else + gen3_log_info "Database $db_name created successfully" + if [ "$database" == "sheepdog" ]; then + sheepdog_db_name=$db_name + fi + fi + + # Backup the current database and restore it to the newly created database. + if gen3 db backup $database | PGPASSWORD=${db_password} psql -h $aurora_host_name -U "$db_user" -d "$db_name"; then + gen3_log_info "Database $database restored successfully to $db_name" + new_resources="${new_resources}\nSource_Database: $db_database Source_Host: $db_hostname Source_User: $db_username Restored_Database: $db_name User: $db_user" + else + gen3_log_err "Failed to backup and restore database for $database" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to backup and restore database" + fi + fi + + if [ "$database" == "peregrine" ]; then + if [ -n "$sheepdog_db_name" ]; then + gen3 psql aurora -d "$sheepdog_db_name" -c "GRANT ALL PRIVILEGES ON ALL TABLES IN SCHEMA public TO \"$db_user\"" + if [ $? -ne 0 ]; then + gen3_log_err "Failed to grant access to sheepdog tables for peregrine user" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Failed to grant access to sheepdog tables for peregrine user" + continue + else + gen3_log_info "Access to sheepdog tables granted successfully for peregrine user" + new_resources="${new_resources}\nUser: $db_user with access to sheepdog database $sheepdog_db_name" + fi + else + gen3_log_err "Sheepdog database not found for granting permissions to peregrine user" + failed_migrations="${failed_migrations}\nDatabase: $database, Error: Sheepdog database not found for granting permissions" + fi + fi + done + + # Logging the newly created resources + gen3_log_info "New resources created on $aurora_host_name\n$new_resources" + + # Logging the failed migrations + if [ -n "$failed_migrations" ]; then + gen3_log_info "Failed migrations:\n$failed_migrations" + fi + + # Sleep for 600 seconds to allow the user to check the logs + sleep 600 + restartPolicy: Never diff --git a/kube/services/jobs/psql-db-copy-aurora-job.yaml b/kube/services/jobs/psql-db-copy-aurora-job.yaml new file mode 100644 index 0000000000..8fd6e899aa --- /dev/null +++ b/kube/services/jobs/psql-db-copy-aurora-job.yaml @@ -0,0 +1,193 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: psql-db-copy-aurora +spec: + template: + metadata: + labels: + app: gen3job + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 + preference: + matchExpressions: + - key: eks.amazonaws.com/capacityType + operator: In + values: + - ONDEMAND + serviceAccountName: psql-db-copy-sa + containers: + - name: pgdump + image: quay.io/cdis/awshelper:master + imagePullPolicy: Always + env: + - name: gen3Env + valueFrom: + configMapKeyRef: + name: global + key: environment + - name: JENKINS_HOME + value: "devterm" + - name: GEN3_HOME + value: /home/ubuntu/cloud-automation + - name: SOURCE_NAMESPACE + GEN3_SOURCE_NAMESPACE|-value: "staging"-| # Default value, should be overwritten by the environment variable + command: [ "/bin/bash" ] + args: + - "-c" + - | + # This script copies specified databases from a source namespace to the current namespace on the same Aurora RDS instance. + # + # This script requires the following to work properly: + # + # 1. Aurora server credentials must be present in the Gen3Secrets/creds.json file. + # These credentials should be present as a Kubernetes secret named "aurora-creds". + # This secret should contain the keys: db_host, db_username, db_password, and db_database. + # + # 2. The "gen3 psql aurora" command should be available to connect to the Aurora server. + # + # 3. The "gen3 secrets decode aurora-creds creds.json" command should work, allowing the script to decode the necessary secrets. + # + # 4. The source and the destination databases should be on the same Aurora instance. + # + # 5. The ServiceAccount, roles, and role binding must be set up using the script psql-db-copy-aurora-sa.yaml. + # The psql-db-copy-aurora-sa.yaml script is configured for the default namespace. + # Modify the namespace as needed before applying it where the script will run. + # These can be created by executing the command: + # kubectl apply -f ${GEN3_HOME}/kube/services/jobs/psql-db-copy-aurora-sa.yaml + # + # How to run the script: + # gen3 job run psql-db-copy-aurora -v SOURCE_NAMESPACE + # + + source "${GEN3_HOME}/gen3/lib/utils.sh" + gen3_load "gen3/gen3setup" + namespace=$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace) + date_str=$(date -u +%y%m%d_%H%M%S) + # Define the default databases to be copied + databases=( "indexd" "sheepdog" "metadata") + gen3_log_info "databases to be processed: ${databases[@]}" + source_namespace=$SOURCE_NAMESPACE + gen3_log_info "Source Namespace: $source_namespace" + + # find Aurora Server credentials + aurora_host_name=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_host') + aurora_master_username=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_username') + aurora_master_password=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_password') + aurora_database=$(gen3 secrets decode aurora-creds creds.json | jq -r '.db_database') + + # Verify important variables are present + if [ -z "$aurora_host_name" ] || [ -z "$aurora_master_username" ] || [ -z "$aurora_master_password" ] || [ -z "$aurora_database" ]; then + gen3_log_err "Aurora credentials are missing. Exiting." + exit 1 + fi + + # Function to truncate to 63 characters + function truncate_identifier() { + local identifier=$1 + if [ ${#identifier} -gt 63 ]; then + echo "${identifier:0:63}" + else + echo "$identifier" + fi + } + + # Function to decode Kubernetes secrets + function secrets_decode() { + local namespace=$1 + local secret=$2 + local key=$3 + local secrets_value + + secrets_value=$(kubectl get secret -n $namespace $secret -o json 2>/dev/null | jq -r --arg key "$key" '.data[$key]' | base64 --decode --ignore-garbage 2>/dev/null) + if [ $? -ne 0 ] || [ -z "$secrets_value" ]; then + echo "Secret $secret in namespace $namespace not found or failed to decode" >&2 + return 1 + else + echo "$secrets_value" + fi + } + + # Array to hold the names of newly created databases + new_databases=() + + # Looping through each database + for database in "${databases[@]}"; do + source_creds="" + creds="" + + # Try to get the source and destination credentials with the "-g3auto" suffix and key "dbcreds.json" + source_creds=$(secrets_decode $source_namespace ${database}-g3auto dbcreds.json) + if [ $? -ne 0 ]; then + source_creds="" + fi + creds=$(secrets_decode $namespace ${database}-g3auto dbcreds.json) + if [ $? -ne 0 ]; then + creds="" + fi + + # If the "-g3auto" suffix didn't work for both source_creds and creds, try with the suffix "creds" and key "creds.json" + if [ -z "$source_creds" ] && [ -z "$creds" ]; then + source_creds=$(secrets_decode $source_namespace ${database}-creds creds.json) + if [ $? -ne 0 ]; then + source_creds="" + fi + creds=$(secrets_decode $namespace ${database}-creds creds.json) + if [ $? -ne 0 ]; then + creds="" + fi + fi + + # If we still couldn't get the credentials, log an error and continue to the next database + if [ -z "$source_creds" ] || [ -z "$creds" ]; then + gen3_log_err "Failed to extract database credentials for $database" + continue + fi + + source_db_database=$(echo $source_creds | jq -r .db_database) + db_username=$(echo $creds | jq -r .db_username) + db_database=$(echo $creds | jq -r .db_database) + + if [ -z "$source_db_database" ] || [ -z "$db_username" ] || [ -z "$db_database" ]; then + gen3_log_err "One or more required credentials are missing for $database. Skipping." + continue + fi + target_db=$(truncate_identifier $(echo "${database}_${namespace}_${date_str}" | tr '-' '_')) + gen3_log_info "Processing database: $database" + gen3_log_info "Source DB: $source_db_database, Username: $db_username, Current DB: $db_database, Target DB: $target_db" + + # DB commands + gen3 psql aurora -c "GRANT $db_username TO $aurora_master_username" + gen3 psql aurora -c "SELECT pg_terminate_backend(pg_stat_activity.pid) FROM pg_stat_activity WHERE pg_stat_activity.datname = '$source_db_database' AND pid <> pg_backend_pid()" + gen3 psql aurora -c "CREATE DATABASE $target_db WITH TEMPLATE $source_db_database OWNER $db_username" + pg_command="DO \$\$ DECLARE tbl record; BEGIN FOR tbl IN (SELECT table_schema || '.' || table_name AS full_table_name FROM information_schema.tables WHERE table_schema = 'public') LOOP EXECUTE 'ALTER TABLE ' || tbl.full_table_name || ' OWNER TO $db_username;'; END LOOP; END \$\$;" + PGPASSWORD=${aurora_master_password} psql -h $aurora_host_name -U $aurora_master_username -d "$target_db" -c "$pg_command" + if [ $? -eq 0 ]; then + gen3_log_info "Successfully processed $database" + new_databases+=("$target_db") + else + gen3_log_err "Failed to process $database" + fi + done + + gen3_log_info "Job Completed" + + # Print the list of newly created databases + gen3_log_info "Newly created Database Names::" + for new_db in "${new_databases[@]}"; do + gen3_log_info "$new_db" + done + + sleep 600 + restartPolicy: Never diff --git a/kube/services/jobs/psql-db-copy-aurora-sa.yaml b/kube/services/jobs/psql-db-copy-aurora-sa.yaml new file mode 100644 index 0000000000..e6977a187f --- /dev/null +++ b/kube/services/jobs/psql-db-copy-aurora-sa.yaml @@ -0,0 +1,30 @@ +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: psql-db-copy-sa + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: psql-db-copy-role +rules: +- apiGroups: [""] + resources: ["secrets"] + verbs: ["get", "watch", "list"] + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: psql-db-copy-rolebinding +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: psql-db-copy-role +subjects: +- kind: ServiceAccount + name: psql-db-copy-sa + namespace: default # Ensure this references the correct namespace + diff --git a/kube/services/node-monitors/argo-monitors/argo-node-age.yaml b/kube/services/node-monitors/argo-monitors/argo-node-age.yaml index 890495ee00..b389c072ce 100644 --- a/kube/services/node-monitors/argo-monitors/argo-node-age.yaml +++ b/kube/services/node-monitors/argo-monitors/argo-node-age.yaml @@ -27,7 +27,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook command: ["/bin/bash"] args: @@ -55,4 +55,4 @@ spec: curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Node \`${NODE_NAME}\` is older than 3 hours!\"}" $SLACK_WEBHOOK_URL fi done - restartPolicy: OnFailure \ No newline at end of file + restartPolicy: OnFailure diff --git a/kube/services/node-monitors/node-not-ready.yaml b/kube/services/node-monitors/node-not-ready.yaml index 500832fc34..15ed616e6f 100644 --- a/kube/services/node-monitors/node-not-ready.yaml +++ b/kube/services/node-monitors/node-not-ready.yaml @@ -21,7 +21,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook - name: ENVIRONMENT valueFrom: configMapKeyRef: diff --git a/kube/services/workflow-age-monitor/argo-workflow-age.yaml b/kube/services/workflow-age-monitor/argo-workflow-age.yaml index 0d0c29115b..52910ad4a1 100644 --- a/kube/services/workflow-age-monitor/argo-workflow-age.yaml +++ b/kube/services/workflow-age-monitor/argo-workflow-age.yaml @@ -24,7 +24,7 @@ spec: valueFrom: configMapKeyRef: name: global - key: slack_webhook + key: slack_alarm_webhook command: ["/bin/bash"] args: @@ -32,24 +32,30 @@ spec: - | #!/bin/bash # Get all workflows with specific label and check their age - kubectl get workflows --all-namespaces -o json | jq -c '.items[] | {name: .metadata.name, creationTimestamp: .metadata.creationTimestamp}' | while read workflow_info; do + kubectl get workflows --all-namespaces -o json | jq -c '.items[] | {name: .metadata.name, startedTimestamp: .status.startedAt}' | while read workflow_info; do WORKFLOW_NAME=$(echo $workflow_info | jq -r '.name') - CREATION_TIMESTAMP=$(echo $workflow_info | jq -r '.creationTimestamp') + STARTED_TIMESTAMP=$(echo $workflow_info | jq -r '.startedTimestamp') - # Convert creation timestamp to Unix Epoch time - CREATION_EPOCH=$(date -d "$CREATION_TIMESTAMP" +%s) + echo "Checking workflow $WORKFLOW_NAME" + echo "$STARTED_TIMESTAMP" - # Get current Unix Epoch time - CURRENT_EPOCH=$(date +%s) + if [ "$STARTED_TIMESTAMP" != "null" ]; then + echo "Workflow $WORKFLOW_NAME started at $STARTED_TIMESTAMP" + # Convert creation timestamp to Unix Epoch time + CREATION_EPOCH=$(date -d "$STARTED_TIMESTAMP" +%s) - # Calculate workflow age in seconds - WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH)) + # Get current Unix Epoch time + CURRENT_EPOCH=$(date +%s) - # Check if workflow age is greater than threshold - if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then - echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert" - # Send alert to Slack - curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL + # Calculate workflow age in seconds + WORKFLOW_AGE=$(($CURRENT_EPOCH - $CREATION_EPOCH)) + + # Check if workflow age is greater than threshold + if [ "$WORKFLOW_AGE" -gt "$THRESHOLD_TIME" ]; then + echo "Workflow $WORKFLOW_NAME has been running for over $THRESHOLD_TIME seconds, sending an alert" + # Send alert to Slack + curl -X POST -H 'Content-type: application/json' --data "{\"text\":\"WARNING: Workflow \`${WORKFLOW_NAME}\` has been running longer than $THRESHOLD_TIME seconds\"}" $SLACK_WEBHOOK_URL + fi fi done restartPolicy: OnFailure