diff --git a/Docker/jenkins/Jenkins-CI-Worker/Dockerfile b/Docker/jenkins/Jenkins-CI-Worker/Dockerfile index 242d5e74da..6eeb8f4fd6 100644 --- a/Docker/jenkins/Jenkins-CI-Worker/Dockerfile +++ b/Docker/jenkins/Jenkins-CI-Worker/Dockerfile @@ -1,4 +1,4 @@ -FROM jenkins/inbound-agent:jdk11 +FROM jenkins/inbound-agent:jdk21 USER root diff --git a/Docker/jenkins/Jenkins-Worker/Dockerfile b/Docker/jenkins/Jenkins-Worker/Dockerfile index c824690def..fec6b3203c 100644 --- a/Docker/jenkins/Jenkins-Worker/Dockerfile +++ b/Docker/jenkins/Jenkins-Worker/Dockerfile @@ -1,4 +1,4 @@ -FROM jenkins/inbound-agent:jdk11 +FROM jenkins/inbound-agent:jdk21 USER root diff --git a/Docker/jenkins/Jenkins/Dockerfile b/Docker/jenkins/Jenkins/Dockerfile index ae39ac5740..04ebe5864a 100644 --- a/Docker/jenkins/Jenkins/Dockerfile +++ b/Docker/jenkins/Jenkins/Dockerfile @@ -1,4 +1,4 @@ -FROM jenkins/jenkins:2.415-jdk11 +FROM jenkins/jenkins:2.426.3-lts-jdk21 USER root diff --git a/Docker/jenkins/Jenkins2/Dockerfile b/Docker/jenkins/Jenkins2/Dockerfile index 9976a07c20..e6b73bc76d 100644 --- a/Docker/jenkins/Jenkins2/Dockerfile +++ b/Docker/jenkins/Jenkins2/Dockerfile @@ -1,4 +1,4 @@ -FROM jenkins/jenkins:2.415-jdk11 +FROM jenkins/jenkins:2.426.3-lts-jdk21 USER root diff --git a/doc/s3-to-google-replication.md b/doc/s3-to-google-replication.md new file mode 100644 index 0000000000..82d0374c7c --- /dev/null +++ b/doc/s3-to-google-replication.md @@ -0,0 +1,68 @@ +# S3 to Google Cloud Storage Replication Pipeline + +This document will guide you through setting up a replication pipeline from AWS S3 to Google Cloud Storage (GCS) using VPC Service Controls and Storage Transfer Service. This solution is compliant with security best practices, ensuring that data transfer between AWS S3 and GCS is secure and efficient. + +## Table of Contents + +- [Prerequisites](#prerequisites) +- [Step-by-step Guide](#step-by-step-guide) + - [Setup VPC Service Controls](#setup-vpc-service-controls) + - [Initiate Storage Transfer Service](#initiate-storage-transfer-service) +- [Compliance Benefits](#compliance-benefits) +- [Cost Benefit Analysis](#cost-benefit-analysis) + +## Prerequisites + +1. **AWS account** with access to the S3 bucket. +2. **Google Cloud account** with permissions to create buckets in GCS and set up VPC Service Controls and Storage Transfer Service. +3. Familiarity with AWS IAM for S3 bucket access and Google Cloud IAM for GCS access. + +## Step-by-step Guide + +### Setup VPC Service Controls + +1. **Access the VPC Service Controls** in the Google Cloud Console. +2. **Create a new VPC Service Control perimeter**. + - Name the perimeter and choose the desired region. + - Add the necessary GCP services. Ensure to include `storagetransfer.googleapis.com` for Storage Transfer Service. +3. **Setup VPC Service Control Policy** to allow connections from AWS. + - Use the [documentation](https://cloud.google.com/vpc-service-controls/docs/set-up) to help set up. + +### Initiate Storage Transfer Service + +1. Navigate to **Storage Transfer Service** in the Google Cloud Console. +2. Click **Create Transfer Job**. +3. **Select Source**: Choose Amazon S3 bucket and provide the necessary details. + - Ensure to have necessary permissions for the S3 bucket in AWS IAM. +4. **Select Destination**: Choose your GCS bucket. +5. **Schedule & Advanced Settings**: Set the frequency and conditions for the transfer. Consider setting up notifications for job completion or errors. +6. **Review & Create**: Confirm the details and initiate the transfer job. + +## Compliance Benefits + +Setting up a secure replication pipeline from AWS S3 to GCS using VPC Service Controls and Storage Transfer Service offers the following compliance benefits: + +1. **Data Security**: The VPC Service Controls provide an additional layer of security by ensuring that the transferred data remains within a defined security perimeter, reducing potential data leak risks. +2. **Auditability**: Both AWS and GCS offer logging and monitoring tools that can provide audit trails for data transfer. This can help in meeting regulatory compliance requirements. +3. **Consistent Data Replication**: The Storage Transfer Service ensures that data in GCS is up to date with the source S3 bucket, which is essential for consistent backup and disaster recovery strategies. + +## Cost Benefit Analysis + +**Benefits**: + +1. **Data Redundancy**: Having data stored in multiple cloud providers can be a part of a robust disaster recovery strategy. +2. **Flexibility**: Replicating data to GCS provides flexibility in multi-cloud strategies, enabling seamless migrations or usage of GCP tools and services. +3. **Security**: Utilizing VPC Service Controls strengthens the security posture. + +**Costs**: + +1. **Data Transfer Costs**: Both AWS and Google Cloud might charge for data transfer. It's crucial to analyze the cost, especially for large data transfers. +2. **Storage Costs**: Storing data redundantly incurs additional storage costs in GCS. + +**Analysis**: + +To stay in compliance, we require multiple copies of our data in separate datacenters or clouds. After our security audit, we found the important of not keeping data in a single cloud. It may be expensive to transfer data from AWS to GCP and to store it in 2 clouds simultaniously, but if we need to, then this solution could be an easy way to achieve compliance. + +--- + +Please note that while this guide is based on the provided Google Cloud documentation, it's crucial to refer to the original [documentation](https://cloud.google.com/architecture/transferring-data-from-amazon-s3-to-cloud-storage-using-vpc-service-controls-and-storage-transfer-service) for the most accurate and up-to-date information. diff --git a/files/scripts/ci-env-pool-reset.sh b/files/scripts/ci-env-pool-reset.sh index 362cfbfd57..c0c1f67c6d 100644 --- a/files/scripts/ci-env-pool-reset.sh +++ b/files/scripts/ci-env-pool-reset.sh @@ -29,7 +29,6 @@ source "${GEN3_HOME}/gen3/gen3setup.sh" cat - > jenkins-envs-services.txt < /dev/null 2>&1 # Wait for the new cluster to be available @@ -60,4 +61,4 @@ else if [ $retry_count -eq $max_retries ]; then echo "New cluster creation may still be in progress. Please check the AWS Management Console for the status." fi -fi \ No newline at end of file +fi diff --git a/gen3/bin/kube-roll-all.sh b/gen3/bin/kube-roll-all.sh index c9cec5a25d..6a67f2bdd2 100644 --- a/gen3/bin/kube-roll-all.sh +++ b/gen3/bin/kube-roll-all.sh @@ -274,7 +274,7 @@ if [[ "$GEN3_ROLL_FAST" != "true" ]]; then else gen3 kube-setup-autoscaler & fi - gen3 kube-setup-kube-dns-autoscaler & + #gen3 kube-setup-kube-dns-autoscaler & gen3 kube-setup-metrics deploy || true gen3 kube-setup-tiller || true # diff --git a/gen3/bin/kube-setup-argo-wrapper.sh b/gen3/bin/kube-setup-argo-wrapper.sh index 5727a703e0..306050b124 100644 --- a/gen3/bin/kube-setup-argo-wrapper.sh +++ b/gen3/bin/kube-setup-argo-wrapper.sh @@ -19,5 +19,16 @@ if [[ -z "$GEN3_SOURCE_ONLY" ]]; then gen3 roll argo-wrapper g3kubectl apply -f "${GEN3_HOME}/kube/services/argo-wrapper/argo-wrapper-service.yaml" + if g3k_manifest_lookup .argo.argo_server_service_url 2> /dev/null; then + argo_server_service_url=$(g3k_manifest_lookup .argo.argo_server_service_url) + + export ARGO_HOST=${argo_server_service_url} + export ARGO_NAMESPACE=argo-$(gen3 db namespace) + envsubst <"${GEN3_HOME}/kube/services/argo-wrapper/config.ini" > /tmp/config.ini + + g3kubectl delete configmap argo-wrapper-namespace-config + g3kubectl create configmap argo-wrapper-namespace-config --from-file /tmp/config.ini + fi + gen3_log_info "the argo-wrapper service has been deployed onto the kubernetes cluster" -fi \ No newline at end of file +fi diff --git a/gen3/bin/kube-setup-cedar-wrapper.sh b/gen3/bin/kube-setup-cedar-wrapper.sh index 9a899a7700..c8f0d03c6c 100644 --- a/gen3/bin/kube-setup-cedar-wrapper.sh +++ b/gen3/bin/kube-setup-cedar-wrapper.sh @@ -1,6 +1,58 @@ source "${GEN3_HOME}/gen3/lib/utils.sh" gen3_load "gen3/lib/kube-setup-init" +create_client_and_secret() { + local hostname=$(gen3 api hostname) + local client_name="cedar_ingest_client" + gen3_log_info "kube-setup-cedar-wrapper" "creating fence ${client_name} for $hostname" + # delete any existing fence cedar clients + g3kubectl exec -c fence $(gen3 pod fence) -- fence-create client-delete --client ${client_name} > /dev/null 2>&1 + local secrets=$(g3kubectl exec -c fence $(gen3 pod fence) -- fence-create client-create --client ${client_name} --grant-types client_credentials | tail -1) + # secrets looks like ('CLIENT_ID', 'CLIENT_SECRET') + if [[ ! $secrets =~ (\'(.*)\', \'(.*)\') ]]; then + gen3_log_err "kube-setup-cedar-wrapper" "Failed generating ${client_name}" + return 1 + else + local client_id="${BASH_REMATCH[2]}" + local client_secret="${BASH_REMATCH[3]}" + gen3_log_info "Create cedar-client secrets file" + cat - < /dev/null 2>&1; then + local have_cedar_client_secret="1" + else + gen3_log_info "No g3auto cedar-client key present in secret" + fi + + local client_name="cedar_ingest_client" + local client_list=$(g3kubectl exec -c fence $(gen3 pod fence) -- fence-create client-list) + local client_count=$(echo "$client_list=" | grep -cE "'name':.*'${client_name}'") + gen3_log_info "CEDAR client count = ${client_count}" + + if [[ -z $have_cedar_client_secret ]] || [[ ${client_count} -lt 1 ]]; then + gen3_log_info "Creating new cedar-ingest client and secret" + local credsPath="$(gen3_secrets_folder)/g3auto/cedar/${cedar_creds_file}" + if ! create_client_and_secret > $credsPath; then + gen3_log_err "Failed to setup cedar-ingest secret" + return 1 + else + gen3 secrets sync + gen3 job run usersync + fi + fi +} + [[ -z "$GEN3_ROLL_ALL" ]] && gen3 kube-setup-secrets if ! g3kubectl get secrets/cedar-g3auto > /dev/null 2>&1; then @@ -8,6 +60,9 @@ if ! g3kubectl get secrets/cedar-g3auto > /dev/null 2>&1; then return 1 fi +gen3_log_info "Checking cedar-client creds" +setup_creds + if ! gen3 secrets decode cedar-g3auto cedar_api_key.txt > /dev/null 2>&1; then gen3_log_err "No CEDAR api key present in cedar-g3auto secret, not rolling CEDAR wrapper" return 1 diff --git a/gen3/bin/kube-setup-ingress.sh b/gen3/bin/kube-setup-ingress.sh index d0bcff9a4b..b75470f733 100644 --- a/gen3/bin/kube-setup-ingress.sh +++ b/gen3/bin/kube-setup-ingress.sh @@ -232,6 +232,28 @@ gen3_ingress_setup_role() { } } }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*" + ], + "Condition": { + "StringEquals": { + "elasticloadbalancing:CreateAction": [ + "CreateTargetGroup", + "CreateLoadBalancer" + ] + }, + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, { "Effect": "Allow", "Action": [ @@ -329,4 +351,4 @@ g3kubectl apply -f "${GEN3_HOME}/kube/services/revproxy/revproxy-service.yaml" envsubst <$scriptDir/ingress.yaml | g3kubectl apply -f - if [ "$deployWaf" = true ]; then gen3_ingress_setup_waf -fi \ No newline at end of file +fi diff --git a/gen3/bin/kube-setup-jenkins2.sh b/gen3/bin/kube-setup-jenkins2.sh new file mode 100644 index 0000000000..f5233f978c --- /dev/null +++ b/gen3/bin/kube-setup-jenkins2.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# +# Just a little helper for deploying jenkins onto k8s the first time +# + +set -e + +export WORKSPACE="${WORKSPACE:-$HOME}" +source "${GEN3_HOME}/gen3/lib/utils.sh" +gen3_load "gen3/gen3setup" + +gen3 kube-setup-secrets + +# +# Assume Jenkins should use 'jenkins' profile credentials in "${WORKSPACE}"/.aws/credentials +# +aws_access_key_id="$(aws configure get jenkins.aws_access_key_id)" +aws_secret_access_key="$(aws configure get jenkins.aws_secret_access_key)" +google_acct1_email="$(jq -r '.jenkins.google_acct1.email' < $(gen3_secrets_folder)/creds.json)" +google_acct1_password="$(jq -r '.jenkins.google_acct1.password' < $(gen3_secrets_folder)/creds.json)" +google_acct2_email="$(jq -r '.jenkins.google_acct2.email' < $(gen3_secrets_folder)/creds.json)" +google_acct2_password="$(jq -r '.jenkins.google_acct2.password' < $(gen3_secrets_folder)/creds.json)" + +if [ -z "$aws_access_key_id" -o -z "$aws_secret_access_key" ]; then + gen3_log_err 'not configuring jenkins - could not extract secrets from aws configure' + exit 1 +fi +if [[ -z "$google_acct1_email" || -z "$google_acct1_password" || -z "$google_acct2_email" || -z "$google_acct2_password" ]]; then + gen3_log_err "missing google credentials in '.jenkins' of creds.json" + exit 1 +fi + +if ! g3kubectl get secrets jenkins-secret > /dev/null 2>&1; then + # make it easy to rerun kube-setup-jenkins.sh + g3kubectl create secret generic jenkins-secret "--from-literal=aws_access_key_id=$aws_access_key_id" "--from-literal=aws_secret_access_key=$aws_secret_access_key" +fi +if ! g3kubectl get secrets google-acct1 > /dev/null 2>&1; then + g3kubectl create secret generic google-acct1 "--from-literal=email=${google_acct1_email}" "--from-literal=password=${google_acct1_password}" +fi +if ! g3kubectl get secrets google-acct2 > /dev/null 2>&1; then + g3kubectl create secret generic google-acct2 "--from-literal=email=${google_acct2_email}" "--from-literal=password=${google_acct2_password}" +fi + +if ! g3kubectl get storageclass gp2 > /dev/null 2>&1; then + g3kubectl apply -f "${GEN3_HOME}/kube/services/jenkins/10storageclass.yaml" +fi +if ! g3kubectl get persistentvolumeclaim datadir-jenkins > /dev/null 2>&1; then + g3kubectl apply -f "${GEN3_HOME}/kube/services/jenkins/00pvc.yaml" +fi + +# Note: jenkins service account is configured by `kube-setup-roles` +gen3 kube-setup-roles +# Note: only the 'default' namespace jenkins-service account gets a cluster rolebinding +g3kubectl apply -f "${GEN3_HOME}/kube/services/jenkins/clusterrolebinding-devops.yaml" + +# Note: requires Jenkins entry in cdis-manifest +gen3 roll jenkins2 +gen3 roll jenkins2-worker +gen3 roll jenkins2-ci-worker + +# +# Get the ARN of the SSL certificate for the commons - +# We'll optimistically assume it's a wildcard cert that +# is appropriate to also attach to the jenkins ELB +# +export ARN=$(g3kubectl get configmap global --output=jsonpath='{.data.revproxy_arn}') +if [[ ! -z $ARN ]]; then + envsubst <"${GEN3_HOME}/kube/services/jenkins/jenkins-service.yaml" | g3kubectl apply -f - +else + gen3_log_info "Global configmap not configured - not launching service (require SSL cert ARN)" +fi diff --git a/gen3/bin/kube-setup-karpenter.sh b/gen3/bin/kube-setup-karpenter.sh index 8ba8ed9d97..2737ed6eeb 100644 --- a/gen3/bin/kube-setup-karpenter.sh +++ b/gen3/bin/kube-setup-karpenter.sh @@ -23,8 +23,10 @@ gen3_deploy_karpenter() { if g3k_config_lookup .global.karpenter_version; then karpenter=$(g3k_config_lookup .global.karpenter_version) fi - export clusterversion=`kubectl version --short -o json | jq -r .serverVersion.minor` - if [ "${clusterversion}" = "24+" ]; then + export clusterversion=`kubectl version -o json | jq -r .serverVersion.minor` + if [ "${clusterversion}" = "25+" ]; then + karpenter=${karpenter:-v0.27.0} + elif [ "${clusterversion}" = "24+" ]; then karpenter=${karpenter:-v0.24.0} else karpenter=${karpenter:-v0.22.0} @@ -77,6 +79,14 @@ gen3_deploy_karpenter() { "Effect": "Allow", "Resource": "*", "Sid": "ConditionalEC2Termination" + }, + { + "Sid": "VisualEditor0", + "Effect": "Allow", + "Action": [ + "kms:*" + ], + "Resource": "*" } ], "Version": "2012-10-17" diff --git a/gen3/bin/kube-setup-revproxy.sh b/gen3/bin/kube-setup-revproxy.sh index fcc2ef3b73..5db9850a18 100644 --- a/gen3/bin/kube-setup-revproxy.sh +++ b/gen3/bin/kube-setup-revproxy.sh @@ -111,15 +111,14 @@ for name in $(g3kubectl get services -o json | jq -r '.items[] | .metadata.name' fi done -if g3kubectl get namespace argo > /dev/null 2>&1; -then - for argo in $(g3kubectl get services -n argo -o jsonpath='{.items[*].metadata.name}'); - do - filePath="$scriptDir/gen3.nginx.conf/${argo}.conf" - if [[ -f "$filePath" ]]; then - confFileList+=("--from-file" "$filePath") - fi - done + +if g3k_manifest_lookup .argo.argo_server_service_url 2> /dev/null; then + argo_server_service_url=$(g3k_manifest_lookup .argo.argo_server_service_url) + g3k_kv_filter "${scriptDir}/gen3.nginx.conf/argo-server.conf" SERVICE_URL "${argo_server_service_url}" > /tmp/argo-server-with-url.conf + filePath="/tmp/argo-server-with-url.conf" + if [[ -f "$filePath" ]]; then + confFileList+=("--from-file" "$filePath") + fi fi if g3kubectl get namespace argocd > /dev/null 2>&1; diff --git a/gen3/bin/kube-setup-system-services.sh b/gen3/bin/kube-setup-system-services.sh index 609ee01c76..c26a04cb5d 100644 --- a/gen3/bin/kube-setup-system-services.sh +++ b/gen3/bin/kube-setup-system-services.sh @@ -19,7 +19,7 @@ gen3_load "gen3/gen3setup" kubeproxy=${kubeproxy:-1.24.7} coredns=${coredns:-1.8.7} kubednsautoscaler=${kubednsautoscaler:-1.8.6} -cni=${cni:-1.12.2} +cni=${cni:-1.14.1} calico=${calico:-1.7.8} @@ -39,7 +39,7 @@ calico_yaml="https://raw.githubusercontent.com/aws/amazon-vpc-cni-k8s/v${calico} g3kubectl set image daemonset.apps/kube-proxy -n kube-system kube-proxy=${kube_proxy_image} g3kubectl set image --namespace kube-system deployment.apps/coredns coredns=${coredns_image} -g3k_kv_filter "${GEN3_HOME}/kube/services/kube-dns-autoscaler/dns-horizontal-autoscaler.yaml" SERVICE "coredns" IMAGE "$kubednsautoscaler_image" | g3kubectl apply -f - +#g3k_kv_filter "${GEN3_HOME}/kube/services/kube-dns-autoscaler/dns-horizontal-autoscaler.yaml" SERVICE "coredns" IMAGE "$kubednsautoscaler_image" | g3kubectl apply -f - g3kubectl apply -f ${cni_image} g3kubectl apply -f ${calico_yaml} diff --git a/gen3/bin/migrate-to-vpc-cni.sh b/gen3/bin/migrate-to-vpc-cni.sh new file mode 100644 index 0000000000..510d9ebeff --- /dev/null +++ b/gen3/bin/migrate-to-vpc-cni.sh @@ -0,0 +1,138 @@ +#!/bin/bash + +source "${GEN3_HOME}/gen3/lib/utils.sh" +gen3_load "gen3/gen3setup" + +#Get the K8s NS +ctx="$(g3kubectl config current-context)" +ctxNamespace="$(g3kubectl config view -ojson | jq -r ".contexts | map(select(.name==\"$ctx\")) | .[0] | .context.namespace")" + +# Set the cluster name variable +CLUSTER_NAME=`gen3 api environment` + +# Check if in default ns +if [[ ("$ctxNamespace" != "default" && "$ctxNamespace" != "null") ]]; then + gen3_log_err "Namespace must be default" + exit 1 +fi + +# Cd into Cloud-automation repo and pull the latest from master +gen3_log_info "Pulling the latest from Cloud-Auto" +cd /home/$CLUSTER_NAME/cloud-automation || { gen3_log_err "Cloud-automation repo not found"; exit 1; } +#### Change to master +git checkout master || { gen3_log_err "Failed to checkout master branch"; exit 1; } +git pull || { gen3_log_err "Failed to pull from the repository"; exit 1; } + +# Update the Karpenter Node Template +gen3_log_info "Apply new Karpenter Node Template" +if [[ -d $(g3k_manifest_init)/$(g3k_hostname)/manifests/karpenter ]]; then + gen3_log_info "Karpenter setup in manifest. Open a cdismanifest PR and add this line to aws node templates: https://github.com/uc-cdis/cloud-automation/blob/master/kube/services/karpenter/nodeTemplateDefault.yaml#L40" + while true; do + read -p "Have you updated your manifest? (yes/no): " yn + case $yn in + [Yy]* ) + gen3_log_info "Proceeding with Karpenter deployment..." + gen3 kube-setup-karpenter deploy --force || { gen3_log_err "kube-setup-karpenter failed"; exit 1; } + break + ;; + [Nn]* ) + gen3_log_info "Please update the cdismanifest before proceeding." + exit 1 + ;; + * ) + gen3_log_info "Please answer yes or no." + ;; + esac + done +else + gen3 kube-setup-karpenter deploy --force || { gen3_log_err "kube-setup-karpenter failed"; exit 1; } +fi + +# Cordon all the nodes before running gen3 roll all" +gen3_log_info "Cordoning all nodes" +kubectl get nodes --no-headers -o custom-columns=":metadata.name" | grep -v '^fargate' | xargs -I{} kubectl cordon {} + +# Run a "gen3 roll all" so all nodes use the new mounted BPF File System +gen3_log_info "Cycling all the nodes by running gen3 roll all" +gen3 roll all --fast || exit 1 + +# Confirm that all nodes have been rotated +while true; do + read -p "Roll all complete. Have all cordoned nodes been rotated? (yes/no): " yn + case $yn in + [Yy]* ) + gen3_log_info "Continuing with script..." + break + ;; + [Nn]* ) + gen3_log_info "Please drain any remaining nodes with 'kubectl drain --ignore-daemonsets --delete-emptydir-data'" + ;; + * ) + gen3_log_info "Please answer yes or no." + ;; + esac +done + + +# Delete all existing network policies +gen3_log_info "Deleting networkpolicies" +kubectl delete networkpolicies --all + +# Delete all Calico related resources from the “kube-system” namespace +gen3_log_info "Deleting all Calico related resources" +kubectl get deployments -n kube-system | grep calico | awk '{print $1}' | xargs kubectl delete deployment -n kube-system +kubectl get daemonsets -n kube-system | grep calico | awk '{print $1}' | xargs kubectl delete daemonset -n kube-system +kubectl get services -n kube-system | grep calico | awk '{print $1}' | xargs kubectl delete service -n kube-system +kubectl get replicasets -n kube-system | grep calico | awk '{print $1}' | xargs kubectl delete replicaset -n kube-system + +# Backup the current VPC CNI configuration in case of rollback +gen3_log_info "Backing up current VPC CNI Configuration..." +kubectl get daemonset aws-node -n kube-system -o yaml > aws-k8s-cni-old.yaml || { gen3_log_err "Error backig up VPC CNI configuration"; exit 1; } + +# Check to ensure we are not using an AWS plugin to manage the VPC CNI Plugin +if aws eks describe-addon --cluster-name "$CLUSTER_NAME" --addon-name vpc-cni --query addon.addonVersion --output text 2>/dev/null; then + gen3_log_err "Error: VPC CNI Plugin is managed by AWS. Please log into the AWS UI and delete the VPC CNI Plugin in Amazon EKS, then re-run this script." + exit 1 +else + gen3_log_info "No managed VPC CNI Plugin found, proceeding with the script." +fi + +# Apply the new VPC CNI Version +gen3_log_info "Applying new version of VPC CNI" +g3kubectl apply -f https://raw.githubusercontent.com/aws/amazon-vpc-cni-k8s/v1.14.1/config/master/aws-k8s-cni.yaml || { gen3_log_err "Failed to apply new VPC CNI version"; exit 1; } + +# Check the version to make sure it updated +NEW_VERSION=$(kubectl describe daemonset aws-node --namespace kube-system | grep amazon-k8s-cni: | cut -d : -f 3) +gen3_log_info "Current version of aws-k8s-cni is: $NEW_VERSION" +if [ "$NEW_VERSION" != "v1.14.1" ]; then + gen3_log_info "The version of aws-k8s-cni has not been updated correctly." + exit 1 +fi + +# Edit the amazon-vpc-cni configmap to enable network policy controller +gen3_log_info "Enabling NetworkPolicies in VPC CNI Configmap" +kubectl patch configmap -n kube-system amazon-vpc-cni --type merge -p '{"data":{"enable-network-policy-controller":"true"}}' || { gen3_log_err "Configmap patch failed"; exit 1; } + +# Edit the aws-node daemonset +gen3_log_info "Enabling NetworkPolicies in aws-node Daemonset" +kubectl patch daemonset aws-node -n kube-system --type=json -p='[{"op": "add", "path": "/spec/template/spec/containers/1/args", "value": ["--enable-network-policy=true", "--enable-ipv6=false", "--enable-cloudwatch-logs=false", "--metrics-bind-addr=:8162", "--health-probe-bind-addr=:8163"]}]' || { gen3_log_err "Daemonset edit failed"; exit 1; } + +# Ensure all the aws-nodes are running +kubectl get pods -n kube-system | grep aws +while true; do + read -p "Do all the aws-node pods in the kube-system ns have 2/2 containers running? (yes/no): " yn + case $yn in + [Yy]* ) + gen3_log_info "Running kube-setup-networkpolicy..." + gen3 kube-setup-networkpolicy || exit 1 + break + ;; + [Nn]* ) + gen3_log_err "Look at aws-node logs to figure out what went wrong. View this document for more details: https://docs.google.com/document/d/1fcBTciQSSwjvHktEnO_7EObY-xR_EvJ2NtgUa70wvL8" + gen3_log_info "Rollback instructions are also available in the above document" + ;; + * ) + gen3_log_info "Please answer yes or no." + ;; + esac +done \ No newline at end of file diff --git a/gen3/lib/logs/snapshot.sh b/gen3/lib/logs/snapshot.sh index 31cb80283b..d3d3b2c6cc 100644 --- a/gen3/lib/logs/snapshot.sh +++ b/gen3/lib/logs/snapshot.sh @@ -36,10 +36,11 @@ gen3_logs_snapshot_container() { # Snapshot all the pods # gen3_logs_snapshot_all() { + # For each pod for which we can list the containers, get the pod name and get its list of containers + # (container names + initContainers names). Diplay them as lines of " ". g3kubectl get pods -o json | \ - jq -r '.items | map(select(.status.phase != "Pending" and .status.phase != "Unknown")) | map( {pod: .metadata.name, containers: .spec.containers | map(.name) } ) | map( .pod as $pod | .containers | map( { pod: $pod, cont: .})[]) | map(select(.cont != "pause" and .cont != "jupyterhub"))[] | .pod + " " + .cont' | \ + jq -r '.items | map(select(.status.phase != "Pending" and .status.phase != "Unknown")) | map( {pod: .metadata.name, containers: [(.spec.containers | select(.!=null) | map(.name)), (.spec.initContainers | select(.!=null) | map(.name)) | add ] } ) | map( .pod as $pod | .containers | map( { pod: $pod, cont: .})[]) | map(select(.cont != "pause" and .cont != "jupyterhub"))[] | .pod + " " + .cont' | \ while read -r line; do gen3_logs_snapshot_container $line done } - diff --git a/gen3/lib/testData/default/expectedFenceResult.yaml b/gen3/lib/testData/default/expectedFenceResult.yaml index f6d76d790f..98c3605311 100644 --- a/gen3/lib/testData/default/expectedFenceResult.yaml +++ b/gen3/lib/testData/default/expectedFenceResult.yaml @@ -44,6 +44,13 @@ spec: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 preference: matchExpressions: - key: eks.amazonaws.com/capacityType @@ -136,6 +143,7 @@ spec: ports: - containerPort: 80 - containerPort: 443 + - containerPort: 6567 volumeMounts: # ----------------------------------------------------------------------------- # DEPRECATED! Remove when all commons are no longer using local_settings.py diff --git a/gen3/lib/testData/default/expectedSheepdogResult.yaml b/gen3/lib/testData/default/expectedSheepdogResult.yaml index b9db85a36c..a2bd3efcc4 100644 --- a/gen3/lib/testData/default/expectedSheepdogResult.yaml +++ b/gen3/lib/testData/default/expectedSheepdogResult.yaml @@ -17,6 +17,7 @@ spec: template: metadata: labels: + netnolimit: "yes" app: sheepdog release: production public: "yes" @@ -39,12 +40,19 @@ spec: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - spot + - weight: 99 preference: matchExpressions: - key: eks.amazonaws.com/capacityType operator: In values: - - ONDEMAND + - SPOT automountServiceAccountToken: false volumes: - name: config-volume diff --git a/gen3/lib/testData/test1.manifest.g3k/expectedFenceResult.yaml b/gen3/lib/testData/test1.manifest.g3k/expectedFenceResult.yaml index d4196c0708..adc35ad2f1 100644 --- a/gen3/lib/testData/test1.manifest.g3k/expectedFenceResult.yaml +++ b/gen3/lib/testData/test1.manifest.g3k/expectedFenceResult.yaml @@ -47,6 +47,13 @@ spec: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + - weight: 99 preference: matchExpressions: - key: eks.amazonaws.com/capacityType diff --git a/gen3/lib/testData/test1.manifest.g3k/expectedSheepdogResult.yaml b/gen3/lib/testData/test1.manifest.g3k/expectedSheepdogResult.yaml index f54fd3e03b..08407ae52b 100644 --- a/gen3/lib/testData/test1.manifest.g3k/expectedSheepdogResult.yaml +++ b/gen3/lib/testData/test1.manifest.g3k/expectedSheepdogResult.yaml @@ -43,6 +43,13 @@ spec: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 + preference: + matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - spot + - weight: 99 preference: matchExpressions: - key: eks.amazonaws.com/capacityType diff --git a/kube/services/argo-events/workflows/configmap.yaml b/kube/services/argo-events/workflows/configmap.yaml index eb5f1b04f4..ae1c16653c 100644 --- a/kube/services/argo-events/workflows/configmap.yaml +++ b/kube/services/argo-events/workflows/configmap.yaml @@ -8,7 +8,7 @@ data: apiVersion: karpenter.sh/v1alpha5 kind: Provisioner metadata: - name: workflow-$WORKFLOW_NAME + name: workflow-WORKFLOW_NAME spec: requirements: - key: karpenter.sh/capacity-type @@ -18,23 +18,42 @@ data: operator: In values: - amd64 - - key: karpenter.k8s.aws/instance-family + - key: node.kubernetes.io/instance-type operator: In values: - - c6i - - c7i - - m7i + - c6a.large + - c6a.xlarge + - c6a.2xlarge + - c6a.4xlarge + - c6a.8xlarge + - c6a.12xlarge + - c6i.large + - c6i.xlarge + - c6i.2xlarge + - c6i.4xlarge + - c6i.8xlarge + - c6i.12xlarge + - m6a.2xlarge + - m6a.4xlarge + - m6a.8xlarge + - m6a.12xlarge + - m6a.16xlarge + - m6i.2xlarge + - m6i.4xlarge + - m6i.8xlarge + - m6i.12xlarge + - m6i.16xlarge taints: - key: role - value: $WORKFLOW_NAME + value: WORKFLOW_NAME effect: NoSchedule labels: - role: $WORKFLOW_NAME + role: WORKFLOW_NAME limits: resources: cpu: 2000 providerRef: - name: workflow-$WORKFLOW_NAME + name: workflow-WORKFLOW_NAME # Kill nodes after 30 days to ensure they stay up to date ttlSecondsUntilExpired: 2592000 ttlSecondsAfterEmpty: 10 @@ -43,18 +62,18 @@ data: apiVersion: karpenter.k8s.aws/v1alpha1 kind: AWSNodeTemplate metadata: - name: workflow-$WORKFLOW_NAME + name: workflow-WORKFLOW_NAME spec: subnetSelector: - karpenter.sh/discovery: $ENVIRONMENT + karpenter.sh/discovery: ENVIRONMENT securityGroupSelector: - karpenter.sh/discovery: $ENVIRONMENT-workflow + karpenter.sh/discovery: ENVIRONMENT-workflow tags: - Environment: $ENVIRONMENT - Name: eks-$ENVIRONMENT-workflow-karpenter - karpenter.sh/discovery: $ENVIRONMENT - workflowname: $WORKFLOW_NAME - gen3username: $GEN3_USERNAME + Environment: ENVIRONMENT + Name: eks-ENVIRONMENT-workflow-karpenter + karpenter.sh/discovery: ENVIRONMENT + workflowname: WORKFLOW_NAME + gen3username: GEN3_USERNAME gen3service: argo-workflows purpose: workflow metadataOptions: diff --git a/kube/services/argo-events/workflows/sensor-created.yaml b/kube/services/argo-events/workflows/sensor-created.yaml index 7b1b9d62fe..4221f57423 100644 --- a/kube/services/argo-events/workflows/sensor-created.yaml +++ b/kube/services/argo-events/workflows/sensor-created.yaml @@ -60,11 +60,11 @@ spec: - "-c" - | if ! kubectl get awsnodetemplate workflow-$WORKFLOW_NAME >/dev/null 2>&1; then - envsubst < /home/manifests/nodetemplate.yaml | kubectl apply -f - + sed -e "s/WORKFLOW_NAME/$workflow_name/" -e "s/GEN3_USERNAME/$workflow_user/" -e "s/ENVIRONMENT/$ENVIRONMENT/" | kubectl apply -f - fi if ! kubectl get provisioner workflow-$WORKFLOW_NAME >/dev/null 2>&1; then - envsubst < /home/manifests/provisioner.yaml | kubectl apply -f - + sed -e "s/WORKFLOW_NAME/$workflow_name/" -e "s/GEN3_USERNAME/$workflow_user/" -e "s/ENVIRONMENT/$ENVIRONMENT/" | kubectl apply -f - fi env: - name: WORKFLOW_NAME diff --git a/kube/services/argo-wrapper/argo-wrapper-deploy.yaml b/kube/services/argo-wrapper/argo-wrapper-deploy.yaml index 65f68d98ab..00d118746c 100644 --- a/kube/services/argo-wrapper/argo-wrapper-deploy.yaml +++ b/kube/services/argo-wrapper/argo-wrapper-deploy.yaml @@ -58,7 +58,10 @@ spec: configMap: name: manifest-argo optional: true - + - name: argo-wrapper-namespace-config + configMap: + name: argo-wrapper-namespace-config + containers: - name: argo-wrapper GEN3_ARGO-WRAPPER_IMAGE @@ -70,3 +73,7 @@ spec: readOnly: true mountPath: /argo.json subPath: argo.json + - name: argo-wrapper-namespace-config + readOnly: true + mountPath: /src/config.ini + subPath: config.ini diff --git a/kube/services/argo-wrapper/config.ini b/kube/services/argo-wrapper/config.ini new file mode 100644 index 0000000000..334438ac20 --- /dev/null +++ b/kube/services/argo-wrapper/config.ini @@ -0,0 +1,4 @@ +[DEFAULT] +ARGO_ACCESS_METHOD = access +ARGO_HOST = $ARGO_HOST +ARGO_NAMESPACE = $ARGO_NAMESPACE diff --git a/kube/services/argo/values.yaml b/kube/services/argo/values.yaml index 67fa05a09d..2b46ced0f3 100644 --- a/kube/services/argo/values.yaml +++ b/kube/services/argo/values.yaml @@ -1,5 +1,6 @@ controller: - parallelism: 5 + parallelism: 8 + namespaceParallelism: 3 metricsConfig: # -- Enables prometheus metrics server enabled: true @@ -28,11 +29,11 @@ controller: } ] } - } + } resourceRateLimit: limit: 40 - burst: 4 + burst: 4 # -- enable persistence using postgres persistence: @@ -49,7 +50,7 @@ controller: port: 5432 database: GEN3_ARGO_DB_NAME tableName: argo_workflows - # # the database secrets must be in the same namespace of the controller + # # the database secrets must be in the same namespace of the controller userNameSecret: name: argo-db-creds key: db_username @@ -58,7 +59,7 @@ controller: key: db_password nodeStatusOffLoad: true - workflowDefaults: + workflowDefaults: spec: archiveLogs: true @@ -77,11 +78,11 @@ server: baseHref: "/argo/" # -- Extra arguments to provide to the Argo server binary, such as for disabling authentication. extraArgs: - - --auth-mode=server - - --auth-mode=client + - --auth-mode=server + - --auth-mode=client extraEnv: - - name: ARGO_HTTP1 - value: "true" + - name: ARGO_HTTP1 + value: "true" resources: requests: memory: 8Gi diff --git a/kube/services/guppy/guppy-deploy.yaml b/kube/services/guppy/guppy-deploy.yaml index 01a8905de7..c3e8d121c4 100644 --- a/kube/services/guppy/guppy-deploy.yaml +++ b/kube/services/guppy/guppy-deploy.yaml @@ -155,6 +155,6 @@ spec: resources: requests: cpu: 100m - memory: 128Mi + memory: 256Mi limits: - memory: 1200Mi + memory: 2000Mi diff --git a/kube/services/ingress/ingress.yaml b/kube/services/ingress/ingress.yaml index 65916679a7..3f1f312592 100644 --- a/kube/services/ingress/ingress.yaml +++ b/kube/services/ingress/ingress.yaml @@ -11,7 +11,7 @@ metadata: alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}, {"HTTPS":443}]' alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=600 alb.ingress.kubernetes.io/actions.ssl-redirect: '{"Type": "redirect", "RedirectConfig": { "Protocol": "HTTPS", "Port": "443", "StatusCode": "HTTP_301"}}' - alb.ingress.kubernetes.io/ssl-policy: ELBSecurityPolicy-TLS13-1-2-2021-06 + alb.ingress.kubernetes.io/ssl-policy: ELBSecurityPolicy-TLS13-1-2-Res-FIPS-2023-04 spec: ingressClassName: alb rules: diff --git a/kube/services/jenkins2-ci-worker/jenkins2-agent-service.yaml b/kube/services/jenkins2-ci-worker/jenkins2-agent-service.yaml new file mode 100644 index 0000000000..7f4e58109c --- /dev/null +++ b/kube/services/jenkins2-ci-worker/jenkins2-agent-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + name: jenkins-agent-service + name: jenkins-agent + namespace: default +spec: + ports: + - name: slavelistener + port: 50000 + protocol: TCP + targetPort: 50000 + selector: + app: jenkins + sessionAffinity: None + type: ClusterIP diff --git a/kube/services/jenkins2-ci-worker/jenkins2-ci-worker-deploy.yaml b/kube/services/jenkins2-ci-worker/jenkins2-ci-worker-deploy.yaml new file mode 100644 index 0000000000..3dea38a5cb --- /dev/null +++ b/kube/services/jenkins2-ci-worker/jenkins2-ci-worker-deploy.yaml @@ -0,0 +1,149 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: jenkins-ci-worker-deployment +spec: + selector: + # Only select pods based on the 'app' label + matchLabels: + app: jenkins-ci-worker + template: + metadata: + labels: + app: jenkins-ci-worker + # for network policy + netnolimit: "yes" + annotations: + "cluster-autoscaler.kubernetes.io/safe-to-evict": "false" + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: eks.amazonaws.com/capacityType + operator: In + values: + - ONDEMAND + - matchExpressions: + - key: karpenter.sh/capacity-type + operator: In + values: + - on-demand + serviceAccountName: jenkins-service + securityContext: + runAsUser: 1000 + fsGroup: 1000 + initContainers: + - args: + - -c + - | + # fix permissions for /var/run/docker.sock + chmod 666 /var/run/docker.sock + echo "done" + command: + - /bin/bash + image: quay.io/cdis/awshelper:master + imagePullPolicy: Always + name: awshelper + resources: {} + securityContext: + allowPrivilegeEscalation: false + runAsUser: 0 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/docker.sock + name: dockersock + containers: + # + # See for details on running docker in a pod: + # https://estl.tech/accessing-docker-from-a-kubernetes-pod-68996709c04b + # + - name: jenkins-worker + image: "quay.io/cdis/gen3-ci-worker:master" + ports: + - containerPort: 8080 + env: + - name: JENKINS_URL + value: "https://jenkins2.planx-pla.net" + - name: JENKINS_SECRET + valueFrom: + secretKeyRef: + name: jenkins-ci-worker-g3auto + key: jenkins-jnlp-agent-secret + - name: JENKINS_AGENT_NAME + value: "gen3-ci-worker" + - name: JENKINS_TUNNEL + value: "jenkins-agent:50000" + - name: AWS_DEFAULT_REGION + value: us-east-1 + - name: JAVA_OPTS + value: "-Xmx3072m" + - name: AWS_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: jenkins-secret + key: aws_access_key_id + - name: AWS_SECRET_ACCESS_KEY + valueFrom: + secretKeyRef: + name: jenkins-secret + key: aws_secret_access_key + - name: GOOGLE_EMAIL_AUX1 + valueFrom: + secretKeyRef: + name: google-acct1 + key: email + - name: GOOGLE_PASSWORD_AUX1 + valueFrom: + secretKeyRef: + name: google-acct1 + key: password + - name: GOOGLE_EMAIL_AUX2 + valueFrom: + secretKeyRef: + name: google-acct2 + key: email + - name: GOOGLE_PASSWORD_AUX2 + valueFrom: + secretKeyRef: + name: google-acct2 + key: password + - name: GOOGLE_APP_CREDS_JSON + valueFrom: + secretKeyRef: + name: jenkins-g3auto + key: google_app_creds.json + resources: + limits: + cpu: 0.9 + memory: 4096Mi + ephemeral-storage: 500Mi + imagePullPolicy: Always + volumeMounts: + - name: "cert-volume" + readOnly: true + mountPath: "/mnt/ssl/service.crt" + subPath: "service.crt" + - name: "cert-volume" + readOnly: true + mountPath: "/mnt/ssl/service.key" + subPath: "service.key" + - name: "ca-volume" + readOnly: true + mountPath: "/usr/local/share/ca-certificates/cdis/cdis-ca.crt" + subPath: "ca.pem" + - name: dockersock + mountPath: "/var/run/docker.sock" + imagePullPolicy: Always + volumes: + - name: cert-volume + secret: + secretName: "cert-jenkins-service" + - name: ca-volume + secret: + secretName: "service-ca" + - name: dockersock + hostPath: + path: /var/run/docker.sock diff --git a/kube/services/jenkins2-ci-worker/jenkins2-ci-worker-pvc.yaml b/kube/services/jenkins2-ci-worker/jenkins2-ci-worker-pvc.yaml new file mode 100644 index 0000000000..047e4e966e --- /dev/null +++ b/kube/services/jenkins2-ci-worker/jenkins2-ci-worker-pvc.yaml @@ -0,0 +1,12 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: datadir-jenkins-ci + annotations: + volume.beta.kubernetes.io/storage-class: gp2 +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 200Gi diff --git a/kube/services/jenkins2/jenkins2-deploy.yaml b/kube/services/jenkins2/jenkins2-deploy.yaml index ee838bae60..08365f811b 100644 --- a/kube/services/jenkins2/jenkins2-deploy.yaml +++ b/kube/services/jenkins2/jenkins2-deploy.yaml @@ -48,7 +48,7 @@ spec: # https://estl.tech/accessing-docker-from-a-kubernetes-pod-68996709c04b # - name: jenkins - GEN3_JENKINS_IMAGE + GEN3_JENKINS2_IMAGE ports: - containerPort: 8080 name: http diff --git a/kube/services/karpenter-reconciler/application.yaml b/kube/services/karpenter-reconciler/application.yaml new file mode 100644 index 0000000000..fb0fab8711 --- /dev/null +++ b/kube/services/karpenter-reconciler/application.yaml @@ -0,0 +1,22 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: karpenter-reconciler-application + namespace: argocd +spec: + destination: + namespace: kube-system + server: https://kubernetes.default.svc + project: default + source: + repoURL: https://github.com/uc-cdis/cloud-automation.git + targetRevision: master + path: kube/services/karpenter-reconciler + directory: + exclude: "application.yaml" + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=true diff --git a/kube/services/karpenter-reconciler/auth.yaml b/kube/services/karpenter-reconciler/auth.yaml new file mode 100644 index 0000000000..c159028ab3 --- /dev/null +++ b/kube/services/karpenter-reconciler/auth.yaml @@ -0,0 +1,44 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: karpenter-reconciler + namespace: argo-events +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: karpenter-admin-binding-reconciler +subjects: + - kind: ServiceAccount + name: karpenter-reconciler + namespace: argo-events +roleRef: + kind: ClusterRole + name: karpenter-admin + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: workflow-viewer-reconciler +subjects: + - kind: ServiceAccount + name: karpenter-reconciler + namespace: argo-events +roleRef: + kind: ClusterRole + name: argo-argo-workflows-view + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: viewer-reconciler +subjects: + - kind: ServiceAccount + name: karpenter-reconciler + namespace: argo-events +roleRef: + kind: ClusterRole + name: system:aggregate-to-view + apiGroup: rbac.authorization.k8s.io diff --git a/kube/services/karpenter-reconciler/karpenter-reconciler-cronjob.yaml b/kube/services/karpenter-reconciler/karpenter-reconciler-cronjob.yaml new file mode 100644 index 0000000000..4f82e9d43e --- /dev/null +++ b/kube/services/karpenter-reconciler/karpenter-reconciler-cronjob.yaml @@ -0,0 +1,72 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: karpenter-reconciler-cronjob + namespace: argo-events +spec: + schedule: "*/5 * * * *" + jobTemplate: + spec: + template: + metadata: + labels: + app: gen3job + spec: + serviceAccount: karpenter-reconciler + volumes: + - name: karpenter-templates-volume + configMap: + name: karpenter-templates + containers: + - name: karpenter-reconciler + image: quay.io/cdis/awshelper + volumeMounts: + - name: karpenter-templates-volume + mountPath: /manifests + env: + - name: PROVISIONER_TEMPLATE + value: /manifests/provisioner.yaml + - name: AWSNODETEMPLATE_TEMPLATE + value: /manifests/nodetemplate.yaml + command: ["/bin/bash"] + args: + - "-c" + - | + #!/bin/bash + if [ -z "$PROVISIONER_TEMPLATE" ]; then + PROVISIONER_TEMPLATE="provisioner.yaml" + fi + + if [ -z "$AWSNODETEMPLATE_TEMPLATE" ]; then + AWSNODETEMPLATE_TEMPLATE="nodetemplate.yaml" + fi + + ENVIRONMENT=$(kubectl -n default get configmap global -o jsonpath="{.data.environment}") + + RAW_WORKFLOWS=$(kubectl get workflows -n argo -o yaml) + + WORKFLOWS=$(echo "${RAW_WORKFLOWS}" | yq -r '.items[] | [.metadata.name, .metadata.labels.gen3username] | join(" ")') + + WORKFLOW_ARRAY=() + + while IFS= read -r line; do + WORKFLOW_ARRAY+=("$line") + done <<< "$WORKFLOWS" + + for workflow in "${WORKFLOW_ARRAY[@]}" + do + workflow_name=$(echo "$workflow" | awk '{print $1}') + workflow_user=$(echo "$workflow" | awk '{print $2}') + + if ! kubectl get awsnodetemplate workflow-$workflow_name >/dev/null 2>&1; then + echo "No awsnodetemplate found for ${workflow_name}, creating one" + sed -e "s/WORKFLOW_NAME/$workflow_name/" -e "s/GEN3_USERNAME/$workflow_user/" -e "s/ENVIRONMENT/$ENVIRONMENT/" "$AWSNODETEMPLATE_TEMPLATE" | kubectl apply -f - + fi + + if ! kubectl get provisioner workflow-$workflow_name >/dev/null 2>&1; then + echo "No provisioner found for ${workflow_name}, creating one" + sed -e "s/WORKFLOW_NAME/$workflow_name/" -e "s/GEN3_USERNAME/$workflow_user/" -e "s/ENVIRONMENT/$ENVIRONMENT/" "$PROVISIONER_TEMPLATE" | kubectl apply -f - + + fi + done + restartPolicy: OnFailure diff --git a/kube/services/karpenter/nodeTemplateDefault.yaml b/kube/services/karpenter/nodeTemplateDefault.yaml index a3dbf64802..6ba8b3a0f7 100644 --- a/kube/services/karpenter/nodeTemplateDefault.yaml +++ b/kube/services/karpenter/nodeTemplateDefault.yaml @@ -38,7 +38,14 @@ spec: # configure grub sudo /sbin/grubby --update-kernel=ALL --args="fips=1" + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + --BOUNDARY + Content-Type: text/cloud-config; charset="us-ascii" power_state: diff --git a/kube/services/karpenter/nodeTemplateGPU.yaml b/kube/services/karpenter/nodeTemplateGPU.yaml index 5270b697f3..925e7a9a08 100644 --- a/kube/services/karpenter/nodeTemplateGPU.yaml +++ b/kube/services/karpenter/nodeTemplateGPU.yaml @@ -38,6 +38,12 @@ spec: # configure grub sudo /sbin/grubby --update-kernel=ALL --args="fips=1" + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + --BOUNDARY Content-Type: text/cloud-config; charset="us-ascii" diff --git a/kube/services/karpenter/nodeTemplateJupyter.yaml b/kube/services/karpenter/nodeTemplateJupyter.yaml index 74f24926ac..1c8970ad64 100644 --- a/kube/services/karpenter/nodeTemplateJupyter.yaml +++ b/kube/services/karpenter/nodeTemplateJupyter.yaml @@ -38,6 +38,12 @@ spec: # configure grub sudo /sbin/grubby --update-kernel=ALL --args="fips=1" + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + --BOUNDARY Content-Type: text/cloud-config; charset="us-ascii" diff --git a/kube/services/karpenter/nodeTemplateWorkflow.yaml b/kube/services/karpenter/nodeTemplateWorkflow.yaml index ec2b81a60c..6e47b22f97 100644 --- a/kube/services/karpenter/nodeTemplateWorkflow.yaml +++ b/kube/services/karpenter/nodeTemplateWorkflow.yaml @@ -38,6 +38,12 @@ spec: # configure grub sudo /sbin/grubby --update-kernel=ALL --args="fips=1" + # --BOUNDARY + # Content-Type: text/cloud-config; charset="us-ascii" + + # mounts: + # - ['fstype': 'bpf', 'mountpoint': '/sys/fs/bpf', 'opts': 'rw,relatime'] + --BOUNDARY Content-Type: text/cloud-config; charset="us-ascii" diff --git a/kube/services/ohdsi-webapi/ohdsi-webapi-deploy.yaml b/kube/services/ohdsi-webapi/ohdsi-webapi-deploy.yaml index 65d6ed38c9..a729ae7c41 100644 --- a/kube/services/ohdsi-webapi/ohdsi-webapi-deploy.yaml +++ b/kube/services/ohdsi-webapi/ohdsi-webapi-deploy.yaml @@ -83,7 +83,7 @@ spec: limits: memory: 4Gi - name: ohdsi-webapi-reverse-proxy - image: nginx:1.23 + image: 707767160287.dkr.ecr.us-east-1.amazonaws.com/gen3/nginx:1.23 ports: - containerPort: 80 volumeMounts: @@ -97,4 +97,4 @@ spec: cpu: 100m memory: 100Mi limits: - memory: 500Mi \ No newline at end of file + memory: 500Mi diff --git a/kube/services/revproxy/gen3.nginx.conf/argo-argo-workflows-server.conf b/kube/services/revproxy/gen3.nginx.conf/argo-server.conf similarity index 86% rename from kube/services/revproxy/gen3.nginx.conf/argo-argo-workflows-server.conf rename to kube/services/revproxy/gen3.nginx.conf/argo-server.conf index cb8def3aa5..1cdd4608c6 100644 --- a/kube/services/revproxy/gen3.nginx.conf/argo-argo-workflows-server.conf +++ b/kube/services/revproxy/gen3.nginx.conf/argo-server.conf @@ -7,7 +7,7 @@ auth_request /gen3-authz; set $proxy_service "argo"; - set $upstream http://argo-argo-workflows-server.argo.svc.cluster.local:2746; + set $upstream SERVICE_URL; rewrite ^/argo/(.*) /$1 break; diff --git a/kube/services/revproxy/gen3.nginx.conf/guppy-service.conf b/kube/services/revproxy/gen3.nginx.conf/guppy-service.conf index db2de58861..e6d66ec12e 100644 --- a/kube/services/revproxy/gen3.nginx.conf/guppy-service.conf +++ b/kube/services/revproxy/gen3.nginx.conf/guppy-service.conf @@ -1,4 +1,8 @@ location /guppy/ { + if ($csrf_check !~ ^ok-\S.+$) { + return 403 "failed csrf check, make sure data-portal version >= 2023.12 or >= 5.19.0"; + } + proxy_connect_timeout 600s; proxy_send_timeout 600s; proxy_read_timeout 600s; diff --git a/kube/services/revproxy/revproxy-deploy.yaml b/kube/services/revproxy/revproxy-deploy.yaml index 9d5caab1b2..9f10ce90bd 100644 --- a/kube/services/revproxy/revproxy-deploy.yaml +++ b/kube/services/revproxy/revproxy-deploy.yaml @@ -21,6 +21,7 @@ spec: app: revproxy # allow access from workspaces userhelper: "yes" + internet: "yes" GEN3_DATE_LABEL spec: affinity: diff --git a/packer/configs/web_wildcard_whitelist b/packer/configs/web_wildcard_whitelist index c58eeefe80..621dec3d58 100644 --- a/packer/configs/web_wildcard_whitelist +++ b/packer/configs/web_wildcard_whitelist @@ -44,4 +44,5 @@ .yahooapis.com .cloudfront.net .docker.io +.blob.core.windows.net .googleapis.com