From 6ee728ad3e375b1a0e521ae256ef13ca29529067 Mon Sep 17 00:00:00 2001 From: Alexander VanTol Date: Mon, 4 Mar 2024 11:04:31 -0600 Subject: [PATCH] Automation for Gen3 Discovery AI Service (#2396) * feat(gen3-openai): rough initial testing, no automation for rolling * feat(gen3-discovery-ai): initial deployment * fix(ai): fix setup jq escaping * fix(ai): fix file name * fix(ai): fix deployment configmap * fix(ai): fix configmap * fix(ai): env path * feat(image): use from manifest * chore(ai): better naming * fix(ai): fix mount path for cfg * fix(ai): first attempt to fix issue of needing write volume for chromadb persistance * fix(ai): k8s deploy command * fix(ai): fix duplicate name * chore(ai): don't sent telemetry data * chore(ai): more logging in init * chore(logs): more * fix(ai): mv instead of cp * fix(ai): back to cp, can't mv b/c of readonly * feat(ai): use s3 and service account + role to handle persisted vectorstore data instead of configmap * fix(ai): fix setup * fix(ai): fix setup * fix(ai): fix automation * fix(ai): automation * fix(ai): fix logic for setup * fix(ai): mount storage config and don't use gen3/jq since they're not available * fix(ai): fix wrong path * fix(ai): quotes * fix(ai): quoting * fix(ai): use awshelper for access to aws commands * fix(ai): move files to correct location * fix(ai): only get folder * fix(ai): fix sync * fix(ai): clear folder before syncing * fix(ai): update bucket contents every roll for updates * feat(ai): support TSV loading from manifest config * fix(ai): fix init so aws syncing is done with awshelper image and loading into vectorstore is with service image * fix(ai): fix loading * fix(ai): fix loading * fix(ai): sync all files * feat(ai): add google secret loading and mounting * fix(ai): mount to container, not inits * fix(mount): don't create another dir * fix(mount): don't create another dir * fix(mounts): fix paths * fix(mounts): mount all secrets * fix(secrets): allow .env file to be a secret * fix(secrets): revert failed attempt to support .env * chore(ai): cd to dir with pyproject.toml * chore(ai): try to fix issue with pyproject.toml * fix(ai): actually we need to poetry run * chore(ai): debug lines * chore(ai): debug lines * chore(ai): debug lines * chore(ai): debug lines * chore(ai): debug lines * fix(mount): don't overwrite whole dir * fix(ai): mounts * chore(ai): remove debug lines * fix(ai): remove debug * chore(debug): debug line * chore(debug): remove debug line * feat(ai): add to roll all, fix port in service yaml * fix(ai): fix nginx conf file name * fix(nginx): fix routing for AI service to add trailing slash after "ai" * Update web_whitelist * Update kube-setup-gen3-discovery-ai.sh * Update README.md * Update gen3-discovery-ai-deploy.yaml * Update gen3-discovery-ai-deploy.yaml * Update gen3-discovery-ai-service.yaml * Update kube-setup-gen3-discovery-ai.sh * feat(discovery): update to data load commands and strategy to support markdown --- gen3/bin/kube-roll-all.sh | 6 + gen3/bin/kube-setup-gen3-discovery-ai.sh | 154 +++++++++++++++ kube/services/gen3-discovery-ai/README.md | 42 ++++ .../gen3-discovery-ai-deploy.yaml | 181 ++++++++++++++++++ .../gen3-discovery-ai-service.yaml | 21 ++ .../gen3-discovery-ai-service.conf | 12 ++ 6 files changed, 416 insertions(+) create mode 100644 gen3/bin/kube-setup-gen3-discovery-ai.sh create mode 100644 kube/services/gen3-discovery-ai/README.md create mode 100644 kube/services/gen3-discovery-ai/gen3-discovery-ai-deploy.yaml create mode 100644 kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml create mode 100644 kube/services/revproxy/gen3.nginx.conf/gen3-discovery-ai-service.conf diff --git a/gen3/bin/kube-roll-all.sh b/gen3/bin/kube-roll-all.sh index 6a67f2bdd2..1dca87c68c 100644 --- a/gen3/bin/kube-roll-all.sh +++ b/gen3/bin/kube-roll-all.sh @@ -243,6 +243,12 @@ else gen3_log_info "not deploying dicom-viewer - no manifest entry for '.versions[\"dicom-viewer\"]'" fi +if g3k_manifest_lookup '.versions["gen3-discovery-ai"]' 2> /dev/null; then + gen3 kube-setup-gen3-discovery-ai & +else + gen3_log_info "not deploying gen3-discovery-ai - no manifest entry for '.versions[\"gen3-discovery-ai\"]'" +fi + if g3k_manifest_lookup '.versions["ohdsi-atlas"]' && g3k_manifest_lookup '.versions["ohdsi-webapi"]' 2> /dev/null; then gen3 kube-setup-ohdsi & else diff --git a/gen3/bin/kube-setup-gen3-discovery-ai.sh b/gen3/bin/kube-setup-gen3-discovery-ai.sh new file mode 100644 index 0000000000..44a472a74c --- /dev/null +++ b/gen3/bin/kube-setup-gen3-discovery-ai.sh @@ -0,0 +1,154 @@ +#!/bin/bash +# +# Deploy the gen3-discovery-ai service +# + +source "${GEN3_HOME}/gen3/lib/utils.sh" +gen3_load "gen3/gen3setup" + +# NOTE: no db for this service yet, but we'll likely need it in the future +setup_database() { + gen3_log_info "setting up gen3-discovery-ai service ..." + + if g3kubectl describe secret gen3-discovery-ai-g3auto > /dev/null 2>&1; then + gen3_log_info "gen3-discovery-ai-g3auto secret already configured" + return 0 + fi + if [[ -n "$JENKINS_HOME" || ! -f "$(gen3_secrets_folder)/creds.json" ]]; then + gen3_log_err "skipping db setup in non-adminvm environment" + return 0 + fi + # Setup .env file that gen3-discovery-ai service consumes + if [[ ! -f "$secretsFolder/gen3-discovery-ai.env" || ! -f "$secretsFolder/base64Authz.txt" ]]; then + local secretsFolder="$(gen3_secrets_folder)/g3auto/gen3-discovery-ai" + + if [[ ! -f "$secretsFolder/dbcreds.json" ]]; then + if ! gen3 db setup gen3-discovery-ai; then + gen3_log_err "Failed setting up database for gen3-discovery-ai service" + return 1 + fi + fi + if [[ ! -f "$secretsFolder/dbcreds.json" ]]; then + gen3_log_err "dbcreds not present in Gen3Secrets/" + return 1 + fi + + # go ahead and rotate the password whenever we regen this file + local password="$(gen3 random)" + cat - > "$secretsFolder/gen3-discovery-ai.env" < "$secretsFolder/base64Authz.txt" + fi + gen3 secrets sync 'setup gen3-discovery-ai-g3auto secrets' +} + +if ! g3k_manifest_lookup '.versions."gen3-discovery-ai"' 2> /dev/null; then + gen3_log_info "kube-setup-gen3-discovery-ai exiting - gen3-discovery-ai service not in manifest" + exit 0 +fi + +# There's no db for this service *yet* +# +# if ! setup_database; then +# gen3_log_err "kube-setup-gen3-discovery-ai bailing out - database failed setup" +# exit 1 +# fi + +setup_storage() { + local saName="gen3-discovery-ai-sa" + g3kubectl create sa "$saName" > /dev/null 2>&1 || true + + local secret + local secretsFolder="$(gen3_secrets_folder)/g3auto/gen3-discovery-ai" + + secret="$(g3kubectl get secret gen3-discovery-ai-g3auto -o json 2> /dev/null)" + local hasStorageCfg + hasStorageCfg=$(jq -r '.data | has("storage_config.json")' <<< "$secret") + + if [ "$hasStorageCfg" = "false" ]; then + gen3_log_info "setting up storage for gen3-discovery-ai service" + # + # gen3-discovery-ai-g3auto secret still does not exist + # we need to setup an S3 bucket and IAM creds + # let's avoid creating multiple buckets for different + # deployments to the same k8s cluster (dev, etc) + # + local bucketName + local accountNumber + local environment + + if ! accountNumber="$(aws sts get-caller-identity --output text --query 'Account')"; then + gen3_log_err "could not determine account numer" + return 1 + fi + + gen3_log_info "accountNumber: ${accountNumber}" + + if ! environment="$(g3kubectl get configmap manifest-global -o json | jq -r .data.environment)"; then + gen3_log_err "could not determine environment from manifest-global - bailing out of gen3-discovery-ai setup" + return 1 + fi + + gen3_log_info "environment: ${environment}" + + # try to come up with a unique but composable bucket name + bucketName="gen3-discovery-ai-${accountNumber}-${environment//_/-}" + + gen3_log_info "bucketName: ${bucketName}" + + if aws s3 ls --page-size 1 "s3://${bucketName}" > /dev/null 2>&1; then + gen3_log_info "${bucketName} s3 bucket already exists - probably in use by another namespace - copy the creds from there to $(gen3_secrets_folder)/g3auto/gen3-discovery-ai" + # continue on ... + elif ! gen3 s3 create "${bucketName}"; then + gen3_log_err "maybe failed to create bucket ${bucketName}, but maybe not, because the terraform script is flaky" + fi + + local hostname + hostname="$(gen3 api hostname)" + jq -r -n --arg bucket "${bucketName}" --arg hostname "${hostname}" '.bucket=$bucket | .prefix=$hostname' > "${secretsFolder}/storage_config.json" + gen3 secrets sync 'setup gen3-discovery-ai credentials' + + local roleName + roleName="$(gen3 api safe-name gen3-discovery-ai)" || return 1 + + if ! gen3 awsrole info "$roleName" > /dev/null; then # setup role + bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || return 1 + gen3 awsrole create "$roleName" "$saName" || return 1 + gen3 s3 attach-bucket-policy "$bucketName" --read-write --role-name "${roleName}" + # try to give the gitops role read/write permissions on the bucket + local gitopsRoleName + gitopsRoleName="$(gen3 api safe-name gitops)" + gen3 s3 attach-bucket-policy "$bucketName" --read-write --role-name "${gitopsRoleName}" + fi + fi + + return 0 +} + +if ! setup_storage; then + gen3_log_err "kube-setup-gen3-discovery-ai bailing out - storage failed setup" + exit 1 +fi + +gen3_log_info "Setup complete, syncing configuration to bucket" + +bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || exit 1 +aws s3 sync "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge" "s3://$bucketName" --delete + +gen3 roll gen3-discovery-ai +g3kubectl apply -f "${GEN3_HOME}/kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml" + +if [[ -z "$GEN3_ROLL_ALL" ]]; then + gen3 kube-setup-networkpolicy + gen3 kube-setup-revproxy +fi + +gen3_log_info "The gen3-discovery-ai service has been deployed onto the kubernetes cluster" +gen3_log_info "test with: curl https://commons-host/ai" diff --git a/kube/services/gen3-discovery-ai/README.md b/kube/services/gen3-discovery-ai/README.md new file mode 100644 index 0000000000..4c20678e06 --- /dev/null +++ b/kube/services/gen3-discovery-ai/README.md @@ -0,0 +1,42 @@ +# Gen3 Discovery AI Configuration + +Expects data in a `gen3-discovery-ai` folder relative to +where the `manifest.json` is. + +Basic setup: + +`{{dir where manifest.json is}}/gen3-discovery-ai/knowledge/` + +- `tsvs` folder + - tsvs with topic_name at beginning of file +- `markdown` folder + - {{topic_name_1}} + - markdown file(s) + - {{topic_name_2}} + - markdown file(s) + +The `kube-setup-gen3-discovery-ai` script syncs the above `/knowledge` folder to +an S3 bucket. The service configuration then pulls from the S3 bucket and runs load commands +to get the data into chromadb. + +> Note: See the `gen3-discovery-ai` service repo docs and README for more details on data load capabilities. + +Check the `gen3-discovery-ai-deploy.yaml` for what commands are being run in the automation. + +Expects secrets setup in `g3auto/gen3-discovery-ai` folder + - `credentials.json`: Google service account key if using a topic with Google Vertex AI + - `env`: .env file contents for service configuration (see service repo for a default one) + +## Populating Disk for In-Memory Vectordb Chromadb + +In order to setup pre-configured topics, we need to load a bunch of data +into Chromadb (which is an in-mem vectordb with an option to persist to disk). + +To load topics consistently, we setup an S3 bucket to house the persisted +data for the vectordb. + +### Getting data from S3 in mem + +We specify a path for Chromadb to use for persisted data and when it sees +data there, it loads it in. So the deployment automation: 1. aws syncs the bucket +and then 2. calls a script to load the files into the in-mem vectorstore from there. diff --git a/kube/services/gen3-discovery-ai/gen3-discovery-ai-deploy.yaml b/kube/services/gen3-discovery-ai/gen3-discovery-ai-deploy.yaml new file mode 100644 index 0000000000..dcfe03248a --- /dev/null +++ b/kube/services/gen3-discovery-ai/gen3-discovery-ai-deploy.yaml @@ -0,0 +1,181 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gen3-discovery-ai-deployment +spec: + selector: + # Only select pods based on the 'app' label + matchLabels: + app: gen3-discovery-ai + release: production + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + app: gen3-discovery-ai + release: production + GEN3_DATE_LABEL + spec: + serviceAccountName: gen3-discovery-ai-sa + volumes: + - name: gen3-discovery-ai-g3auto-volume + secret: + secretName: gen3-discovery-ai-g3auto + - name: gen3-discovery-ai-knowledge-library-volume + emptyDir: {} + initContainers: + # chromadb's persisted disk support requires the ability to write. We don't technically need this ability + # since we're populating the entirety of the database from configured files (no live updates). + # + # Solution: utilize emptyDir as a writable space. + # + # Procedure: in init containers, copy files from s3 to writable + # temporary space in emptyDir, use files from writable space + # to load into knowledge libary, move final knowledge library + # files into top-level emptyDir and make available in final container + - name: gen3-discovery-ai-aws-init + GEN3_AWSHELPER_IMAGE|-image: quay.io/cdis/awshelper:master-| + imagePullPolicy: Always + ports: + - containerPort: 8080 + env: + - name: GEN3_DEBUG + GEN3_DEBUG_FLAG|-value: "False"-| + volumeMounts: + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/.env + subPath: env + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/credentials.json + subPath: credentials.json + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/storage_config.json + subPath: storage_config.json + - name: gen3-discovery-ai-knowledge-library-volume + mountPath: /gen3discoveryai/knowledge + imagePullPolicy: Always + resources: + requests: + cpu: 1 + limits: + cpu: 2 + memory: 512Mi + command: ["/bin/bash"] + args: + - "-c" + - | + bucketName=$(grep -o "\"bucket\": *\"[^\"]*\"" /gen3discoveryai/storage_config.json | awk -F'"' '{print $4}') + echo BUCKET: "$bucketName" + echo + echo BEFORE /gen3discoveryai/knowledge + ls -Ra /gen3discoveryai/knowledge + echo + echo syncing from s3 + aws s3 sync "s3://${bucketName}" "/gen3discoveryai/knowledge/tmp" + echo + echo AFTER /gen3discoveryai/knowledge + ls -Ra /gen3discoveryai/knowledge + - name: gen3-discovery-ai-knowledge-init + GEN3_GEN3-DISCOVERY-AI_IMAGE + imagePullPolicy: Always + ports: + - containerPort: 8080 + env: + - name: GEN3_DEBUG + GEN3_DEBUG_FLAG|-value: "False"-| + - name: ANONYMIZED_TELEMETRY + value: "False" + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /gen3discoveryai/credentials.json + volumeMounts: + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/.env + subPath: env + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/credentials.json + subPath: credentials.json + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/storage_config.json + subPath: storage_config.json + - name: gen3-discovery-ai-knowledge-library-volume + mountPath: /gen3discoveryai/knowledge + imagePullPolicy: Always + resources: + requests: + cpu: 1 + limits: + cpu: 2 + memory: 512Mi + command: ["/bin/bash"] + args: + - "-c" + - | + echo + echo BEFORE /gen3discoveryai/knowledge + ls -Ra /gen3discoveryai/knowledge + echo running load_into_knowledge_store.py + poetry run python /gen3discoveryai/bin/load_into_knowledge_store.py tsvs /gen3discoveryai/knowledge/tmp/tsvs + + if [ -d "/gen3discoveryai/knowledge/tmp/markdown" ]; then + for dir in "/gen3discoveryai/knowledge/tmp/markdown"/*; do + if [ -d "$dir" ]; then + dir_name=$(basename "$dir") + + echo "Processing directory: $dir_name. Full path: $dir" + poetry run python /gen3discoveryai/bin/load_into_knowledge_store.py markdown --topic $dir_name $dir + fi + done + else + echo "Not syncing markdown, directory not found: /gen3discoveryai/knowledge/tmp/markdown" + fi + + rm -r /gen3discoveryai/knowledge/tmp/ + echo + echo AFTER /gen3discoveryai/knowledge + ls -Ra /gen3discoveryai/knowledge + containers: + - name: gen3-discovery-ai + GEN3_GEN3-DISCOVERY-AI_IMAGE + imagePullPolicy: Always + ports: + - containerPort: 8080 + env: + - name: GEN3_DEBUG + GEN3_DEBUG_FLAG|-value: "False"-| + - name: ANONYMIZED_TELEMETRY + value: "False" + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /gen3discoveryai/credentials.json + volumeMounts: + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/.env + subPath: env + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/credentials.json + subPath: credentials.json + - name: gen3-discovery-ai-g3auto-volume + readOnly: true + mountPath: /gen3discoveryai/storage_config.json + subPath: storage_config.json + - name: gen3-discovery-ai-knowledge-library-volume + mountPath: /gen3discoveryai/knowledge + imagePullPolicy: Always + resources: + requests: + cpu: 1 + limits: + cpu: 2 + # NOTE: If the configured data for the knowledge library (vector database) is large, you may need to bump this + memory: 512Mi diff --git a/kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml b/kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml new file mode 100644 index 0000000000..b4734c3b8a --- /dev/null +++ b/kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml @@ -0,0 +1,21 @@ +kind: Service +apiVersion: v1 +metadata: + name: gen3-discovery-ai-service +spec: + selector: + app: gen3-discovery-ai + release: production + ports: + - protocol: TCP + port: 80 + targetPort: 8089 + name: http + nodePort: null + - protocol: TCP + port: 443 + targetPort: 443 + name: https + nodePort: null + type: ClusterIP + diff --git a/kube/services/revproxy/gen3.nginx.conf/gen3-discovery-ai-service.conf b/kube/services/revproxy/gen3.nginx.conf/gen3-discovery-ai-service.conf new file mode 100644 index 0000000000..42e9a3758b --- /dev/null +++ b/kube/services/revproxy/gen3.nginx.conf/gen3-discovery-ai-service.conf @@ -0,0 +1,12 @@ + location /ai { + if ($csrf_check !~ ^ok-\S.+$) { + return 403 "failed csrf check"; + } + + set $proxy_service "gen3-discovery-ai-service"; + set $upstream http://gen3-discovery-ai-service$des_domain; + rewrite ^/ai/(.*) /$1 break; + proxy_pass $upstream; + proxy_redirect http://$host/ https://$host/ai/; + client_max_body_size 0; + }