Skip to content

Commit

Permalink
Merge branch 'master' into feat/GPE-1173
Browse files Browse the repository at this point in the history
  • Loading branch information
EliseCastle23 authored Mar 4, 2024
2 parents 190fa5d + 6ee728a commit be7ad8a
Show file tree
Hide file tree
Showing 6 changed files with 416 additions and 0 deletions.
6 changes: 6 additions & 0 deletions gen3/bin/kube-roll-all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,12 @@ else
gen3_log_info "not deploying dicom-viewer - no manifest entry for '.versions[\"dicom-viewer\"]'"
fi

if g3k_manifest_lookup '.versions["gen3-discovery-ai"]' 2> /dev/null; then
gen3 kube-setup-gen3-discovery-ai &
else
gen3_log_info "not deploying gen3-discovery-ai - no manifest entry for '.versions[\"gen3-discovery-ai\"]'"
fi

if g3k_manifest_lookup '.versions["ohdsi-atlas"]' && g3k_manifest_lookup '.versions["ohdsi-webapi"]' 2> /dev/null; then
gen3 kube-setup-ohdsi &
else
Expand Down
154 changes: 154 additions & 0 deletions gen3/bin/kube-setup-gen3-discovery-ai.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
#!/bin/bash
#
# Deploy the gen3-discovery-ai service
#

source "${GEN3_HOME}/gen3/lib/utils.sh"
gen3_load "gen3/gen3setup"

# NOTE: no db for this service yet, but we'll likely need it in the future
setup_database() {
gen3_log_info "setting up gen3-discovery-ai service ..."

if g3kubectl describe secret gen3-discovery-ai-g3auto > /dev/null 2>&1; then
gen3_log_info "gen3-discovery-ai-g3auto secret already configured"
return 0
fi
if [[ -n "$JENKINS_HOME" || ! -f "$(gen3_secrets_folder)/creds.json" ]]; then
gen3_log_err "skipping db setup in non-adminvm environment"
return 0
fi
# Setup .env file that gen3-discovery-ai service consumes
if [[ ! -f "$secretsFolder/gen3-discovery-ai.env" || ! -f "$secretsFolder/base64Authz.txt" ]]; then
local secretsFolder="$(gen3_secrets_folder)/g3auto/gen3-discovery-ai"

if [[ ! -f "$secretsFolder/dbcreds.json" ]]; then
if ! gen3 db setup gen3-discovery-ai; then
gen3_log_err "Failed setting up database for gen3-discovery-ai service"
return 1
fi
fi
if [[ ! -f "$secretsFolder/dbcreds.json" ]]; then
gen3_log_err "dbcreds not present in Gen3Secrets/"
return 1
fi

# go ahead and rotate the password whenever we regen this file
local password="$(gen3 random)"
cat - > "$secretsFolder/gen3-discovery-ai.env" <<EOM
DEBUG=0
DB_HOST=$(jq -r .db_host < "$secretsFolder/dbcreds.json")
DB_USER=$(jq -r .db_username < "$secretsFolder/dbcreds.json")
DB_PASSWORD=$(jq -r .db_password < "$secretsFolder/dbcreds.json")
DB_DATABASE=$(jq -r .db_database < "$secretsFolder/dbcreds.json")
ADMIN_LOGINS=gateway:$password
EOM
# make it easy for nginx to get the Authorization header ...
echo -n "gateway:$password" | base64 > "$secretsFolder/base64Authz.txt"
fi
gen3 secrets sync 'setup gen3-discovery-ai-g3auto secrets'
}

if ! g3k_manifest_lookup '.versions."gen3-discovery-ai"' 2> /dev/null; then
gen3_log_info "kube-setup-gen3-discovery-ai exiting - gen3-discovery-ai service not in manifest"
exit 0
fi

# There's no db for this service *yet*
#
# if ! setup_database; then
# gen3_log_err "kube-setup-gen3-discovery-ai bailing out - database failed setup"
# exit 1
# fi

setup_storage() {
local saName="gen3-discovery-ai-sa"
g3kubectl create sa "$saName" > /dev/null 2>&1 || true

local secret
local secretsFolder="$(gen3_secrets_folder)/g3auto/gen3-discovery-ai"

secret="$(g3kubectl get secret gen3-discovery-ai-g3auto -o json 2> /dev/null)"
local hasStorageCfg
hasStorageCfg=$(jq -r '.data | has("storage_config.json")' <<< "$secret")

if [ "$hasStorageCfg" = "false" ]; then
gen3_log_info "setting up storage for gen3-discovery-ai service"
#
# gen3-discovery-ai-g3auto secret still does not exist
# we need to setup an S3 bucket and IAM creds
# let's avoid creating multiple buckets for different
# deployments to the same k8s cluster (dev, etc)
#
local bucketName
local accountNumber
local environment

if ! accountNumber="$(aws sts get-caller-identity --output text --query 'Account')"; then
gen3_log_err "could not determine account numer"
return 1
fi

gen3_log_info "accountNumber: ${accountNumber}"

if ! environment="$(g3kubectl get configmap manifest-global -o json | jq -r .data.environment)"; then
gen3_log_err "could not determine environment from manifest-global - bailing out of gen3-discovery-ai setup"
return 1
fi

gen3_log_info "environment: ${environment}"

# try to come up with a unique but composable bucket name
bucketName="gen3-discovery-ai-${accountNumber}-${environment//_/-}"

gen3_log_info "bucketName: ${bucketName}"

if aws s3 ls --page-size 1 "s3://${bucketName}" > /dev/null 2>&1; then
gen3_log_info "${bucketName} s3 bucket already exists - probably in use by another namespace - copy the creds from there to $(gen3_secrets_folder)/g3auto/gen3-discovery-ai"
# continue on ...
elif ! gen3 s3 create "${bucketName}"; then
gen3_log_err "maybe failed to create bucket ${bucketName}, but maybe not, because the terraform script is flaky"
fi

local hostname
hostname="$(gen3 api hostname)"
jq -r -n --arg bucket "${bucketName}" --arg hostname "${hostname}" '.bucket=$bucket | .prefix=$hostname' > "${secretsFolder}/storage_config.json"
gen3 secrets sync 'setup gen3-discovery-ai credentials'

local roleName
roleName="$(gen3 api safe-name gen3-discovery-ai)" || return 1

if ! gen3 awsrole info "$roleName" > /dev/null; then # setup role
bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || return 1
gen3 awsrole create "$roleName" "$saName" || return 1
gen3 s3 attach-bucket-policy "$bucketName" --read-write --role-name "${roleName}"
# try to give the gitops role read/write permissions on the bucket
local gitopsRoleName
gitopsRoleName="$(gen3 api safe-name gitops)"
gen3 s3 attach-bucket-policy "$bucketName" --read-write --role-name "${gitopsRoleName}"
fi
fi

return 0
}

if ! setup_storage; then
gen3_log_err "kube-setup-gen3-discovery-ai bailing out - storage failed setup"
exit 1
fi

gen3_log_info "Setup complete, syncing configuration to bucket"

bucketName="$( (gen3 secrets decode 'gen3-discovery-ai-g3auto' 'storage_config.json' || echo ERROR) | jq -r .bucket)" || exit 1
aws s3 sync "$(dirname $(g3k_manifest_path))/gen3-discovery-ai/knowledge" "s3://$bucketName" --delete

gen3 roll gen3-discovery-ai
g3kubectl apply -f "${GEN3_HOME}/kube/services/gen3-discovery-ai/gen3-discovery-ai-service.yaml"

if [[ -z "$GEN3_ROLL_ALL" ]]; then
gen3 kube-setup-networkpolicy
gen3 kube-setup-revproxy
fi

gen3_log_info "The gen3-discovery-ai service has been deployed onto the kubernetes cluster"
gen3_log_info "test with: curl https://commons-host/ai"
42 changes: 42 additions & 0 deletions kube/services/gen3-discovery-ai/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Gen3 Discovery AI Configuration

Expects data in a `gen3-discovery-ai` folder relative to
where the `manifest.json` is.

Basic setup:

`{{dir where manifest.json is}}/gen3-discovery-ai/knowledge/`

- `tsvs` folder
- tsvs with topic_name at beginning of file
- `markdown` folder
- {{topic_name_1}}
- markdown file(s)
- {{topic_name_2}}
- markdown file(s)

The `kube-setup-gen3-discovery-ai` script syncs the above `/knowledge` folder to
an S3 bucket. The service configuration then pulls from the S3 bucket and runs load commands
to get the data into chromadb.

> Note: See the `gen3-discovery-ai` service repo docs and README for more details on data load capabilities.
Check the `gen3-discovery-ai-deploy.yaml` for what commands are being run in the automation.

Expects secrets setup in `g3auto/gen3-discovery-ai` folder
- `credentials.json`: Google service account key if using a topic with Google Vertex AI
- `env`: .env file contents for service configuration (see service repo for a default one)

## Populating Disk for In-Memory Vectordb Chromadb

In order to setup pre-configured topics, we need to load a bunch of data
into Chromadb (which is an in-mem vectordb with an option to persist to disk).

To load topics consistently, we setup an S3 bucket to house the persisted
data for the vectordb.

### Getting data from S3 in mem

We specify a path for Chromadb to use for persisted data and when it sees
data there, it loads it in. So the deployment automation: 1. aws syncs the bucket
and then 2. calls a script to load the files into the in-mem vectorstore from there.
181 changes: 181 additions & 0 deletions kube/services/gen3-discovery-ai/gen3-discovery-ai-deploy.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: gen3-discovery-ai-deployment
spec:
selector:
# Only select pods based on the 'app' label
matchLabels:
app: gen3-discovery-ai
release: production
strategy:
type: RollingUpdate
rollingUpdate:
maxSurge: 1
maxUnavailable: 0
template:
metadata:
labels:
app: gen3-discovery-ai
release: production
GEN3_DATE_LABEL
spec:
serviceAccountName: gen3-discovery-ai-sa
volumes:
- name: gen3-discovery-ai-g3auto-volume
secret:
secretName: gen3-discovery-ai-g3auto
- name: gen3-discovery-ai-knowledge-library-volume
emptyDir: {}
initContainers:
# chromadb's persisted disk support requires the ability to write. We don't technically need this ability
# since we're populating the entirety of the database from configured files (no live updates).
#
# Solution: utilize emptyDir as a writable space.
#
# Procedure: in init containers, copy files from s3 to writable
# temporary space in emptyDir, use files from writable space
# to load into knowledge libary, move final knowledge library
# files into top-level emptyDir and make available in final container
- name: gen3-discovery-ai-aws-init
GEN3_AWSHELPER_IMAGE|-image: quay.io/cdis/awshelper:master-|
imagePullPolicy: Always
ports:
- containerPort: 8080
env:
- name: GEN3_DEBUG
GEN3_DEBUG_FLAG|-value: "False"-|
volumeMounts:
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/.env
subPath: env
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/credentials.json
subPath: credentials.json
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/storage_config.json
subPath: storage_config.json
- name: gen3-discovery-ai-knowledge-library-volume
mountPath: /gen3discoveryai/knowledge
imagePullPolicy: Always
resources:
requests:
cpu: 1
limits:
cpu: 2
memory: 512Mi
command: ["/bin/bash"]
args:
- "-c"
- |
bucketName=$(grep -o "\"bucket\": *\"[^\"]*\"" /gen3discoveryai/storage_config.json | awk -F'"' '{print $4}')
echo BUCKET: "$bucketName"
echo
echo BEFORE /gen3discoveryai/knowledge
ls -Ra /gen3discoveryai/knowledge
echo
echo syncing from s3
aws s3 sync "s3://${bucketName}" "/gen3discoveryai/knowledge/tmp"
echo
echo AFTER /gen3discoveryai/knowledge
ls -Ra /gen3discoveryai/knowledge
- name: gen3-discovery-ai-knowledge-init
GEN3_GEN3-DISCOVERY-AI_IMAGE
imagePullPolicy: Always
ports:
- containerPort: 8080
env:
- name: GEN3_DEBUG
GEN3_DEBUG_FLAG|-value: "False"-|
- name: ANONYMIZED_TELEMETRY
value: "False"
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /gen3discoveryai/credentials.json
volumeMounts:
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/.env
subPath: env
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/credentials.json
subPath: credentials.json
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/storage_config.json
subPath: storage_config.json
- name: gen3-discovery-ai-knowledge-library-volume
mountPath: /gen3discoveryai/knowledge
imagePullPolicy: Always
resources:
requests:
cpu: 1
limits:
cpu: 2
memory: 512Mi
command: ["/bin/bash"]
args:
- "-c"
- |
echo
echo BEFORE /gen3discoveryai/knowledge
ls -Ra /gen3discoveryai/knowledge
echo running load_into_knowledge_store.py
poetry run python /gen3discoveryai/bin/load_into_knowledge_store.py tsvs /gen3discoveryai/knowledge/tmp/tsvs
if [ -d "/gen3discoveryai/knowledge/tmp/markdown" ]; then
for dir in "/gen3discoveryai/knowledge/tmp/markdown"/*; do
if [ -d "$dir" ]; then
dir_name=$(basename "$dir")
echo "Processing directory: $dir_name. Full path: $dir"
poetry run python /gen3discoveryai/bin/load_into_knowledge_store.py markdown --topic $dir_name $dir
fi
done
else
echo "Not syncing markdown, directory not found: /gen3discoveryai/knowledge/tmp/markdown"
fi
rm -r /gen3discoveryai/knowledge/tmp/
echo
echo AFTER /gen3discoveryai/knowledge
ls -Ra /gen3discoveryai/knowledge
containers:
- name: gen3-discovery-ai
GEN3_GEN3-DISCOVERY-AI_IMAGE
imagePullPolicy: Always
ports:
- containerPort: 8080
env:
- name: GEN3_DEBUG
GEN3_DEBUG_FLAG|-value: "False"-|
- name: ANONYMIZED_TELEMETRY
value: "False"
- name: GOOGLE_APPLICATION_CREDENTIALS
value: /gen3discoveryai/credentials.json
volumeMounts:
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/.env
subPath: env
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/credentials.json
subPath: credentials.json
- name: gen3-discovery-ai-g3auto-volume
readOnly: true
mountPath: /gen3discoveryai/storage_config.json
subPath: storage_config.json
- name: gen3-discovery-ai-knowledge-library-volume
mountPath: /gen3discoveryai/knowledge
imagePullPolicy: Always
resources:
requests:
cpu: 1
limits:
cpu: 2
# NOTE: If the configured data for the knowledge library (vector database) is large, you may need to bump this
memory: 512Mi
Loading

0 comments on commit be7ad8a

Please sign in to comment.