diff --git a/.gitignore b/.gitignore index 9b15dfdc..71bcd867 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ *.tfstate* .terraform -terraform.tfvars \ No newline at end of file +terraform.tfvars +settings.json \ No newline at end of file diff --git a/deployments/README.md b/deployments/README.md deleted file mode 100644 index bd0bab18..00000000 --- a/deployments/README.md +++ /dev/null @@ -1 +0,0 @@ -This directory is not actively used and will be removed in the future \ No newline at end of file diff --git a/deployments/spacelift/dpe-k8s/main.tf b/deployments/spacelift/dpe-k8s/main.tf index 7f9a8429..0b2e9b5d 100644 --- a/deployments/spacelift/dpe-k8s/main.tf +++ b/deployments/spacelift/dpe-k8s/main.tf @@ -196,3 +196,75 @@ resource "spacelift_aws_integration_attachment" "k8s-deployments-aws-integration read = true write = true } + +resource "spacelift_stack" "k8s-stack-deployments-testing" { + github_enterprise { + namespace = "Sage-Bionetworks-Workflows" + id = "sage-bionetworks-workflows-gh" + } + + depends_on = [ + spacelift_space.dpe-space + ] + + administrative = false + autodeploy = var.auto_deploy + branch = "ibcdpe-1004-airflow-ops" + description = "Deployments internal to an EKS cluster" + name = "${var.k8s_stack_deployments_name}-testing" + project_root = "deployments/stacks/dpe-k8s-deployments-testing" + repository = "eks-stack" + terraform_version = var.opentofu_version + terraform_workflow_tool = "OPEN_TOFU" + space_id = spacelift_space.dpe-space.id + additional_project_globs = [ + "deployments/" + ] +} + +resource "spacelift_environment_variable" "k8s-stack-deployments-testing-environment-variables" { + for_each = local.k8s_stack_deployments_variables + + stack_id = spacelift_stack.k8s-stack-deployments-testing.id + name = "TF_VAR_${each.key}" + value = try(tostring(each.value), jsonencode(each.value)) + write_only = false +} + +resource "spacelift_context_attachment" "k8s-kubeconfig-hooks-testing" { + context_id = "kubernetes-deployments-kubeconfig" + stack_id = spacelift_stack.k8s-stack-deployments-testing.id +} + +resource "spacelift_stack_dependency" "k8s-stack-to-deployments-testing" { + stack_id = spacelift_stack.k8s-stack-deployments-testing.id + depends_on_stack_id = spacelift_stack.k8s-stack.id +} + +resource "spacelift_stack_dependency_reference" "dependency-references-testing" { + for_each = local.k8s_stack_to_deployment_variables + + stack_dependency_id = spacelift_stack_dependency.k8s-stack-to-deployments-testing.id + output_name = each.key + input_name = each.value +} + +resource "spacelift_stack_dependency_reference" "region-name-testing" { + stack_dependency_id = spacelift_stack_dependency.k8s-stack-to-deployments-testing.id + output_name = "region" + input_name = "REGION" +} + +resource "spacelift_stack_dependency_reference" "cluster-name-testing" { + stack_dependency_id = spacelift_stack_dependency.k8s-stack-to-deployments-testing.id + output_name = "cluster_name" + input_name = "CLUSTER_NAME" +} + +resource "spacelift_aws_integration_attachment" "k8s-deployments-aws-integration-attachment-testing" { + + integration_id = var.aws_integration_id + stack_id = spacelift_stack.k8s-stack-deployments-testing.id + read = true + write = true +} diff --git a/deployments/stacks/dpe-k8s-deployments-testing/data.tf b/deployments/stacks/dpe-k8s-deployments-testing/data.tf new file mode 100644 index 00000000..c1724ceb --- /dev/null +++ b/deployments/stacks/dpe-k8s-deployments-testing/data.tf @@ -0,0 +1,15 @@ +data "aws_eks_cluster" "cluster" { + name = var.cluster_name +} + +data "aws_eks_cluster_auth" "cluster" { + name = var.cluster_name +} + +data "aws_secretsmanager_secret" "spotinst_token" { + name = "spotinst_token" +} + +data "aws_secretsmanager_secret_version" "secret_credentials" { + secret_id = data.aws_secretsmanager_secret.spotinst_token.id +} diff --git a/deployments/stacks/dpe-k8s-deployments-testing/main.tf b/deployments/stacks/dpe-k8s-deployments-testing/main.tf new file mode 100644 index 00000000..aab2823a --- /dev/null +++ b/deployments/stacks/dpe-k8s-deployments-testing/main.tf @@ -0,0 +1,20 @@ +module "postgres-cloud-native-operator" { + # source = "spacelift.io/sagebionetworks/postgres-cloud-native/aws" + source = "../../../modules/postgres-cloud-native-operator/" + # version = "0.2.1" + auto_deploy = true + auto_prune = true + git_revision = "ibcdpe-1004-airflow-ops" +} + + +# module "postgres-cloud-native" { +# # source = "spacelift.io/sagebionetworks/postgres-cloud-native/aws" +# source = "../../../modules/postgres-cloud-native/" +# # version = "0.2.1" +# auto_deploy = true +# auto_prune = true +# git_revision = "ibcdpe-1004-airflow-ops" +# namespace = "airflow" +# argo_deployment_name = "airflow-postgres-cloud-native" +# } diff --git a/deployments/stacks/dpe-k8s-deployments-testing/provider.tf b/deployments/stacks/dpe-k8s-deployments-testing/provider.tf new file mode 100644 index 00000000..32049e25 --- /dev/null +++ b/deployments/stacks/dpe-k8s-deployments-testing/provider.tf @@ -0,0 +1,28 @@ +provider "aws" { + region = var.region +} + +provider "kubernetes" { + config_path = var.kube_config_path + host = data.aws_eks_cluster.cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data) + token = data.aws_eks_cluster_auth.cluster.token +} + +provider "helm" { + kubernetes { + config_path = var.kube_config_path + } +} + +provider "spotinst" { + account = var.spotinst_account + token = data.aws_secretsmanager_secret_version.secret_credentials.secret_string +} + +provider "kubectl" { + config_path = var.kube_config_path + host = data.aws_eks_cluster.cluster.endpoint + cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data) + token = data.aws_eks_cluster_auth.cluster.token +} diff --git a/deployments/stacks/dpe-k8s-deployments-testing/variables.tf b/deployments/stacks/dpe-k8s-deployments-testing/variables.tf new file mode 100644 index 00000000..2828bba3 --- /dev/null +++ b/deployments/stacks/dpe-k8s-deployments-testing/variables.tf @@ -0,0 +1,62 @@ +variable "vpc_id" { + description = "VPC ID" + type = string +} + +variable "private_subnet_ids" { + description = "Private subnet IDs" + type = list(string) +} + +variable "node_security_group_id" { + description = "Node security group ID" + type = string +} + +variable "pod_to_node_dns_sg_id" { + description = "Pod to node DNS security group ID." + type = string +} + +variable "vpc_cidr_block" { + description = "VPC CIDR block" + type = string +} + +variable "kube_config_path" { + description = "Kube config path" + type = string + default = "~/.kube/config" +} + +variable "region" { + description = "AWS region" + type = string + default = "us-east-1" +} + +variable "cluster_name" { + description = "EKS cluster name" + type = string +} + +variable "spotinst_account" { + description = "Spot.io account" + type = string +} + +variable "auto_deploy" { + description = "Automatically deploy the stack" + type = bool +} + +variable "auto_prune" { + description = "Automatically prune kubernetes resources" + type = bool +} + +variable "git_revision" { + description = "The git revision to deploy" + type = string + default = "main" +} diff --git a/deployments/stacks/dpe-k8s-deployments-testing/versions.tf b/deployments/stacks/dpe-k8s-deployments-testing/versions.tf new file mode 100644 index 00000000..e3f8c566 --- /dev/null +++ b/deployments/stacks/dpe-k8s-deployments-testing/versions.tf @@ -0,0 +1,11 @@ +terraform { + required_providers { + spotinst = { + source = "spotinst/spotinst" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } +} diff --git a/deployments/stacks/dpe-k8s-deployments/main.tf b/deployments/stacks/dpe-k8s-deployments/main.tf index 57df99d3..591d81e0 100644 --- a/deployments/stacks/dpe-k8s-deployments/main.tf +++ b/deployments/stacks/dpe-k8s-deployments/main.tf @@ -13,7 +13,7 @@ module "sage-aws-eks-autoscaler" { module "victoria-metrics" { depends_on = [module.argo-cd, module.sage-aws-eks-autoscaler] source = "spacelift.io/sagebionetworks/victoria-metrics/aws" - version = "0.4.7" + version = "0.4.8" auto_deploy = var.auto_deploy auto_prune = var.auto_prune git_revision = var.git_revision @@ -28,14 +28,14 @@ module "trivy-operator" { git_revision = var.git_revision } -module "airflow" { - depends_on = [module.victoria-metrics, module.argo-cd, module.sage-aws-eks-autoscaler] - source = "spacelift.io/sagebionetworks/airflow/aws" - version = "0.3.1" - auto_deploy = var.auto_deploy - auto_prune = var.auto_prune - git_revision = var.git_revision -} +# module "airflow" { +# depends_on = [module.victoria-metrics, module.argo-cd, module.sage-aws-eks-autoscaler] +# source = "spacelift.io/sagebionetworks/airflow/aws" +# version = "0.3.1" +# auto_deploy = var.auto_deploy +# auto_prune = var.auto_prune +# git_revision = var.git_revision +# } module "argo-cd" { depends_on = [module.sage-aws-eks-autoscaler] diff --git a/main.tf b/main.tf index fcdc2390..b1ce7b66 100644 --- a/main.tf +++ b/main.tf @@ -10,7 +10,7 @@ # } locals { - git_branch = "main" + git_branch = "ibcdpe-1004-airflow-ops" } resource "spacelift_stack" "root_administrative_stack" { diff --git a/modules/apache-airflow/main.tf b/modules/apache-airflow/main.tf index 9546d371..ad6cb18f 100644 --- a/modules/apache-airflow/main.tf +++ b/modules/apache-airflow/main.tf @@ -5,7 +5,7 @@ resource "kubernetes_namespace" "airflow" { metadata { - name = "airflow" + name = var.namespace } } @@ -18,7 +18,7 @@ resource "random_password" "airflow" { resource "kubernetes_secret" "airflow_webserver_secret" { metadata { name = "airflow-webserver-secret" - namespace = "airflow" + namespace = var.namespace } data = { @@ -28,10 +28,26 @@ resource "kubernetes_secret" "airflow_webserver_secret" { depends_on = [kubernetes_namespace.airflow] } +resource "random_password" "airflow-admin-user" { + length = 32 + special = false +} + +resource "kubernetes_secret" "airflow-admin-user-secret" { + metadata { + name = "airflow-admin-user-secret" + namespace = var.namespace + } + + data = { + "password" = random_password.airflow-admin-user.result + "username" = "admin" + } + + depends_on = [kubernetes_namespace.airflow] +} -# TODO: Should a long-term deployment use a managed RDS instance? -# https://github.com/apache/airflow/blob/main/chart/values.yaml#L2321-L2329 -resource "kubectl_manifest" "argo-deployment" { +resource "kubectl_manifest" "airflow-deployment" { depends_on = [kubernetes_namespace.airflow] yaml_body = <. +This will deploy both the operator. + +The `database` deployment is a part of another module. This allows us to add a single +operator to a cluster and deploy 1 or more databases to that cluster. + + +## Future work to expand the capabilities +- Setting up backups to S3: https://cloudnative-pg.io/documentation/current/backup/ +- Moving to database only node groups: https://cloudnative-pg.io/documentation/current/architecture/#postgresql-architecture + + +Reading: +- https://www.cncf.io/blog/2023/09/29/recommended-architectures-for-postgresql-in-kubernetes/ + - "The next level is to separate the Kubernetes worker nodes for PostgreSQL workloads from the other workloads’, using Kubernetes’ native scheduling capabilities, such as affinity, anti-affinity, node selectors and taints. You’ll still insist on the same storage, but you can get more predictability in terms of CPU and memory usage." +- Assign database persistent volumes to an expandable storage class + + +# How many databases should I use? + +From their documentation: + +"Our recommendation is to dedicate a single PostgreSQL cluster (intended as primary and multiple standby servers) to a single database, entirely managed by a single microservice application." \ No newline at end of file diff --git a/modules/postgres-cloud-native-operator/main.tf b/modules/postgres-cloud-native-operator/main.tf new file mode 100644 index 00000000..85f941c8 --- /dev/null +++ b/modules/postgres-cloud-native-operator/main.tf @@ -0,0 +1,44 @@ +locals { + git_revision = "ibcdpe-1004-airflow-ops" +} + +resource "kubernetes_namespace" "cnpg-system" { + metadata { + name = "cnpg-system" + } +} + +resource "kubectl_manifest" "argo-deployment-operator" { + depends_on = [kubernetes_namespace.cnpg-system] + + yaml_body = <=17.0.0" + name: pg_stat_bgwriter + query: | + SELECT buffers_clean + , maxwritten_clean + , buffers_alloc + , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time + FROM pg_catalog.pg_stat_bgwriter + metrics: + - buffers_clean: + usage: "COUNTER" + description: "Number of buffers written by the background writer" + - maxwritten_clean: + usage: "COUNTER" + description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" + - buffers_alloc: + usage: "COUNTER" + description: "Number of buffers allocated" + - stats_reset_time: + usage: "GAUGE" + description: "Time at which these statistics were last reset" + + pg_stat_checkpointer: + runonserver: ">=17.0.0" + query: | + SELECT num_timed AS checkpoints_timed + , num_requested AS checkpoints_req + , restartpoints_timed + , restartpoints_req + , restartpoints_done + , write_time + , sync_time + , buffers_written + , EXTRACT(EPOCH FROM stats_reset) AS stats_reset_time + FROM pg_catalog.pg_stat_checkpointer + metrics: + - checkpoints_timed: + usage: "COUNTER" + description: "Number of scheduled checkpoints that have been performed" + - checkpoints_req: + usage: "COUNTER" + description: "Number of requested checkpoints that have been performed" + - restartpoints_timed: + usage: "COUNTER" + description: "Number of scheduled restartpoints due to timeout or after a failed attempt to perform it" + - restartpoints_req: + usage: "COUNTER" + description: "Number of requested restartpoints that have been performed" + - restartpoints_done: + usage: "COUNTER" + description: "Number of restartpoints that have been performed" + - write_time: + usage: "COUNTER" + description: "Total amount of time that has been spent in the portion of processing checkpoints and restartpoints where files are written to disk, in milliseconds" + - sync_time: + usage: "COUNTER" + description: "Total amount of time that has been spent in the portion of processing checkpoints and restartpoints where files are synchronized to disk, in milliseconds" + - buffers_written: + usage: "COUNTER" + description: "Number of buffers written during checkpoints and restartpoints" + - stats_reset_time: + usage: "GAUGE" + description: "Time at which these statistics were last reset" + + pg_stat_database: + query: | + SELECT datname + , xact_commit + , xact_rollback + , blks_read + , blks_hit + , tup_returned + , tup_fetched + , tup_inserted + , tup_updated + , tup_deleted + , conflicts + , temp_files + , temp_bytes + , deadlocks + , blk_read_time + , blk_write_time + FROM pg_catalog.pg_stat_database + metrics: + - datname: + usage: "LABEL" + description: "Name of this database" + - xact_commit: + usage: "COUNTER" + description: "Number of transactions in this database that have been committed" + - xact_rollback: + usage: "COUNTER" + description: "Number of transactions in this database that have been rolled back" + - blks_read: + usage: "COUNTER" + description: "Number of disk blocks read in this database" + - blks_hit: + usage: "COUNTER" + description: "Number of times disk blocks were found already in the buffer cache, so that a read was not necessary (this only includes hits in the PostgreSQL buffer cache, not the operating system's file system cache)" + - tup_returned: + usage: "COUNTER" + description: "Number of rows returned by queries in this database" + - tup_fetched: + usage: "COUNTER" + description: "Number of rows fetched by queries in this database" + - tup_inserted: + usage: "COUNTER" + description: "Number of rows inserted by queries in this database" + - tup_updated: + usage: "COUNTER" + description: "Number of rows updated by queries in this database" + - tup_deleted: + usage: "COUNTER" + description: "Number of rows deleted by queries in this database" + - conflicts: + usage: "COUNTER" + description: "Number of queries canceled due to conflicts with recovery in this database" + - temp_files: + usage: "COUNTER" + description: "Number of temporary files created by queries in this database" + - temp_bytes: + usage: "COUNTER" + description: "Total amount of data written to temporary files by queries in this database" + - deadlocks: + usage: "COUNTER" + description: "Number of deadlocks detected in this database" + - blk_read_time: + usage: "COUNTER" + description: "Time spent reading data file blocks by backends in this database, in milliseconds" + - blk_write_time: + usage: "COUNTER" + description: "Time spent writing data file blocks by backends in this database, in milliseconds" + + pg_stat_replication: + primary: true + query: | + SELECT usename + , COALESCE(application_name, '') AS application_name + , COALESCE(client_addr::text, '') AS client_addr + , COALESCE(client_port::text, '') AS client_port + , EXTRACT(EPOCH FROM backend_start) AS backend_start + , COALESCE(pg_catalog.age(backend_xmin), 0) AS backend_xmin_age + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), sent_lsn) AS sent_diff_bytes + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), write_lsn) AS write_diff_bytes + , pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), flush_lsn) AS flush_diff_bytes + , COALESCE(pg_catalog.pg_wal_lsn_diff(pg_catalog.pg_current_wal_lsn(), replay_lsn),0) AS replay_diff_bytes + , COALESCE((EXTRACT(EPOCH FROM write_lag)),0)::float AS write_lag_seconds + , COALESCE((EXTRACT(EPOCH FROM flush_lag)),0)::float AS flush_lag_seconds + , COALESCE((EXTRACT(EPOCH FROM replay_lag)),0)::float AS replay_lag_seconds + FROM pg_catalog.pg_stat_replication + metrics: + - usename: + usage: "LABEL" + description: "Name of the replication user" + - application_name: + usage: "LABEL" + description: "Name of the application" + - client_addr: + usage: "LABEL" + description: "Client IP address" + - client_port: + usage: "LABEL" + description: "Client TCP port" + - backend_start: + usage: "COUNTER" + description: "Time when this process was started" + - backend_xmin_age: + usage: "COUNTER" + description: "The age of this standby's xmin horizon" + - sent_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location sent on this connection" + - write_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location written to disk by this standby server" + - flush_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location flushed to disk by this standby server" + - replay_diff_bytes: + usage: "GAUGE" + description: "Difference in bytes from the last write-ahead log location replayed into the database on this standby server" + - write_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" + - flush_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" + - replay_lag_seconds: + usage: "GAUGE" + description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" + + pg_settings: + query: | + SELECT name, + CASE setting WHEN 'on' THEN '1' WHEN 'off' THEN '0' ELSE setting END AS setting + FROM pg_catalog.pg_settings + WHERE vartype IN ('integer', 'real', 'bool') + ORDER BY 1 + metrics: + - name: + usage: "LABEL" + description: "Name of the setting" + - setting: + usage: "GAUGE" + description: "Setting value" + diff --git a/modules/postgres-cloud-native-operator/variables.tf b/modules/postgres-cloud-native-operator/variables.tf new file mode 100644 index 00000000..07ea3c7a --- /dev/null +++ b/modules/postgres-cloud-native-operator/variables.tf @@ -0,0 +1,17 @@ +variable "auto_deploy" { + description = "Auto deploy through ArgoCD" + type = bool + default = false +} + +variable "auto_prune" { + description = "Auto prune through ArgoCD" + type = bool + default = false +} + +variable "git_revision" { + description = "The git revision to deploy" + type = string + default = "main" +} diff --git a/modules/postgres-cloud-native-operator/versions.tf b/modules/postgres-cloud-native-operator/versions.tf new file mode 100644 index 00000000..c35c044f --- /dev/null +++ b/modules/postgres-cloud-native-operator/versions.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } +} diff --git a/modules/postgres-cloud-native/README.md b/modules/postgres-cloud-native/README.md new file mode 100644 index 00000000..bfa988d3 --- /dev/null +++ b/modules/postgres-cloud-native/README.md @@ -0,0 +1,16 @@ +# Purpose +The purpose of this module is to deploy the `Cloudnative PG` helm chart . +This will deploy both the operator and a database cluster. + + +Future work: + +- Since each microservice/application is meant to recieve it's own database the deployment model within this module should be changed slightly to install the operator at a cluster level, with each application having its own database. + + + +# How many databases?? + +From their documentation: + +"Our recommendation is to dedicate a single PostgreSQL cluster (intended as primary and multiple standby servers) to a single database, entirely managed by a single microservice application." \ No newline at end of file diff --git a/modules/postgres-cloud-native/main.tf b/modules/postgres-cloud-native/main.tf new file mode 100644 index 00000000..419cef7f --- /dev/null +++ b/modules/postgres-cloud-native/main.tf @@ -0,0 +1,71 @@ +locals { + git_revision = "ibcdpe-1004-airflow-ops" +} + +resource "kubectl_manifest" "argo-deployment-database" { + depends_on = [ + kubernetes_secret.connection-secret + ] + yaml_body = <.amazonaws.com" + # Leave empty if using the default S3 endpoint + endpointURL: "" + # -- Specifies a CA bundle to validate a privately signed certificate. + endpointCA: + # -- Creates a secret with the given value if true, otherwise uses an existing secret. + create: false + name: "" + key: "" + value: "" + # -- Overrides the provider specific default path. Defaults to: + # S3: s3:// + # Azure: https://..core.windows.net/ + # Google: gs:// + destinationPath: "" + # -- One of `s3`, `azure` or `google` + provider: s3 + s3: + region: "" + bucket: "" + path: "/" + accessKey: "" + secretKey: "" + azure: + path: "/" + connectionString: "" + storageAccount: "" + storageKey: "" + storageSasToken: "" + containerName: "" + serviceName: blob + inheritFromAzureAD: false + google: + path: "/" + bucket: "" + gkeEnvironment: false + applicationCredentials: "" + secret: + # -- Whether to create a secret for the backup credentials + create: true + # -- Name of the backup credentials secret + name: "" + + +cluster: + # -- Number of instances + instances: 3 + + # -- Name of the container image, supporting both tags (:) and digests for deterministic and repeatable deployments: + # :@sha256: + imageName: "" # Default value depends on type (postgresql/postgis/timescaledb) + + # -- Image pull policy. One of Always, Never or IfNotPresent. If not defined, it defaults to IfNotPresent. Cannot be updated. + # More info: https://kubernetes.io/docs/concepts/containers/images#updating-images + imagePullPolicy: IfNotPresent + + # -- The list of pull secrets to be used to pull the images. + # See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-LocalObjectReference + imagePullSecrets: [] + + storage: + size: 8Gi + storageClass: "" + + # -- The UID of the postgres user inside the image, defaults to 26 + postgresUID: 26 + + # -- The GID of the postgres user inside the image, defaults to 26 + postgresGID: 26 + + # -- Resources requirements of every generated Pod. + # Please refer to https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ for more information. + # We strongly advise you use the same setting for limits and requests so that your cluster pods are given a Guaranteed QoS. + # See: https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/ + resources: {} + # limits: + # cpu: 2000m + # memory: 8Gi + # requests: + # cpu: 2000m + # memory: 8Gi + + priorityClassName: "" + + # -- Method to follow to upgrade the primary server during a rolling update procedure, after all replicas have been + # successfully updated. It can be switchover (default) or in-place (restart). + primaryUpdateMethod: switchover + + # -- Strategy to follow to upgrade the primary server during a rolling update procedure, after all replicas have been + # successfully updated: it can be automated (unsupervised - default) or manual (supervised) + primaryUpdateStrategy: unsupervised + + # -- The instances' log level, one of the following values: error, warning, info (default), debug, trace + logLevel: "info" + + # -- Affinity/Anti-affinity rules for Pods. + # See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-AffinityConfiguration + affinity: + topologyKey: topology.kubernetes.io/zone + + # -- The configuration for the CA and related certificates. + # See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-CertificatesConfiguration + certificates: {} + + # -- When this option is enabled, the operator will use the SuperuserSecret to update the postgres user password. + # If the secret is not present, the operator will automatically create one. + # When this option is disabled, the operator will ignore the SuperuserSecret content, delete it when automatically created, + # and then blank the password of the postgres user by setting it to NULL. + enableSuperuserAccess: true + superuserSecret: "" + + # -- This feature enables declarative management of existing roles, as well as the creation of new roles if they are not + # already present in the database. + # See: https://cloudnative-pg.io/documentation/current/declarative_role_management/ + # TODO: Role management + roles: [] + # - name: airflow-pg + # ensure: present + # comment: Service account for airflow + # login: true + # superuser: false + # inRoles: + # - pg_monitor + # - pg_signal_backend + + monitoring: + # -- Whether to enable monitoring + enabled: true + podMonitor: + # -- Whether to enable the PodMonitor + enabled: true + prometheusRule: + # -- Whether to enable the PrometheusRule automated alerts + enabled: true + # -- Exclude specified rules + excludeRules: [] + # - CNPGClusterZoneSpreadWarning + # -- Custom Prometheus metrics + customQueries: [] + # - name: "pg_cache_hit_ratio" + # query: "SELECT current_database() as datname, sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read)) as ratio FROM pg_statio_user_tables;" + # metrics: + # - datname: + # usage: "LABEL" + # description: "Name of the database" + # - ratio: + # usage: GAUGE + # description: "Cache hit ratio" + + # -- Configuration of the PostgreSQL server. + # See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-PostgresConfiguration + postgresql: {} + # max_connections: 300 + + # -- BootstrapInitDB is the configuration of the bootstrap process when initdb is used. + # See: https://cloudnative-pg.io/documentation/current/bootstrap/ + # See: https://cloudnative-pg.io/documentation/current/cloudnative-pg.v1/#postgresql-cnpg-io-v1-bootstrapinitdb + # TODO: Verify user/db setup works properly + initdb: + database: application-database + owner: "" # Defaults to the database name + secret: + name: "pg-user-secret" # Name of the secret containing the initial credentials for the owner of the user database. If empty a new secret will be created from scratch + # postInitSQL: + # - CREATE EXTENSION IF NOT EXISTS vector; + + additionalLabels: {} + annotations: {} + + +backups: + # -- You need to configure backups manually, so backups are disabled by default. + enabled: false + + # -- Overrides the provider specific default endpoint. Defaults to: + # S3: https://s3..amazonaws.com" + endpointURL: "" # Leave empty if using the default S3 endpoint + # -- Specifies a CA bundle to validate a privately signed certificate. + endpointCA: + # -- Creates a secret with the given value if true, otherwise uses an existing secret. + create: false + name: "" + key: "" + value: "" + + # -- Overrides the provider specific default path. Defaults to: + # S3: s3:// + # Azure: https://..core.windows.net/ + # Google: gs:// + destinationPath: "" + # -- One of `s3`, `azure` or `google` + provider: s3 + s3: + region: "" + bucket: "" + path: "/" + accessKey: "" + secretKey: "" + azure: + path: "/" + connectionString: "" + storageAccount: "" + storageKey: "" + storageSasToken: "" + containerName: "" + serviceName: blob + inheritFromAzureAD: false + google: + path: "/" + bucket: "" + gkeEnvironment: false + applicationCredentials: "" + secret: + # -- Whether to create a secret for the backup credentials + create: true + # -- Name of the backup credentials secret + name: "" + + wal: + # -- WAL compression method. One of `` (for no compression), `gzip`, `bzip2` or `snappy`. + compression: gzip + # -- Whether to instruct the storage provider to encrypt WAL files. One of `` (use the storage container default), `AES256` or `aws:kms`. + encryption: AES256 + # -- Number of WAL files to be archived or restored in parallel. + maxParallel: 1 + data: + # -- Data compression method. One of `` (for no compression), `gzip`, `bzip2` or `snappy`. + compression: gzip + # -- Whether to instruct the storage provider to encrypt data files. One of `` (use the storage container default), `AES256` or `aws:kms`. + encryption: AES256 + # -- Number of data files to be archived or restored in parallel. + jobs: 2 + + scheduledBackups: + - + # -- Scheduled backup name + name: daily-backup + # -- Schedule in cron format + schedule: "0 0 0 * * *" + # -- Backup owner reference + backupOwnerReference: self + # -- Backup method, can be `barmanObjectStore` (default) or `volumeSnapshot` + method: barmanObjectStore + + # -- Retention policy for backups + retentionPolicy: "30d" + + +pooler: + # -- Whether to enable PgBouncer + enabled: false + # -- PgBouncer type of service to forward traffic to. + type: rw + # -- PgBouncer pooling mode + poolMode: transaction + # -- Number of PgBouncer instances + instances: 3 + # -- PgBouncer configuration parameters + parameters: + max_client_conn: "1000" + default_pool_size: "25" + + monitoring: + # -- Whether to enable monitoring + enabled: false + podMonitor: + # -- Whether to enable the PodMonitor + enabled: true + + # -- Custom PgBouncer deployment template. + # Use to override image, specify resources, etc. + template: {} + diff --git a/modules/postgres-cloud-native/variables.tf b/modules/postgres-cloud-native/variables.tf new file mode 100644 index 00000000..770d80bd --- /dev/null +++ b/modules/postgres-cloud-native/variables.tf @@ -0,0 +1,27 @@ +variable "auto_deploy" { + description = "Auto deploy through ArgoCD" + type = bool + default = false +} + +variable "auto_prune" { + description = "Auto prune through ArgoCD" + type = bool + default = false +} + +variable "git_revision" { + description = "The git revision to deploy" + type = string + default = "main" +} + +variable "argo_deployment_name" { + description = "The name of the ArgoCD deployment, must be globally unique" + type = string +} + +variable "namespace" { + description = "The namespace to deploy into" + type = string +} diff --git a/modules/postgres-cloud-native/versions.tf b/modules/postgres-cloud-native/versions.tf new file mode 100644 index 00000000..c35c044f --- /dev/null +++ b/modules/postgres-cloud-native/versions.tf @@ -0,0 +1,16 @@ +terraform { + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + kubernetes = { + source = "hashicorp/kubernetes" + version = "~> 2.0" + } + kubectl = { + source = "gavinbunney/kubectl" + version = "1.14.0" + } + } +} diff --git a/modules/victoria-metrics/templates/values.yaml b/modules/victoria-metrics/templates/values.yaml index 3c4ffc64..b50a54b9 100644 --- a/modules/victoria-metrics/templates/values.yaml +++ b/modules/victoria-metrics/templates/values.yaml @@ -798,6 +798,10 @@ grafana: gnetId: 17813 revision: 2 datasource: VictoriaMetrics + cloudnativepg: + gnetId: 20417 + revision: 3 + datasource: VictoriaMetrics defaultDashboardsTimezone: utc @@ -1143,7 +1147,7 @@ crds: ## install prometheus operator crds prometheus-operator-crds: - enabled: false + enabled: true # -- Add extra objects dynamically to this chart extraObjects: []