From 48a9ffa5e4c25a1691e69f4b0d156cb6e3271b48 Mon Sep 17 00:00:00 2001 From: Stefano Lottini Date: Mon, 1 Jul 2024 12:08:56 +0200 Subject: [PATCH] Massive refactoring of CI and flows (#289) * user/password auth in fixtures; core testing can run on hcd * WIP: live_test_models and live_provider_info must become a db-dependent tower of fixtures * wip2, formatting * wip refactored all (nonvecze) env vars * restructure conftests to avoid quirks as much as possible * last before live_test_models becomes a fixture * full refactoring of the test suites + description in readme * improve workflows; add hcd workflow; condition nonAstra admin on env var * skip cursor timeout test on nonAstra (astra test is enough) --- .github/workflows/hcd.yaml | 39 + .github/workflows/main.yml | 4 +- Makefile | 15 +- README.md | 146 +- poetry.lock | 159 +- pyproject.toml | 1 + tests/.env.template | 20 - tests/conftest.py | 165 +- tests/core/conftest.py | 88 +- tests/core/test_async_db_ddl.py | 11 +- tests/core/test_db_ddl.py | 13 +- ...test_admin.py => test_endpoint_parsing.py} | 14 + tests/core/test_logging.py | 6 +- tests/core/test_ops.py | 38 +- tests/env_templates/env.astra.admin.template | 16 + tests/env_templates/env.astra.template | 22 + .../env.local.template} | 12 +- .../env.vectorize-minimal.template | 5 + .../env.vectorize.template} | 41 +- tests/hcd_compose/cassandra-hcd.yaml | 1546 +++++++++++++++++ tests/hcd_compose/docker-compose.yml | 51 + tests/idiomatic/conftest.py | 15 +- tests/idiomatic/integration/test_admin.py | 113 +- .../integration/test_nonastra_admin.py | 3 +- .../integration/test_timeout_async.py | 1 + .../integration/test_timeout_sync.py | 1 + tests/preprocess_env.py | 158 ++ .../integration/__init__.py | 13 + .../vectorize_idiomatic/live_provider_info.py | 64 + tests/vectorize_idiomatic/query_providers.py | 46 +- tests/vectorize_idiomatic/unit/__init__.py | 13 + tests/vectorize_idiomatic/vectorize_models.py | 48 +- 32 files changed, 2575 insertions(+), 312 deletions(-) create mode 100644 .github/workflows/hcd.yaml delete mode 100644 tests/.env.template rename tests/core/{test_admin.py => test_endpoint_parsing.py} (79%) create mode 100644 tests/env_templates/env.astra.admin.template create mode 100644 tests/env_templates/env.astra.template rename tests/{.env.local.template => env_templates/env.local.template} (62%) create mode 100644 tests/env_templates/env.vectorize-minimal.template rename tests/{.vectorize.env.template => env_templates/env.vectorize.template} (67%) create mode 100644 tests/hcd_compose/cassandra-hcd.yaml create mode 100644 tests/hcd_compose/docker-compose.yml create mode 100644 tests/preprocess_env.py create mode 100644 tests/vectorize_idiomatic/live_provider_info.py diff --git a/.github/workflows/hcd.yaml b/.github/workflows/hcd.yaml new file mode 100644 index 00000000..49ce4f13 --- /dev/null +++ b/.github/workflows/hcd.yaml @@ -0,0 +1,39 @@ +name: Run idiomatic pytest on HCD + +on: + push: + branches: + - master + pull_request: + branches: + - master + +jobs: + test: + env: + # special setting to not drop any collection (core only) + TEST_SKIP_COLLECTION_DELETE: ${{ secrets.TEST_SKIP_COLLECTION_DELETE }} + # for admin-related testing if enabled + DO_IDIOMATIC_ADMIN_TESTS: ${{ secrets.DO_IDIOMATIC_ADMIN_TESTS }} + # hardcoding the target DB + DOCKER_COMPOSE_LOCAL_DATA_API: "yes" + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: 3.11 + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install poetry + poetry install + + - name: Run pytest + run: | + poetry run pytest tests/idiomatic diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a0238217..92620837 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,4 +1,4 @@ -name: Run Pytest on Astrapy +name: Run idiomatic pytest on Astra DB on: push: @@ -16,7 +16,7 @@ jobs: ASTRA_DB_API_ENDPOINT: ${{ secrets.ASTRA_DB_API_ENDPOINT }} ASTRA_DB_KEYSPACE: ${{ secrets.ASTRA_DB_KEYSPACE }} ASTRA_DB_SECONDARY_KEYSPACE: ${{ secrets.ASTRA_DB_SECONDARY_KEYSPACE }} - # special setting to not drop any collection + # special setting to not drop any collection (core only) TEST_SKIP_COLLECTION_DELETE: ${{ secrets.TEST_SKIP_COLLECTION_DELETE }} # for admin-related testing if enabled DO_IDIOMATIC_ADMIN_TESTS: ${{ secrets.DO_IDIOMATIC_ADMIN_TESTS }} diff --git a/Makefile b/Makefile index a2fb233b..147ee7f5 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,8 @@ format-fix-src: format-src format-fix-tests: FMT_FLAGS= format-fix-tests: format-tests -test-idiomatic: test-idiomatic-unit test-idiomatic-integration +test-idiomatic: + poetry run pytest tests/idiomatic -vv test-idiomatic-unit: poetry run pytest tests/idiomatic/unit -vv @@ -36,6 +37,15 @@ test-idiomatic-unit: test-idiomatic-integration: poetry run pytest tests/idiomatic/integration -vv +docker-test-idiomatic: + DOCKER_COMPOSE_LOCAL_DATA_API="yes" poetry run pytest tests/idiomatic -vv + +docker-test-idiomatic-unit: + DOCKER_COMPOSE_LOCAL_DATA_API="yes" poetry run pytest tests/idiomatic/unit -vv + +docker-test-idiomatic-integration: + DOCKER_COMPOSE_LOCAL_DATA_API="yes" poetry run pytest tests/idiomatic/integration -vv + build: rm -f dist/astrapy* poetry build @@ -53,5 +63,8 @@ help: @echo "test-idiomatic run idiomatic tests" @echo " test-idiomatic-unit unit only" @echo " test-idiomatic-integration integration only" + @echo "docker-test-idiomatic same, on docker container" + @echo " docker-test-idiomatic-unit same" + @echo " docker-test-idiomatic-integration same" @echo "build build package ready for PyPI" @echo "======================================================================" diff --git a/README.md b/README.md index b48ee88e..4aac1cbc 100644 --- a/README.md +++ b/README.md @@ -226,56 +226,136 @@ naming convention and module structure). ### Running tests -"Full regular" testing requires environment variables: +Tests are grouped in three _blocks_ (in as many subdirs of `tests/`): -```bash -export ASTRA_DB_APPLICATION_TOKEN="AstraCS:..." -export ASTRA_DB_API_ENDPOINT="https://.......apps.astra.datastax.com" +- **core**: pre-1.0 classes +- **idiomatic**: all 1.0+ classes and APIs, except... +- **vectorize**: ... everything making use of `$vectorize` (within the idiomatic classes) + +Actually, for convenience, _sub-blocks_ of tests are considered: + +- **core regular**: everything except DevOps interactions +- **core ops**: core DevOps operations +- **idiomatic regular**: everything except the admin parts +- **idiomatic admin Astra**: the Astra-specific admin operations +- **idiomatic admin nonAstra**: the nonAstra-specific admin operations +- **vectorize in-depth**: many Data API interactions for a single choice of provider/model. This is mostly test the client +- **vectorize all-providers**: a slightly more shallow test repeated for all providers, models, auth methods etc. This is mostly testing the API + +Tests can be run on three types of Data API _targets_ (with slight differences in what is applicable): + +- **DockerCompose**: HCD started by the test initialization with `docker-compose`. _Note that in this case you will have to manually destroy the created containers._ +- **nonAstra**: a ready-to-use (user-supplied) local Data API +- **Astra**: an Astra DB target account (or two, as some tests are specific to dev environment) + +Depending on the (sub-block, target) combination, some environment variables may be needed. +Templates for the environment variables are to be found in `tests/env_templates`. + +The general expectation is that idiomatic non-Admin tests, and vectorize in-depth tests, are +part of the main CI flow; conversely, core, admin and vectorize all-providers are kept as a +manual task to run (locally in most cases) when circumstances require it (use your judgement). + +#### Required environment variables + +Below is a detail of the reference template files needed for the various types +of testing: + +- **DockerCompose**: generally no variables needed, except: + - **vectorize in-depth**: provide as in `env.vectorize-minimal.template` + - **vectorize all-providers**: provide as in `env.vectorize.template` + - (also note that _core ops_ and _idiomatic admin Astra_ amount to nothing in this case) +- **nonAstra**: all tests require as in `env.local.template`, plus: + - **vectorize in-depth**: also provide as in `env.vectorize-minimal.template` + - **vectorize all-providers**: also provide as in `env.vectorize.template` + - (also note that _core ops_ and _idiomatic admin Astra_ amount to nothing in this case) +- **Astra**: all tests require as in `env.astra.template`, plus: + - **core ops**: the token must have at least "Database Administrator" role (possibly through definition of a separate `ASTRA_DB_OPS_APPLICATION_TOKEN`), and `ASTRA_DB_ID` must also be defined + - **idiomatic admin Astra**: also provide as in `env.astra.admin.template` + - **vectorize in-depth**: also provide as in `env.vectorize-minimal.template` + - **vectorize all-providers**: also provide as in `env.vectorize.template` + - (also note that _idiomatic admin nonAstra_ amounts to nothing in this case) + +#### Sample testing commands + +For the **DockerCompose** case, prepend all of the following with `DOCKER_COMPOSE_LOCAL_DATA_API="yes" `. + +All the usual `pytest` ways of restricting the test selection hold in addition +(e.g. `poetry run pytest tests/idiomatic/unit` or `[...] -k `). + +##### _core regular_: -export ASTRA_DB_KEYSPACE="default_keyspace" -# Optional: -export ASTRA_DB_SECONDARY_KEYSPACE="..." +``` +poetry run pytest tests/core ``` -#### "Idiomatic" testing +##### _core ops_: -Tests can be started in various ways: mostly `make tests-idiomatic`, but also: +Note the special variable needed to actually run this. You will have to manually clean up afterwards. -```bash -# test the "idiomatic" layer +``` +TEST_ASTRADBOPS="1" poetry run pytest tests/core/test_ops.py +``` + +##### _idiomatic regular_: + +Warning: this will also trigger the very long-running _idiomatic admin Astra_ if the vars as in `env.astra.admin.template` are also detected. Likewise, the _idiomatic admin nonAstra_ may start (if `DO_IDIOMATIC_ADMIN_TESTS` is set), which however takes few seconds. + +``` poetry run pytest tests/idiomatic -poetry run pytest tests/idiomatic/unit -poetry run pytest tests/idiomatic/integration +``` -# remove logging noise: -poetry run pytest [...] -o log_cli=0 +##### _idiomatic admin Astra_: + +``` +poetry run pytest tests/idiomatic/integration/test_admin.py ``` -The above runs the regular testing (i.e. non-Admin, non-core). -The (idiomatic) Admin part is tested manually by you, on Astra accounts with room -for up to 3 new databases, possibly both on prod and dev, and uses specific env vars, -as can be seen on `tests/idiomatic/integration/test_admin.py`. +##### _idiomatic admin nonAstra_: -#### Other tests +``` +DO_IDIOMATIC_ADMIN_TESTS="1" poetry run pytest tests/idiomatic/integration/test_nonastra_admin.py +``` -Vectorize tests are confined in `tests/vectorize_idiomatic` and are run -separately. A separate set of credentials is required to do the full testing: -refer to `tests/.vectorize.env.template` for the complete listing, including -the secrets that should be added to the database beforehand, through the UI. +##### _vectorize in-depth_: -Should you be interested in testing the "core" modules, moreover, -this is also something for you to run manually (do that if you touch "core"): +``` +poetry run pytest tests/vectorize_idiomatic/integration/test_vectorize_methods*.py +``` -```bash -# test the core modules -poetry run pytest tests/core +or just: -# do not drop collections: -TEST_SKIP_COLLECTION_DELETE=1 poetry run pytest [...] +``` +poetry run pytest tests/vectorize_idiomatic/integration/test_vectorize_methods_sync.py +``` + +##### _vectorize all-providers_: + +This generates all possible test cases and runs them: + +``` +poetry run pytest tests/vectorize_idiomatic +``` + +For a spot test, you may restrict to one case, e.g. -# include astrapy.core.ops testing (tester must clean up after that): -TEST_ASTRADBOPS=1 poetry run pytest [...] ``` +EMBEDDING_MODEL_TAGS="openai/text-embedding-3-large/HEADER/0" poetry run pytest tests/vectorize_idiomatic/integration/test_vectorize_providers.py -k test_vectorize_usage_auth_type_header_sync +``` + +#### Useful flags for testing + +Remove logging noise with: + +``` +poetry run pytest [...] -o log_cli=0 +``` + +Do not drop collections (core): + +``` +TEST_SKIP_COLLECTION_DELETE=1 poetry run pytest [...] +``` + ## Appendices diff --git a/poetry.lock b/poetry.lock index d690053a..e4c32b87 100644 --- a/poetry.lock +++ b/poetry.lock @@ -383,6 +383,28 @@ files = [ {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, ] +[[package]] +name = "docker" +version = "7.1.0" +description = "A Python library for the Docker Engine API." +optional = false +python-versions = ">=3.8" +files = [ + {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"}, + {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"}, +] + +[package.dependencies] +pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""} +requests = ">=2.26.0" +urllib3 = ">=1.26.0" + +[package.extras] +dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"] +docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"] +ssh = ["paramiko (>=2.4.3)"] +websockets = ["websocket-client (>=1.3.0)"] + [[package]] name = "exceptiongroup" version = "1.2.1" @@ -947,6 +969,29 @@ files = [ [package.extras] cli = ["click (>=5.0)"] +[[package]] +name = "pywin32" +version = "306" +description = "Python for Window Extensions" +optional = false +python-versions = "*" +files = [ + {file = "pywin32-306-cp310-cp310-win32.whl", hash = "sha256:06d3420a5155ba65f0b72f2699b5bacf3109f36acbe8923765c22938a69dfc8d"}, + {file = "pywin32-306-cp310-cp310-win_amd64.whl", hash = "sha256:84f4471dbca1887ea3803d8848a1616429ac94a4a8d05f4bc9c5dcfd42ca99c8"}, + {file = "pywin32-306-cp311-cp311-win32.whl", hash = "sha256:e65028133d15b64d2ed8f06dd9fbc268352478d4f9289e69c190ecd6818b6407"}, + {file = "pywin32-306-cp311-cp311-win_amd64.whl", hash = "sha256:a7639f51c184c0272e93f244eb24dafca9b1855707d94c192d4a0b4c01e1100e"}, + {file = "pywin32-306-cp311-cp311-win_arm64.whl", hash = "sha256:70dba0c913d19f942a2db25217d9a1b726c278f483a919f1abfed79c9cf64d3a"}, + {file = "pywin32-306-cp312-cp312-win32.whl", hash = "sha256:383229d515657f4e3ed1343da8be101000562bf514591ff383ae940cad65458b"}, + {file = "pywin32-306-cp312-cp312-win_amd64.whl", hash = "sha256:37257794c1ad39ee9be652da0462dc2e394c8159dfd913a8a4e8eb6fd346da0e"}, + {file = "pywin32-306-cp312-cp312-win_arm64.whl", hash = "sha256:5821ec52f6d321aa59e2db7e0a35b997de60c201943557d108af9d4ae1ec7040"}, + {file = "pywin32-306-cp37-cp37m-win32.whl", hash = "sha256:1c73ea9a0d2283d889001998059f5eaaba3b6238f767c9cf2833b13e6a685f65"}, + {file = "pywin32-306-cp37-cp37m-win_amd64.whl", hash = "sha256:72c5f621542d7bdd4fdb716227be0dd3f8565c11b280be6315b06ace35487d36"}, + {file = "pywin32-306-cp38-cp38-win32.whl", hash = "sha256:e4c092e2589b5cf0d365849e73e02c391c1349958c5ac3e9d5ccb9a28e017b3a"}, + {file = "pywin32-306-cp38-cp38-win_amd64.whl", hash = "sha256:e8ac1ae3601bee6ca9f7cb4b5363bf1c0badb935ef243c4733ff9a393b1690c0"}, + {file = "pywin32-306-cp39-cp39-win32.whl", hash = "sha256:e25fd5b485b55ac9c057f67d94bc203f3f6595078d1fb3b458c9c28b7153a802"}, + {file = "pywin32-306-cp39-cp39-win_amd64.whl", hash = "sha256:39b61c15272833b5c329a2989999dcae836b1eed650252ab1b7bfbe1d59f30f4"}, +] + [[package]] name = "pyyaml" version = "6.0.1" @@ -1065,6 +1110,39 @@ files = [ {file = "sniffio-1.3.1.tar.gz", hash = "sha256:f4324edc670a0f49750a81b895f35c3adb843cca46f0530f79fc1babb23789dc"}, ] +[[package]] +name = "testcontainers" +version = "3.7.1" +description = "Library provides lightweight, throwaway instances of common databases, Selenium web browsers, or anything else that can run in a Docker container" +optional = false +python-versions = ">=3.7" +files = [ + {file = "testcontainers-3.7.1-py2.py3-none-any.whl", hash = "sha256:7f48cef4bf0ccd78f1a4534d4b701a003a3bace851f24eae58a32f9e3f0aeba0"}, +] + +[package.dependencies] +deprecation = "*" +docker = ">=4.0.0" +wrapt = "*" + +[package.extras] +arangodb = ["python-arango"] +azurite = ["azure-storage-blob"] +clickhouse = ["clickhouse-driver"] +docker-compose = ["docker-compose"] +google-cloud-pubsub = ["google-cloud-pubsub (<2)"] +kafka = ["kafka-python"] +keycloak = ["python-keycloak"] +mongo = ["pymongo"] +mssqlserver = ["pymssql"] +mysql = ["pymysql", "sqlalchemy"] +neo4j = ["neo4j"] +oracle = ["cx-Oracle", "sqlalchemy"] +postgresql = ["psycopg2-binary", "sqlalchemy"] +rabbitmq = ["pika"] +redis = ["redis"] +selenium = ["selenium"] + [[package]] name = "toml" version = "0.10.2" @@ -1174,7 +1252,86 @@ MarkupSafe = ">=2.1.1" [package.extras] watchdog = ["watchdog (>=2.3)"] +[[package]] +name = "wrapt" +version = "1.16.0" +description = "Module for decorators, wrappers and monkey patching." +optional = false +python-versions = ">=3.6" +files = [ + {file = "wrapt-1.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4"}, + {file = "wrapt-1.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487"}, + {file = "wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0"}, + {file = "wrapt-1.16.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136"}, + {file = "wrapt-1.16.0-cp310-cp310-win32.whl", hash = "sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d"}, + {file = "wrapt-1.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09"}, + {file = "wrapt-1.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060"}, + {file = "wrapt-1.16.0-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956"}, + {file = "wrapt-1.16.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d"}, + {file = "wrapt-1.16.0-cp311-cp311-win32.whl", hash = "sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362"}, + {file = "wrapt-1.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b"}, + {file = "wrapt-1.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809"}, + {file = "wrapt-1.16.0-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9"}, + {file = "wrapt-1.16.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c"}, + {file = "wrapt-1.16.0-cp312-cp312-win32.whl", hash = "sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc"}, + {file = "wrapt-1.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8"}, + {file = "wrapt-1.16.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c"}, + {file = "wrapt-1.16.0-cp36-cp36m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e"}, + {file = "wrapt-1.16.0-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465"}, + {file = "wrapt-1.16.0-cp36-cp36m-win32.whl", hash = "sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e"}, + {file = "wrapt-1.16.0-cp36-cp36m-win_amd64.whl", hash = "sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966"}, + {file = "wrapt-1.16.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5"}, + {file = "wrapt-1.16.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f"}, + {file = "wrapt-1.16.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win32.whl", hash = "sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c"}, + {file = "wrapt-1.16.0-cp37-cp37m-win_amd64.whl", hash = "sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0"}, + {file = "wrapt-1.16.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e"}, + {file = "wrapt-1.16.0-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca"}, + {file = "wrapt-1.16.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6"}, + {file = "wrapt-1.16.0-cp38-cp38-win32.whl", hash = "sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b"}, + {file = "wrapt-1.16.0-cp38-cp38-win_amd64.whl", hash = "sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2"}, + {file = "wrapt-1.16.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c"}, + {file = "wrapt-1.16.0-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f"}, + {file = "wrapt-1.16.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537"}, + {file = "wrapt-1.16.0-cp39-cp39-win32.whl", hash = "sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3"}, + {file = "wrapt-1.16.0-cp39-cp39-win_amd64.whl", hash = "sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35"}, + {file = "wrapt-1.16.0-py3-none-any.whl", hash = "sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1"}, + {file = "wrapt-1.16.0.tar.gz", hash = "sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d"}, +] + [metadata] lock-version = "2.0" python-versions = "^3.8.0" -content-hash = "e3b42f8ee41d4e4f1e62e70f2d6f812706c842d1da5bdbc4383a6d3154e7a497" +content-hash = "7b8e7fe6ca5bc2671172233b85942735df1b9346c62178d4468d0ec41323ed09" diff --git a/pyproject.toml b/pyproject.toml index 46f30006..afcb73ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ pytest-testdox = "~3.1.0" pytest = "~8.0.0" python-dotenv = "~1.0.1" pytest-httpserver = "~1.0.8" +testcontainers = "~3.7.1" ruff = "~0.2.1" types-toml = "^0.10.8.7" isort = "^5.13.2" diff --git a/tests/.env.template b/tests/.env.template deleted file mode 100644 index 01e53557..00000000 --- a/tests/.env.template +++ /dev/null @@ -1,20 +0,0 @@ -######################## -# FOR THE REGULAR TESTS: -######################## -# -export ASTRA_DB_APPLICATION_TOKEN="AstraCS:..." -# -export ASTRA_DB_API_ENDPOINT="https://-.apps.astra.datastax.com" -# -# OPTIONAL: -# export ASTRA_DB_KEYSPACE="..." -# export ASTRA_DB_SECONDARY_KEYSPACE="..." - -################### -# FOR THE OPS TEST: -################### -# -export ASTRA_DB_ID="..." -# -# OPTIONAL (falls back to the token above) -# export ASTRA_DB_OPS_APPLICATION_TOKEN="..." diff --git a/tests/conftest.py b/tests/conftest.py index 12f7c1f4..2ccaa1ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,29 +1,67 @@ -# main conftest for shared fixtures (if any). +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Main conftest for shared fixtures (if any). +""" + import functools -import os import warnings from typing import Any, Awaitable, Callable, Optional, Tuple, TypedDict import pytest from deprecation import UnsupportedWarning +from astrapy import DataAPIClient from astrapy.admin import parse_api_endpoint +from astrapy.authentication import TokenProvider from astrapy.constants import Environment from astrapy.core.defaults import DEFAULT_KEYSPACE_NAME -IS_ASTRA_DB: bool -SECONDARY_NAMESPACE: Optional[str] -if "LOCAL_DATA_API_ENDPOINT" in os.environ: - IS_ASTRA_DB = False - SECONDARY_NAMESPACE = os.environ.get("LOCAL_DATA_API_SECONDARY_KEYSPACE") -elif "ASTRA_DB_API_ENDPOINT" in os.environ: - IS_ASTRA_DB = True - SECONDARY_NAMESPACE = os.environ.get("ASTRA_DB_SECONDARY_KEYSPACE") -else: - raise ValueError("No credentials.") +from .preprocess_env import ( + ADMIN_ENV_LIST, + ADMIN_ENV_VARIABLE_MAP, + ASTRA_DB_API_ENDPOINT, + ASTRA_DB_APPLICATION_TOKEN, + ASTRA_DB_ID, + ASTRA_DB_KEYSPACE, + ASTRA_DB_OPS_APPLICATION_TOKEN, + ASTRA_DB_REGION, + ASTRA_DB_TOKEN_PROVIDER, + DO_IDIOMATIC_ADMIN_TESTS, + DOCKER_COMPOSE_LOCAL_DATA_API, + IS_ASTRA_DB, + LOCAL_DATA_API_APPLICATION_TOKEN, + LOCAL_DATA_API_ENDPOINT, + LOCAL_DATA_API_KEYSPACE, + LOCAL_DATA_API_PASSWORD, + LOCAL_DATA_API_TOKEN_PROVIDER, + LOCAL_DATA_API_USERNAME, + SECONDARY_NAMESPACE, + TEST_ASTRADBOPS, + TEST_SKIP_COLLECTION_DELETE, +) class DataAPICredentials(TypedDict): + token: str | TokenProvider + api_endpoint: str + namespace: str + + +# to be used for 'core' testing, derived from above +class DataAPICoreCredentials(TypedDict): token: str api_endpoint: str namespace: str @@ -105,31 +143,65 @@ def test_inner(*args: Any, **kwargs: Any) -> Any: @pytest.fixture(scope="session") def data_api_credentials_kwargs() -> DataAPICredentials: if IS_ASTRA_DB: - ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"] - ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"] - ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE", DEFAULT_KEYSPACE_NAME) + if ASTRA_DB_API_ENDPOINT is None: + raise ValueError("No endpoint data for local Data API") astra_db_creds: DataAPICredentials = { - "token": ASTRA_DB_APPLICATION_TOKEN, - "api_endpoint": ASTRA_DB_API_ENDPOINT, - "namespace": ASTRA_DB_KEYSPACE, + "token": ASTRA_DB_TOKEN_PROVIDER or "", + "api_endpoint": ASTRA_DB_API_ENDPOINT or "", + "namespace": ASTRA_DB_KEYSPACE or DEFAULT_KEYSPACE_NAME, } return astra_db_creds else: - LOCAL_DATA_API_APPLICATION_TOKEN = os.environ[ - "LOCAL_DATA_API_APPLICATION_TOKEN" - ] - LOCAL_DATA_API_ENDPOINT = os.environ["LOCAL_DATA_API_ENDPOINT"] - LOCAL_DATA_API_KEYSPACE = os.environ.get( - "LOCAL_DATA_API_KEYSPACE", "default_keyspace" - ) + if LOCAL_DATA_API_ENDPOINT is None: + raise ValueError("No endpoint data for local Data API") local_db_creds: DataAPICredentials = { - "token": LOCAL_DATA_API_APPLICATION_TOKEN, - "api_endpoint": LOCAL_DATA_API_ENDPOINT, - "namespace": LOCAL_DATA_API_KEYSPACE, + "token": LOCAL_DATA_API_TOKEN_PROVIDER or "", + "api_endpoint": LOCAL_DATA_API_ENDPOINT or "", + "namespace": LOCAL_DATA_API_KEYSPACE or DEFAULT_KEYSPACE_NAME, } + + # ensure keyspace(s) exist at this point + # (we have to bypass the fixture hierarchy as the ..._info fixture + # comes later, so this part instantiates and uses throwaway objects) + _env, _ = env_region_from_endpoint(local_db_creds["api_endpoint"]) + _client = DataAPIClient(environment=_env) + _database = _client.get_database( + local_db_creds["api_endpoint"], + token=local_db_creds["token"], + # namespace=local_db_creds["namespace"], + ) + _database_admin = _database.get_database_admin() + _database_admin.create_namespace(local_db_creds["namespace"]) + if SECONDARY_NAMESPACE: + _database_admin.create_namespace(SECONDARY_NAMESPACE) + # end of keyspace-ensuring block + return local_db_creds +@pytest.fixture(scope="session") +def data_api_core_credentials_kwargs( + data_api_credentials_kwargs: DataAPICredentials, +) -> DataAPICoreCredentials: + token_str: str + if isinstance(data_api_credentials_kwargs["token"], str): + token_str = data_api_credentials_kwargs["token"] + elif isinstance(data_api_credentials_kwargs["token"], TokenProvider): + token_str0 = data_api_credentials_kwargs["token"].get_token() + if token_str0 is None: + raise ValueError("Token cannot be made into a string in fixture") + else: + token_str = token_str0 + else: + # this should not happen + token_str = str(data_api_credentials_kwargs["token"]) + return { + "token": token_str, + "api_endpoint": data_api_credentials_kwargs["api_endpoint"], + "namespace": data_api_credentials_kwargs["namespace"], + } + + @pytest.fixture(scope="session") def data_api_credentials_info( data_api_credentials_kwargs: DataAPICredentials, @@ -147,13 +219,38 @@ def data_api_credentials_info( @pytest.fixture(scope="session") -def astra_invalid_db_credentials_kwargs( - data_api_credentials_kwargs: DataAPICredentials, -) -> DataAPICredentials: - astra_db_creds: DataAPICredentials = { - "token": data_api_credentials_kwargs["token"], - "namespace": data_api_credentials_kwargs["namespace"], +def data_api_core_bad_credentials_kwargs( + data_api_core_credentials_kwargs: DataAPICoreCredentials, +) -> DataAPICoreCredentials: + astra_db_creds: DataAPICoreCredentials = { + "token": data_api_core_credentials_kwargs["token"], + "namespace": data_api_core_credentials_kwargs["namespace"], "api_endpoint": "http://localhost:1234", } return astra_db_creds + + +__all__ = [ + "ASTRA_DB_API_ENDPOINT", + "ASTRA_DB_APPLICATION_TOKEN", + "ASTRA_DB_ID", + "ASTRA_DB_KEYSPACE", + "ASTRA_DB_OPS_APPLICATION_TOKEN", + "ASTRA_DB_REGION", + "DOCKER_COMPOSE_LOCAL_DATA_API", + "IS_ASTRA_DB", + "LOCAL_DATA_API_APPLICATION_TOKEN", + "LOCAL_DATA_API_ENDPOINT", + "LOCAL_DATA_API_KEYSPACE", + "LOCAL_DATA_API_PASSWORD", + "LOCAL_DATA_API_USERNAME", + "SECONDARY_NAMESPACE", + "TEST_ASTRADBOPS", + "TEST_SKIP_COLLECTION_DELETE", + "ADMIN_ENV_LIST", + "ADMIN_ENV_VARIABLE_MAP", + "DO_IDIOMATIC_ADMIN_TESTS", + "ASTRA_DB_TOKEN_PROVIDER", + "LOCAL_DATA_API_TOKEN_PROVIDER", +] diff --git a/tests/core/conftest.py b/tests/core/conftest.py index 0e7df68d..fbc109d8 100644 --- a/tests/core/conftest.py +++ b/tests/core/conftest.py @@ -1,9 +1,22 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """ -Test fixtures +Test fixtures for 'core' testing """ import math -import os from typing import AsyncIterable, Dict, Iterable, List, Optional, Set, TypeVar import pytest @@ -16,15 +29,14 @@ AsyncAstraDBCollection, ) -from ..conftest import DataAPICredentials +from ..conftest import ( + TEST_SKIP_COLLECTION_DELETE, + DataAPICoreCredentials, + DataAPICredentialsInfo, +) T = TypeVar("T") -TEST_SKIP_COLLECTION_DELETE: bool -if os.getenv("TEST_SKIP_COLLECTION_DELETE"): - TEST_SKIP_COLLECTION_DELETE = int(os.environ["TEST_SKIP_COLLECTION_DELETE"]) != 0 -else: - TEST_SKIP_COLLECTION_DELETE = False # fixed TEST_WRITABLE_VECTOR_COLLECTION = "writable_v_col" @@ -86,46 +98,76 @@ def _batch_iterable(iterable: Iterable[T], batch_size: int) -> Iterable[Iterable @pytest.fixture(scope="session") -def db(data_api_credentials_kwargs: DataAPICredentials) -> AstraDB: - token = data_api_credentials_kwargs["token"] - api_endpoint = data_api_credentials_kwargs["api_endpoint"] - namespace = data_api_credentials_kwargs.get("namespace") +def db( + data_api_core_credentials_kwargs: DataAPICoreCredentials, + data_api_credentials_info: DataAPICredentialsInfo, +) -> AstraDB: + token = data_api_core_credentials_kwargs["token"] + api_endpoint = data_api_core_credentials_kwargs["api_endpoint"] + namespace = data_api_core_credentials_kwargs.get("namespace") if token is None or api_endpoint is None: raise ValueError("Required ASTRA DB configuration is missing") - return AstraDB(token=token, api_endpoint=api_endpoint, namespace=namespace) + db_kwargs: Dict[str, str] + if data_api_credentials_info["environment"] in {"prod", "dev", "test"}: + db_kwargs = {} + else: + db_kwargs = {"api_path": ""} + + return AstraDB( + token=token, api_endpoint=api_endpoint, namespace=namespace, **db_kwargs + ) @pytest_asyncio.fixture(scope="function") async def async_db( - data_api_credentials_kwargs: DataAPICredentials, + data_api_core_credentials_kwargs: DataAPICoreCredentials, + data_api_credentials_info: DataAPICredentialsInfo, ) -> AsyncIterable[AsyncAstraDB]: - token = data_api_credentials_kwargs["token"] - api_endpoint = data_api_credentials_kwargs["api_endpoint"] - namespace = data_api_credentials_kwargs.get("namespace") + token = data_api_core_credentials_kwargs["token"] + api_endpoint = data_api_core_credentials_kwargs["api_endpoint"] + namespace = data_api_core_credentials_kwargs.get("namespace") if token is None or api_endpoint is None: raise ValueError("Required ASTRA DB configuration is missing") + db_kwargs: Dict[str, str] + if data_api_credentials_info["environment"] in {"prod", "dev", "test"}: + db_kwargs = {} + else: + db_kwargs = {"api_path": ""} + async with AsyncAstraDB( - token=token, api_endpoint=api_endpoint, namespace=namespace + token=token, + api_endpoint=api_endpoint, + namespace=namespace, + **db_kwargs, ) as db: yield db @pytest.fixture(scope="module") def invalid_db( - astra_invalid_db_credentials_kwargs: Dict[str, Optional[str]] + data_api_core_bad_credentials_kwargs: Dict[str, Optional[str]], + data_api_credentials_info: DataAPICredentialsInfo, ) -> AstraDB: - token = astra_invalid_db_credentials_kwargs["token"] - api_endpoint = astra_invalid_db_credentials_kwargs["api_endpoint"] - namespace = astra_invalid_db_credentials_kwargs.get("namespace") + token = data_api_core_bad_credentials_kwargs["token"] + api_endpoint = data_api_core_bad_credentials_kwargs["api_endpoint"] + namespace = data_api_core_bad_credentials_kwargs.get("namespace") + + db_kwargs: Dict[str, str] + if data_api_credentials_info["environment"] in {"prod", "dev", "test"}: + db_kwargs = {} + else: + db_kwargs = {"api_path": ""} if token is None or api_endpoint is None: raise ValueError("Required ASTRA DB configuration is missing") - return AstraDB(token=token, api_endpoint=api_endpoint, namespace=namespace) + return AstraDB( + token=token, api_endpoint=api_endpoint, namespace=namespace, **db_kwargs + ) @pytest.fixture(scope="session") diff --git a/tests/core/test_async_db_ddl.py b/tests/core/test_async_db_ddl.py index 196b69f2..6b395c28 100644 --- a/tests/core/test_async_db_ddl.py +++ b/tests/core/test_async_db_ddl.py @@ -23,8 +23,7 @@ from astrapy.core.db import AsyncAstraDB, AsyncAstraDBCollection from astrapy.core.defaults import DEFAULT_KEYSPACE_NAME -from ..conftest import DataAPICredentials -from .conftest import TEST_SKIP_COLLECTION_DELETE +from ..conftest import TEST_SKIP_COLLECTION_DELETE, DataAPICoreCredentials TEST_CREATE_DELETE_VECTOR_COLLECTION_NAME = "ephemeral_v_col" TEST_CREATE_DELETE_NONVECTOR_COLLECTION_NAME = "ephemeral_non_v_col" @@ -34,11 +33,11 @@ @pytest.mark.describe("should confirm path handling in constructor (async)") async def test_path_handling( - data_api_credentials_kwargs: DataAPICredentials, + data_api_core_credentials_kwargs: DataAPICoreCredentials, ) -> None: - token = data_api_credentials_kwargs["token"] - api_endpoint = data_api_credentials_kwargs["api_endpoint"] - namespace = data_api_credentials_kwargs.get("namespace") + token = data_api_core_credentials_kwargs["token"] + api_endpoint = data_api_core_credentials_kwargs["api_endpoint"] + namespace = data_api_core_credentials_kwargs.get("namespace") if token is None or api_endpoint is None: raise ValueError("Required ASTRA DB configuration is missing") diff --git a/tests/core/test_db_ddl.py b/tests/core/test_db_ddl.py index 9b73fe6b..25f4fdde 100644 --- a/tests/core/test_db_ddl.py +++ b/tests/core/test_db_ddl.py @@ -23,8 +23,7 @@ from astrapy.core.db import AstraDB, AstraDBCollection from astrapy.core.defaults import DEFAULT_KEYSPACE_NAME -from ..conftest import DataAPICredentials -from .conftest import TEST_SKIP_COLLECTION_DELETE +from ..conftest import TEST_SKIP_COLLECTION_DELETE, DataAPICoreCredentials TEST_CREATE_DELETE_VECTOR_COLLECTION_NAME = "ephemeral_v_col" TEST_CREATE_DELETE_NONVECTOR_COLLECTION_NAME = "ephemeral_non_v_col" @@ -33,10 +32,12 @@ @pytest.mark.describe("should confirm path handling in constructor") -def test_path_handling(data_api_credentials_kwargs: DataAPICredentials) -> None: - token = data_api_credentials_kwargs["token"] - api_endpoint = data_api_credentials_kwargs["api_endpoint"] - namespace = data_api_credentials_kwargs.get("namespace") +def test_path_handling( + data_api_core_credentials_kwargs: DataAPICoreCredentials, +) -> None: + token = data_api_core_credentials_kwargs["token"] + api_endpoint = data_api_core_credentials_kwargs["api_endpoint"] + namespace = data_api_core_credentials_kwargs.get("namespace") if token is None or api_endpoint is None: raise ValueError("Required ASTRA DB configuration is missing") diff --git a/tests/core/test_admin.py b/tests/core/test_endpoint_parsing.py similarity index 79% rename from tests/core/test_admin.py rename to tests/core/test_endpoint_parsing.py index 48464f3e..7ee2b93d 100644 --- a/tests/core/test_admin.py +++ b/tests/core/test_endpoint_parsing.py @@ -1,3 +1,17 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import pytest from astrapy.admin import parse_generic_api_url diff --git a/tests/core/test_logging.py b/tests/core/test_logging.py index a16c9c29..7e039859 100644 --- a/tests/core/test_logging.py +++ b/tests/core/test_logging.py @@ -22,7 +22,7 @@ from astrapy.core.db import AstraDB -from ..conftest import DataAPICredentials +from ..conftest import DataAPICoreCredentials logger = logging.getLogger(__name__) @@ -30,9 +30,9 @@ @pytest.mark.describe("should obey the 'TRACE' logging level when requested") def test_trace_logging_trace( caplog: pytest.LogCaptureFixture, - data_api_credentials_kwargs: DataAPICredentials, + data_api_core_credentials_kwargs: DataAPICoreCredentials, ) -> None: - astra_db = AstraDB(**data_api_credentials_kwargs) + astra_db = AstraDB(**data_api_core_credentials_kwargs) with caplog.at_level(10): astra_db.get_collections() for record in caplog.records: diff --git a/tests/core/test_ops.py b/tests/core/test_ops.py index 2da1f764..95165d7b 100644 --- a/tests/core/test_ops.py +++ b/tests/core/test_ops.py @@ -14,29 +14,23 @@ import itertools import logging -import os from typing import Any, Dict, List, cast import pytest -from dotenv import load_dotenv from astrapy.core.defaults import DEFAULT_KEYSPACE_NAME, DEFAULT_REGION from astrapy.core.ops import AstraDBOps -logger = logging.getLogger(__name__) - - -load_dotenv() - - -# Parameters for the AstraDBOps testing -ASTRA_DB_APPLICATION_TOKEN = os.environ.get( - "ASTRA_DB_OPS_APPLICATION_TOKEN", - os.environ.get("ASTRA_DB_APPLICATION_TOKEN", "no_token!"), +from ..conftest import ( + ASTRA_DB_ID, + ASTRA_DB_KEYSPACE, + ASTRA_DB_OPS_APPLICATION_TOKEN, + ASTRA_DB_REGION, + IS_ASTRA_DB, + TEST_ASTRADBOPS, ) -ASTRA_DB_ID = os.environ.get("ASTRA_DB_ID", "") -ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE", DEFAULT_KEYSPACE_NAME) -ASTRA_DB_REGION = os.environ.get("ASTRA_DB_REGION", DEFAULT_REGION) + +logger = logging.getLogger(__name__) def find_new_name(existing: List[str], prefix: str) -> str: @@ -50,15 +44,19 @@ def find_new_name(existing: List[str], prefix: str) -> str: @pytest.fixture def devops_client() -> AstraDBOps: - return AstraDBOps(token=ASTRA_DB_APPLICATION_TOKEN) + return AstraDBOps(token=ASTRA_DB_OPS_APPLICATION_TOKEN) # In the regular CI we skip these Ops tests (slow and require manual care). # To maintainers: please run them now and them while we figure out automation. @pytest.mark.skipif( - int(os.environ.get("TEST_ASTRADBOPS", "0")) == 0, + not TEST_ASTRADBOPS, reason="Ops tests not explicitly requested", ) +@pytest.mark.skipif( + not IS_ASTRA_DB, + reason="Ops tests are only for Astra DB", +) class TestAstraDBOps: @pytest.mark.describe("should initialize an AstraDB Ops Client") def test_client_type(self, devops_client: AstraDBOps) -> None: @@ -83,11 +81,11 @@ def test_create_database(self, devops_client: AstraDBOps) -> None: "name": new_database_name, "tier": "serverless", "cloudProvider": "GCP", - "keyspace": ASTRA_DB_KEYSPACE, - "region": ASTRA_DB_REGION, + "keyspace": ASTRA_DB_KEYSPACE or DEFAULT_KEYSPACE_NAME, + "region": ASTRA_DB_REGION or DEFAULT_REGION, "capacityUnits": 1, "user": "token", - "password": ASTRA_DB_APPLICATION_TOKEN, + "password": ASTRA_DB_OPS_APPLICATION_TOKEN, "dbType": "vector", } response = devops_client.create_database( diff --git a/tests/env_templates/env.astra.admin.template b/tests/env_templates/env.astra.admin.template new file mode 100644 index 00000000..fe01bf00 --- /dev/null +++ b/tests/env_templates/env.astra.admin.template @@ -0,0 +1,16 @@ +################################## +# FOR THE (idiomatic) ADMIN TESTS: +################################## + +# General setting +export DO_IDIOMATIC_ADMIN_TESTS="1" + +# PROD settings +export PROD_ADMIN_TEST_ASTRA_DB_APPLICATION_TOKEN="AstraCS:..." +export PROD_ADMIN_TEST_ASTRA_DB_PROVIDER="aws" +export PROD_ADMIN_TEST_ASTRA_DB_REGION="eu-west-1" + +# DEV settings (optional) +export DEV_ADMIN_TEST_ASTRA_DB_APPLICATION_TOKEN="AstraCS:..." +export DEV_ADMIN_TEST_ASTRA_DB_PROVIDER="aws" +export DEV_ADMIN_TEST_ASTRA_DB_REGION="us-west-2" diff --git a/tests/env_templates/env.astra.template b/tests/env_templates/env.astra.template new file mode 100644 index 00000000..fa72aa02 --- /dev/null +++ b/tests/env_templates/env.astra.template @@ -0,0 +1,22 @@ +######################## +# FOR THE REGULAR TESTS: +######################## + +export ASTRA_DB_APPLICATION_TOKEN="AstraCS:..." + +export ASTRA_DB_API_ENDPOINT="https://-.apps.astra.datastax.com" + +# OPTIONAL (the first has a default; a few tests are skipped if the second is missing): +# export ASTRA_DB_KEYSPACE="..." +# export ASTRA_DB_SECONDARY_KEYSPACE="..." + + + +########################## +# FOR THE (core) OPS TEST: +########################## + +export ASTRA_DB_ID="..." + +# OPTIONAL (falls back to the token above if not defined. Ensure it is a DB Admin token) +export ASTRA_DB_OPS_APPLICATION_TOKEN="..." diff --git a/tests/.env.local.template b/tests/env_templates/env.local.template similarity index 62% rename from tests/.env.local.template rename to tests/env_templates/env.local.template index c2ab4dd3..d3a61c6b 100644 --- a/tests/.env.local.template +++ b/tests/env_templates/env.local.template @@ -1,11 +1,15 @@ ######################## # FOR THE REGULAR TESTS: ######################## -# + +# Authentication. Choose either: export LOCAL_DATA_API_APPLICATION_TOKEN="Cassandra:Y2Fzc2FuZHJh:Y2Fzc2FuZHJh" -# +# Or: +export LOCAL_DATA_API_USERNAME="cassandra" +export LOCAL_DATA_API_PASSWORD="cassandra" + export LOCAL_DATA_API_ENDPOINT="http://localhost:8181" -# -# OPTIONAL: + +# OPTIONAL: (if defined here, they will be created as needed) export LOCAL_DATA_API_KEYSPACE="default_keyspace" export LOCAL_DATA_API_SECONDARY_KEYSPACE="alternate_keyspace" diff --git a/tests/env_templates/env.vectorize-minimal.template b/tests/env_templates/env.vectorize-minimal.template new file mode 100644 index 00000000..7deb7e54 --- /dev/null +++ b/tests/env_templates/env.vectorize-minimal.template @@ -0,0 +1,5 @@ +################################## +# FOR THE MINIMAL VECTORIZE TESTS: +################################## + +export HEADER_EMBEDDING_API_KEY_OPENAI="..." diff --git a/tests/.vectorize.env.template b/tests/env_templates/env.vectorize.template similarity index 67% rename from tests/.vectorize.env.template rename to tests/env_templates/env.vectorize.template index 763411d8..3367bf65 100644 --- a/tests/.vectorize.env.template +++ b/tests/env_templates/env.vectorize.template @@ -1,6 +1,22 @@ -# The variables defined here are to probe the HEADER vedtorize mode. +############################### +# FOR THE FULL VECTORIZE TESTS: +############################### + + +export HEADER_EMBEDDING_API_KEY_HUGGINGFACE="..." + +export HEADER_EMBEDDING_API_KEY_COHERE="..." + +export HEADER_EMBEDDING_API_KEY_VOYAGEAI="..." + +export HEADER_EMBEDDING_API_KEY_MISTRAL="..." + +export HEADER_EMBEDDING_API_KEY_UPSTAGE="..." + +export HEADER_EMBEDDING_API_KEY_OPENAI="..." +export OPENAI_ORGANIZATION_ID="..." +export OPENAI_PROJECT_ID="..." -# Azure OpenAI secrets/parameters export HEADER_EMBEDDING_API_KEY_AZURE_OPENAI="..." export AZURE_OPENAI_DEPLOY_ID_EMB3LARGE="..." export AZURE_OPENAI_RESNAME_EMB3LARGE="..." @@ -9,23 +25,24 @@ export AZURE_OPENAI_RESNAME_EMB3SMALL="..." export AZURE_OPENAI_DEPLOY_ID_ADA2="..." export AZURE_OPENAI_RESNAME_ADA2="..." -# VertexAI secrets/parameters +export HEADER_EMBEDDING_API_KEY_JINAAI="..." + export HEADER_EMBEDDING_API_KEY_VERTEXAI="..." + export HEADER_EMBEDDING_VERTEXAI_PROJECT_ID="..." -# All other providers' embedding api keys -export HEADER_EMBEDDING_API_KEY_COHERE="..." -export HEADER_EMBEDDING_API_KEY_HUGGINGFACE="..." -export HEADER_EMBEDDING_API_KEY_JINAAI="..." -export HEADER_EMBEDDING_API_KEY_MISTRAL="..." -export HEADER_EMBEDDING_API_KEY_OPENAI="..." -export HEADER_EMBEDDING_API_KEY_UPSTAGE="..." -export HEADER_EMBEDDING_API_KEY_VOYAGEAI="..." +export HEADER_EMBEDDING_API_KEY_HUGGINGFACEDED="..." +export HUGGINGFACEDED_DIMENSION="..." +export HUGGINGFACEDED_ENDPOINTNAME="..." +export HUGGINGFACEDED_REGIONNAME="..." +export HUGGINGFACEDED_CLOUDNAME="..." -# SHARED_SECRET testing information. Preparation to be done in the UI for now: +# Additional SHARED_SECRET testing information +# (Preparation to be done in the UI at the moment) # Scope these secrets, with these names, to the targeted DB(s): # SHARED_SECRET_EMBEDDING_API_KEY_AZURE_OPENAI # SHARED_SECRET_EMBEDDING_API_KEY_HUGGINGFACE +# SHARED_SECRET_EMBEDDING_API_KEY_HUGGINGFACEDED # SHARED_SECRET_EMBEDDING_API_KEY_JINAAI # SHARED_SECRET_EMBEDDING_API_KEY_MISTRAL # SHARED_SECRET_EMBEDDING_API_KEY_OPENAI diff --git a/tests/hcd_compose/cassandra-hcd.yaml b/tests/hcd_compose/cassandra-hcd.yaml new file mode 100644 index 00000000..4ed77101 --- /dev/null +++ b/tests/hcd_compose/cassandra-hcd.yaml @@ -0,0 +1,1546 @@ + +# Cassandra storage config YAML + +# NOTE: +# See https://cassandra.apache.org/doc/latest/configuration/ for +# full explanations of configuration directives +# /NOTE + +# The name of the cluster. This is mainly used to prevent machines in +# one logical cluster from joining another. +cluster_name: 'Test Cluster' + +# This defines the number of tokens randomly assigned to this node on the ring +# The more tokens, relative to other nodes, the larger the proportion of data +# that this node will store. You probably want all nodes to have the same number +# of tokens assuming they have equal hardware capability. +# +# If you leave this unspecified, Cassandra will use the default of 1 token for legacy compatibility, +# and will use the initial_token as described below. +# +# Specifying initial_token will override this setting on the node's initial start, +# on subsequent starts, this setting will apply even if initial token is set. +# +# See https://cassandra.apache.org/doc/latest/getting_started/production.html#tokens for +# best practice information about num_tokens. +# +num_tokens: 16 + +# Triggers automatic allocation of num_tokens tokens for this node. The allocation +# algorithm attempts to choose tokens in a way that optimizes replicated load over +# the nodes in the datacenter for the replica factor. +# +# The load assigned to each node will be close to proportional to its number of +# vnodes. +# +# Only supported with the Murmur3Partitioner. + +# Replica factor is determined via the replication strategy used by the specified +# keyspace. +# allocate_tokens_for_keyspace: KEYSPACE + +# Replica factor is explicitly set, regardless of keyspace or datacenter. +# This is the replica factor within the datacenter, like NTS. +allocate_tokens_for_local_replication_factor: 3 + +# initial_token allows you to specify tokens manually. While you can use it with +# vnodes (num_tokens > 1, above) -- in which case you should provide a +# comma-separated list -- it's primarily used when adding nodes to legacy clusters +# that do not have vnodes enabled. +# initial_token: + +# May either be "true" or "false" to enable globally +hinted_handoff_enabled: true + +# When hinted_handoff_enabled is true, a black list of data centers that will not +# perform hinted handoff +# hinted_handoff_disabled_datacenters: +# - DC1 +# - DC2 + +# this defines the maximum amount of time a dead host will have hints +# generated. After it has been dead this long, new hints for it will not be +# created until it has been seen alive and gone down again. +max_hint_window_in_ms: 10800000 # 3 hours + +# Maximum throttle in KBs per second, per delivery thread. This will be +# reduced proportionally to the number of nodes in the cluster. (If there +# are two nodes in the cluster, each delivery thread will use the maximum +# rate; if there are three, each will throttle to half of the maximum, +# since we expect two nodes to be delivering hints simultaneously.) +hinted_handoff_throttle_in_kb: 1024 + +# Number of threads with which to deliver hints; +# Consider increasing this number when you have multi-dc deployments, since +# cross-dc handoff tends to be slower +max_hints_delivery_threads: 2 + +# Directory where Cassandra should store hints. +# If not set, the default directory is $CASSANDRA_HOME/data/hints. +# hints_directory: /var/lib/cassandra/hints + +# How often hints should be flushed from the internal buffers to disk. +# Will *not* trigger fsync. +hints_flush_period_in_ms: 10000 + +# Maximum size for a single hints file, in megabytes. +max_hints_file_size_in_mb: 128 + +# Compression to apply to the hint files. If omitted, hints files +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +#hints_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Maximum throttle in KBs per second, total. This will be +# reduced proportionally to the number of nodes in the cluster. +batchlog_replay_throttle_in_kb: 1024 + +# Authentication backend, implementing IAuthenticator; used to identify users +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthenticator, +# PasswordAuthenticator}. +# +# - AllowAllAuthenticator performs no checks - set it to disable authentication. +# - PasswordAuthenticator relies on username/password pairs to authenticate +# users. It keeps usernames and hashed passwords in system_auth.roles table. +# Please increase system_auth keyspace replication factor if you use this authenticator. +# If using PasswordAuthenticator, CassandraRoleManager must also be used (see below) +authenticator: PasswordAuthenticator + +# Authorization backend, implementing IAuthorizer; used to limit access/provide permissions +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllAuthorizer, +# CassandraAuthorizer}. +# +# - AllowAllAuthorizer allows any action to any user - set it to disable authorization. +# - CassandraAuthorizer stores permissions in system_auth.role_permissions table. Please +# increase system_auth keyspace replication factor if you use this authorizer. +authorizer: CassandraAuthorizer + +# Part of the Authentication & Authorization backend, implementing IRoleManager; used +# to maintain grants and memberships between roles. +# Out of the box, Cassandra provides org.apache.cassandra.auth.CassandraRoleManager, +# which stores role information in the system_auth keyspace. Most functions of the +# IRoleManager require an authenticated login, so unless the configured IAuthenticator +# actually implements authentication, most of this functionality will be unavailable. +# +# - CassandraRoleManager stores role data in the system_auth keyspace. Please +# increase system_auth keyspace replication factor if you use this role manager. +role_manager: CassandraRoleManager + +# Network authorization backend, implementing INetworkAuthorizer; used to restrict user +# access to certain DCs +# Out of the box, Cassandra provides org.apache.cassandra.auth.{AllowAllNetworkAuthorizer, +# CassandraNetworkAuthorizer}. +# +# - AllowAllNetworkAuthorizer allows access to any DC to any user - set it to disable authorization. +# - CassandraNetworkAuthorizer stores permissions in system_auth.network_permissions table. Please +# increase system_auth keyspace replication factor if you use this authorizer. +network_authorizer: AllowAllNetworkAuthorizer + +# Validity period for roles cache (fetching granted roles can be an expensive +# operation depending on the role manager, CassandraRoleManager is one example) +# Granted roles are cached for authenticated sessions in AuthenticatedUser and +# after the period specified here, become eligible for (async) reload. +# Defaults to 2000, set to 0 to disable caching entirely. +# Will be disabled automatically for AllowAllAuthenticator. +roles_validity_in_ms: 2000 + +# Refresh interval for roles cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If roles_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as roles_validity_in_ms. +# roles_update_interval_in_ms: 2000 + +# Validity period for permissions cache (fetching permissions can be an +# expensive operation depending on the authorizer, CassandraAuthorizer is +# one example). Defaults to 2000, set to 0 to disable. +# Will be disabled automatically for AllowAllAuthorizer. +permissions_validity_in_ms: 2000 + +# Refresh interval for permissions cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If permissions_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as permissions_validity_in_ms. +# permissions_update_interval_in_ms: 2000 + +# Validity period for credentials cache. This cache is tightly coupled to +# the provided PasswordAuthenticator implementation of IAuthenticator. If +# another IAuthenticator implementation is configured, this cache will not +# be automatically used and so the following settings will have no effect. +# Please note, credentials are cached in their encrypted form, so while +# activating this cache may reduce the number of queries made to the +# underlying table, it may not bring a significant reduction in the +# latency of individual authentication attempts. +# Defaults to 2000, set to 0 to disable credentials caching. +credentials_validity_in_ms: 2000 + +# Refresh interval for credentials cache (if enabled). +# After this interval, cache entries become eligible for refresh. Upon next +# access, an async reload is scheduled and the old value returned until it +# completes. If credentials_validity_in_ms is non-zero, then this must be +# also. +# Defaults to the same value as credentials_validity_in_ms. +# credentials_update_interval_in_ms: 2000 + +# The partitioner is responsible for distributing groups of rows (by +# partition key) across nodes in the cluster. The partitioner can NOT be +# changed without reloading all data. If you are adding nodes or upgrading, +# you should set this to the same partitioner that you are currently using. +# +# The default partitioner is the Murmur3Partitioner. Older partitioners +# such as the RandomPartitioner, ByteOrderedPartitioner, and +# OrderPreservingPartitioner have been included for backward compatibility only. +# For new clusters, you should NOT change this value. +# +partitioner: org.apache.cassandra.dht.Murmur3Partitioner + +# Directories where Cassandra should store data on disk. If multiple +# directories are specified, Cassandra will spread data evenly across +# them by partitioning the token ranges. +# If not set, the default directory is $CASSANDRA_HOME/data/data. +# data_file_directories: +# - /var/lib/cassandra/data + +# Metadata directory that holds information about the cluster, local node and its peers. +# Currently, only a single subdirectory called 'nodes' will be used. +# If not set, the default directory is $CASSANDRA_HOME/data/metadata. +# metadata_directory: /var/lib/cassandra/metadata + +# Directory were Cassandra should store the data of the local system keyspaces. +# By default Cassandra will store the data of the local system keyspaces in the first of the data directories specified +# by data_file_directories. +# This approach ensures that if one of the other disks is lost Cassandra can continue to operate. For extra security +# this setting allows to store those data on a different directory that provides redundancy. +# local_system_data_file_directory: + +# commit log. when running on magnetic HDD, this should be a +# separate spindle than the data directories. +# If not set, the default directory is $CASSANDRA_HOME/data/commitlog. +# commitlog_directory: /var/lib/cassandra/commitlog + +# Enable / disable CDC functionality on a per-node basis. This modifies the logic used +# for write path allocation rejection (standard: never reject. cdc: reject Mutation +# containing a CDC-enabled table if at space limit in cdc_raw_directory). +cdc_enabled: false + +# CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the +# segment contains mutations for a CDC-enabled table. This should be placed on a +# separate spindle than the data directories. If not set, the default directory is +# $CASSANDRA_HOME/data/cdc_raw. +# cdc_raw_directory: /var/lib/cassandra/cdc_raw + +# Policy for data disk failures: +# +# die +# shut down gossip and client transports and kill the JVM for any fs errors or +# single-sstable errors, so the node can be replaced. +# +# stop_paranoid +# shut down gossip and client transports even for single-sstable errors, +# kill the JVM for errors during startup. +# +# stop +# shut down gossip and client transports, leaving the node effectively dead, but +# can still be inspected via JMX, kill the JVM for errors during startup. +# +# best_effort +# stop using the failed disk and respond to requests based on +# remaining available sstables. This means you WILL see obsolete +# data at CL.ONE! +# +# ignore +# ignore fatal errors and let requests fail, as in pre-1.2 Cassandra +disk_failure_policy: stop + +# Policy for commit disk failures: +# +# die +# shut down the node and kill the JVM, so the node can be replaced. +# +# stop +# shut down the node, leaving the node effectively dead, but +# can still be inspected via JMX. +# +# stop_commit +# shutdown the commit log, letting writes collect but +# continuing to service reads, as in pre-2.0.5 Cassandra +# +# ignore +# ignore fatal errors and let the batches fail +commit_failure_policy: stop + +# Maximum size of the native protocol prepared statement cache +# +# Valid values are either "auto" (omitting the value) or a value greater 0. +# +# Note that specifying a too large value will result in long running GCs and possbily +# out-of-memory errors. Keep the value at a small fraction of the heap. +# +# If you constantly see "prepared statements discarded in the last minute because +# cache limit reached" messages, the first step is to investigate the root cause +# of these messages and check whether prepared statements are used correctly - +# i.e. use bind markers for variable parts. +# +# Do only change the default value, if you really have more prepared statements than +# fit in the cache. In most cases it is not neccessary to change this value. +# Constantly re-preparing statements is a performance penalty. +# +# Default value ("auto") is 1/256th of the heap or 10MB, whichever is greater +prepared_statements_cache_size_mb: + +# Maximum size of the key cache in memory. +# +# Each key cache hit saves 1 seek and each row cache hit saves 2 seeks at the +# minimum, sometimes more. The key cache is fairly tiny for the amount of +# time it saves, so it's worthwhile to use it at large numbers. +# The row cache saves even more time, but must contain the entire row, +# so it is extremely space-intensive. It's best to only use the +# row cache if you have hot rows or static rows. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(5% of Heap (in MB), 100MB)). Set to 0 to disable key cache. +key_cache_size_in_mb: + +# Duration in seconds after which Cassandra should +# save the key cache. Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 14400 or 4 hours. +key_cache_save_period: 14400 + +# Number of keys from the key cache to save +# Disabled by default, meaning all keys are going to be saved +# key_cache_keys_to_save: 100 + +# Row cache implementation class name. Available implementations: +# +# org.apache.cassandra.cache.OHCProvider +# Fully off-heap row cache implementation (default). +# +# org.apache.cassandra.cache.SerializingCacheProvider +# This is the row cache implementation availabile +# in previous releases of Cassandra. +# row_cache_class_name: org.apache.cassandra.cache.OHCProvider + +# Maximum size of the row cache in memory. +# Please note that OHC cache implementation requires some additional off-heap memory to manage +# the map structures and some in-flight memory during operations before/after cache entries can be +# accounted against the cache capacity. This overhead is usually small compared to the whole capacity. +# Do not specify more memory that the system can afford in the worst usual situation and leave some +# headroom for OS block level cache. Do never allow your system to swap. +# +# Default value is 0, to disable row caching. +row_cache_size_in_mb: 0 + +# Duration in seconds after which Cassandra should save the row cache. +# Caches are saved to saved_caches_directory as specified in this configuration file. +# +# Saved caches greatly improve cold-start speeds, and is relatively cheap in +# terms of I/O for the key cache. Row cache saving is much more expensive and +# has limited use. +# +# Default is 0 to disable saving the row cache. +row_cache_save_period: 0 + +# Number of keys from the row cache to save. +# Specify 0 (which is the default), meaning all keys are going to be saved +# row_cache_keys_to_save: 100 + +# Maximum size of the counter cache in memory. +# +# Counter cache helps to reduce counter locks' contention for hot counter cells. +# In case of RF = 1 a counter cache hit will cause Cassandra to skip the read before +# write entirely. With RF > 1 a counter cache hit will still help to reduce the duration +# of the lock hold, helping with hot counter cell updates, but will not allow skipping +# the read entirely. Only the local (clock, count) tuple of a counter cell is kept +# in memory, not the whole counter, so it's relatively cheap. +# +# NOTE: if you reduce the size, you may not get you hottest keys loaded on startup. +# +# Default value is empty to make it "auto" (min(2.5% of Heap (in MB), 50MB)). Set to 0 to disable counter cache. +# NOTE: if you perform counter deletes and rely on low gcgs, you should disable the counter cache. +counter_cache_size_in_mb: + +# Duration in seconds after which Cassandra should +# save the counter cache (keys only). Caches are saved to saved_caches_directory as +# specified in this configuration file. +# +# Default is 7200 or 2 hours. +counter_cache_save_period: 7200 + +# Number of keys from the counter cache to save +# Disabled by default, meaning all keys are going to be saved +# counter_cache_keys_to_save: 100 + +# saved caches +# If not set, the default directory is $CASSANDRA_HOME/data/saved_caches. +# saved_caches_directory: /var/lib/cassandra/saved_caches + +# Number of seconds the server will wait for each cache (row, key, etc ...) to load while starting +# the Cassandra process. Setting this to a negative value is equivalent to disabling all cache loading on startup +# while still having the cache during runtime. +# cache_load_timeout_seconds: 30 + +# commitlog_sync may be either "periodic", "group", or "batch." +# +# When in batch mode, Cassandra won't ack writes until the commit log +# has been flushed to disk. Each incoming write will trigger the flush task. +# commitlog_sync_batch_window_in_ms is a deprecated value. Previously it had +# almost no value, and is being removed. +# +# commitlog_sync_batch_window_in_ms: 2 +# +# group mode is similar to batch mode, where Cassandra will not ack writes +# until the commit log has been flushed to disk. The difference is group +# mode will wait up to commitlog_sync_group_window_in_ms between flushes. +# +# commitlog_sync_group_window_in_ms: 1000 +# +# the default option is "periodic" where writes may be acked immediately +# and the CommitLog is simply synced every commitlog_sync_period_in_ms +# milliseconds. +commitlog_sync: periodic +commitlog_sync_period_in_ms: 10000 + +# When in periodic commitlog mode, the number of milliseconds to block writes +# while waiting for a slow disk flush to complete. +# periodic_commitlog_sync_lag_block_in_ms: + +# The size of the individual commitlog file segments. A commitlog +# segment may be archived, deleted, or recycled once all the data +# in it (potentially from each columnfamily in the system) has been +# flushed to sstables. +# +# The default size is 32, which is almost always fine, but if you are +# archiving commitlog segments (see commitlog_archiving.properties), +# then you probably want a finer granularity of archiving; 8 or 16 MB +# is reasonable. +# Max mutation size is also configurable via max_mutation_size_in_kb setting in +# cassandra-hcd.yaml. The default is half the size commitlog_segment_size_in_mb * 1024. +# This should be positive and less than 2048. +# +# NOTE: If max_mutation_size_in_kb is set explicitly then commitlog_segment_size_in_mb must +# be set to at least twice the size of max_mutation_size_in_kb / 1024 +# +commitlog_segment_size_in_mb: 32 + +# Compression to apply to the commit log. If omitted, the commit log +# will be written uncompressed. LZ4, Snappy, and Deflate compressors +# are supported. +# commitlog_compression: +# - class_name: LZ4Compressor +# parameters: +# - + +# Compression to apply to SSTables as they flush for compressed tables. +# Note that tables without compression enabled do not respect this flag. +# +# As high ratio compressors like LZ4HC, Zstd, and Deflate can potentially +# block flushes for too long, the default is to flush with a known fast +# compressor in those cases. Options are: +# +# none : Flush without compressing blocks but while still doing checksums. +# fast : Flush with a fast compressor. If the table is already using a +# fast compressor that compressor is used. +# table: Always flush with the same compressor that the table uses. This +# was the pre 4.0 behavior. +# +# flush_compression: fast + +# any class that implements the SeedProvider interface and has a +# constructor that takes a Map of parameters will do. +seed_provider: + # Addresses of hosts that are deemed contact points. + # Cassandra nodes use this list of hosts to find each other and learn + # the topology of the ring. You must change this if you are running + # multiple nodes! + - class_name: org.apache.cassandra.locator.SimpleSeedProvider + parameters: + # seeds is actually a comma-delimited list of addresses. + # Ex: ",," + - seeds: "localhost" + +# For workloads with more data than can fit in memory, Cassandra's +# bottleneck will be reads that need to fetch data from +# disk. "concurrent_reads" should be set to (16 * number_of_drives) in +# order to allow the operations to enqueue low enough in the stack +# that the OS and drives can reorder them. Same applies to +# "concurrent_counter_writes", since counter writes read the current +# values before incrementing and writing them back. +# +# On the other hand, since writes are almost never IO bound, the ideal +# number of "concurrent_writes" is dependent on the number of cores in +# your system; (8 * number_of_cores) is a good rule of thumb. +concurrent_reads: 32 +concurrent_writes: 32 +concurrent_counter_writes: 32 + +# For materialized view writes, as there is a read involved, so this should +# be limited by the less of concurrent reads or concurrent writes. +concurrent_materialized_view_writes: 32 + +# Maximum memory to use for inter-node and client-server networking buffers. +# +# Defaults to the smaller of 1/16 of heap or 128MB. This pool is allocated off-heap, +# so is in addition to the memory allocated for heap. The cache also has on-heap +# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size +# if the default 64k chunk size is used). +# Memory is only allocated when needed. +# networking_cache_size_in_mb: 128 + +# Enable the sstable chunk cache. The chunk cache will store recently accessed +# sections of the sstable in-memory as uncompressed buffers. +file_cache_enabled: true + +# Maximum memory to use for sstable chunk cache and buffer pooling. +# 32MB of this are reserved for pooling buffers, the rest is used for chunk cache +# that holds uncompressed sstable chunks. +# Defaults to the smaller of 1/4 of heap or 512MB. This pool is allocated off-heap, +# so is in addition to the memory allocated for heap. The cache also has on-heap +# overhead which is roughly 128 bytes per chunk (i.e. 0.2% of the reserved size +# if the default 64k chunk size is used). +# Memory is only allocated when needed. +# file_cache_size_in_mb: 512 + +# Flag indicating whether to allocate on or off heap when the sstable buffer +# pool is exhausted, that is when it has exceeded the maximum memory +# file_cache_size_in_mb, beyond which it will not cache buffers but allocate on request. + +# buffer_pool_use_heap_if_exhausted: true + +# The strategy for optimizing disk read +# Possible values are: +# ssd (for solid state disks, the default) +# spinning (for spinning disks) +# disk_optimization_strategy: ssd + +# Total permitted memory to use for memtables. Cassandra will stop +# accepting writes when the limit is exceeded until a flush completes, +# and will trigger a flush based on memtable_cleanup_threshold +# If omitted, Cassandra will set both to 1/4 the size of the heap. +# memtable_heap_space_in_mb: 2048 +# memtable_offheap_space_in_mb: 2048 + +# memtable_cleanup_threshold is deprecated. The default calculation +# is the only reasonable choice. See the comments on memtable_flush_writers +# for more information. +# +# Ratio of occupied non-flushing memtable size to total permitted size +# that will trigger a flush of the largest memtable. Larger mct will +# mean larger flushes and hence less compaction, but also less concurrent +# flush activity which can make it difficult to keep your disks fed +# under heavy write load. +# +# memtable_cleanup_threshold defaults to 1 / (memtable_flush_writers + 1) +# memtable_cleanup_threshold: 0.11 + +# Specify the way Cassandra allocates and manages memtable memory. +# Options are: +# +# heap_buffers +# on heap nio buffers +# +# offheap_buffers +# off heap (direct) nio buffers +# +# offheap_objects +# off heap objects +memtable_allocation_type: offheap_objects + +# Limit memory usage for Merkle tree calculations during repairs. The default +# is 1/16th of the available heap. The main tradeoff is that smaller trees +# have less resolution, which can lead to over-streaming data. If you see heap +# pressure during repairs, consider lowering this, but you cannot go below +# one megabyte. If you see lots of over-streaming, consider raising +# this or using subrange repair. +# +# For more details see https://issues.apache.org/jira/browse/CASSANDRA-14096. +# +# repair_session_space_in_mb: + +# Total space to use for commit logs on disk. +# +# If space gets above this value, Cassandra will flush every dirty CF +# in the oldest segment and remove it. So a small total commitlog space +# will tend to cause more flush activity on less-active columnfamilies. +# +# The default value is the smaller of 8192, and 1/4 of the total space +# of the commitlog volume. +# +# commitlog_total_space_in_mb: 8192 + +# This sets the number of memtable flush writer threads per disk +# as well as the total number of memtables that can be flushed concurrently. +# These are generally a combination of compute and IO bound. +# +# Memtable flushing is more CPU efficient than memtable ingest and a single thread +# can keep up with the ingest rate of a whole server on a single fast disk +# until it temporarily becomes IO bound under contention typically with compaction. +# At that point you need multiple flush threads. At some point in the future +# it may become CPU bound all the time. +# +# You can tell if flushing is falling behind using the MemtablePool.BlockedOnAllocation +# metric which should be 0, but will be non-zero if threads are blocked waiting on flushing +# to free memory. +# +# memtable_flush_writers defaults to two for a single data directory. +# This means that two memtables can be flushed concurrently to the single data directory. +# If you have multiple data directories the default is one memtable flushing at a time +# but the flush will use a thread per data directory so you will get two or more writers. +# +# Two is generally enough to flush on a fast disk [array] mounted as a single data directory. +# Adding more flush writers will result in smaller more frequent flushes that introduce more +# compaction overhead. +# +# There is a direct tradeoff between number of memtables that can be flushed concurrently +# and flush size and frequency. More is not better you just need enough flush writers +# to never stall waiting for flushing to free memory. +# +#memtable_flush_writers: 2 + +# Total space to use for change-data-capture logs on disk. +# +# If space gets above this value, Cassandra will throw WriteTimeoutException +# on Mutations including tables with CDC enabled. A CDCCompactor is responsible +# for parsing the raw CDC logs and deleting them when parsing is completed. +# +# The default value is the min of 4096 mb and 1/8th of the total space +# of the drive where cdc_raw_directory resides. +# cdc_total_space_in_mb: 4096 + +# When we hit our cdc_raw limit and the CDCCompactor is either running behind +# or experiencing backpressure, we check at the following interval to see if any +# new space for cdc-tracked tables has been made available. Default to 250ms +# cdc_free_space_check_interval_ms: 250 + +# A fixed memory pool size in MB for for SSTable index summaries. If left +# empty, this will default to 5% of the heap size. If the memory usage of +# all index summaries exceeds this limit, SSTables with low read rates will +# shrink their index summaries in order to meet this limit. However, this +# is a best-effort process. In extreme conditions Cassandra may need to use +# more than this amount of memory. +index_summary_capacity_in_mb: + +# How frequently index summaries should be resampled. This is done +# periodically to redistribute memory from the fixed-size pool to sstables +# proportional their recent read rates. Setting to -1 will disable this +# process, leaving existing index summaries at their current sampling level. +index_summary_resize_interval_in_minutes: 60 + +# Whether to, when doing sequential writing, fsync() at intervals in +# order to force the operating system to flush the dirty +# buffers. Enable this to avoid sudden dirty buffer flushing from +# impacting read latencies. Almost always a good idea on SSDs; not +# necessarily on platters. +trickle_fsync: true +trickle_fsync_interval_in_kb: 10240 + +# TCP port, for commands and data +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +storage_port: 7000 + +# SSL port, for legacy encrypted communication. This property is unused unless enabled in +# server_encryption_options (see below). As of cassandra 4.0, this property is deprecated +# as a single port can be used for either/both secure and insecure connections. +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +ssl_storage_port: 7001 + +# Address or interface to bind to and tell other Cassandra nodes to connect to. +# You _must_ change this if you want multiple nodes to be able to communicate! +# +# Set listen_address OR listen_interface, not both. +# +# Leaving it blank leaves it up to InetAddress.getLocalHost(). This +# will always do the Right Thing _if_ the node is properly configured +# (hostname, name resolution, etc), and the Right Thing is to use the +# address associated with the hostname (it might not be). If unresolvable +# it will fall back to InetAddress.getLoopbackAddress(), which is wrong for production systems. +# +# Setting listen_address to 0.0.0.0 is always wrong. +# +listen_address: localhost + +# Set listen_address OR listen_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# listen_interface: eth0 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using listen_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# listen_interface_prefer_ipv6: false + +# Address to broadcast to other Cassandra nodes +# Leaving this blank will set it to the same value as listen_address +broadcast_address: localhost + +# When using multiple physical network interfaces, set this +# to true to listen on broadcast_address in addition to +# the listen_address, allowing nodes to communicate in both +# interfaces. +# Ignore this property if the network configuration automatically +# routes between the public and private networks such as EC2. +# listen_on_broadcast_address: false + +# Internode authentication backend, implementing IInternodeAuthenticator; +# used to allow/disallow connections from peer nodes. +# internode_authenticator: org.apache.cassandra.auth.AllowAllInternodeAuthenticator + +# Whether to start the native transport server. +# The address on which the native transport is bound is defined by rpc_address. +start_native_transport: true +# port for the CQL native transport to listen for clients on +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +native_transport_port: 9042 +# Enabling native transport encryption in client_encryption_options allows you to either use +# encryption for the standard port or to use a dedicated, additional port along with the unencrypted +# standard native_transport_port. +# Enabling client encryption and keeping native_transport_port_ssl disabled will use encryption +# for native_transport_port. Setting native_transport_port_ssl to a different value +# from native_transport_port will use encryption for native_transport_port_ssl while +# keeping native_transport_port unencrypted. +# native_transport_port_ssl: 9142 +# The maximum threads for handling requests (note that idle threads are stopped +# after 30 seconds so there is not corresponding minimum setting). +# native_transport_max_threads: 128 +# +# The maximum size of allowed frame. Frame (requests) larger than this will +# be rejected as invalid. The default is 256MB. If you're changing this parameter, +# you may want to adjust max_value_size_in_mb accordingly. This should be positive and less than 2048. +# native_transport_max_frame_size_in_mb: 256 + +# The maximum number of concurrent client connections. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections: -1 + +# The maximum number of concurrent client connections per source ip. +# The default is -1, which means unlimited. +# native_transport_max_concurrent_connections_per_ip: -1 + +# Controls whether Cassandra honors older, yet currently supported, protocol versions. +# The default is true, which means all supported protocols will be honored. +native_transport_allow_older_protocols: true + +# Controls when idle client connections are closed. Idle connections are ones that had neither reads +# nor writes for a time period. +# +# Clients may implement heartbeats by sending OPTIONS native protocol message after a timeout, which +# will reset idle timeout timer on the server side. To close idle client connections, corresponding +# values for heartbeat intervals have to be set on the client side. +# +# Idle connection timeouts are disabled by default. +# native_transport_idle_timeout_in_ms: 60000 + +# The address or interface to bind the native transport server to. +# +# Set rpc_address OR rpc_interface, not both. +# +# Leaving rpc_address blank has the same effect as on listen_address +# (i.e. it will be based on the configured hostname of the node). +# +# Note that unlike listen_address, you can specify 0.0.0.0, but you must also +# set broadcast_rpc_address to a value other than 0.0.0.0. +# +# For security reasons, you should not expose this port to the internet. Firewall it if needed. +rpc_address: 0.0.0.0 + +# Set rpc_address OR rpc_interface, not both. Interfaces must correspond +# to a single address, IP aliasing is not supported. +# rpc_interface: eth1 + +# If you choose to specify the interface by name and the interface has an ipv4 and an ipv6 address +# you can specify which should be chosen using rpc_interface_prefer_ipv6. If false the first ipv4 +# address will be used. If true the first ipv6 address will be used. Defaults to false preferring +# ipv4. If there is only one address it will be selected regardless of ipv4/ipv6. +# rpc_interface_prefer_ipv6: false + +# RPC address to broadcast to drivers and other Cassandra nodes. This cannot +# be set to 0.0.0.0. If left blank, this will be set to the value of +# rpc_address. If rpc_address is set to 0.0.0.0, broadcast_rpc_address must +# be set. +broadcast_rpc_address: localhost + +# enable or disable keepalive on rpc/native connections +rpc_keepalive: true + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# See also: +# /proc/sys/net/core/wmem_max +# /proc/sys/net/core/rmem_max +# /proc/sys/net/ipv4/tcp_wmem +# /proc/sys/net/ipv4/tcp_wmem +# and 'man tcp' +# internode_socket_send_buffer_size_in_bytes: + +# Uncomment to set socket buffer size for internode communication +# Note that when setting this, the buffer size is limited by net.core.wmem_max +# and when not setting it it is defined by net.ipv4.tcp_wmem +# internode_socket_receive_buffer_size_in_bytes: + +# Set to true to have Cassandra create a hard link to each sstable +# flushed or streamed locally in a backups/ subdirectory of the +# keyspace data. Removing these links is the operator's +# responsibility. +incremental_backups: false + +# Whether or not to take a snapshot before each compaction. Be +# careful using this option, since Cassandra won't clean up the +# snapshots for you. Mostly useful if you're paranoid when there +# is a data format change. +snapshot_before_compaction: false + +# Whether or not a snapshot is taken of the data before keyspace truncation +# or dropping of column families. The STRONGLY advised default of true +# should be used to provide data safety. If you set this flag to false, you will +# lose data on truncation or drop. +auto_snapshot: true + +# The act of creating or clearing a snapshot involves creating or removing +# potentially tens of thousands of links, which can cause significant performance +# impact, especially on consumer grade SSDs. A non-zero value here can +# be used to throttle these links to avoid negative performance impact of +# taking and clearing snapshots +snapshot_links_per_second: 0 + +# Granularity of the collation index of rows within a partition. +# Increase if your rows are large, or if you have a very large +# number of rows per partition. The competing goals are these: +# +# - a smaller granularity means more index entries are generated +# and looking up rows withing the partition by collation column +# is faster +# - but, Cassandra will keep the collation index in memory for hot +# rows (as part of the key cache), so a larger granularity means +# you can cache more hot rows +column_index_size_in_kb: 64 + +# Per sstable indexed key cache entries (the collation index in memory +# mentioned above) exceeding this size will not be held on heap. +# This means that only partition information is held on heap and the +# index entries are read from disk. +# +# Note that this size refers to the size of the +# serialized index information and not the size of the partition. +column_index_cache_size_in_kb: 2 + +# Number of simultaneous compactions to allow, NOT including +# validation "compactions" for anti-entropy repair. Simultaneous +# compactions can help preserve read performance in a mixed read/write +# workload, by mitigating the tendency of small sstables to accumulate +# during a single long running compactions. The default is usually +# fine and if you experience problems with compaction running too +# slowly or too fast, you should look at +# compaction_throughput_mb_per_sec first. +# +# concurrent_compactors defaults to the smaller of (number of disks, +# number of cores), with a minimum of 2 and a maximum of 8. +# +# If your data directories are backed by SSD, you should increase this +# to the number of cores. +#concurrent_compactors: 1 + +# Number of simultaneous repair validations to allow. If not set or set to +# a value less than 1, it defaults to the value of concurrent_compactors. +# To set a value greeater than concurrent_compactors at startup, the system +# property cassandra.allow_unlimited_concurrent_validations must be set to +# true. To dynamically resize to a value > concurrent_compactors on a running +# node, first call the bypassConcurrentValidatorsLimit method on the +# org.apache.cassandra.db:type=StorageService mbean +# concurrent_validations: 0 + +# Number of simultaneous materialized view builder tasks to allow. +concurrent_materialized_view_builders: 1 + +# Throttles compaction to the given total throughput across the entire +# system. The faster you insert data, the faster you need to compact in +# order to keep the sstable count down, but in general, setting this to +# 16 to 32 times the rate you are inserting data is more than sufficient. +# Setting this to 0 disables throttling. Note that this accounts for all types +# of compaction, including validation compaction (building Merkle trees +# for repairs). +compaction_throughput_mb_per_sec: 64 + +# When compacting, the replacement sstable(s) can be opened before they +# are completely written, and used in place of the prior sstables for +# any range that has been written. This helps to smoothly transfer reads +# between the sstables, reducing page cache churn and keeping hot rows hot +sstable_preemptive_open_interval_in_mb: 50 + +# Starting from 4.1 sstables support UUID based generation identifiers. They are disabled by default +# because once enabled, there is no easy way to downgrade. When the node is restarted with this option +# set to true, each newly created sstable will have a UUID based generation identifier and such files are +# not readable by previous Cassandra versions. At some point, this option will become true by default +# and eventually get removed from the configuration. +# In Converged Cassandra, we enable this option by default +enable_uuid_sstable_identifiers: true + +# When enabled, permits Cassandra to zero-copy stream entire eligible +# SSTables between nodes, including every component. +# This speeds up the network transfer significantly subject to +# throttling specified by stream_throughput_outbound_megabits_per_sec. +# Enabling this will reduce the GC pressure on sending and receiving node. +# When unset, the default is enabled. While this feature tries to keep the +# disks balanced, it cannot guarantee it. This feature will be automatically +# disabled if internode encryption is enabled. +# stream_entire_sstables: true + +# Throttles all outbound streaming file transfers on this node to the +# given total throughput in Mbps. This is necessary because Cassandra does +# mostly sequential IO when streaming data during bootstrap or repair, which +# can lead to saturating the network connection and degrading rpc performance. +# When unset, the default is 200 Mbps or 25 MB/s. +# stream_throughput_outbound_megabits_per_sec: 200 + +# Throttles all streaming file transfer between the datacenters, +# this setting allows users to throttle inter dc stream throughput in addition +# to throttling all network stream traffic as configured with +# stream_throughput_outbound_megabits_per_sec +# When unset, the default is 200 Mbps or 25 MB/s +# inter_dc_stream_throughput_outbound_megabits_per_sec: 200 + +# Server side timeouts for requests. The server will return a timeout exception +# to the client if it can't complete an operation within the corresponding +# timeout. Those settings are a protection against: +# 1) having client wait on an operation that might never terminate due to some +# failures. +# 2) operations that use too much CPU/read too much data (leading to memory build +# up) by putting a limit to how long an operation will execute. +# For this reason, you should avoid putting these settings too high. In other words, +# if you are timing out requests because of underlying resource constraints then +# increasing the timeout will just cause more problems. Of course putting them too +# low is equally ill-advised since clients could get timeouts even for successful +# operations just because the timeout setting is too tight. + +# How long the coordinator should wait for read operations to complete. +# Lowest acceptable value is 10 ms. +read_request_timeout_in_ms: 5000 +# How long the coordinator should wait for seq or index scans to complete. +# Lowest acceptable value is 10 ms. +range_request_timeout_in_ms: 10000 +# How long the coordinator should wait for writes to complete. +# Lowest acceptable value is 10 ms. +write_request_timeout_in_ms: 2000 +# How long the coordinator should wait for counter writes to complete. +# Lowest acceptable value is 10 ms. +counter_write_request_timeout_in_ms: 5000 +# How long a coordinator should continue to retry a CAS operation +# that contends with other proposals for the same row. +# Lowest acceptable value is 10 ms. +cas_contention_timeout_in_ms: 1000 +# How long the coordinator should wait for truncates to complete +# (This can be much longer, because unless auto_snapshot is disabled +# we need to flush first so we can snapshot before removing the data.) +# Lowest acceptable value is 10 ms. +truncate_request_timeout_in_ms: 60000 +# The default timeout for other, miscellaneous operations. +# Lowest acceptable value is 10 ms. +request_timeout_in_ms: 10000 + +# Defensive settings for protecting Cassandra from true network partitions. +# See (CASSANDRA-14358) for details. +# +# The amount of time to wait for internode tcp connections to establish. +# internode_tcp_connect_timeout_in_ms: 2000 +# +# The amount of time unacknowledged data is allowed on a connection before we throw out the connection +# Note this is only supported on Linux + epoll, and it appears to behave oddly above a setting of 30000 +# (it takes much longer than 30s) as of Linux 4.12. If you want something that high set this to 0 +# which picks up the OS default and configure the net.ipv4.tcp_retries2 sysctl to be ~8. +# internode_tcp_user_timeout_in_ms: 30000 + +# The amount of time unacknowledged data is allowed on a streaming connection. +# The default is 5 minutes. Increase it or set it to 0 in order to increase the timeout. +# internode_streaming_tcp_user_timeout_in_ms: 300000 + +# Global, per-endpoint and per-connection limits imposed on messages queued for delivery to other nodes +# and waiting to be processed on arrival from other nodes in the cluster. These limits are applied to the on-wire +# size of the message being sent or received. +# +# The basic per-link limit is consumed in isolation before any endpoint or global limit is imposed. +# Each node-pair has three links: urgent, small and large. So any given node may have a maximum of +# N*3*(internode_application_send_queue_capacity_in_bytes+internode_application_receive_queue_capacity_in_bytes) +# messages queued without any coordination between them although in practice, with token-aware routing, only RF*tokens +# nodes should need to communicate with significant bandwidth. +# +# The per-endpoint limit is imposed on all messages exceeding the per-link limit, simultaneously with the global limit, +# on all links to or from a single node in the cluster. +# The global limit is imposed on all messages exceeding the per-link limit, simultaneously with the per-endpoint limit, +# on all links to or from any node in the cluster. +# +# internode_application_send_queue_capacity_in_bytes: 4194304 #4MiB +# internode_application_send_queue_reserve_endpoint_capacity_in_bytes: 134217728 #128MiB +# internode_application_send_queue_reserve_global_capacity_in_bytes: 536870912 #512MiB +# internode_application_receive_queue_capacity_in_bytes: 4194304 #4MiB +# internode_application_receive_queue_reserve_endpoint_capacity_in_bytes: 134217728 #128MiB +# internode_application_receive_queue_reserve_global_capacity_in_bytes: 536870912 #512MiB + + +# How long before a node logs slow queries. Select queries that take longer than +# this timeout to execute, will generate an aggregated log message, so that slow queries +# can be identified. Set this value to zero to disable slow query logging. +slow_query_log_timeout_in_ms: 500 + +# Enable operation timeout information exchange between nodes to accurately +# measure request timeouts. If disabled, replicas will assume that requests +# were forwarded to them instantly by the coordinator, which means that +# under overload conditions we will waste that much extra time processing +# already-timed-out requests. +# +# Warning: It is generally assumed that users have setup NTP on their clusters, and that clocks are modestly in sync, +# since this is a requirement for general correctness of last write wins. +#cross_node_timeout: true + +# Set keep-alive period for streaming +# This node will send a keep-alive message periodically with this period. +# If the node does not receive a keep-alive message from the peer for +# 2 keep-alive cycles the stream session times out and fail +# Default value is 300s (5 minutes), which means stalled stream +# times out in 10 minutes by default +# streaming_keep_alive_period_in_secs: 300 + +# Limit number of connections per host for streaming +# Increase this when you notice that joins are CPU-bound rather that network +# bound (for example a few nodes with big files). +# streaming_connections_per_host: 1 + + +# phi value that must be reached for a host to be marked down. +# most users should never need to adjust this. +# phi_convict_threshold: 8 + +# endpoint_snitch -- Set this to a class that implements +# IEndpointSnitch. The snitch has two functions: +# +# - it teaches Cassandra enough about your network topology to route +# requests efficiently +# - it allows Cassandra to spread replicas around your cluster to avoid +# correlated failures. It does this by grouping machines into +# "datacenters" and "racks." Cassandra will do its best not to have +# more than one replica on the same "rack" (which may not actually +# be a physical location) +# +# CASSANDRA WILL NOT ALLOW YOU TO SWITCH TO AN INCOMPATIBLE SNITCH +# ONCE DATA IS INSERTED INTO THE CLUSTER. This would cause data loss. +# This means that if you start with the default SimpleSnitch, which +# locates every node on "rack1" in "datacenter1", your only options +# if you need to add another datacenter are GossipingPropertyFileSnitch +# (and the older PFS). From there, if you want to migrate to an +# incompatible snitch like Ec2Snitch you can do it by adding new nodes +# under Ec2Snitch (which will locate them in a new "datacenter") and +# decommissioning the old ones. +# +# Out of the box, Cassandra provides: +# +# SimpleSnitch: +# Treats Strategy order as proximity. This can improve cache +# locality when disabling read repair. Only appropriate for +# single-datacenter deployments. +# +# GossipingPropertyFileSnitch +# This should be your go-to snitch for production use. The rack +# and datacenter for the local node are defined in +# cassandra-rackdc.properties and propagated to other nodes via +# gossip. If cassandra-topology.properties exists, it is used as a +# fallback, allowing migration from the PropertyFileSnitch. +# +# PropertyFileSnitch: +# Proximity is determined by rack and data center, which are +# explicitly configured in cassandra-topology.properties. +# +# Ec2Snitch: +# Appropriate for EC2 deployments in a single Region. Loads Region +# and Availability Zone information from the EC2 API. The Region is +# treated as the datacenter, and the Availability Zone as the rack. +# Only private IPs are used, so this will not work across multiple +# Regions. +# +# Ec2MultiRegionSnitch: +# Uses public IPs as broadcast_address to allow cross-region +# connectivity. (Thus, you should set seed addresses to the public +# IP as well.) You will need to open the storage_port or +# ssl_storage_port on the public IP firewall. (For intra-Region +# traffic, Cassandra will switch to the private IP after +# establishing a connection.) +# +# RackInferringSnitch: +# Proximity is determined by rack and data center, which are +# assumed to correspond to the 3rd and 2nd octet of each node's IP +# address, respectively. Unless this happens to match your +# deployment conventions, this is best used as an example of +# writing a custom Snitch class and is provided in that spirit. +# +# You can use a custom Snitch by setting this to the full class name +# of the snitch, which will be assumed to be on your classpath. +endpoint_snitch: GossipingPropertyFileSnitch + +# controls how often to perform the more expensive part of host score +# calculation +dynamic_snitch_update_interval_in_ms: 100 +# controls how often to reset all host scores, allowing a bad host to +# possibly recover +dynamic_snitch_reset_interval_in_ms: 600000 +# if set greater than zero, this will allow +# 'pinning' of replicas to hosts in order to increase cache capacity. +# The badness threshold will control how much worse the pinned host has to be +# before the dynamic snitch will prefer other replicas over it. This is +# expressed as a double which represents a percentage. Thus, a value of +# 0.2 means Cassandra would continue to prefer the static snitch values +# until the pinned host was 20% worse than the fastest. +dynamic_snitch_badness_threshold: 1.0 + +# Configure server-to-server internode encryption +# +# JVM and netty defaults for supported SSL socket protocols and cipher suites can +# be replaced using custom encryption options. This is not recommended +# unless you have policies in place that dictate certain settings, or +# need to disable vulnerable ciphers or protocols in case the JVM cannot +# be updated. +# +# FIPS compliant settings can be configured at JVM level and should not +# involve changing encryption settings here: +# https://docs.oracle.com/javase/8/docs/technotes/guides/security/jsse/FIPS.html +# +# **NOTE** this default configuration is an insecure configuration. If you need to +# enable server-to-server encryption generate server keystores (and truststores for mutual +# authentication) per: +# http://download.oracle.com/javase/8/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore +# Then perform the following configuration changes: +# +# Step 1: Set internode_encryption= and explicitly set optional=true. Restart all nodes +# +# Step 2: Set optional=false (or remove it) and if you generated truststores and want to use mutual +# auth set require_client_auth=true. Restart all nodes +server_encryption_options: + # On outbound connections, determine which type of peers to securely connect to. + # The available options are : + # none : Do not encrypt outgoing connections + # dc : Encrypt connections to peers in other datacenters but not within datacenters + # rack : Encrypt connections to peers in other racks but not within racks + # all : Always use encrypted connections + internode_encryption: none + # When set to true, encrypted and unencrypted connections are allowed on the storage_port + # This should _only be true_ while in unencrypted or transitional operation + # optional defaults to true if internode_encryption is none + # optional: true + # If enabled, will open up an encrypted listening socket on ssl_storage_port. Should only be used + # during upgrade to 4.0; otherwise, set to false. + enable_legacy_ssl_storage_port: false + # Set to a valid keystore if internode_encryption is dc, rack or all + keystore: conf/.keystore + keystore_password: cassandra + # Verify peer server certificates + require_client_auth: false + # Set to a valid trustore if require_client_auth is true + truststore: conf/.truststore + truststore_password: cassandra + # Verify that the host name in the certificate matches the connected host + require_endpoint_verification: false + # More advanced defaults: + # protocol: TLS + # store_type: JKS + # cipher_suites: [ + # TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, + # TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, + # TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_128_GCM_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA, + # TLS_RSA_WITH_AES_256_CBC_SHA + # ] + +# Configure client-to-server encryption. +# +# **NOTE** this default configuration is an insecure configuration. If you need to +# enable client-to-server encryption generate server keystores (and truststores for mutual +# authentication) per: +# http://download.oracle.com/javase/8/docs/technotes/guides/security/jsse/JSSERefGuide.html#CreateKeystore +# Then perform the following configuration changes: +# +# Step 1: Set enabled=true and explicitly set optional=true. Restart all nodes +# +# Step 2: Set optional=false (or remove it) and if you generated truststores and want to use mutual +# auth set require_client_auth=true. Restart all nodes +client_encryption_options: + # Enable client-to-server encryption + enabled: false + # When set to true, encrypted and unencrypted connections are allowed on the native_transport_port + # This should _only be true_ while in unencrypted or transitional operation + # optional defaults to true when enabled is false, and false when enabled is true. + # optional: true + # Set keystore and keystore_password to valid keystores if enabled is true + keystore: conf/.keystore + keystore_password: cassandra + # Verify client certificates + require_client_auth: false + # Set trustore and truststore_password if require_client_auth is true + # truststore: conf/.truststore + # truststore_password: cassandra + # More advanced defaults: + # protocol: TLS + # store_type: JKS + # cipher_suites: [ + # TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, + # TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, + # TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, TLS_RSA_WITH_AES_128_GCM_SHA256, TLS_RSA_WITH_AES_128_CBC_SHA, + # TLS_RSA_WITH_AES_256_CBC_SHA + # ] + +# internode_compression controls whether traffic between nodes is +# compressed. +# Can be: +# +# all +# all traffic is compressed +# +# dc +# traffic between different datacenters is compressed +# +# none +# nothing is compressed. +internode_compression: dc + +# Enable or disable tcp_nodelay for inter-dc communication. +# Disabling it will result in larger (but fewer) network packets being sent, +# reducing overhead from the TCP protocol itself, at the cost of increasing +# latency if you block for cross-datacenter responses. +inter_dc_tcp_nodelay: false + +# TTL for different trace types used during logging of the repair process. +tracetype_query_ttl: 86400 +tracetype_repair_ttl: 604800 + +# If unset, all GC Pauses greater than gc_log_threshold_in_ms will log at +# INFO level +# UDFs (user defined functions) are disabled by default. +# As of Cassandra 3.0 there is a sandbox in place that should prevent execution of evil code. +enable_user_defined_functions: false + +# Enables scripted UDFs (JavaScript UDFs). +# Java UDFs are always enabled, if enable_user_defined_functions is true. +# Enable this option to be able to use UDFs with "language javascript" or any custom JSR-223 provider. +# This option has no effect, if enable_user_defined_functions is false. +enable_scripted_user_defined_functions: false + +# The default Windows kernel timer and scheduling resolution is 15.6ms for power conservation. +# Lowering this value on Windows can provide much tighter latency and better throughput, however +# some virtualized environments may see a negative performance impact from changing this setting +# below their system default. The sysinternals 'clockres' tool can confirm your system's default +# setting. +windows_timer_interval: 1 + + +# Enables encrypting data at-rest (on disk). Different key providers can be plugged in, but the default reads from +# a JCE-style keystore. A single keystore can hold multiple keys, but the one referenced by +# the "key_alias" is the only key that will be used for encrypt opertaions; previously used keys +# can still (and should!) be in the keystore and will be used on decrypt operations +# (to handle the case of key rotation). +# +# It is strongly recommended to download and install Java Cryptography Extension (JCE) +# Unlimited Strength Jurisdiction Policy Files for your version of the JDK. +# (current link: http://www.oracle.com/technetwork/java/javase/downloads/jce8-download-2133166.html) +# +# Currently, only the following file types are supported for transparent data encryption, although +# more are coming in future cassandra releases: commitlog, hints +transparent_data_encryption_options: + enabled: false + chunk_length_kb: 64 + cipher: AES/CBC/PKCS5Padding + key_alias: testing:1 + # CBC IV length for AES needs to be 16 bytes (which is also the default size) + # iv_length: 16 + key_provider: + - class_name: org.apache.cassandra.security.JKSKeyProvider + parameters: + - keystore: conf/.keystore + keystore_password: cassandra + store_type: JCEKS + key_password: cassandra + + +##################### +# SAFETY THRESHOLDS # +##################### + +# Filtering and secondary index queries at read consistency levels above ONE/LOCAL_ONE use a +# mechanism called replica filtering protection to ensure that results from stale replicas do +# not violate consistency. (See CASSANDRA-8272 and CASSANDRA-15907 for more details.) This +# mechanism materializes replica results by partition on-heap at the coordinator. The more possibly +# stale results returned by the replicas, the more rows materialized during the query. +replica_filtering_protection: + # These thresholds exist to limit the damage severely out-of-date replicas can cause during these + # queries. They limit the number of rows from all replicas individual index and filtering queries + # can materialize on-heap to return correct results at the desired read consistency level. + # + # "cached_replica_rows_warn_threshold" is the per-query threshold at which a warning will be logged. + # "cached_replica_rows_fail_threshold" is the per-query threshold at which the query will fail. + # + # These thresholds may also be adjusted at runtime using the StorageService mbean. + # + # If the failure threshold is breached, it is likely that either the current page/fetch size + # is too large or one or more replicas is severely out-of-sync and in need of repair. + cached_rows_warn_threshold: 2000 + cached_rows_fail_threshold: 32000 + +# GC Pauses greater than 200 ms will be logged at INFO level +# This threshold can be adjusted to minimize logging if necessary +# gc_log_threshold_in_ms: 200 + +# GC Pauses greater than gc_warn_threshold_in_ms will be logged at WARN level +# Adjust the threshold based on your application throughput requirement. Setting to 0 +# will deactivate the feature. +# gc_warn_threshold_in_ms: 1000 + +# Maximum size of any value in SSTables. Safety measure to detect SSTable corruption +# early. Any value size larger than this threshold will result into marking an SSTable +# as corrupted. This should be positive and less than 2048. +# max_value_size_in_mb: 256 + +# Track a metric per keyspace indicating whether replication achieved the ideal consistency +# level for writes without timing out. This is different from the consistency level requested by +# each write which may be lower in order to facilitate availability. +# ideal_consistency_level: EACH_QUORUM + +# Automatically upgrade sstables after upgrade - if there is no ordinary compaction to do, the +# oldest non-upgraded sstable will get upgraded to the latest version +# automatic_sstable_upgrade: false +# Limit the number of concurrent sstable upgrades +# max_concurrent_automatic_sstable_upgrades: 1 + +# Audit logging - Logs every incoming CQL command request, authentication to a node. See the docs +# on audit_logging for full details about the various configuration options. +audit_logging_options: + enabled: false + logger: + - class_name: BinAuditLogger + # audit_logs_dir: + # included_keyspaces: + # excluded_keyspaces: system, system_schema, system_virtual_schema + # included_categories: + # excluded_categories: + # included_users: + # excluded_users: + # roll_cycle: HOURLY + # block: true + # max_queue_weight: 268435456 # 256 MiB + # max_log_size: 17179869184 # 16 GiB + ## archive command is "/path/to/script.sh %path" where %path is replaced with the file being rolled: + # archive_command: + # max_archive_retries: 10 + + + # default options for full query logging - these can be overridden from command line when executing + # nodetool enablefullquerylog + #full_query_logging_options: + # log_dir: + # roll_cycle: HOURLY + # block: true + # max_queue_weight: 268435456 # 256 MiB + # max_log_size: 17179869184 # 16 GiB + ## archive command is "/path/to/script.sh %path" where %path is replaced with the file being rolled: + # archive_command: + # max_archive_retries: 10 + +# validate tombstones on reads and compaction +# can be either "disabled", "warn" or "exception" +# corrupted_tombstone_strategy: disabled + +# Diagnostic Events # +# If enabled, diagnostic events can be helpful for troubleshooting operational issues. Emitted events contain details +# on internal state and temporal relationships across events, accessible by clients via JMX. +diagnostic_events_enabled: false + +# Use native transport TCP message coalescing. If on upgrade to 4.0 you found your throughput decreasing, and in +# particular you run an old kernel or have very fewer client connections, this option might be worth evaluating. +#native_transport_flush_in_batches_legacy: false + +# Enable tracking of repaired state of data during reads and comparison between replicas +# Mismatches between the repaired sets of replicas can be characterized as either confirmed +# or unconfirmed. In this context, unconfirmed indicates that the presence of pending repair +# sessions, unrepaired partition tombstones, or some other condition means that the disparity +# cannot be considered conclusive. Confirmed mismatches should be a trigger for investigation +# as they may be indicative of corruption or data loss. +# There are separate flags for range vs partition reads as single partition reads are only tracked +# when CL > 1 and a digest mismatch occurs. Currently, range queries don't use digests so if +# enabled for range reads, all range reads will include repaired data tracking. As this adds +# some overhead, operators may wish to disable it whilst still enabling it for partition reads +repaired_data_tracking_for_range_reads_enabled: false +repaired_data_tracking_for_partition_reads_enabled: false +# If false, only confirmed mismatches will be reported. If true, a separate metric for unconfirmed +# mismatches will also be recorded. This is to avoid potential signal:noise issues are unconfirmed +# mismatches are less actionable than confirmed ones. +report_unconfirmed_repaired_data_mismatches: false + +# Having many tables and/or keyspaces negatively affects performance of many operations in the +# cluster. When the number of tables/keyspaces in the cluster exceeds the following thresholds +# a client warning will be sent back to the user when creating a table or keyspace. +# table_count_warn_threshold: 150 +# keyspace_count_warn_threshold: 40 + +# This is the page size used internally by aggregation queries. It aims to limit the memory used by aggregation +# queries when there is a lot of data to aggregate. +# aggregation_subpage_size_in_kb: 2048 + +######################### +# EXPERIMENTAL FEATURES # +######################### + +# Enables materialized view creation on this node. +# Materialized views are considered experimental and are not recommended for production use. +enable_materialized_views: false + +# Enables SASI index creation on this node. +# SASI indexes are considered experimental and are not recommended for production use. +enable_sasi_indexes: false + +# Enables creation of transiently replicated keyspaces on this node. +# Transient replication is experimental and is not recommended for production use. +enable_transient_replication: false + +# Enables the used of 'ALTER ... DROP COMPACT STORAGE' statements on this node. +# 'ALTER ... DROP COMPACT STORAGE' is considered experimental and is not recommended for production use. +enable_drop_compact_storage: false + + # Emulates DataStax Constellation database-as-a-service defaults. + # + # When enabled, some defaults are modified to match those used by DataStax Constellation (DataStax cloud data + # platform). This includes (but is not limited to) stricter guardrails defaults. + # + # This can be used as an convenience to develop and test applications meant to run on DataStax Constellation. + # + # Warning: when enabled, the updated defaults reflect those of DataStax Constellation _at the time_ of the currently + # used DSE release. This is a best-effort emulation of said defaults. Further, all nodes must use the same + # config value. + # emulate_dbaas_defaults: false + + # Guardrails settings. + # guardrails: + # When executing a scan, within or across a partition, we need to keep the + # tombstones seen in memory so we can return them to the coordinator, which + # will use them to make sure other replicas also know about the deleted rows. + # With workloads that generate a lot of tombstones, this can cause performance + # problems and even exhaust the server heap. + # (http://www.datastax.com/dev/blog/cassandra-anti-patterns-queues-and-queue-like-datasets) + # Adjust the thresholds here if you understand the dangers and want to + # scan more tombstones anyway. These thresholds may also be adjusted at runtime + # using the StorageService mbean. + # + # Default tombstone_warn_threshold is 1000, may differ if emulate_dbaas_defaults is enabled + # Default tombstone_failure_threshold is 100000, may differ if emulate_dbaas_defaults is enabled + # tombstone_warn_threshold: 1000 + # tombstone_failure_threshold: 100000 + + # Log a warning when compacting partitions larger than this value. + # Default value is 100mb, may differ if emulate_dbaas_defaults is enabled + # partition_size_warn_threshold_in_mb: 100 + + # Log WARN on any multiple-partition batch size that exceeds this value. 64kb per batch by default. + # Use caution when increasing the size of this threshold as it can lead to node instability. + # Default value is 64kb, may differ if emulate_dbaas_defaults is enabled + # batch_size_warn_threshold_in_kb: 64 + + # Fail any multiple-partition batch that exceeds this value. The calculated default is 640kb (10x warn threshold). + # Default value is 640kb, may differ if emulate_dbaas_defaults is enabled + # batch_size_fail_threshold_in_kb: 640 + + # Log WARN on any batches not of type LOGGED than span across more partitions than this limit. + # Default value is 10, may differ if emulate_dbaas_defaults is enabled + # unlogged_batch_across_partitions_warn_threshold: 10 + + # Failure threshold to prevent writing large column value into Cassandra. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # column_value_size_failure_threshold_in_kb: -1 + + # Failure threshold to prevent creating more columns per table than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # columns_per_table_failure_threshold: -1 + + # Failure threshold to prevent creating more fields in user-defined-type than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # fields_per_udt_failure_threshold: -1 + + # Warning threshold to warn when encountering larger size of collection data than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # collection_size_warn_threshold_in_kb: -1 + + # Warning threshold to warn when encountering more elements in collection than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # items_per_collection_warn_threshold: -1 + + # Whether read-before-write operation is allowed, eg. setting list element by index, removing list element + # by index. Note: LWT is always allowed. + # Default true to allow read before write operation, may differ if emulate_dbaas_defaults is enabled + # read_before_write_list_operations_enabled: true + + # Failure threshold to prevent creating more secondary index per table than threshold (does not apply to CUSTOM INDEX StorageAttachedIndex) + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # secondary_index_per_table_failure_threshold: -1 + + # Failure threshold for number of StorageAttachedIndex per table (only applies to CUSTOM INDEX StorageAttachedIndex) + # Default is 10 (same when emulate_dbaas_defaults is enabled) + # sai_indexes_per_table_failure_threshold: 10 + # + # Failure threshold for total number of StorageAttachedIndex across all keyspaces (only applies to CUSTOM INDEX StorageAttachedIndex) + # Default is 10 (same when emulate_dbaas_defaults is enabled) + # sai_indexes_total_failure_threshold: 100 + + # Failure threshold to prevent creating more materialized views per table than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # materialized_view_per_table_failure_threshold: -1 + + # Warn threshold to warn creating more tables than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # tables_warn_threshold: -1 + + # Failure threshold to prevent creating more tables than threshold. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # tables_failure_threshold: -1 + + # Preventing creating tables with provided configurations. + # Default all properties are allowed, may differ if emulate_dbaas_defaults is enabled + # table_properties_disallowed: + + # Whether to allow user-provided timestamp in write request + # Default true to allow user-provided timestamp, may differ if emulate_dbaas_defaults is enabled + # user_timestamps_enabled: true + + # Preventing query with provided consistency levels + # Default all consistency levels are allowed. + # write_consistency_levels_disallowed: + + # Failure threshold to prevent providing larger paging by bytes than threshold, also served as a hard paging limit + # when paging by rows is used. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # page_size_failure_threshold_in_kb: -1 + + # Failure threshold to prevent IN query creating size of cartesian product exceeding threshold, eg. + # "a in (1,2,...10) and b in (1,2...10)" results in cartesian product of 100. + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # in_select_cartesian_product_failure_threshold: -1 + + # Failure threshold to prevent IN query containing more partition keys than threshold + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # partition_keys_in_select_failure_threshold: -1 + + # Warning threshold to warn when local disk usage exceeding threshold. Valid values: (1, 100] + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # disk_usage_percentage_warn_threshold: -1 + + # Failure threshold to reject write requests if replica disk usage exceeding threshold. Valid values: (1, 100] + # Default -1 to disable, may differ if emulate_dbaas_defaults is enabled + # disk_usage_percentage_failure_threshold: -1 + + # Allows configuring max disk size of data directories when calculating thresholds for disk_usage_percentage_warn_threshold + # and disk_usage_percentage_failure_threshold. Valid values: (1, max available disk size of all data directories] + # Default -1 to disable and use the physically available disk size of data directories during calculations. +# may differ if emulate_dbaas_defaults is enabled +# disk_usage_max_disk_size_in_gb: -1 diff --git a/tests/hcd_compose/docker-compose.yml b/tests/hcd_compose/docker-compose.yml new file mode 100644 index 00000000..edef32c1 --- /dev/null +++ b/tests/hcd_compose/docker-compose.yml @@ -0,0 +1,51 @@ +version: '3' + +services: + hcd: + image: datastax/hcd:1.0.0-early-preview + networks: + - stargate + mem_limit: 2G + environment: + - MAX_HEAP_SIZE=1536M + - CLUSTER_NAME=hcd-1.0.0-early-preview.1-cluster + - DS_LICENSE=accept + - HCD_AUTO_CONF_OFF=cassandra.yaml + volumes: + - ./cassandra-hcd.yaml:/opt/hcd/resources/cassandra/conf/cassandra.yaml:rw + ports: + - "9042:9042" + healthcheck: + test: [ "CMD-SHELL", "cqlsh -u cassandra -p cassandra -e 'describe keyspaces'" ] + interval: 15s + timeout: 10s + retries: 20 + + data-api: + image: stargateio/data-api:v1.0.12 + depends_on: + hcd: + condition: service_healthy + networks: + - stargate + ports: + - "8181:8181" + mem_limit: 2G + environment: + - JAVA_MAX_MEM_RATIO=75 + - JAVA_INITIAL_MEM_RATIO=50 + - STARGATE_DATA_STORE_IGNORE_BRIDGE=true + - GC_CONTAINER_OPTIONS=-XX:+UseG1GC + - STARGATE_JSONAPI_OPERATIONS_DATABASE_CONFIG_CASSANDRA_END_POINTS=hcd + - STARGATE_JSONAPI_OPERATIONS_DATABASE_CONFIG_LOCAL_DATACENTER=dc1 + - QUARKUS_HTTP_ACCESS_LOG_ENABLED=FALSE + - QUARKUS_LOG_LEVEL=INFO + - STARGATE_JSONAPI_OPERATIONS_VECTORIZE_ENABLED=true + - JAVA_OPTS_APPEND=-Dquarkus.http.host=0.0.0.0 -Djava.util.logging.manager=org.jboss.logmanager.LogManager + healthcheck: + test: curl -f http://localhost:8181/stargate/health || exit 1 + interval: 5s + timeout: 10s + retries: 10 +networks: + stargate: diff --git a/tests/idiomatic/conftest.py b/tests/idiomatic/conftest.py index 14f41f80..358a93ec 100644 --- a/tests/idiomatic/conftest.py +++ b/tests/idiomatic/conftest.py @@ -22,6 +22,9 @@ from astrapy.constants import VectorMetric from ..conftest import ( + ADMIN_ENV_LIST, + ADMIN_ENV_VARIABLE_MAP, + DO_IDIOMATIC_ADMIN_TESTS, IS_ASTRA_DB, SECONDARY_NAMESPACE, DataAPICredentials, @@ -55,15 +58,6 @@ def sync_database( namespace=data_api_credentials_kwargs["namespace"], ) - if not IS_ASTRA_DB: - # ensure keyspace(s) exist - database_admin = database.get_database_admin() - database_admin.create_namespace(data_api_credentials_kwargs["namespace"]) - if data_api_credentials_info["secondary_namespace"]: - database_admin.create_namespace( - data_api_credentials_info["secondary_namespace"] - ) - yield database @@ -139,5 +133,8 @@ def async_empty_collection( "async_database", "async_fail_if_not_removed", "IS_ASTRA_DB", + "ADMIN_ENV_LIST", + "ADMIN_ENV_VARIABLE_MAP", + "DO_IDIOMATIC_ADMIN_TESTS", "SECONDARY_NAMESPACE", ] diff --git a/tests/idiomatic/integration/test_admin.py b/tests/idiomatic/integration/test_admin.py index 0520b195..ff4f04dc 100644 --- a/tests/idiomatic/integration/test_admin.py +++ b/tests/idiomatic/integration/test_admin.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import time from typing import Any, Awaitable, Callable, List, Optional, Tuple @@ -21,7 +20,12 @@ from astrapy import DataAPIClient from astrapy.admin import API_ENDPOINT_TEMPLATE_MAP -ENV_LIST = ["prod", "dev"] +from ..conftest import ( + ADMIN_ENV_LIST, + ADMIN_ENV_VARIABLE_MAP, + DO_IDIOMATIC_ADMIN_TESTS, + IS_ASTRA_DB, +) NAMESPACE_POLL_SLEEP_TIME = 2 NAMESPACE_TIMEOUT = 30 @@ -31,32 +35,20 @@ PRE_DROP_SAFETY_TIMEOUT = 120 -DO_IDIOMATIC_ADMIN_TESTS: bool -if "DO_IDIOMATIC_ADMIN_TESTS" in os.environ: - _do_idiomatic_admin_tests = os.environ["DO_IDIOMATIC_ADMIN_TESTS"] - if _do_idiomatic_admin_tests.strip(): - DO_IDIOMATIC_ADMIN_TESTS = int(_do_idiomatic_admin_tests) != 0 - else: - DO_IDIOMATIC_ADMIN_TESTS = False -else: - DO_IDIOMATIC_ADMIN_TESTS = False - - def admin_test_envs_tokens() -> List[Any]: """ This actually returns a List of `_pytest.mark.structures.ParameterSet` instances, each wrapping a Tuple[str, Optional[str]] = (env, token) """ envs_tokens: List[Any] = [] - for env in ENV_LIST: - varname = f"{env.upper()}_ADMIN_TEST_ASTRA_DB_APPLICATION_TOKEN" + for admin_env in ADMIN_ENV_LIST: markers = [] pair: Tuple[str, Optional[str]] - if varname in os.environ: - pair = (env, os.environ[varname]) + if ADMIN_ENV_VARIABLE_MAP[admin_env]["token"]: + pair = (admin_env, ADMIN_ENV_VARIABLE_MAP[admin_env]["token"]) else: - pair = (env, None) - markers.append(pytest.mark.skip(reason=f"{env} token not available")) + pair = (admin_env, None) + markers.append(pytest.mark.skip(reason=f"{admin_env} token not available")) envs_tokens.append(pytest.param(pair, marks=markers)) return envs_tokens @@ -82,11 +74,16 @@ async def await_until_true( time.sleep(poll_interval) +@pytest.mark.skipif(not IS_ASTRA_DB, reason="Not supported outside of Astra DB") @pytest.mark.skipif(not DO_IDIOMATIC_ADMIN_TESTS, reason="Admin tests are suppressed") class TestAdmin: - @pytest.mark.parametrize("env_token", admin_test_envs_tokens(), ids=ENV_LIST) + @pytest.mark.parametrize( + "admin_env_token", admin_test_envs_tokens(), ids=ADMIN_ENV_LIST + ) @pytest.mark.describe("test of the full tour with AstraDBDatabaseAdmin, sync") - def test_astra_db_database_admin_sync(self, env_token: Tuple[str, str]) -> None: + def test_astra_db_database_admin_sync( + self, admin_env_token: Tuple[str, str] + ) -> None: """ Test plan (it has to be a single giant test to use one DB throughout): - create client -> get_admin @@ -103,17 +100,17 @@ def test_astra_db_database_admin_sync(self, env_token: Tuple[str, str]) -> None: - drop database (wait) - check DB not existings """ - env, token = env_token - db_name = f"test_database_{env}" - db_provider = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_PROVIDER"] - db_region = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_REGION"] + admin_env, token = admin_env_token + db_name = f"test_database_{admin_env}" + db_provider = ADMIN_ENV_VARIABLE_MAP[admin_env]["provider"] + db_region = ADMIN_ENV_VARIABLE_MAP[admin_env]["region"] # create client, get admin client: DataAPIClient - if env == "prod": + if admin_env == "prod": client = DataAPIClient(token) else: - client = DataAPIClient(token, environment=env) + client = DataAPIClient(token, environment=admin_env) admin = client.get_admin() # create a db (wait) @@ -196,11 +193,13 @@ def test_astra_db_database_admin_sync(self, env_token: Tuple[str, str]) -> None: db_ids = {db.id for db in admin.list_databases()} assert created_db_id not in db_ids - @pytest.mark.parametrize("env_token", admin_test_envs_tokens(), ids=ENV_LIST) + @pytest.mark.parametrize( + "admin_env_token", admin_test_envs_tokens(), ids=ADMIN_ENV_LIST + ) @pytest.mark.describe( "test of the full tour with AstraDBAdmin and client methods, sync" ) - def test_astra_db_admin_sync(self, env_token: Tuple[str, str]) -> None: + def test_astra_db_admin_sync(self, admin_env_token: Tuple[str, str]) -> None: """ Test plan (it has to be a single giant test to use the two DBs throughout): - create client -> get_admin @@ -216,18 +215,18 @@ def test_astra_db_admin_sync(self, env_token: Tuple[str, str]) -> None: - get_async_database and check == with above - drop dbs, (wait, nonwait) """ - env, token = env_token - db_name_w = f"test_database_w_{env}" - db_name_nw = f"test_database_nw_{env}" - db_provider = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_PROVIDER"] - db_region = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_REGION"] + admin_env, token = admin_env_token + db_name_w = f"test_database_w_{admin_env}" + db_name_nw = f"test_database_nw_{admin_env}" + db_provider = ADMIN_ENV_VARIABLE_MAP[admin_env]["provider"] + db_region = ADMIN_ENV_VARIABLE_MAP[admin_env]["region"] # create client and get admin client: DataAPIClient - if env == "prod": + if admin_env == "prod": client = DataAPIClient(token) else: - client = DataAPIClient(token, environment=env) + client = DataAPIClient(token, environment=admin_env) admin = client.get_admin() # create the two dbs @@ -265,7 +264,7 @@ def _waiter1() -> bool: assert db_w_info.id == created_db_id_w # get and compare dbs obtained by the client - synthetic_api_endpoint = API_ENDPOINT_TEMPLATE_MAP[env].format( + synthetic_api_endpoint = API_ENDPOINT_TEMPLATE_MAP[admin_env].format( database_id=created_db_id_w, region=db_region, ) @@ -320,10 +319,12 @@ def _waiter2() -> bool: condition=_waiter2, ) - @pytest.mark.parametrize("env_token", admin_test_envs_tokens(), ids=ENV_LIST) + @pytest.mark.parametrize( + "admin_env_token", admin_test_envs_tokens(), ids=ADMIN_ENV_LIST + ) @pytest.mark.describe("test of the full tour with AstraDBDatabaseAdmin, async") async def test_astra_db_database_admin_async( - self, env_token: Tuple[str, str] + self, admin_env_token: Tuple[str, str] ) -> None: """ Test plan (it has to be a single giant test to use one DB throughout): @@ -341,17 +342,17 @@ async def test_astra_db_database_admin_async( - drop database (wait) - check DB not existings """ - env, token = env_token - db_name = f"test_database_{env}" - db_provider = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_PROVIDER"] - db_region = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_REGION"] + admin_env, token = admin_env_token + db_name = f"test_database_{admin_env}" + db_provider = ADMIN_ENV_VARIABLE_MAP[admin_env]["provider"] + db_region = ADMIN_ENV_VARIABLE_MAP[admin_env]["region"] # create client, get admin client: DataAPIClient - if env == "prod": + if admin_env == "prod": client = DataAPIClient(token) else: - client = DataAPIClient(token, environment=env) + client = DataAPIClient(token, environment=admin_env) admin = client.get_admin() # create a db (wait) @@ -447,11 +448,13 @@ async def _awaiter3() -> bool: db_ids = {db.id for db in (await admin.async_list_databases())} assert created_db_id not in db_ids - @pytest.mark.parametrize("env_token", admin_test_envs_tokens(), ids=ENV_LIST) + @pytest.mark.parametrize( + "admin_env_token", admin_test_envs_tokens(), ids=ADMIN_ENV_LIST + ) @pytest.mark.describe( "test of the full tour with AstraDBAdmin and client methods, async" ) - async def test_astra_db_admin_async(self, env_token: Tuple[str, str]) -> None: + async def test_astra_db_admin_async(self, admin_env_token: Tuple[str, str]) -> None: """ Test plan (it has to be a single giant test to use the two DBs throughout): - create client -> get_admin @@ -467,18 +470,18 @@ async def test_astra_db_admin_async(self, env_token: Tuple[str, str]) -> None: - get_async_database and check == with above - drop dbs, (wait, nonwait) """ - env, token = env_token - db_name_w = f"test_database_w_{env}" - db_name_nw = f"test_database_nw_{env}" - db_provider = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_PROVIDER"] - db_region = os.environ[f"{env.upper()}_ADMIN_TEST_ASTRA_DB_REGION"] + admin_env, token = admin_env_token + db_name_w = f"test_database_w_{admin_env}" + db_name_nw = f"test_database_nw_{admin_env}" + db_provider = ADMIN_ENV_VARIABLE_MAP[admin_env]["provider"] + db_region = ADMIN_ENV_VARIABLE_MAP[admin_env]["region"] # create client and get admin client: DataAPIClient - if env == "prod": + if admin_env == "prod": client = DataAPIClient(token) else: - client = DataAPIClient(token, environment=env) + client = DataAPIClient(token, environment=admin_env) admin = client.get_admin() # create the two dbs @@ -516,7 +519,7 @@ async def _awaiter1() -> bool: assert db_w_info.id == created_db_id_w # get and compare dbs obtained by the client - synthetic_api_endpoint = API_ENDPOINT_TEMPLATE_MAP[env].format( + synthetic_api_endpoint = API_ENDPOINT_TEMPLATE_MAP[admin_env].format( database_id=created_db_id_w, region=db_region, ) diff --git a/tests/idiomatic/integration/test_nonastra_admin.py b/tests/idiomatic/integration/test_nonastra_admin.py index eb825e00..d3ad69ee 100644 --- a/tests/idiomatic/integration/test_nonastra_admin.py +++ b/tests/idiomatic/integration/test_nonastra_admin.py @@ -16,10 +16,11 @@ from astrapy import AsyncDatabase, Database -from ..conftest import IS_ASTRA_DB +from ..conftest import DO_IDIOMATIC_ADMIN_TESTS, IS_ASTRA_DB @pytest.mark.skipif(IS_ASTRA_DB, reason="Not supported on Astra DB") +@pytest.mark.skipif(not DO_IDIOMATIC_ADMIN_TESTS, reason="Admin tests are suppressed") class TestNonAstraAdmin: @pytest.mark.describe( "test of the namespace crud with non-Astra DataAPIDatabaseAdmin, sync" diff --git a/tests/idiomatic/integration/test_timeout_async.py b/tests/idiomatic/integration/test_timeout_async.py index a984346b..c08dce71 100644 --- a/tests/idiomatic/integration/test_timeout_async.py +++ b/tests/idiomatic/integration/test_timeout_async.py @@ -67,6 +67,7 @@ async def test_database_info_timeout_async( assert exc.value.endpoint is not None assert exc.value.raw_payload is not None + @pytest.mark.skipif(not IS_ASTRA_DB, reason="Too fast on nonAstra") @pytest.mark.describe("test of cursor-based timeouts, async") async def test_cursor_timeouts_async( self, diff --git a/tests/idiomatic/integration/test_timeout_sync.py b/tests/idiomatic/integration/test_timeout_sync.py index cfa7118b..de6a498e 100644 --- a/tests/idiomatic/integration/test_timeout_sync.py +++ b/tests/idiomatic/integration/test_timeout_sync.py @@ -65,6 +65,7 @@ def test_database_info_timeout_sync( assert exc.value.endpoint is not None assert exc.value.raw_payload is not None + @pytest.mark.skipif(not IS_ASTRA_DB, reason="Too fast on nonAstra") @pytest.mark.describe("test of cursor-based timeouts, sync") def test_cursor_timeouts_sync( self, diff --git a/tests/preprocess_env.py b/tests/preprocess_env.py new file mode 100644 index 00000000..f15891fa --- /dev/null +++ b/tests/preprocess_env.py @@ -0,0 +1,158 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Bottleneck entrypoint for reading os.environ and exposing its contents as +(normalized) regular variables. +Except for the vectorize information, which for the time being passes as os.environ. +""" + +import os +import time +from typing import Optional + +from testcontainers.compose import DockerCompose + +from astrapy.authentication import ( + StaticTokenProvider, + TokenProvider, + UsernamePasswordTokenProvider, +) + +DOCKER_COMPOSE_SLEEP_TIME_SECONDS = 20 + +base_dir = os.path.abspath(os.path.dirname(__file__)) +docker_compose_filepath = os.path.join(base_dir, "hcd_compose") + + +IS_ASTRA_DB: bool +DOCKER_COMPOSE_LOCAL_DATA_API: bool +SECONDARY_NAMESPACE: Optional[str] = None +ASTRA_DB_API_ENDPOINT: Optional[str] = None +ASTRA_DB_APPLICATION_TOKEN: Optional[str] = None +ASTRA_DB_KEYSPACE: Optional[str] = None +LOCAL_DATA_API_USERNAME: Optional[str] = None +LOCAL_DATA_API_PASSWORD: Optional[str] = None +LOCAL_DATA_API_APPLICATION_TOKEN: Optional[str] = None +LOCAL_DATA_API_ENDPOINT: Optional[str] = None +LOCAL_DATA_API_KEYSPACE: Optional[str] = None + +ASTRA_DB_TOKEN_PROVIDER: Optional[TokenProvider] = None +LOCAL_DATA_API_TOKEN_PROVIDER: Optional[TokenProvider] = None + +# idiomatic-related settings +if "LOCAL_DATA_API_ENDPOINT" in os.environ: + IS_ASTRA_DB = False + DOCKER_COMPOSE_LOCAL_DATA_API = False + LOCAL_DATA_API_USERNAME = os.environ.get("LOCAL_DATA_API_USERNAME") + LOCAL_DATA_API_PASSWORD = os.environ.get("LOCAL_DATA_API_PASSWORD") + LOCAL_DATA_API_APPLICATION_TOKEN = os.environ.get( + "LOCAL_DATA_API_APPLICATION_TOKEN" + ) + LOCAL_DATA_API_ENDPOINT = os.environ["LOCAL_DATA_API_ENDPOINT"] + LOCAL_DATA_API_KEYSPACE = os.environ.get("LOCAL_DATA_API_KEYSPACE") + # no reason not to use it + SECONDARY_NAMESPACE = os.environ.get( + "LOCAL_DATA_API_SECONDARY_KEYSPACE", "alternate_keyspace" + ) +elif "DOCKER_COMPOSE_LOCAL_DATA_API" in os.environ: + IS_ASTRA_DB = False + DOCKER_COMPOSE_LOCAL_DATA_API = True + LOCAL_DATA_API_USERNAME = "cassandra" + LOCAL_DATA_API_PASSWORD = "cassandra" + LOCAL_DATA_API_ENDPOINT = "http://localhost:8181" + LOCAL_DATA_API_KEYSPACE = os.environ.get("LOCAL_DATA_API_KEYSPACE") + # no reason not to use it + SECONDARY_NAMESPACE = os.environ.get( + "LOCAL_DATA_API_SECONDARY_KEYSPACE", "alternate_keyspace" + ) +elif "ASTRA_DB_API_ENDPOINT" in os.environ: + IS_ASTRA_DB = True + DOCKER_COMPOSE_LOCAL_DATA_API = False + SECONDARY_NAMESPACE = os.environ.get("ASTRA_DB_SECONDARY_KEYSPACE") + ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"] + ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"] + ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE") +else: + raise ValueError("No credentials.") + +# token provider setup +if IS_ASTRA_DB: + ASTRA_DB_TOKEN_PROVIDER = StaticTokenProvider(ASTRA_DB_APPLICATION_TOKEN) +else: + # either token or user/pwd pair (the latter having precedence) + if LOCAL_DATA_API_USERNAME and LOCAL_DATA_API_PASSWORD: + LOCAL_DATA_API_TOKEN_PROVIDER = UsernamePasswordTokenProvider( + username=LOCAL_DATA_API_USERNAME, + password=LOCAL_DATA_API_PASSWORD, + ) + elif LOCAL_DATA_API_APPLICATION_TOKEN: + LOCAL_DATA_API_TOKEN_PROVIDER = StaticTokenProvider( + LOCAL_DATA_API_APPLICATION_TOKEN + ) + else: + raise ValueError("No full authentication data for local Data API") + + +# Ensure docker-compose, if needed, is started and ready before anything else +# (especially querying the findEmbeddingProviders) +# if "DOCKER_COMPOSE_LOCAL_DATA_API", must spin the whole environment: +# (it is started and not cleaned up at the moment: manual cleanup if needed) +is_docker_compose_started = False +if DOCKER_COMPOSE_LOCAL_DATA_API: + if not is_docker_compose_started: + compose = DockerCompose(filepath=docker_compose_filepath) + compose.start() + time.sleep(DOCKER_COMPOSE_SLEEP_TIME_SECONDS) + is_docker_compose_started = True + + +# Idomatic admin test flag +DO_IDIOMATIC_ADMIN_TESTS: bool +if "DO_IDIOMATIC_ADMIN_TESTS" in os.environ: + _do_idiomatic_admin_tests = os.environ["DO_IDIOMATIC_ADMIN_TESTS"] + if _do_idiomatic_admin_tests.strip(): + DO_IDIOMATIC_ADMIN_TESTS = int(_do_idiomatic_admin_tests) != 0 + else: + DO_IDIOMATIC_ADMIN_TESTS = False +else: + DO_IDIOMATIC_ADMIN_TESTS = False + +ADMIN_ENV_LIST = ["prod", "dev"] +ADMIN_ENV_VARIABLE_MAP = { + admin_env: { + "token": os.environ.get( + f"{admin_env.upper()}_ADMIN_TEST_ASTRA_DB_APPLICATION_TOKEN" + ), + "provider": os.environ.get(f"{admin_env.upper()}_ADMIN_TEST_ASTRA_DB_PROVIDER"), + "region": os.environ.get(f"{admin_env.upper()}_ADMIN_TEST_ASTRA_DB_REGION"), + } + for admin_env in ADMIN_ENV_LIST +} + +# core-specific (legacy) flags +TEST_SKIP_COLLECTION_DELETE: bool +if os.getenv("TEST_SKIP_COLLECTION_DELETE"): + TEST_SKIP_COLLECTION_DELETE = int(os.environ["TEST_SKIP_COLLECTION_DELETE"]) != 0 +else: + TEST_SKIP_COLLECTION_DELETE = False + +ASTRA_DB_OPS_APPLICATION_TOKEN = os.environ.get( + "ASTRA_DB_OPS_APPLICATION_TOKEN", + ASTRA_DB_APPLICATION_TOKEN or "no_token!", +) +ASTRA_DB_ID = os.environ.get("ASTRA_DB_ID", "") +ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE") +ASTRA_DB_REGION = os.environ.get("ASTRA_DB_REGION") +TEST_ASTRADBOPS = int(os.environ.get("TEST_ASTRADBOPS", "0")) != 0 diff --git a/tests/vectorize_idiomatic/integration/__init__.py b/tests/vectorize_idiomatic/integration/__init__.py index e69de29b..2c9ca172 100644 --- a/tests/vectorize_idiomatic/integration/__init__.py +++ b/tests/vectorize_idiomatic/integration/__init__.py @@ -0,0 +1,13 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/vectorize_idiomatic/live_provider_info.py b/tests/vectorize_idiomatic/live_provider_info.py new file mode 100644 index 00000000..f48882a6 --- /dev/null +++ b/tests/vectorize_idiomatic/live_provider_info.py @@ -0,0 +1,64 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Any, Dict, Optional + +from preprocess_env import ( + ASTRA_DB_API_ENDPOINT, + ASTRA_DB_TOKEN_PROVIDER, + IS_ASTRA_DB, + LOCAL_DATA_API_ENDPOINT, + LOCAL_DATA_API_TOKEN_PROVIDER, +) + +from astrapy.api_commander import APICommander + + +def live_provider_info() -> Dict[str, Any]: + """ + Query the API endpoint `findEmbeddingProviders` endpoint + for the latest information. + + This is where the preprocess_env variables are read to figure out whom to ask. + """ + response: Dict[str, Any] + + if IS_ASTRA_DB: + if ASTRA_DB_TOKEN_PROVIDER is None: + raise ValueError("No token provider for Astra DB") + path = "api/json/v1" + headers_a: Dict[str, Optional[str]] = { + "Token": ASTRA_DB_TOKEN_PROVIDER.get_token(), + } + cmd = APICommander( + api_endpoint=ASTRA_DB_API_ENDPOINT or "", + path=path, + headers=headers_a, + ) + response = cmd.request(payload={"findEmbeddingProviders": {}}) + else: + path = "v1" + if LOCAL_DATA_API_TOKEN_PROVIDER is None: + raise ValueError("No token provider for Local Data API") + headers_l: Dict[str, Optional[str]] = { + "Token": LOCAL_DATA_API_TOKEN_PROVIDER.get_token(), + } + cmd = APICommander( + api_endpoint=LOCAL_DATA_API_ENDPOINT or "", + path=path, + headers=headers_l, + ) + response = cmd.request(payload={"findEmbeddingProviders": {}}) + + return response diff --git a/tests/vectorize_idiomatic/query_providers.py b/tests/vectorize_idiomatic/query_providers.py index d339c157..c8bd3b40 100644 --- a/tests/vectorize_idiomatic/query_providers.py +++ b/tests/vectorize_idiomatic/query_providers.py @@ -15,17 +15,12 @@ import json import os import sys -from typing import Any, Dict, Optional -from astrapy.api_commander import APICommander +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -IS_ASTRA_DB: bool -if "LOCAL_DATA_API_ENDPOINT" in os.environ: - IS_ASTRA_DB = False -elif "ASTRA_DB_API_ENDPOINT" in os.environ: - IS_ASTRA_DB = True -else: - raise ValueError("No credentials.") +from typing import Any, Dict + +from live_provider_info import live_provider_info def desc_param(param_data: Dict[str, Any]) -> str: @@ -47,39 +42,12 @@ def desc_param(param_data: Dict[str, Any]) -> str: if __name__ == "__main__": + response: Dict[str, Any] if "l" in sys.argv[1:]: response = json.load(open("_providers.json")) else: - if IS_ASTRA_DB: - ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"] - ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"] - api_endpoint = ASTRA_DB_API_ENDPOINT - path = "api/json/v1" - headers_a: Dict[str, Optional[str]] = {"Token": ASTRA_DB_APPLICATION_TOKEN} - cmd = APICommander( - api_endpoint=api_endpoint, - path=path, - headers=headers_a, - ) - response = cmd.request(payload={"findEmbeddingProviders": {}}) - json.dump(response, open("_providers.json", "w"), indent=2, sort_keys=True) - else: - LOCAL_DATA_API_APPLICATION_TOKEN = os.environ[ - "LOCAL_DATA_API_APPLICATION_TOKEN" - ] - LOCAL_DATA_API_ENDPOINT = os.environ["LOCAL_DATA_API_ENDPOINT"] - api_endpoint = LOCAL_DATA_API_ENDPOINT - path = "v1" - headers_l: Dict[str, Optional[str]] = { - "Token": LOCAL_DATA_API_APPLICATION_TOKEN - } - cmd = APICommander( - api_endpoint=api_endpoint, - path=path, - headers=headers_l, - ) - response = cmd.request(payload={"findEmbeddingProviders": {}}) - json.dump(response, open("_providers.json", "w"), indent=2, sort_keys=True) + response = live_provider_info() + json.dump(response, open("_providers.json", "w"), indent=2, sort_keys=True) provider_map = response["status"]["embeddingProviders"] for provider, provider_data in sorted(provider_map.items()): diff --git a/tests/vectorize_idiomatic/unit/__init__.py b/tests/vectorize_idiomatic/unit/__init__.py index e69de29b..2c9ca172 100644 --- a/tests/vectorize_idiomatic/unit/__init__.py +++ b/tests/vectorize_idiomatic/unit/__init__.py @@ -0,0 +1,13 @@ +# Copyright DataStax, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/vectorize_idiomatic/vectorize_models.py b/tests/vectorize_idiomatic/vectorize_models.py index 1387cfea..dfcde71b 100644 --- a/tests/vectorize_idiomatic/vectorize_models.py +++ b/tests/vectorize_idiomatic/vectorize_models.py @@ -13,12 +13,14 @@ # limitations under the License. import os -from typing import Any, Dict, Iterable, Optional, Tuple +import sys +from typing import Any, Dict, Iterable, Tuple + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) -from astrapy.api_commander import APICommander from astrapy.info import CollectionVectorServiceOptions -from .conftest import IS_ASTRA_DB +from .live_provider_info import live_provider_info alphanum = set("qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM1234567890") @@ -157,46 +159,6 @@ } -def live_provider_info() -> Dict[str, Any]: - """ - Query the API endpoint `findEmbeddingProviders` endpoint - for the latest information. - This is later used to make sure everything is mapped/tested. - """ - response: Dict[str, Any] - - if IS_ASTRA_DB: - ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"] - ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"] - api_endpoint = ASTRA_DB_API_ENDPOINT - path = "api/json/v1" - headers_a: Dict[str, Optional[str]] = {"Token": ASTRA_DB_APPLICATION_TOKEN} - cmd = APICommander( - api_endpoint=api_endpoint, - path=path, - headers=headers_a, - ) - response = cmd.request(payload={"findEmbeddingProviders": {}}) - else: - LOCAL_DATA_API_APPLICATION_TOKEN = os.environ[ - "LOCAL_DATA_API_APPLICATION_TOKEN" - ] - LOCAL_DATA_API_ENDPOINT = os.environ["LOCAL_DATA_API_ENDPOINT"] - api_endpoint = LOCAL_DATA_API_ENDPOINT - path = "v1" - headers_l: Dict[str, Optional[str]] = { - "Token": LOCAL_DATA_API_APPLICATION_TOKEN - } - cmd = APICommander( - api_endpoint=api_endpoint, - path=path, - headers=headers_l, - ) - response = cmd.request(payload={"findEmbeddingProviders": {}}) - - return response - - def live_test_models() -> Iterable[Dict[str, Any]]: def _from_validation(pspec: Dict[str, Any]) -> int: