From 7c8d81b265f2ce9e8f8aadddd29075ee47b7cd91 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 10 May 2022 10:37:20 +0200 Subject: [PATCH 01/60] use jsonnet-team as owners (#1) Signed-off-by: Friedrich Gonzalez --- CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEOWNERS b/CODEOWNERS index 0296909a..1f769c76 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,4 +1,4 @@ # https://help.github.com/articles/about-codeowners/ # https://git-scm.com/docs/gitignore#_pattern_format -* @grafana/cortex-team +* @cortexproject/jsonnet-team From 978fe497e328c78a41fc710410cd479dd38abb5d Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 10 May 2022 13:21:08 +0200 Subject: [PATCH 02/60] Updated instructions to use tanka with this repo (#3) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 ++ README.md | 33 ++++++++++++++++----------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3135d97c..50706dd7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## master / unreleased +* [CHANGE] Updated readme to use this repo with tanka + ## 1.11.0 / 2021-12-30 * [CHANGE] Store gateway: set `-blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency`, diff --git a/README.md b/README.md index 8864e8cf..c04f5751 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,13 @@ This repo has the jsonnet for deploying Cortex and the related monitoring in Kubernetes. +--- +**NOTE** + +If you are more familiar with helm you should use the [helm chart](https://cortexproject.github.io/cortex-helm-chart/) for cortex + +--- + To generate the YAMLs for deploying Cortex: 1. Make sure you have tanka and jb installed: @@ -10,25 +17,19 @@ To generate the YAMLs for deploying Cortex: ```console $ # make sure to be outside of GOPATH or a go.mod project - $ GO111MODULE=on go get github.com/grafana/tanka/cmd/tk - $ GO111MODULE=on go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb + $ GO111MODULE=on go install github.com/grafana/tanka/cmd/tk@v0.21.0 + $ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.4.0 ``` -1. Initialise the Tanka, and install the Cortex and Kubernetes Jsonnet libraries. +1. Initialise the Tanka repo, install the Cortex and Kubernetes Jsonnet libraries. ```console $ mkdir && cd - $ tk init --k8s=false - $ # The k8s-alpha library supports Kubernetes versions 1.14+ - $ jb install github.com/jsonnet-libs/k8s-alpha/1.18 - $ cat < lib/k.libsonnet - (import "github.com/jsonnet-libs/k8s-alpha/1.18/main.libsonnet") - + (import "github.com/jsonnet-libs/k8s-alpha/1.18/extensions/kausal-shim.libsonnet") - EOF - $ jb install github.com/grafana/cortex-jsonnet/cortex@main + $ tk init --k8s=1.21 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.21@main + $ jb install github.com/cortexproject/cortex-jsonnet/cortex@main ``` -1. Use the example monitoring.jsonnet.example: +1. Use the example cortex-manifests.jsonnet.example: ```console $ cp vendor/cortex/cortex-manifests.jsonnet.example environments/default/main.jsonnet @@ -58,13 +59,11 @@ To generate the YAMLs for deploying Cortex: To generate the Grafana dashboards and Prometheus alerts for Cortex: ```console -$ GO111MODULE=on go get github.com/monitoring-mixins/mixtool/cmd/mixtool -$ GO111MODULE=on go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb -$ git clone https://github.com/grafana/cortex-jsonnet +$ GO111MODULE=on go install github.com/monitoring-mixins/mixtool/cmd/mixtool@2ff523ea63d1cdeee2a10e01d1d48d20adcc7030 +$ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.4.0 +$ git clone https://github.com/cortexproject/cortex-jsonnet $ cd cortex-jsonnet $ make build-mixin ``` This will leave all the alerts and dashboards in cortex-mixin/cortex-mixin.zip (or cortex-mixin/out). - -If you get an error like `cannot use cli.StringSliceFlag literal (type cli.StringSliceFlag) as type cli.Flag in slice literal` when installing [mixtool](https://github.com/monitoring-mixins/mixtool/issues/27), make sure you set `GO111MODULE=on` before `go get`. From f8dd8c2381c82dfd869505ea5775bc0830337d03 Mon Sep 17 00:00:00 2001 From: Niclas Schad Date: Wed, 11 May 2022 15:34:58 +0200 Subject: [PATCH 03/60] move to github actions (#4) Signed-off-by: Niclas Schad rename build to CI Signed-off-by: Niclas Schad --- .circleci/config.yml | 41 ------------------------------------ .github/workflows/lint.yaml | 42 +++++++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+), 41 deletions(-) delete mode 100644 .circleci/config.yml create mode 100644 .github/workflows/lint.yaml diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 41cd2ad2..00000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,41 +0,0 @@ -version: 2 - -workflows: - version: 2 - ci: - jobs: - - lint - - build - - test-readme - -jobs: - lint: - docker: - - image: grafana/cortex-jsonnet-build-image:3527936 - steps: - - checkout - - run: - name: "Check white noise" - command: make check-white-noise - - run: - name: "Lint mixin" - command: make lint-mixin - - run: - name: "Lint playbooks" - command: make lint-playbooks - - build: - docker: - - image: grafana/cortex-jsonnet-build-image:3527936 - steps: - - checkout - - run: make build-mixin - - store_artifacts: - path: cortex-mixin/cortex-mixin.zip - - test-readme: - docker: - - image: grafana/cortex-jsonnet-build-image:3527936 - steps: - - checkout - - run: make test-readme diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml new file mode 100644 index 00000000..3cee255b --- /dev/null +++ b/.github/workflows/lint.yaml @@ -0,0 +1,42 @@ +name: CI + +on: + pull_request: + +jobs: + lint: + runs-on: ubuntu-18.04 + container: grafana/cortex-jsonnet-build-image:3527936 + steps: + - uses: actions/checkout@v2 + name: Checkout + with: + fetch-depth: 0 + + - name: "Lint mixin" + run: make lint-mixin + + - name: "Lint playbooks" + run: make lint-playbooks + build: + runs-on: ubuntu-18.04 + container: grafana/cortex-jsonnet-build-image:3527936 + steps: + - uses: actions/checkout@v2 + name: Checkout + with: + fetch-depth: 0 + + - name: "Build mixin" + run: make build-mixin + readme: + runs-on: ubuntu-18.04 + container: grafana/cortex-jsonnet-build-image:3527936 + steps: + - uses: actions/checkout@v2 + name: Checkout + with: + fetch-depth: 0 + + - name: "Test readme" + run: make test-readme \ No newline at end of file From 149069e5f19c3df4359799d6cb107445d3a2039c Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <35104428+friedrich-at-adobe@users.noreply.github.com> Date: Wed, 11 May 2022 16:46:26 +0200 Subject: [PATCH 04/60] How to rename buckets in AWS and Azure playbook update (#5) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex-mixin/docs/playbooks.md | 28 +++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 50706dd7..62a053ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master / unreleased * [CHANGE] Updated readme to use this repo with tanka +* [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 ## 1.11.0 / 2021-12-30 diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 0e98a891..c1ee3ef4 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -3,7 +3,7 @@ This document contains playbooks, or at least a checklist of what to look for, for alerts in the cortex-mixin and logs from Cortex. This document assumes that you are running a Cortex cluster: 1. Using this mixin config -2. Using GCS as object store (but similar procedures apply to other backends) +2. Using GCS (Google), S3 (AWS) or Blobs (Azure). Similar procedures apply to other backends. ## Alerts @@ -413,6 +413,32 @@ Where: - `TENANT` is the tenant id reported in the example error message above as `REDACTED-TENANT` - `BLOCK` is the last part of the file path reported as `REDACTED-BLOCK` in the example error message above +To rename a block stored on S3 you can use the `aws` CLI command: +``` +aws s3 mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK +``` +Where: +- `BUCKET` is the s3 bucket name the compactor is using. +- `TENANT` is the tenant id reported in the example error message above as `REDACTED-TENANT` +- `BLOCK` is the last part of the file path reported as `REDACTED-BLOCK` in the example error message above + + +To rename a block stored on Azure you can use the `azcopy` and `az` CLI command: +``` +azcopy copy "https://$STORAGE-ACCOUNT.blob.core.windows.net/$CONTAINER/$TENANT/$BLOCK?$SASTOKEN" "https://$STORAGE-ACCOUNT.blob.core.windows.net/$CONTAINER/$TENANT/corrupted-$BLOCK?$SASTOKEN" --recursive +azcopy remove "https://$STORAGE-ACCOUNT.blob.core.windows.net/$CONTAINER/$TENANT/$BLOCK?$SASTOKEN" --recursive +``` +Where: +- `STORAGE-ACCOUNT` is the storage account the compactor is using. +- `CONTAINER` is what is specified as `-blocks-storage.azure.container-name` +- `TENANT` is the tenant id reported in the example error message above as REDACTED-TENANT +- `BLOCK` is the last part of the file path reported as REDACTED-BLOCK in the example error message above +- `SAS-TOKEN` this is a token that can be created with the following command: + +``` +az storage container generate-sas --account-name $STORAGE-ACCOUNT --expiry $(date -v +1d +%Y-%m-%d) --name $CONTAINER --permissions dlrw +``` + ### CortexBucketIndexNotUpdated This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. From b8be2c2a811eda5c87985685e3ba63bde0bedcd0 Mon Sep 17 00:00:00 2001 From: bogdan-at-adobe <102800531+bogdan-at-adobe@users.noreply.github.com> Date: Tue, 12 Jul 2022 18:39:58 +0300 Subject: [PATCH 05/60] Update s3 endpoint from us-east-1 to aws_region (#8) * Change s3 endpoint to include correct value --- CHANGELOG.md | 1 + cortex/config.libsonnet | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62a053ce..5117a795 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [CHANGE] Updated readme to use this repo with tanka * [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 +* [BUGFIX] Updated blocks_storage_s3_endpoint in config.libsonnet to include the correct aws region ## 1.11.0 / 2021-12-30 diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 7cf316b7..248cedaf 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -57,7 +57,7 @@ storage_engine: 'chunks', // Available options are 'chunks' or 'blocks' blocks_storage_backend: 'gcs', // Available options are 'gcs', 's3', 'azure' blocks_storage_bucket_name: error 'must specify blocks storage bucket name', - blocks_storage_s3_endpoint: 's3.dualstack.us-east-1.amazonaws.com', + blocks_storage_s3_endpoint: 's3.dualstack.%s.amazonaws.com' % $._config.aws_region, blocks_storage_azure_account_name: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account name' else '', blocks_storage_azure_account_key: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account key' else '', From a6318c7e401deb4a8ab7a9f776e9bbf3156a5cad Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 12 Jul 2022 22:23:25 +0200 Subject: [PATCH 06/60] The overrides exporter is integrated in cortex since v1.9.0 (#7) The only caveat is that it doesn't support the presets. Everything else works the same Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/images.libsonnet | 2 +- cortex/overrides-exporter.libsonnet | 50 +++++------------------------ 3 files changed, 10 insertions(+), 43 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5117a795..d57193a2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master / unreleased * [CHANGE] Updated readme to use this repo with tanka +* [CHANGE] Use integrated cortex overrides exporter * [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 * [BUGFIX] Updated blocks_storage_s3_endpoint in config.libsonnet to include the correct aws region diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index 1eb891c4..f6203326 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -19,7 +19,7 @@ store_gateway: self.cortex, query_scheduler: self.cortex, - cortex_tools: 'grafana/cortex-tools:v0.4.0', + overrides_exporter: self.cortex, query_tee: 'quay.io/cortexproject/query-tee:v1.11.0', testExporter: 'cortexproject/test-exporter:v1.11.0', }, diff --git a/cortex/overrides-exporter.libsonnet b/cortex/overrides-exporter.libsonnet index d8eb411a..1f9de4ea 100644 --- a/cortex/overrides-exporter.libsonnet +++ b/cortex/overrides-exporter.libsonnet @@ -1,56 +1,23 @@ -// this enables overrides exporter, which will expose the configured -// overrides and presets (if configured). Those metrics can be potentially -// high cardinality. +// this enables overrides exporter, which will expose the configured overrides. { local name = 'overrides-exporter', - _config+: { - // overrides exporter can also make the configured presets available, this - // list references entries within $._config.overrides - - overrides_exporter_presets:: [ - 'extra_small_user', - 'small_user', - 'medium_user', - 'big_user', - 'super_user', - 'mega_user', - ], - }, - - local presets_enabled = std.length($._config.overrides_exporter_presets) > 0, - - local configMap = $.core.v1.configMap, - overrides_exporter_presets_configmap: - if presets_enabled then - configMap.new('overrides-presets') + - configMap.withData({ - 'overrides-presets.yaml': $.util.manifestYaml( - { - presets: { - [key]: $._config.overrides[key] - for key in $._config.overrides_exporter_presets - }, - } - ), - }), - local containerPort = $.core.v1.containerPort, - overrides_exporter_port:: containerPort.newNamed(name='http-metrics', containerPort=9683), + overrides_exporter_port:: containerPort.newNamed(name='http-metrics', containerPort=80), overrides_exporter_args:: { - 'overrides-file': '/etc/cortex/overrides.yaml', - } + if presets_enabled then { - 'presets-file': '/etc/cortex_presets/overrides-presets.yaml', - } else {}, + target: 'overrides-exporter', + + 'runtime-config.file': '/etc/cortex/overrides.yaml', + }, local container = $.core.v1.container, overrides_exporter_container:: - container.new(name, $._images.cortex_tools) + + container.new(name, $._images.overrides_exporter) + container.withPorts([ $.overrides_exporter_port, ]) + - container.withArgsMixin([name] + $.util.mapToFlags($.overrides_exporter_args, prefix='--')) + + container.withArgsMixin($.util.mapToFlags($.overrides_exporter_args, prefix='--')) + $.util.resourcesRequests('0.5', '0.5Gi') + $.util.readinessProbe + container.mixin.readinessProbe.httpGet.withPort($.overrides_exporter_port.name), @@ -59,7 +26,6 @@ overrides_exporter_deployment: deployment.new(name, 1, [$.overrides_exporter_container], { name: name }) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.configVolumeMount('overrides-presets', '/etc/cortex_presets') + deployment.mixin.metadata.withLabels({ name: name }), overrides_exporter_service: From 92ae7d9b852b02d7fce3c090b7d93de6e15d18b1 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 4 Oct 2022 16:48:34 +0200 Subject: [PATCH 07/60] Remove chunks (#6) * Remove chunks from jsonnet deployment * Adjust examples * Adjust changelog * Fix test-readme * Include tanka v0.21.0 updating build-image --- .github/workflows/lint.yaml | 6 +- CHANGELOG.md | 2 + Makefile | 22 +- README.md | 12 +- build-image/Dockerfile | 4 +- cortex/azure/main.jsonnet.example | 28 +++ cortex/compactor.libsonnet | 72 ++++++ cortex/config.libsonnet | 148 ++---------- cortex/cortex-manifests.jsonnet.example | 27 --- cortex/cortex.libsonnet | 4 +- cortex/flusher-job.libsonnet | 51 ----- cortex/gcs/main.jsonnet.example | 22 ++ cortex/ingester.libsonnet | 119 +++++----- cortex/memcached.libsonnet | 8 - cortex/querier.libsonnet | 10 +- cortex/ruler.libsonnet | 11 +- cortex/s3/main.jsonnet.example | 24 ++ cortex/store-gateway.libsonnet | 94 ++++++++ cortex/table-manager.libsonnet | 44 ---- cortex/tsdb-config.libsonnet | 68 ++++++ cortex/tsdb.libsonnet | 290 ------------------------ 21 files changed, 418 insertions(+), 648 deletions(-) create mode 100644 cortex/azure/main.jsonnet.example create mode 100644 cortex/compactor.libsonnet delete mode 100644 cortex/cortex-manifests.jsonnet.example delete mode 100644 cortex/flusher-job.libsonnet create mode 100644 cortex/gcs/main.jsonnet.example create mode 100644 cortex/s3/main.jsonnet.example create mode 100644 cortex/store-gateway.libsonnet delete mode 100644 cortex/table-manager.libsonnet create mode 100644 cortex/tsdb-config.libsonnet delete mode 100644 cortex/tsdb.libsonnet diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index 3cee255b..b1ad62b2 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -6,7 +6,7 @@ on: jobs: lint: runs-on: ubuntu-18.04 - container: grafana/cortex-jsonnet-build-image:3527936 + container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 steps: - uses: actions/checkout@v2 name: Checkout @@ -20,7 +20,7 @@ jobs: run: make lint-playbooks build: runs-on: ubuntu-18.04 - container: grafana/cortex-jsonnet-build-image:3527936 + container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 steps: - uses: actions/checkout@v2 name: Checkout @@ -31,7 +31,7 @@ jobs: run: make build-mixin readme: runs-on: ubuntu-18.04 - container: grafana/cortex-jsonnet-build-image:3527936 + container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 steps: - uses: actions/checkout@v2 name: Checkout diff --git a/CHANGELOG.md b/CHANGELOG.md index d57193a2..5d86d000 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,9 @@ ## master / unreleased * [CHANGE] Updated readme to use this repo with tanka +* [CHANGE] Removed chunks support * [CHANGE] Use integrated cortex overrides exporter +* [ENHANCEMENT] Added main.jsonnet examples for azure, gcs and s3 * [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 * [BUGFIX] Updated blocks_storage_s3_endpoint in config.libsonnet to include the correct aws region diff --git a/Makefile b/Makefile index d0ca2f52..9f2501d5 100644 --- a/Makefile +++ b/Makefile @@ -32,10 +32,10 @@ fmt: xargs -n 1 -- $(JSONNET_FMT) -i build-image: - docker build -t grafana/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-image + docker build -t quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-image publish-build-image: - docker push grafana/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) + docker push quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-mixin: @cd cortex-mixin && \ @@ -44,16 +44,16 @@ build-mixin: mixtool generate all --output-alerts out/alerts.yaml --output-rules out/rules.yaml --directory out/dashboards mixin.libsonnet && \ zip -q -r cortex-mixin.zip out -test-readme: - rm -rf test-readme && \ - mkdir test-readme && cd test-readme && \ - tk init --k8s=false && \ - jb install github.com/jsonnet-libs/k8s-alpha/1.18 && \ - printf '(import "github.com/jsonnet-libs/k8s-alpha/1.18/main.libsonnet")\n+(import "github.com/jsonnet-libs/k8s-alpha/1.18/extensions/kausal-shim.libsonnet")' > lib/k.libsonnet && \ - jb install github.com/grafana/cortex-jsonnet/cortex@main && \ +test-readme: test-readme/azure test-readme/gcs test-readme/s3 + +test-readme/%: + rm -rf $@ && \ + mkdir -p $@ && cd $@ && \ + tk init --k8s=1.21 && \ + jb install github.com/cortexproject/cortex-jsonnet/cortex@main && \ rm -fr ./vendor/cortex && \ - cp -r ../cortex ./vendor/ && \ - cp vendor/cortex/cortex-manifests.jsonnet.example environments/default/main.jsonnet && \ + cp -r ../../cortex ./vendor/ && \ + cp vendor/cortex/$(notdir $@)/main.jsonnet.example environments/default/main.jsonnet && \ PAGER=cat tk show environments/default clean-white-noise: diff --git a/README.md b/README.md index c04f5751..d7eaaa6d 100644 --- a/README.md +++ b/README.md @@ -29,10 +29,18 @@ To generate the YAMLs for deploying Cortex: $ jb install github.com/cortexproject/cortex-jsonnet/cortex@main ``` -1. Use the example cortex-manifests.jsonnet.example: +1. Use any of the examples to get a main.jsonnet and adjust as needed ```console - $ cp vendor/cortex/cortex-manifests.jsonnet.example environments/default/main.jsonnet + $ cp vendor/cortex/azure/main.jsonnet.example environments/default/main.jsonnet + ``` + + ```console + $ cp vendor/cortex/gcs/main.jsonnet.example environments/default/main.jsonnet + ``` + + ```console + $ cp vendor/cortex/s3/main.jsonnet.example environments/default/main.jsonnet ``` 1. Check what is in the example: diff --git a/build-image/Dockerfile b/build-image/Dockerfile index 5b0f50aa..dc37595e 100644 --- a/build-image/Dockerfile +++ b/build-image/Dockerfile @@ -18,8 +18,8 @@ RUN chmod +x /usr/bin/jb # Build tanka FROM alpine:3.13 AS tk-builder -ARG TANKA_VERSION=0.11.1 -ARG TANKA_CHECKSUM="3b253ca7d7bf01189604c10a8f7cead20a553ddc04c813f0f836d80338cfad71 /usr/bin/tk" +ARG TANKA_VERSION=0.21.0 +ARG TANKA_CHECKSUM="cd60a005f84fd99763f26d07d4cb626e7585a62800aae97234d8187129eed1ec /usr/bin/tk" RUN apk add --no-cache curl RUN curl -fSL -o "/usr/bin/tk" "https://github.com/grafana/tanka/releases/download/v${TANKA_VERSION}/tk-linux-amd64" RUN echo "${TANKA_CHECKSUM}" | sha256sum -c || (printf "wanted: %s\n got: %s\n" "${TANKA_CHECKSUM}" "$(sha256sum /usr/bin/tk)"; exit 1) diff --git a/cortex/azure/main.jsonnet.example b/cortex/azure/main.jsonnet.example new file mode 100644 index 00000000..dddd337d --- /dev/null +++ b/cortex/azure/main.jsonnet.example @@ -0,0 +1,28 @@ +local cortex = import 'cortex/cortex.libsonnet'; + +cortex { + _config+:: { + namespace: 'default', + + blocks_storage_backend: 'azure', + blocks_storage_bucket_name: 'example-bucket', + blocks_storage_azure_account_key: 'replace-with-valid-key', + blocks_storage_azure_account_name: 'example-account', + + // Cortex Ruler config. + ruler_enabled: true, + ruler_client_type: 'azure', + ruler_storage_bucket_name: 'ruler-example-bucket', + ruler_storage_azure_account_name: 'example-account', + ruler_storage_azure_account_key: 'replace-with-valid-key', + + // Cortex Alertmanager config + alertmanager_enabled: true, + alertmanager_client_type: 'azure', + alertmanager_azure_container_name: 'alertmanager-example-bucket', + alertmanager_azure_account_key: 'replace-with-valid-key', + alertmanager_azure_account_name: 'example-account', + external_url: 'https://cortex.example.com', //Alertmanager UI + cluster: 'cluster', + }, +} diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet new file mode 100644 index 00000000..03df1ab7 --- /dev/null +++ b/cortex/compactor.libsonnet @@ -0,0 +1,72 @@ +{ + local container = $.core.v1.container, + local pvc = $.core.v1.persistentVolumeClaim, + local statefulSet = $.apps.v1.statefulSet, + local volumeMount = $.core.v1.volumeMount, + + // The compactor runs a statefulset with a single replica, because + // it does not support horizontal scalability yet. + local compactor_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_compactor_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_compactor_data_disk_class) + + pvc.mixin.metadata.withName('compactor-data'), + + compactor_args:: + $._config.grpcConfig + + $._config.blocksStorageConfig + + $._config.compactorLimitsConfig + + { + target: 'compactor', + + // Compactor config. + 'compactor.block-ranges': '2h,12h,24h', + 'compactor.data-dir': '/data', + 'compactor.compaction-interval': '30m', + 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, + 'compactor.cleanup-interval': $._config.cortex_compactor_cleanup_interval, + + // Enable sharding. + 'compactor.sharding-enabled': true, + 'compactor.ring.store': 'consul', + 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'compactor.ring.prefix': '', + + // Limits config. + 'runtime-config.file': '/etc/cortex/overrides.yaml', + }, + + compactor_ports:: $.util.defaultPorts, + + compactor_container:: + container.new('compactor', $._images.compactor) + + container.withPorts($.compactor_ports) + + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. + $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + + $.util.resourcesLimits(null, '6Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + newCompactorStatefulSet(name, container):: + statefulSet.new(name, 1, [container], compactor_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + + // Parallelly scale up/down compactor instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + + compactor_statefulset: + $.newCompactorStatefulSet('compactor', $.compactor_container), +} diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 248cedaf..4ced473a 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -5,13 +5,7 @@ replication_factor: 3, external_url: error 'must define external url for cluster', - storage_backend: error 'must specify storage backend (cassandra, gcp, aws)', - table_prefix: $._config.namespace, - cassandra_addresses: error 'must specify cassandra addresses', - bigtable_instance: error 'must specify bigtable instance', - bigtable_project: error 'must specify bigtable project', aws_region: error 'must specify AWS region', - s3_bucket_name: error 'must specify S3 bucket name', // If false, ingesters are not unregistered on shutdown and left in the ring with // the LEAVING state. Setting to false prevents series resharding during ingesters rollouts, @@ -26,15 +20,6 @@ cortex_querier_allow_multiple_replicas_on_same_node: false, cortex_query_frontend_allow_multiple_replicas_on_same_node: false, - // schema is used to generate the storage schema yaml file used by - // the Cortex chunks storage: - // - More information: https://github.com/cortexproject/cortex/pull/1072 - // - Blocks storage doesn't support / uses the schema config. - schema: if $._config.storage_engine != 'blocks' then - error 'must specify a schema config' - else - [], - max_chunk_idle: '15m', test_exporter_enabled: false, @@ -52,18 +37,12 @@ jaeger_agent_host: null, - // Use the Cortex chunks storage engine by default, while giving the ability - // to switch to blocks storage. - storage_engine: 'chunks', // Available options are 'chunks' or 'blocks' - blocks_storage_backend: 'gcs', // Available options are 'gcs', 's3', 'azure' + blocks_storage_backend: error "must specify $._config.blocks_storage_backend . Available options are 'gcs', 's3', 'azure'", blocks_storage_bucket_name: error 'must specify blocks storage bucket name', blocks_storage_s3_endpoint: 's3.dualstack.%s.amazonaws.com' % $._config.aws_region, blocks_storage_azure_account_name: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account name' else '', blocks_storage_azure_account_key: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account key' else '', - // Secondary storage engine is only used for querying. - querier_second_storage_engine: null, - store_gateway_replication_factor: 3, // By default ingesters will be run as StatefulSet with WAL. @@ -76,16 +55,6 @@ statefulset_disk: '150Gi', }, - // Blocks storage engine doesn't require the table manager. - // When running blocks with chunks as secondary storage engine for querier only, we need table-manager to apply - // retention policies. - table_manager_enabled: $._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks', - - // Blocks storage engine doesn't support index-writes (for writes deduplication) cache. - memcached_index_writes_enabled: $._config.storage_engine != 'blocks', - memcached_index_writes_max_item_size_mb: 1, - - // Index and chunks caches are supported by both blocks storage engine and chunks engine. memcached_index_queries_enabled: true, memcached_index_queries_max_item_size_mb: 5, @@ -102,55 +71,15 @@ query_tee_backend_endpoints: [], query_tee_backend_preferred: '', - enabledBackends: [ - backend - for backend in std.split($._config.storage_backend, ',') - ], - - client_configs: { - aws: - if std.count($._config.enabledBackends, 'aws') > 0 then { - 'dynamodb.api-limit': 10, - 'dynamodb.url': 'https://%s' % $._config.aws_region, - 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.s3_bucket_name], - } else {}, - cassandra: - if std.count($._config.enabledBackends, 'cassandra') > 0 then { - 'cassandra.keyspace': $._config.namespace, - 'cassandra.addresses': $._config.cassandra_addresses, - 'cassandra.replication-factor': $._config.replication_factor, - } else {}, - gcp: - if std.count($._config.enabledBackends, 'gcp') > 0 then { - 'bigtable.project': $._config.bigtable_project, - 'bigtable.instance': $._config.bigtable_instance, - } else {}, - }, - - storeConfig: self.storeMemcachedChunksConfig, - - storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then - { - 'store.chunks-cache.memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, - 'store.chunks-cache.memcached.service': 'memcached-client', - 'store.chunks-cache.memcached.timeout': '3s', - } - else {}, - grpcConfig:: { 'server.grpc.keepalive.min-time-between-pings': '10s', 'server.grpc.keepalive.ping-without-stream-allowed': true, }, - storageConfig: - $._config.client_configs.aws + - $._config.client_configs.cassandra + - $._config.client_configs.gcp + - { 'schema-config-file': '/etc/cortex/schema/config.yaml' }, - genericBlocksStorageConfig:: { - 'store.engine': $._config.storage_engine, // May still be chunks + 'store.engine': 'blocks', }, + queryBlocksStorageConfig:: { 'blocks-storage.bucket-store.sync-dir': '/data/tsdb', 'blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', @@ -176,59 +105,29 @@ 'blocks-storage.azure.account-name': $._config.blocks_storage_azure_account_name, 'blocks-storage.azure.account-key': $._config.blocks_storage_azure_account_key, }, - // Blocks storage configuration, used only when 'blocks' storage - // engine is explicitly enabled. - blocksStorageConfig: ( - if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then ( - if $._config.blocks_storage_backend == 'gcs' then $._config.gcsBlocksStorageConfig - else if $._config.blocks_storage_backend == 's3' then $._config.s3BlocksStorageConfig - else if $._config.blocks_storage_backend == 'azure' then $._config.azureBlocksStorageConfig - else $._config.genericBlocksStorageConfig - ) else {} - ), + + blocksStorageConfig: + if $._config.blocks_storage_backend == 'gcs' then $._config.gcsBlocksStorageConfig + else if $._config.blocks_storage_backend == 's3' then $._config.s3BlocksStorageConfig + else if $._config.blocks_storage_backend == 'azure' then $._config.azureBlocksStorageConfig + else $._config.genericBlocksStorageConfig, // Querier component config (shared between the ruler and querier). queryConfig: { 'runtime-config.file': '/etc/cortex/overrides.yaml', - // Limit the size of the rows we read from the index. - 'store.cardinality-limit': 1e6, - // Don't allow individual queries of longer than 32days. Due to day query // splitting in the frontend, the reality is this only limits rate(foo[32d]) // type queries. 32 days to allow for comparision over the last month (31d) and // then some. 'store.max-query-length': '768h', - } + ( - if $._config.storage_engine == 'chunks' then { - // Don't query ingesters for older queries. - // Chunks are held in memory for up to 6hrs right now. Additional 6h are granted for safety reasons because - // the remote writing Prometheus may have a delay or write requests into the database are queued. - 'querier.query-ingesters-within': '12h', - - // Don't query the chunk store for data younger than max_chunk_idle. - 'querier.query-store-after': $._config.max_chunk_idle, - } else if $._config.storage_engine == 'blocks' then { - // Ingesters don't have data older than 13h, no need to ask them. - 'querier.query-ingesters-within': '13h', - - // No need to look at store for data younger than 12h, as ingesters have all of it. - 'querier.query-store-after': '12h', - } - ) + ( - if $._config.memcached_index_queries_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then - { - // Setting for index cache. - 'store.index-cache-validity': '14m', // ingester.retain-period=15m, 1m less for safety. - 'store.index-cache-read.cache.enable-fifocache': true, - 'store.index-cache-read.fifocache.max-size-items': 102400, - 'store.index-cache-read.memcached.hostname': 'memcached-index-queries.%(namespace)s.svc.cluster.local' % $._config, - 'store.index-cache-read.memcached.service': 'memcached-client', - 'store.index-cache-read.memcached.timeout': '500ms', - 'store.cache-lookups-older-than': '36h', - } - else {} - ), + + // Ingesters don't have data older than 13h, no need to ask them. + 'querier.query-ingesters-within': '13h', + + // No need to look at store for data younger than 12h, as ingesters have all of it. + 'querier.query-store-after': '12h', + }, // PromQL query engine config (shared between all services running PromQL engine, like the ruler and querier). queryEngineConfig: { @@ -473,8 +372,6 @@ // if not empty, passed to overrides.yaml as another top-level field multi_kv_config: {}, - schemaID: std.md5(std.toString($._config.schema)), - enable_pod_priorities: true, alertmanager_enabled: false, @@ -509,19 +406,6 @@ ), }), - storage_config: - configMap.new('schema-' + $._config.schemaID) + - configMap.withData({ - 'config.yaml': $.util.manifestYaml({ - configs: $._config.schema, - }), - }), - - local deployment = $.apps.v1.deployment, - storage_config_mixin:: - deployment.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + - $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), - // This removed the CPU limit from the config. NB won't show up in subset // diffs, but ks apply will do the right thing. removeCPULimitsMixin:: { diff --git a/cortex/cortex-manifests.jsonnet.example b/cortex/cortex-manifests.jsonnet.example deleted file mode 100644 index 9abcc1b1..00000000 --- a/cortex/cortex-manifests.jsonnet.example +++ /dev/null @@ -1,27 +0,0 @@ -local cortex = import "cortex/cortex.libsonnet"; - -cortex { - _config+:: { - namespace: "default", - schema: [{ - from: '2019-11-15', - store: 'bigtable-hashed', - object_store: 'gcs', - schema: 'v10', - index: { - prefix: 'dev_index_', - period: '168h', - }, - chunks: { - prefix: 'dev_chunks_', - period: '168h', - }, - }], - - storage_backend: 'gcp', - bigtable_instance: 'example-instance-prod', - bigtable_project: 'example-project1-cortex', - ruler_client_type: 'gcs' - }, -} - diff --git a/cortex/cortex.libsonnet b/cortex/cortex.libsonnet index b8716d19..122aa80c 100644 --- a/cortex/cortex.libsonnet +++ b/cortex/cortex.libsonnet @@ -3,6 +3,7 @@ (import 'images.libsonnet') + (import 'common.libsonnet') + (import 'config.libsonnet') + +(import 'tsdb-config.libsonnet') + (import 'consul.libsonnet') + // Cortex services @@ -10,10 +11,11 @@ (import 'ingester.libsonnet') + (import 'querier.libsonnet') + (import 'query-frontend.libsonnet') + -(import 'table-manager.libsonnet') + (import 'ruler.libsonnet') + (import 'alertmanager.libsonnet') + (import 'query-scheduler.libsonnet') + +(import 'compactor.libsonnet') + +(import 'store-gateway.libsonnet') + // Supporting services (import 'etcd.libsonnet') + diff --git a/cortex/flusher-job.libsonnet b/cortex/flusher-job.libsonnet deleted file mode 100644 index 4d9a5762..00000000 --- a/cortex/flusher-job.libsonnet +++ /dev/null @@ -1,51 +0,0 @@ -{ - // Usage example: - // local flusher_job = import 'cortex/flusher-job.libsonnet'; - // flusher_job + { - // flusher_job: - // $.flusher_job_func('pvc-af8947e6-182e-11ea-82e4-42010a9a0137', 'ingester-pvc-ingester-5'), - // } - - local container = $.core.v1.container, - local job = $.batch.v1.job, - local volumeMount = $.core.v1.volumeMount, - local volume = $.core.v1.volume, - - flusher_container:: - container.new('flusher', $._images.flusher) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.ingester_args { - target: 'flusher', - 'flusher.wal-dir': $._config.wal_dir, - })) + - $.util.resourcesRequests('4', '15Gi') + - $.util.resourcesLimits(null, '25Gi') + - $.util.readinessProbe + - $.jaeger_mixin, - - flusher_job_storage_config_mixin:: - job.mixin.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + - $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), - - flusher_job_func(volumeName, pvcName):: - job.new() + - job.mixin.spec.template.spec.withContainers([ - $.flusher_container + - container.withVolumeMountsMixin([ - volumeMount.new(volumeName, $._config.wal_dir), - ]), - ]) + - job.mixin.spec.template.spec.withRestartPolicy('Never') + - job.mixin.spec.template.spec.withVolumes([ - volume.fromPersistentVolumeClaim(volumeName, pvcName), - ]) + - $.flusher_job_storage_config_mixin + - job.mixin.metadata.withName('flusher') + - job.mixin.metadata.withNamespace($._config.namespace) + - job.mixin.metadata.withLabels({ name: 'flusher' }) + - job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + - job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.podPriority('high'), -} diff --git a/cortex/gcs/main.jsonnet.example b/cortex/gcs/main.jsonnet.example new file mode 100644 index 00000000..99d40caf --- /dev/null +++ b/cortex/gcs/main.jsonnet.example @@ -0,0 +1,22 @@ +local cortex = import 'cortex/cortex.libsonnet'; + +cortex { + _config+:: { + namespace: 'default', + + blocks_storage_backend: 'gcs', + blocks_storage_bucket_name: 'example-bucket', + + // Cortex Ruler config. + ruler_enabled: true, + ruler_client_type: 'gcs', + ruler_storage_bucket_name: 'ruler-example-bucket', + + // Cortex Alertmanager config + alertmanager_enabled: true, + alertmanager_client_type: 'gcs', + alertmanager_gcs_bucket_name: 'alertmanager-example-bucket', + external_url: 'https://cortex.example.com', //Alertmanager UI + cluster: 'cluster', + }, +} diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 3078db36..50ab90fd 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -1,9 +1,21 @@ { + local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + local pvc = $.core.v1.persistentVolumeClaim, + local statefulSet = $.apps.v1.statefulSet, + local volume = $.core.v1.volume, + + // The ingesters should persist TSDB blocks and WAL on a persistent + // volume in order to be crash resilient. + local ingester_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_ingester_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_ingester_data_disk_class) + + pvc.mixin.metadata.withName('ingester-data'), + ingester_args:: $._config.grpcConfig + $._config.ringConfig + - $._config.storeConfig + - $._config.storageConfig + $._config.blocksStorageConfig + $._config.distributorConfig + // This adds the distributor ring flags to the ingester. $._config.ingesterLimitsConfig + @@ -12,14 +24,12 @@ // Ring config. 'ingester.num-tokens': 512, - 'ingester.join-after': '30s', - 'ingester.max-transfer-retries': 60, // Each retry is backed off by 5s, so 5mins for new ingester to come up. + 'ingester.join-after': '0s', 'ingester.heartbeat-period': '15s', 'ingester.max-stale-chunk-idle': '5m', 'ingester.unregister-on-shutdown': $._config.unregister_ingesters_on_shutdown, // Chunk building/flushing config. - 'ingester.chunk-encoding': 3, // Bigchunk encoding 'ingester.retain-period': '15m', 'ingester.max-chunk-age': '6h', @@ -29,15 +39,16 @@ 'server.grpc-max-concurrent-streams': 10000, 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, - } + ( - if $._config.memcached_index_writes_enabled then - { - // Setup index write deduping. - 'store.index-cache-write.memcached.hostname': 'memcached-index-writes.%(namespace)s.svc.cluster.local' % $._config, - 'store.index-cache-write.memcached.service': 'memcached-client', - } - else {} - ), + + 'blocks-storage.tsdb.dir': '/data/tsdb', + 'blocks-storage.tsdb.block-ranges-period': '2h', + 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. + 'blocks-storage.tsdb.ship-interval': '1m', + + // Persist ring tokens so that when the ingester will be restarted + // it will pick the same tokens + 'ingester.tokens-file-path': '/data/tokens', + }, ingester_statefulset_args:: $._config.grpcConfig @@ -76,64 +87,14 @@ ingester_deployment_labels:: {}, - local pvc = $.core.v1.persistentVolumeClaim, - local volume = $.core.v1.volume, - local statefulSet = $.apps.v1.statefulSet, - local ingester_pvc = pvc.new('ingester-pvc') + pvc.mixin.spec.resources.withRequests({ storage: $._config.ingester.statefulset_disk }) + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + pvc.mixin.spec.withStorageClassName('fast'), - statefulset_storage_config_mixin:: - statefulSet.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + - $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), - - ingester_statefulset: - if $._config.ingester_deployment_without_wal == false then - statefulSet.new('ingester', 3, [$.ingester_statefulset_container], ingester_pvc) + - statefulSet.mixin.spec.withServiceName('ingester') + - statefulSet.mixin.spec.template.spec.withVolumes([volume.fromPersistentVolumeClaim('ingester-pvc', 'ingester-pvc')]) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: 'ingester' }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: 'ingester' } + $.ingester_deployment_labels) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: 'ingester' }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - $.statefulset_storage_config_mixin + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.podPriority('high') + - $.util.antiAffinityStatefulSet - else null, - - local deployment = $.apps.v1.deployment, - - ingester_deployment: - if $._config.ingester_deployment_without_wal then - deployment.new(name, 3, [$.ingester_container], $.ingester_deployment_labels) + - $.util.antiAffinity + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - deployment.mixin.metadata.withLabels({ name: name }) + - deployment.mixin.spec.withMinReadySeconds(60) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + - deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + - $.storage_config_mixin + - $.util.podPriority('high') - else null, - ingester_service_ignored_labels:: [], - ingester_service: - if $._config.ingester_deployment_without_wal then - $.util.serviceFor($.ingester_deployment, $.ingester_service_ignored_labels) - else - $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), - - local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, - newIngesterPdb(pdbName, ingesterName):: podDisruptionBudget.new() + podDisruptionBudget.mixin.metadata.withName(pdbName) + @@ -142,4 +103,34 @@ podDisruptionBudget.mixin.spec.withMaxUnavailable(1), ingester_pdb: self.newIngesterPdb('ingester-pdb', name), + + newIngesterStatefulSet(name, container, with_anti_affinity=true):: + statefulSet.new(name, 3, [ + container + $.core.v1.container.withVolumeMountsMixin([ + volumeMount.new('ingester-data', '/data'), + ]), + ], ingester_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. + // For this reason, we grant an high termination period (80 minutes). + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.util.podPriority('high') + + // Parallelly scale up/down ingester instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + (if with_anti_affinity then $.util.antiAffinity else {}), + + ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container), + + ingester_service: + $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), } diff --git a/cortex/memcached.libsonnet b/cortex/memcached.libsonnet index 011328c3..fb8d2e3e 100644 --- a/cortex/memcached.libsonnet +++ b/cortex/memcached.libsonnet @@ -38,14 +38,6 @@ memcached { } else {}, - // Dedicated memcached instance used to dedupe writes to the index. - memcached_index_writes: if $._config.memcached_index_writes_enabled then - $.memcached { - name: 'memcached-index-writes', - max_item_size: '%dm' % [$._config.memcached_index_writes_max_item_size_mb], - } - else {}, - // Memcached instance used to cache chunks. memcached_chunks: if $._config.memcached_chunks_enabled then $.memcached { diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index eb807ee2..6ebe85fe 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -4,12 +4,13 @@ querier_args:: $._config.grpcConfig + $._config.ringConfig + - $._config.storeConfig + - $._config.storageConfig + $._config.blocksStorageConfig + $._config.queryConfig + $._config.queryEngineConfig + $._config.distributorConfig + + $._config.queryBlocksStorageConfig + + $.blocks_metadata_caching_config + + $.bucket_index_config + { target: 'querier', @@ -25,8 +26,6 @@ 'querier.frontend-address': 'query-frontend-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, - 'querier.second-store-engine': $._config.querier_second_storage_engine, - // We request high memory but the Go heap is typically very low (< 100MB) and this causes // the GC to trigger continuously. Setting a ballast of 256MB reduces GC. 'mem-ballast-size-bytes': 1 << 28, // 256M @@ -59,8 +58,7 @@ (if $._config.cortex_querier_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + - $.storage_config_mixin, + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), querier_deployment: self.newQuerierDeployment('querier', $.querier_container), diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index a7df54fd..cfb0252b 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -4,14 +4,15 @@ ruler_args:: $._config.grpcConfig + $._config.ringConfig + - $._config.storeConfig + - $._config.storageConfig + $._config.blocksStorageConfig + $._config.queryConfig + $._config.queryEngineConfig + $._config.distributorConfig + $._config.rulerClientConfig + $._config.rulerLimitsConfig + + $._config.queryBlocksStorageConfig + + $.blocks_metadata_caching_config + + $.bucket_index_config + { target: 'ruler', // Alertmanager configs @@ -27,9 +28,6 @@ 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, - // Storage - 'querier.second-store-engine': $._config.querier_second_storage_engine, - // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" // is set to false. 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, @@ -55,8 +53,7 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + (if $._config.cortex_ruler_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.storage_config_mixin + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') else {}, local service = $.core.v1.service, diff --git a/cortex/s3/main.jsonnet.example b/cortex/s3/main.jsonnet.example new file mode 100644 index 00000000..0d961ffb --- /dev/null +++ b/cortex/s3/main.jsonnet.example @@ -0,0 +1,24 @@ +local cortex = import 'cortex/cortex.libsonnet'; + +cortex { + _config+:: { + namespace: 'default', + + blocks_storage_backend: 's3', + blocks_storage_bucket_name: 'blocks-example-bucket', + + aws_region: 'replace-with-valid-region', // For example 'us-east-2', + + // Cortex Ruler config + ruler_enabled: true, + ruler_client_type: 's3', + ruler_storage_bucket_name: 'ruler-example-bucket', + + // Cortex Alertmanager config + alertmanager_enabled: true, + alertmanager_client_type: 's3', + alertmanager_s3_bucket_name: 'alertmanager-example-bucket', + external_url: 'https://cortex.example.com', //Alertmanager UI + cluster: 'cluster', + }, +} diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet new file mode 100644 index 00000000..4568692b --- /dev/null +++ b/cortex/store-gateway.libsonnet @@ -0,0 +1,94 @@ +{ + local container = $.core.v1.container, + local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + local pvc = $.core.v1.persistentVolumeClaim, + local statefulSet = $.apps.v1.statefulSet, + local volumeMount = $.core.v1.volumeMount, + + // The store-gateway runs a statefulset. + local store_gateway_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_store_gateway_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_store_gateway_data_disk_class) + + pvc.mixin.metadata.withName('store-gateway-data'), + + store_gateway_args:: + $._config.grpcConfig + + $._config.blocksStorageConfig + + $._config.queryBlocksStorageConfig + + { + target: 'store-gateway', + 'runtime-config.file': '/etc/cortex/overrides.yaml', + + // Persist ring tokens so that when the store-gateway will be restarted + // it will pick the same tokens + 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', + + // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. + 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', + 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', + + 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, + + // We should keep a number of idle connections equal to the max "get" concurrency, + // in order to avoid re-opening connections continuously (this would be slower + // and fill up the conntrack table too). + // + // The downside of this approach is that we'll end up with an higher number of + // active connections to memcached, so we have to make sure connections limit + // set in memcached is high enough. + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], + 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], + } + + $.blocks_chunks_caching_config + + $.blocks_metadata_caching_config + + $.bucket_index_config, + + store_gateway_ports:: $.util.defaultPorts, + + store_gateway_container:: + container.new('store-gateway', $._images.store_gateway) + + container.withPorts($.store_gateway_ports) + + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + + $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesLimits(null, '18Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + newStoreGatewayStatefulSet(name, container):: + statefulSet.new(name, 3, [container], store_gateway_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) + + // Parallelly scale up/down store-gateway instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + + store_gateway_statefulset: self.newStoreGatewayStatefulSet('store-gateway', $.store_gateway_container), + + store_gateway_service: + $.util.serviceFor($.store_gateway_statefulset), + + store_gateway_pdb: + podDisruptionBudget.new() + + podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') + + podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + + // To avoid any disruption in the read path we need at least 1 replica of each + // block available, so the disruption budget depends on the blocks replication factor. + podDisruptionBudget.mixin.spec.withMaxUnavailable(if $._config.store_gateway_replication_factor > 1 then $._config.store_gateway_replication_factor - 1 else 1), +} diff --git a/cortex/table-manager.libsonnet b/cortex/table-manager.libsonnet deleted file mode 100644 index 90cb733c..00000000 --- a/cortex/table-manager.libsonnet +++ /dev/null @@ -1,44 +0,0 @@ -{ - local container = $.core.v1.container, - - table_manager_args:: - $._config.storageConfig - { - target: 'table-manager', - - // Rate limit Bigtable Admin calls. Google seem to limit to ~100QPS, - // and given 2yrs worth of tables (~100) a sync will table 20s. This - // allows you to run upto 20 independant Cortex clusters on the same - // Google project before running into issues. - 'bigtable.grpc-client-rate-limit': 5.0, - 'bigtable.grpc-client-rate-limit-burst': 5, - 'bigtable.backoff-on-ratelimits': true, - 'bigtable.table-cache.enabled': true, - 'table-manager.poll-interval': '10m', - 'table-manager.periodic-table.grace-period': '3h', - }, - - table_manager_container:: - if $._config.table_manager_enabled then - container.new('table-manager', $._images.tableManager) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.table_manager_args)) + - $.util.resourcesRequests('100m', '100Mi') + - $.util.resourcesLimits('200m', '200Mi') + - $.util.readinessProbe + - $.jaeger_mixin - else {}, - - local deployment = $.apps.v1.deployment, - - table_manager_deployment: - if $._config.table_manager_enabled then - deployment.new('table-manager', 1, [$.table_manager_container]) + - $.storage_config_mixin - else {}, - - table_manager_service: - if $._config.table_manager_enabled then - $.util.serviceFor($.table_manager_deployment) - else {}, -} diff --git a/cortex/tsdb-config.libsonnet b/cortex/tsdb-config.libsonnet new file mode 100644 index 00000000..a397bdaf --- /dev/null +++ b/cortex/tsdb-config.libsonnet @@ -0,0 +1,68 @@ +{ + _config+:: { + // Enforce blocks storage + storage_backend: 'none', + storage_engine: 'blocks', + + // Allow to configure the ingester disk. + cortex_ingester_data_disk_size: '100Gi', + cortex_ingester_data_disk_class: 'fast', + + // Allow to configure the store-gateway disk. + cortex_store_gateway_data_disk_size: '50Gi', + cortex_store_gateway_data_disk_class: 'standard', + + // Allow to configure the compactor disk. + cortex_compactor_data_disk_size: '250Gi', + cortex_compactor_data_disk_class: 'standard', + + // Allow to fine tune compactor. + cortex_compactor_max_concurrency: 1, + // While this is the default value, we want to pass the same to the -blocks-storage.bucket-store.sync-interval + cortex_compactor_cleanup_interval: '15m', + + // Enable use of bucket index by querier, ruler and store-gateway. + // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor. + cortex_bucket_index_enabled: false, + }, + + blocks_chunks_caching_config:: + ( + if $._config.memcached_index_queries_enabled then { + 'blocks-storage.bucket-store.index-cache.backend': 'memcached', + 'blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + } else {} + ) + ( + if $._config.memcached_chunks_enabled then { + 'blocks-storage.bucket-store.chunks-cache.backend': 'memcached', + 'blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + } else {} + ), + + blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { + 'blocks-storage.bucket-store.metadata-cache.backend': 'memcached', + 'blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', + } else {}, + + bucket_index_config:: if $._config.cortex_bucket_index_enabled then { + 'blocks-storage.bucket-store.bucket-index.enabled': true, + + // Bucket index is updated by compactor on each cleanup cycle. + 'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval, + } else {}, +} diff --git a/cortex/tsdb.libsonnet b/cortex/tsdb.libsonnet deleted file mode 100644 index 15902099..00000000 --- a/cortex/tsdb.libsonnet +++ /dev/null @@ -1,290 +0,0 @@ -{ - local pvc = $.core.v1.persistentVolumeClaim, - local volumeMount = $.core.v1.volumeMount, - local container = $.core.v1.container, - local statefulSet = $.apps.v1.statefulSet, - local service = $.core.v1.service, - - _config+:: { - // Enforce blocks storage - storage_backend: 'none', - storage_engine: 'blocks', - - // Allow to configure the ingester disk. - cortex_ingester_data_disk_size: '100Gi', - cortex_ingester_data_disk_class: 'fast', - - // Allow to configure the store-gateway disk. - cortex_store_gateway_data_disk_size: '50Gi', - cortex_store_gateway_data_disk_class: 'standard', - - // Allow to configure the compactor disk. - cortex_compactor_data_disk_size: '250Gi', - cortex_compactor_data_disk_class: 'standard', - - // Allow to fine tune compactor. - cortex_compactor_max_concurrency: 1, - // While this is the default value, we want to pass the same to the -blocks-storage.bucket-store.sync-interval - cortex_compactor_cleanup_interval: '15m', - - // Enable use of bucket index by querier, ruler and store-gateway. - // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor. - cortex_bucket_index_enabled: false, - }, - - blocks_chunks_caching_config:: - ( - if $._config.memcached_index_queries_enabled then { - 'blocks-storage.bucket-store.index-cache.backend': 'memcached', - 'blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, - 'blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', - 'blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, - 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', - 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', - 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', - } else {} - ) + ( - if $._config.memcached_chunks_enabled then { - 'blocks-storage.bucket-store.chunks-cache.backend': 'memcached', - 'blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, - 'blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', - 'blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, - 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', - 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', - 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', - } else {} - ), - - blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { - 'blocks-storage.bucket-store.metadata-cache.backend': 'memcached', - 'blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, - 'blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', - 'blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, - 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', - 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', - 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', - } else {}, - - bucket_index_config:: if $._config.cortex_bucket_index_enabled then { - 'blocks-storage.bucket-store.bucket-index.enabled': true, - - // Bucket index is updated by compactor on each cleanup cycle. - 'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval, - } else {}, - - querier_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, - ruler_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, - - // The ingesters should persist TSDB blocks and WAL on a persistent - // volume in order to be crash resilient. - local ingester_data_pvc = - pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_ingester_data_disk_size }) + - pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName($._config.cortex_ingester_data_disk_class) + - pvc.mixin.metadata.withName('ingester-data'), - - ingester_deployment: {}, - - ingester_args+:: { - 'blocks-storage.tsdb.dir': '/data/tsdb', - 'blocks-storage.tsdb.block-ranges-period': '2h', - 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. - 'blocks-storage.tsdb.ship-interval': '1m', - - // Disable TSDB blocks transfer because of persistent volumes - 'ingester.max-transfer-retries': 0, - 'ingester.join-after': '0s', - - // Persist ring tokens so that when the ingester will be restarted - // it will pick the same tokens - 'ingester.tokens-file-path': '/data/tokens', - }, - - newIngesterStatefulSet(name, container, with_anti_affinity=true):: - statefulSet.new(name, 3, [ - container + $.core.v1.container.withVolumeMountsMixin([ - volumeMount.new('ingester-data', '/data'), - ]), - ], ingester_data_pvc) + - statefulSet.mixin.spec.withServiceName(name) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. - // For this reason, we grant an high termination period (80 minutes). - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.podPriority('high') + - // Parallelly scale up/down ingester instances instead of starting them - // one by one. This does NOT affect rolling updates: they will continue to be - // rolled out one by one (the next pod will be rolled out once the previous is - // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + - (if with_anti_affinity then $.util.antiAffinity else {}), - - ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container), - - ingester_service: - $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), - - // The compactor runs a statefulset with a single replica, because - // it does not support horizontal scalability yet. - local compactor_data_pvc = - pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_compactor_data_disk_size }) + - pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName($._config.cortex_compactor_data_disk_class) + - pvc.mixin.metadata.withName('compactor-data'), - - compactor_args:: - $._config.grpcConfig + - $._config.storageConfig + - $._config.blocksStorageConfig + - $._config.compactorLimitsConfig + - { - target: 'compactor', - - // Compactor config. - 'compactor.block-ranges': '2h,12h,24h', - 'compactor.data-dir': '/data', - 'compactor.compaction-interval': '30m', - 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, - 'compactor.cleanup-interval': $._config.cortex_compactor_cleanup_interval, - - // Enable sharding. - 'compactor.sharding-enabled': true, - 'compactor.ring.store': 'consul', - 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - 'compactor.ring.prefix': '', - - // Limits config. - 'runtime-config.file': '/etc/cortex/overrides.yaml', - }, - - compactor_ports:: $.util.defaultPorts, - - compactor_container:: - container.new('compactor', $._images.compactor) + - container.withPorts($.compactor_ports) + - container.withArgsMixin($.util.mapToFlags($.compactor_args)) + - container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + - // Do not limit compactor CPU and request enough cores to honor configured max concurrency. - $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + - $.util.resourcesLimits(null, '6Gi') + - $.util.readinessProbe + - $.jaeger_mixin, - - newCompactorStatefulSet(name, container):: - statefulSet.new(name, 1, [container], compactor_data_pvc) + - statefulSet.mixin.spec.withServiceName(name) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + - // Parallelly scale up/down compactor instances instead of starting them - // one by one. This does NOT affect rolling updates: they will continue to be - // rolled out one by one (the next pod will be rolled out once the previous is - // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), - - compactor_statefulset: - $.newCompactorStatefulSet('compactor', $.compactor_container), - - // The store-gateway runs a statefulset. - local store_gateway_data_pvc = - pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_store_gateway_data_disk_size }) + - pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName($._config.cortex_store_gateway_data_disk_class) + - pvc.mixin.metadata.withName('store-gateway-data'), - - store_gateway_args:: - $._config.grpcConfig + - $._config.storageConfig + - $._config.blocksStorageConfig + - $._config.queryBlocksStorageConfig + - { - target: 'store-gateway', - 'runtime-config.file': '/etc/cortex/overrides.yaml', - - // Persist ring tokens so that when the store-gateway will be restarted - // it will pick the same tokens - 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', - - // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. - 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', - 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', - - 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, - - // We should keep a number of idle connections equal to the max "get" concurrency, - // in order to avoid re-opening connections continuously (this would be slower - // and fill up the conntrack table too). - // - // The downside of this approach is that we'll end up with an higher number of - // active connections to memcached, so we have to make sure connections limit - // set in memcached is high enough. - 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], - } + - $.blocks_chunks_caching_config + - $.blocks_metadata_caching_config + - $.bucket_index_config, - - store_gateway_ports:: $.util.defaultPorts, - - store_gateway_container:: - container.new('store-gateway', $._images.store_gateway) + - container.withPorts($.store_gateway_ports) + - container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + - container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + - $.util.resourcesRequests('1', '12Gi') + - $.util.resourcesLimits(null, '18Gi') + - $.util.readinessProbe + - $.jaeger_mixin, - - newStoreGatewayStatefulSet(name, container):: - statefulSet.new(name, 3, [container], store_gateway_data_pvc) + - statefulSet.mixin.spec.withServiceName(name) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) + - // Parallelly scale up/down store-gateway instances instead of starting them - // one by one. This does NOT affect rolling updates: they will continue to be - // rolled out one by one (the next pod will be rolled out once the previous is - // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), - - store_gateway_statefulset: self.newStoreGatewayStatefulSet('store-gateway', $.store_gateway_container), - - store_gateway_service: - $.util.serviceFor($.store_gateway_statefulset), - - local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, - - store_gateway_pdb: - podDisruptionBudget.new() + - podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') + - podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + - podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + - // To avoid any disruption in the read path we need at least 1 replica of each - // block available, so the disruption budget depends on the blocks replication factor. - podDisruptionBudget.mixin.spec.withMaxUnavailable(if $._config.store_gateway_replication_factor > 1 then $._config.store_gateway_replication_factor - 1 else 1), -} From 4c5e36c74ab41fadfdf5f07151d1ff90db0cf94d Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 19 Oct 2022 19:35:02 +0200 Subject: [PATCH 08/60] Remove chunk configurations (#9) --- cortex/config.libsonnet | 2 -- cortex/ingester.libsonnet | 6 ------ 2 files changed, 8 deletions(-) diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 4ced473a..ac54218b 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -20,8 +20,6 @@ cortex_querier_allow_multiple_replicas_on_same_node: false, cortex_query_frontend_allow_multiple_replicas_on_same_node: false, - max_chunk_idle: '15m', - test_exporter_enabled: false, test_exporter_start_time: error 'must specify test exporter start time', test_exporter_user_id: error 'must specify test exporter used id', diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 50ab90fd..16d91e4f 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -26,15 +26,9 @@ 'ingester.num-tokens': 512, 'ingester.join-after': '0s', 'ingester.heartbeat-period': '15s', - 'ingester.max-stale-chunk-idle': '5m', 'ingester.unregister-on-shutdown': $._config.unregister_ingesters_on_shutdown, - // Chunk building/flushing config. - 'ingester.retain-period': '15m', - 'ingester.max-chunk-age': '6h', - // Limits config. - 'ingester.max-chunk-idle': $._config.max_chunk_idle, 'runtime-config.file': '/etc/cortex/overrides.yaml', 'server.grpc-max-concurrent-streams': 10000, 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, From 05b947efa389e0edce6932cfe3621cc7a67115dc Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 26 Oct 2022 19:18:45 +0200 Subject: [PATCH 09/60] Support new cache metrics (#11) Signed-off-by: Friedrich Gonzalez Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex-mixin/dashboards/queries.libsonnet | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d86d000..0ca5e950 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * [CHANGE] Use integrated cortex overrides exporter * [ENHANCEMENT] Added main.jsonnet examples for azure, gcs and s3 * [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 +* [ENHANCEMENT] Support new metrics cortex_cache_fetched_keys_total and cortex_cache_fetched_keys_total * [BUGFIX] Updated blocks_storage_s3_endpoint in config.libsonnet to include the correct aws region ## 1.11.0 / 2021-12-30 diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index 259f5dfa..f18cdee3 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -46,12 +46,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Results Cache Hit %') + - $.queryPanel('sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + + $.queryPanel(||| + sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) or + sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) + ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( $.panel('Results Cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), + $.queryPanel(||| + sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) or + sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) + ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Miss Rate'), ) ) .addRow( @@ -94,7 +100,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addPanel( $.panel('Chunk cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%s,name="chunksmemcache"}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Hit rate'), + $.queryPanel(||| + sum(rate(cortex_cache_fetched_keys{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%(q)s,name="chunksmemcache"}[1m])) or + sum(rate(cortex_cache_fetched_keys_total{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits_total{%(q)s,name="chunksmemcache"}[1m])) + ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Hit rate'), ) .addPanel( $.panel('Chunk cache corruptions') + From 15bb706fb2ab117a217af56826cabef23905853b Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 6 Dec 2022 10:44:12 +0100 Subject: [PATCH 10/60] Removing dead config (#13) Signed-off-by: Friedrich Gonzalez Signed-off-by: Friedrich Gonzalez --- cortex/config.libsonnet | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index ac54218b..f7a1baba 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -43,10 +43,6 @@ store_gateway_replication_factor: 3, - // By default ingesters will be run as StatefulSet with WAL. - // If this is set to true, ingesters will use staless deployments without WAL. - ingester_deployment_without_wal: false, - ingester: { // These config options are only for the chunks storage. wal_dir: '/wal_data', From c2eef2d7f523b81a0d56ed023829f48e9f811bc8 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Fri, 13 Jan 2023 10:24:14 +0100 Subject: [PATCH 11/60] Use default remote timeout (#15) Use default remote timeout in distributors (2s) to prevent distributors OOMs --- CHANGELOG.md | 1 + cortex/distributor.libsonnet | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ca5e950..40979de7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [CHANGE] Updated readme to use this repo with tanka * [CHANGE] Removed chunks support * [CHANGE] Use integrated cortex overrides exporter +* [CHANGE] Use default remote timeout in distributors * [ENHANCEMENT] Added main.jsonnet examples for azure, gcs and s3 * [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 * [ENHANCEMENT] Support new metrics cortex_cache_fetched_keys_total and cortex_cache_fetched_keys_total diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index ea22523e..c2bcfe07 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -13,7 +13,6 @@ 'validation.reject-old-samples': true, 'validation.reject-old-samples.max-age': '12h', 'runtime-config.file': '/etc/cortex/overrides.yaml', - 'distributor.remote-timeout': '20s', 'distributor.ha-tracker.enable': true, 'distributor.ha-tracker.enable-for-all-users': true, From 08645ca59a148bcdc441efd30b530179a280bb27 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Fri, 13 Jan 2023 10:28:27 +0100 Subject: [PATCH 12/60] Memcached configuration belongs in tsdb-config.libsonnet (#12) This effectively fixes max idle connections for memcached-metadata in ruler and querier --- CHANGELOG.md | 1 + cortex/store-gateway.libsonnet | 15 --------------- cortex/tsdb-config.libsonnet | 14 ++++++++++++++ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40979de7..6db7e332 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ * [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 * [ENHANCEMENT] Support new metrics cortex_cache_fetched_keys_total and cortex_cache_fetched_keys_total * [BUGFIX] Updated blocks_storage_s3_endpoint in config.libsonnet to include the correct aws region +* [BUGFIX] Fixes `-blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections` for ruler and querier ## 1.11.0 / 2021-12-30 diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet index 4568692b..235adf23 100644 --- a/cortex/store-gateway.libsonnet +++ b/cortex/store-gateway.libsonnet @@ -28,22 +28,7 @@ // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', - 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, - - // We should keep a number of idle connections equal to the max "get" concurrency, - // in order to avoid re-opening connections continuously (this would be slower - // and fill up the conntrack table too). - // - // The downside of this approach is that we'll end up with an higher number of - // active connections to memcached, so we have to make sure connections limit - // set in memcached is high enough. - 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], } + $.blocks_chunks_caching_config + $.blocks_metadata_caching_config + diff --git a/cortex/tsdb-config.libsonnet b/cortex/tsdb-config.libsonnet index a397bdaf..99b8abb7 100644 --- a/cortex/tsdb-config.libsonnet +++ b/cortex/tsdb-config.libsonnet @@ -26,6 +26,14 @@ cortex_bucket_index_enabled: false, }, + // We should keep a number of idle connections equal to the max "get" concurrency, + // in order to avoid re-opening connections continuously (this would be slower + // and fill up the conntrack table too). + // + // The downside of this approach is that we'll end up with an higher number of + // active connections to memcached, so we have to make sure connections limit + // set in memcached is high enough. + blocks_chunks_caching_config:: ( if $._config.memcached_index_queries_enabled then { @@ -36,6 +44,8 @@ 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': self['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], } else {} ) + ( if $._config.memcached_chunks_enabled then { @@ -46,6 +56,8 @@ 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': self['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], } else {} ), @@ -57,6 +69,8 @@ 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': self['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], } else {}, bucket_index_config:: if $._config.cortex_bucket_index_enabled then { From 625394cbf0eec0dcf845cd7f880f43679e98ccfe Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 16 Jan 2023 10:30:37 +0100 Subject: [PATCH 13/60] Release v1.11.1 (#16) Signed-off-by: Friedrich Gonzalez Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 +- cortex/images.libsonnet | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6db7e332..d7fd1ad3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## master / unreleased +## 1.11.1 / 2023-01-13 * [CHANGE] Updated readme to use this repo with tanka * [CHANGE] Removed chunks support diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index f6203326..7db53819 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.11.0', + cortex: 'cortexproject/cortex:v1.11.1', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, overrides_exporter: self.cortex, - query_tee: 'quay.io/cortexproject/query-tee:v1.11.0', - testExporter: 'cortexproject/test-exporter:v1.11.0', + query_tee: 'quay.io/cortexproject/query-tee:v1.11.1', + testExporter: 'cortexproject/test-exporter:v1.11.1', }, } From 5e810e9ce28cc0640e810a355509a6536daad402 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 17 Jan 2023 09:56:03 +0100 Subject: [PATCH 14/60] Use policy/v1 PodDisruptionBudget to support k8s 1.25+ (#17) Signed-off-by: Friedrich Gonzalez Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 4 ++++ Makefile | 2 +- README.md | 2 +- cortex/ingester.libsonnet | 5 ++--- cortex/store-gateway.libsonnet | 5 ++--- 5 files changed, 10 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d7fd1ad3..8e88e470 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## master / unreleased + +* [CHANGE] Use policy/v1 PodDisruptionBudget to support k8s 1.25+ + ## 1.11.1 / 2023-01-13 * [CHANGE] Updated readme to use this repo with tanka diff --git a/Makefile b/Makefile index 9f2501d5..2b4b6d25 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ test-readme: test-readme/azure test-readme/gcs test-readme/s3 test-readme/%: rm -rf $@ && \ mkdir -p $@ && cd $@ && \ - tk init --k8s=1.21 && \ + tk init --k8s=1.23 && \ jb install github.com/cortexproject/cortex-jsonnet/cortex@main && \ rm -fr ./vendor/cortex && \ cp -r ../../cortex ./vendor/ && \ diff --git a/README.md b/README.md index d7eaaa6d..4226c7ea 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ To generate the YAMLs for deploying Cortex: ```console $ mkdir && cd - $ tk init --k8s=1.21 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.21@main + $ tk init --k8s=1.23 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.23@main $ jb install github.com/cortexproject/cortex-jsonnet/cortex@main ``` diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 16d91e4f..818716e1 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -1,5 +1,5 @@ { - local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + local podDisruptionBudget = $.policy.v1.podDisruptionBudget, local pvc = $.core.v1.persistentVolumeClaim, local statefulSet = $.apps.v1.statefulSet, local volume = $.core.v1.volume, @@ -90,8 +90,7 @@ ingester_service_ignored_labels:: [], newIngesterPdb(pdbName, ingesterName):: - podDisruptionBudget.new() + - podDisruptionBudget.mixin.metadata.withName(pdbName) + + podDisruptionBudget.new(pdbName) + podDisruptionBudget.mixin.metadata.withLabels({ name: pdbName }) + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: ingesterName }) + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet index 235adf23..cea63089 100644 --- a/cortex/store-gateway.libsonnet +++ b/cortex/store-gateway.libsonnet @@ -1,6 +1,6 @@ { local container = $.core.v1.container, - local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, + local podDisruptionBudget = $.policy.v1.podDisruptionBudget, local pvc = $.core.v1.persistentVolumeClaim, local statefulSet = $.apps.v1.statefulSet, local volumeMount = $.core.v1.volumeMount, @@ -69,8 +69,7 @@ $.util.serviceFor($.store_gateway_statefulset), store_gateway_pdb: - podDisruptionBudget.new() + - podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') + + podDisruptionBudget.new('store-gateway-pdb') + podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + // To avoid any disruption in the read path we need at least 1 replica of each From 6779b009ee659eb08a0cd6afae318a4c13bf872c Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Sat, 29 Apr 2023 11:39:25 +0200 Subject: [PATCH 15/60] fixes CI (#19) * Rename workflow from lint to ci Signed-off-by: Friedrich Gonzalez * Run on push and pull_requests for main Signed-off-by: Friedrich Gonzalez * Use ubuntu-latest Signed-off-by: Friedrich Gonzalez --------- Signed-off-by: Friedrich Gonzalez --- .github/workflows/{lint.yaml => ci.yaml} | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) rename .github/workflows/{lint.yaml => ci.yaml} (83%) diff --git a/.github/workflows/lint.yaml b/.github/workflows/ci.yaml similarity index 83% rename from .github/workflows/lint.yaml rename to .github/workflows/ci.yaml index b1ad62b2..dbfd6c0d 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/ci.yaml @@ -1,11 +1,14 @@ name: CI on: + push: + branches: [ main ] pull_request: + branches: [ main ] jobs: lint: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 steps: - uses: actions/checkout@v2 @@ -19,7 +22,7 @@ jobs: - name: "Lint playbooks" run: make lint-playbooks build: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 steps: - uses: actions/checkout@v2 @@ -30,7 +33,7 @@ jobs: - name: "Build mixin" run: make build-mixin readme: - runs-on: ubuntu-18.04 + runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 steps: - uses: actions/checkout@v2 @@ -39,4 +42,4 @@ jobs: fetch-depth: 0 - name: "Test readme" - run: make test-readme \ No newline at end of file + run: make test-readme From ed6d0f2a47af078b6fb4f93d7dbb98a6c976295f Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Sat, 29 Apr 2023 11:40:49 +0200 Subject: [PATCH 16/60] Release v1.13.2 (#18) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 ++ cortex/images.libsonnet | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e88e470..771852e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## master / unreleased +## 1.13.2 / 2023-04-29 + * [CHANGE] Use policy/v1 PodDisruptionBudget to support k8s 1.25+ ## 1.11.1 / 2023-01-13 diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index 7db53819..b980b02a 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.11.1', + cortex: 'cortexproject/cortex:v1.13.2', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, overrides_exporter: self.cortex, - query_tee: 'quay.io/cortexproject/query-tee:v1.11.1', - testExporter: 'cortexproject/test-exporter:v1.11.1', + query_tee: 'quay.io/cortexproject/query-tee:v1.13.2', + testExporter: 'cortexproject/test-exporter:v1.13.2', }, } From 536ef90260855ab73ab6c8de553ca152516827af Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Sat, 29 Apr 2023 16:16:49 +0200 Subject: [PATCH 17/60] Upgrade build image (#20) * Upgrade alpine to 3.17 Signed-off-by: Friedrich Gonzalez * Upgrade to golang 1.20 Signed-off-by: Friedrich Gonzalez * Upgrade jsonnet to v0.20.0 Signed-off-by: Friedrich Gonzalez * Upgrade to jsonnet-bundler v0.5.1 Signed-off-by: Friedrich Gonzalez * Upgrade tanka to v0.24.0 Signed-off-by: Friedrich Gonzalez * Update ci.yaml Signed-off-by: Friedrich Gonzalez --------- Signed-off-by: Friedrich Gonzalez --- .github/workflows/ci.yaml | 8 ++++---- README.md | 6 +++--- build-image/Dockerfile | 28 ++++++++++++---------------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index dbfd6c0d..e0eac22e 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,7 +9,7 @@ on: jobs: lint: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 + container: quay.io/cortexproject/cortex-jsonnet-build-image:e63d87f steps: - uses: actions/checkout@v2 name: Checkout @@ -23,7 +23,7 @@ jobs: run: make lint-playbooks build: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 + container: quay.io/cortexproject/cortex-jsonnet-build-image:e63d87f steps: - uses: actions/checkout@v2 name: Checkout @@ -34,7 +34,7 @@ jobs: run: make build-mixin readme: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:c924d52 + container: quay.io/cortexproject/cortex-jsonnet-build-image:e63d87f steps: - uses: actions/checkout@v2 name: Checkout @@ -42,4 +42,4 @@ jobs: fetch-depth: 0 - name: "Test readme" - run: make test-readme + run: make test-readme diff --git a/README.md b/README.md index 4226c7ea..a3bbc2b0 100644 --- a/README.md +++ b/README.md @@ -17,8 +17,8 @@ To generate the YAMLs for deploying Cortex: ```console $ # make sure to be outside of GOPATH or a go.mod project - $ GO111MODULE=on go install github.com/grafana/tanka/cmd/tk@v0.21.0 - $ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.4.0 + $ GO111MODULE=on go install github.com/grafana/tanka/cmd/tk@v0.24.0 + $ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1 ``` 1. Initialise the Tanka repo, install the Cortex and Kubernetes Jsonnet libraries. @@ -68,7 +68,7 @@ To generate the Grafana dashboards and Prometheus alerts for Cortex: ```console $ GO111MODULE=on go install github.com/monitoring-mixins/mixtool/cmd/mixtool@2ff523ea63d1cdeee2a10e01d1d48d20adcc7030 -$ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.4.0 +$ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1 $ git clone https://github.com/cortexproject/cortex-jsonnet $ cd cortex-jsonnet $ make build-mixin diff --git a/build-image/Dockerfile b/build-image/Dockerfile index dc37595e..a874e693 100644 --- a/build-image/Dockerfile +++ b/build-image/Dockerfile @@ -1,43 +1,39 @@ # Build jsonnet -FROM alpine:3.13 AS jsonnet-builder +FROM alpine:3.17 AS jsonnet-builder RUN apk add --no-cache git make g++ RUN git clone https://github.com/google/jsonnet && \ - git -C jsonnet checkout v0.15.0 && \ + git -C jsonnet checkout v0.20.0 && \ make -C jsonnet 2LDFLAGS=-static && \ cp jsonnet/jsonnet /usr/bin && \ cp jsonnet/jsonnetfmt /usr/bin # Build jb -FROM alpine:3.13 AS jb-builder -ARG JSONNET_BUNDLER_VERSION=0.4.0 -ARG JSONNET_BUNDLER_CHECKSUM="433edab5554a88a0371e11e93080408b225d41c31decf321c02b50d2e44993ce /usr/bin/jb" +FROM alpine:3.17 AS jb-builder +ARG JSONNET_BUNDLER_VERSION=0.5.1 +ARG JSONNET_BUNDLER_CHECKSUM="f5bccc94d28fbbe8ad1d46fd4f208619e45d368a5d7924f6335f4ecfa0605c85 /usr/bin/jb" RUN apk add --no-cache curl RUN curl -fSL -o "/usr/bin/jb" "https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v${JSONNET_BUNDLER_VERSION}/jb-linux-amd64" RUN echo "${JSONNET_BUNDLER_CHECKSUM}" | sha256sum -c || (printf "wanted: %s\n got: %s\n" "${JSONNET_BUNDLER_CHECKSUM}" "$(sha256sum /usr/bin/jb)"; exit 1) RUN chmod +x /usr/bin/jb # Build tanka -FROM alpine:3.13 AS tk-builder -ARG TANKA_VERSION=0.21.0 -ARG TANKA_CHECKSUM="cd60a005f84fd99763f26d07d4cb626e7585a62800aae97234d8187129eed1ec /usr/bin/tk" +FROM alpine:3.17 AS tk-builder +ARG TANKA_VERSION=0.24.0 +ARG TANKA_CHECKSUM="82c8c533c29eefea0af9c28f487203b19dec84ce2624702f99196e777f946ddc /usr/bin/tk" RUN apk add --no-cache curl RUN curl -fSL -o "/usr/bin/tk" "https://github.com/grafana/tanka/releases/download/v${TANKA_VERSION}/tk-linux-amd64" RUN echo "${TANKA_CHECKSUM}" | sha256sum -c || (printf "wanted: %s\n got: %s\n" "${TANKA_CHECKSUM}" "$(sha256sum /usr/bin/tk)"; exit 1) RUN chmod +x /usr/bin/tk # Build mixtool -FROM golang:1.15-alpine AS mixtool-builder -RUN GO111MODULE=on go get github.com/monitoring-mixins/mixtool/cmd/mixtool@ae18e31161ea10545b9c1ac0d23c10122f2c12b5 +FROM golang:1.20-alpine AS mixtool-builder +RUN GO111MODULE=on go install github.com/monitoring-mixins/mixtool/cmd/mixtool@ae18e31161ea10545b9c1ac0d23c10122f2c12b5 -FROM alpine:3.13 -RUN apk add --no-cache git make libgcc libstdc++ zip findutils sed +FROM alpine:3.17 +RUN apk add --no-cache git make libgcc libstdc++ zip findutils sed yq COPY --from=jsonnet-builder /usr/bin/jsonnetfmt /usr/bin COPY --from=jsonnet-builder /usr/bin/jsonnet /usr/bin COPY --from=jb-builder /usr/bin/jb /usr/bin COPY --from=tk-builder /usr/bin/tk /usr/bin COPY --from=mixtool-builder /go/bin/mixtool /usr/bin -# Install yq. -# TODO We can install it via apk once alpine 3.14 or above will be released. Previous versions don't package v4. -RUN wget -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v4.9.3/yq_linux_amd64 && \ - chmod +x /usr/bin/yq From 45824a13e2a977cdaa6791c8cd259d5c51d814f2 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Sat, 29 Apr 2023 16:25:38 +0200 Subject: [PATCH 18/60] Use k8s 1.24 (#21) Signed-off-by: Friedrich Gonzalez --- Makefile | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 2b4b6d25..3b1f77d1 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ test-readme: test-readme/azure test-readme/gcs test-readme/s3 test-readme/%: rm -rf $@ && \ mkdir -p $@ && cd $@ && \ - tk init --k8s=1.23 && \ + tk init --k8s=1.24 && \ jb install github.com/cortexproject/cortex-jsonnet/cortex@main && \ rm -fr ./vendor/cortex && \ cp -r ../../cortex ./vendor/ && \ diff --git a/README.md b/README.md index a3bbc2b0..b0a5133e 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ To generate the YAMLs for deploying Cortex: ```console $ mkdir && cd - $ tk init --k8s=1.23 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.23@main + $ tk init --k8s=1.24 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.24@main $ jb install github.com/cortexproject/cortex-jsonnet/cortex@main ``` From 945a7e83f87a112ddc178ed81db3720ce9353399 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 8 May 2023 14:33:23 +0200 Subject: [PATCH 19/60] Enables query-scheduler by default (#23) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 ++ cortex/config.libsonnet | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 771852e1..72907a18 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,8 @@ ## master / unreleased +* [CHANGE] Enables query-scheduler by default + ## 1.13.2 / 2023-04-29 * [CHANGE] Use policy/v1 PodDisruptionBudget to support k8s 1.25+ diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index f7a1baba..0ac5b8ed 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -371,7 +371,7 @@ alertmanager_enabled: false, // Enables query-scheduler component, and reconfigures querier and query-frontend to use it. - query_scheduler_enabled: false, + query_scheduler_enabled: true, // Enables streaming of chunks from ingesters using blocks. // Changing it will not cause new rollout of ingesters, as it gets passed to them via runtime-config. From 47d550270e236e593ccec7a2ca0d40ec3484fab7 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 8 May 2023 14:41:23 +0200 Subject: [PATCH 20/60] Enables bucket index by default (#24) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/tsdb-config.libsonnet | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 72907a18..d0eedebf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master / unreleased * [CHANGE] Enables query-scheduler by default +* [CHANGE] Enables bucket-index by default ## 1.13.2 / 2023-04-29 diff --git a/cortex/tsdb-config.libsonnet b/cortex/tsdb-config.libsonnet index 99b8abb7..f4779faa 100644 --- a/cortex/tsdb-config.libsonnet +++ b/cortex/tsdb-config.libsonnet @@ -23,7 +23,7 @@ // Enable use of bucket index by querier, ruler and store-gateway. // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor. - cortex_bucket_index_enabled: false, + cortex_bucket_index_enabled: true, }, // We should keep a number of idle connections equal to the max "get" concurrency, From fe7bd055ed5fdecb4221b3bac140d39d4393f22b Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 8 May 2023 14:45:25 +0200 Subject: [PATCH 21/60] Fix release doc (#25) Signed-off-by: Friedrich Gonzalez --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index 47a14659..6a255c19 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -27,7 +27,7 @@ A release of `cortex-jsonnet` should follow shortly after a release of Cortex. $ make build-mixin ``` 7. Add the `cortex-mixin/cortex-mixin.zip` and release change log to the GitHub release. - - Edit the release in GitHub by going to https://github.com/grafana/cortex-jsonnet/releases/edit/x.y.z + - Edit the release in GitHub by going to https://github.com/cortexproject/cortex-jsonnet/releases/edit/x.y.z ### How to tag a release From dd7c510d5c9ec96d69d2163b4aeec996fba56b7e Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 24 May 2023 19:06:58 +0200 Subject: [PATCH 22/60] Release v1.14.1 (#22) Signed-off-by: Friedrich Gonzalez --- cortex/images.libsonnet | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index b980b02a..9018679e 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.13.2', + cortex: 'cortexproject/cortex:v1.14.1', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, overrides_exporter: self.cortex, - query_tee: 'quay.io/cortexproject/query-tee:v1.13.2', - testExporter: 'cortexproject/test-exporter:v1.13.2', + query_tee: 'quay.io/cortexproject/query-tee:v1.14.1', + testExporter: 'cortexproject/test-exporter:v1.14.1', }, } From c311bcaf029174f424afecdc3f3c52afc6369f62 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 24 May 2023 19:16:37 +0200 Subject: [PATCH 23/60] Use faster disks for compactor (#26) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/tsdb-config.libsonnet | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0eedebf..9c0f6e89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master / unreleased +* [CHANGE] Use faster disks for compactor * [CHANGE] Enables query-scheduler by default * [CHANGE] Enables bucket-index by default diff --git a/cortex/tsdb-config.libsonnet b/cortex/tsdb-config.libsonnet index f4779faa..3d2c5e4b 100644 --- a/cortex/tsdb-config.libsonnet +++ b/cortex/tsdb-config.libsonnet @@ -14,7 +14,7 @@ // Allow to configure the compactor disk. cortex_compactor_data_disk_size: '250Gi', - cortex_compactor_data_disk_class: 'standard', + cortex_compactor_data_disk_class: 'fast', // Allow to fine tune compactor. cortex_compactor_max_concurrency: 1, From 02922f91ad817034aeca9702a0493cd5b0159fa7 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 12 Jun 2023 18:04:10 +0200 Subject: [PATCH 24/60] Filter out user label added in https://github.com/cortexproject/cortex/pull/4918 (#27) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex-mixin/dashboards/queries.libsonnet | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c0f6e89..31439905 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [CHANGE] Use faster disks for compactor * [CHANGE] Enables query-scheduler by default * [CHANGE] Enables bucket-index by default +* [BUGFIX] Fix `Blocks currently loaded` in Queries ## 1.13.2 / 2023-04-29 diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index f18cdee3..853d0f8c 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -254,7 +254,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.row('') .addPanel( $.panel('Blocks currently loaded') + - $.queryPanel('cortex_bucket_store_blocks_loaded{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) + $.queryPanel('sum(cortex_bucket_store_blocks_loaded{component="store-gateway",%s}) without (user)' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) ) .addPanel( $.successFailurePanel( From 0464135da869beeb3e838b3f573a12545646c7ac Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 11 Jul 2023 18:10:03 +0200 Subject: [PATCH 25/60] Add release v1.14.1 to changelog (#28) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 31439905..f1a9cf59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## master / unreleased +## 1.14.1 / 2023-07-11 * [CHANGE] Use faster disks for compactor * [CHANGE] Enables query-scheduler by default From 31112a48d89487e032971c6c705ca86b1e5c103b Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 11 Jul 2023 18:42:17 +0200 Subject: [PATCH 26/60] Use v1.15.3 and configure default azure endpoint suffix (#29) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 4 ++++ cortex/config.libsonnet | 3 +++ cortex/images.libsonnet | 6 +++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1a9cf59..7aefd628 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## master / unreleased +* [CHANGE] Use cortex v1.15.3 +* [CHANGE] Azure storage endpoint suffix is set to `blob.core.windows.net` for backward compatibility + ## 1.14.1 / 2023-07-11 * [CHANGE] Use faster disks for compactor diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 0ac5b8ed..5eb93eb5 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -98,6 +98,7 @@ 'blocks-storage.azure.container-name': $._config.blocks_storage_bucket_name, 'blocks-storage.azure.account-name': $._config.blocks_storage_azure_account_name, 'blocks-storage.azure.account-key': $._config.blocks_storage_azure_account_key, + 'blocks-storage.azure.endpoint-suffix': 'blob.core.windows.net', }, blocksStorageConfig: @@ -164,6 +165,7 @@ 'ruler-storage.azure.container-name': $._config.ruler_storage_bucket_name, 'ruler-storage.azure.account-name': $._config.ruler_storage_azure_account_name, 'ruler-storage.azure.account-key': $._config.ruler_storage_azure_account_key, + 'ruler-storage.azure.endpoint-suffix': 'blob.core.windows.net', }, 'local': { 'ruler-storage.local.directory': $._config.ruler_local_directory, @@ -194,6 +196,7 @@ 'alertmanager-storage.azure.account-key': $._config.alertmanager_azure_account_key, 'alertmanager-storage.azure.account-name': $._config.alertmanager_azure_account_name, 'alertmanager-storage.azure.container-name': $._config.alertmanager_azure_container_name, + 'alertmanager-storage.azure.endpoint-suffix': 'blob.core.windows.net', }, gcs: { 'alertmanager-storage.gcs.bucket-name': $._config.alertmanager_gcs_bucket_name, diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index 9018679e..05f89c02 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.14.1', + cortex: 'cortexproject/cortex:v1.15.3', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, overrides_exporter: self.cortex, - query_tee: 'quay.io/cortexproject/query-tee:v1.14.1', - testExporter: 'cortexproject/test-exporter:v1.14.1', + query_tee: 'quay.io/cortexproject/query-tee:v1.15.3', + testExporter: 'cortexproject/test-exporter:v1.15.3', }, } From 3829a81f0a151e05dbcbd49b7bd6a052c07f4330 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 31 Jul 2023 17:35:01 +0200 Subject: [PATCH 27/60] Add default tenant shard sizes (#31) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/config.libsonnet | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7aefd628..d106e37d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] Add default tenant shard sizes * [CHANGE] Use cortex v1.15.3 * [CHANGE] Azure storage endpoint suffix is set to `blob.core.windows.net` for backward compatibility diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 5eb93eb5..39b50ee0 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -260,6 +260,8 @@ // No retention for now. compactor_blocks_retention_period: '0', + + ingestion_tenant_shard_size: 3, }, medium_small_user:: { @@ -277,6 +279,8 @@ // 1000 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 50, + + ingestion_tenant_shard_size: 9, }, small_user:: { @@ -294,6 +298,8 @@ // 1400 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 70, + + ingestion_tenant_shard_size: 15, }, medium_user:: { @@ -311,6 +317,8 @@ // 1800 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 90, + + ingestion_tenant_shard_size: 30, }, big_user:: { @@ -328,6 +336,8 @@ // 2200 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 110, + + ingestion_tenant_shard_size: 60, }, super_user:: { @@ -345,6 +355,8 @@ // 2600 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 130, + + ingestion_tenant_shard_size: 120, }, // This user class has limits increased by +50% compared to the previous one. @@ -363,6 +375,8 @@ // 3000 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 150, + + ingestion_tenant_shard_size: 180, }, }, From cb68f902e53a9606479b9c2ba8fbc1df7ba919f3 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 9 Aug 2023 10:53:18 +0200 Subject: [PATCH 28/60] Configure default GOMAXPROCS and GOMEMLIMIT (#32) Also remove mem-ballast, that is not required if using GOMEMLIMIT Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 3 +++ cortex/alertmanager.libsonnet | 6 ++++++ cortex/compactor.libsonnet | 6 ++++++ cortex/distributor.libsonnet | 11 ++++++----- cortex/flusher-job-blocks.libsonnet | 6 ++++++ cortex/ingester.libsonnet | 28 +++++++--------------------- cortex/querier.libsonnet | 8 +++----- cortex/query-frontend.libsonnet | 6 ++++++ cortex/query-scheduler.libsonnet | 6 ++++++ cortex/query-tee.libsonnet | 6 ++++++ cortex/ruler.libsonnet | 6 ++++++ cortex/store-gateway.libsonnet | 6 ++++++ cortex/test-exporter.libsonnet | 6 ++++++ 13 files changed, 73 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d106e37d..c84974eb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,9 @@ # Changelog ## master / unreleased +* [CHANGE] Remove mem-ballast from distributor and querier. +* [CHANGE] Increase cpu requests for querier to 2. +* [CHANGE] Configure default GOMAXPROCS and GOMEMLIMIT for all cortex modules * [CHANGE] Add default tenant shard sizes * [CHANGE] Use cortex v1.15.3 * [CHANGE] Azure storage endpoint suffix is set to `blob.core.windows.net` for backward compatibility diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index 480112d3..4df2e77b 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -96,6 +96,7 @@ if $._config.alertmanager_enabled then container.new('alertmanager', $._images.alertmanager) + container.withPorts($.util.defaultPorts + mode.ports) + + container.withEnvMap($.alertmanager_env_map) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + @@ -112,6 +113,11 @@ $.jaeger_mixin else {}, + alertmanager_env_map:: { + GOMAXPROCS: '1', + GOMEMLIMIT: '1GiB', + }, + alertmanager_statefulset: if $._config.alertmanager_enabled then statefulSet.new('alertmanager', $._config.alertmanager.replicas, [$.alertmanager_container], $.alertmanager_pvc) + diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet index 03df1ab7..9edfcdca 100644 --- a/cortex/compactor.libsonnet +++ b/cortex/compactor.libsonnet @@ -43,6 +43,7 @@ container.new('compactor', $._images.compactor) + container.withPorts($.compactor_ports) + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + + container.withEnvMap($.compactor_env_map) + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + @@ -50,6 +51,11 @@ $.util.readinessProbe + $.jaeger_mixin, + compactor_env_map:: { + GOMAXPROCS: std.toString($._config.cortex_compactor_max_concurrency), + GOMEMLIMIT: '6GiB', + }, + newCompactorStatefulSet(name, container):: statefulSet.new(name, 1, [container], compactor_data_pvc) + statefulSet.mixin.spec.withServiceName(name) + diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index c2bcfe07..86a17e2c 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -20,11 +20,6 @@ 'distributor.ha-tracker.etcd.endpoints': 'etcd-client.%s.svc.cluster.local.:2379' % $._config.namespace, 'distributor.ha-tracker.prefix': 'prom_ha/', - // The memory requests are 2G, and we barely use 100M. - // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at - // around 1.25G, reducing the 99%ile. - 'mem-ballast-size-bytes': 1 << 30, // 1GB - 'server.grpc.keepalive.max-connection-age': '2m', 'server.grpc.keepalive.max-connection-age-grace': '5m', 'server.grpc.keepalive.max-connection-idle': '1m', @@ -38,12 +33,18 @@ 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, }, + distributor_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '2GiB', + }, + distributor_ports:: $.util.defaultPorts, distributor_container:: container.new('distributor', $._images.distributor) + container.withPorts($.distributor_ports) + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + + container.withEnvMap($.distributor_env_map) + $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '4Gi') + $.util.readinessProbe + diff --git a/cortex/flusher-job-blocks.libsonnet b/cortex/flusher-job-blocks.libsonnet index 1e6266ca..6917a867 100644 --- a/cortex/flusher-job-blocks.libsonnet +++ b/cortex/flusher-job-blocks.libsonnet @@ -21,11 +21,17 @@ target: 'flusher', 'blocks-storage.tsdb.retention-period': '10000h', // don't delete old blocks too soon. })) + + container.withEnvMap($.flusher_env_map) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + $.jaeger_mixin, + flusher_env_map:: { + GOMAXPROCS: '4', + GOMEMLIMIT: '15GiB', + }, + flusher_job_func(jobName, pvcName):: job.new() + job.mixin.spec.template.spec.withContainers([ diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 818716e1..79945890 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -3,6 +3,7 @@ local pvc = $.core.v1.persistentVolumeClaim, local statefulSet = $.apps.v1.statefulSet, local volume = $.core.v1.volume, + local volumeMount = $.core.v1.volumeMount, // The ingesters should persist TSDB blocks and WAL on a persistent // volume in order to be crash resilient. @@ -44,18 +45,6 @@ 'ingester.tokens-file-path': '/data/tokens', }, - ingester_statefulset_args:: - $._config.grpcConfig - { - 'ingester.wal-enabled': true, - 'ingester.checkpoint-enabled': true, - 'ingester.recover-from-wal': true, - 'ingester.wal-dir': $._config.ingester.wal_dir, - 'ingester.checkpoint-duration': '15m', - '-log.level': 'info', - 'ingester.tokens-file-path': $._config.ingester.wal_dir + '/tokens', - }, - ingester_ports:: $.util.defaultPorts, local name = 'ingester', @@ -65,22 +54,19 @@ container.new(name, $._images.ingester) + container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + + container.withEnvMap($.ingester_env_map) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + $.jaeger_mixin, - local volumeMount = $.core.v1.volumeMount, - - ingester_statefulset_container:: - $.ingester_container + - container.withArgsMixin($.util.mapToFlags($.ingester_statefulset_args)) + - container.withVolumeMountsMixin([ - volumeMount.new('ingester-pvc', $._config.ingester.wal_dir), - ]), - ingester_deployment_labels:: {}, + ingester_env_map:: { + GOMAXPROCS: '4', + GOMEMLIMIT: '15GiB', + }, + local ingester_pvc = pvc.new('ingester-pvc') + pvc.mixin.spec.resources.withRequests({ storage: $._config.ingester.statefulset_disk }) + diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index 6ebe85fe..e5cb82e6 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -26,16 +26,14 @@ 'querier.frontend-address': 'query-frontend-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, - // We request high memory but the Go heap is typically very low (< 100MB) and this causes - // the GC to trigger continuously. Setting a ballast of 256MB reduces GC. - 'mem-ballast-size-bytes': 1 << 28, // 256M - 'log.level': 'debug', }, querier_ports:: $.util.defaultPorts, querier_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '12Gi', JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. }, @@ -46,7 +44,7 @@ $.jaeger_mixin + $.util.readinessProbe + container.withEnvMap($.querier_env_map) + - $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesRequests('2', '12Gi') + $.util.resourcesLimits(null, '24Gi'), local deployment = $.apps.v1.deployment, diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet index 80f36d04..39d4f6d3 100644 --- a/cortex/query-frontend.libsonnet +++ b/cortex/query-frontend.libsonnet @@ -42,11 +42,17 @@ container.new('query-frontend', $._images.query_frontend) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + + container.withEnvMap($.query_frontend_env_map) + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '600Mi') + $.util.resourcesLimits(null, '1200Mi'), + query_frontend_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '600MiB', + }, + local deployment = $.apps.v1.deployment, newQueryFrontendDeployment(name, container):: diff --git a/cortex/query-scheduler.libsonnet b/cortex/query-scheduler.libsonnet index 604d258a..b0a60a5f 100644 --- a/cortex/query-scheduler.libsonnet +++ b/cortex/query-scheduler.libsonnet @@ -17,6 +17,7 @@ container.new('query-scheduler', $._images.query_scheduler) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_scheduler_args)) + + container.withEnvMap($.query_scheduler_env_map) + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '1Gi') + @@ -30,6 +31,11 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), + query_scheduler_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '1GiB', + }, + query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else self.newQuerySchedulerDeployment('query-scheduler', $.query_scheduler_container), diff --git a/cortex/query-tee.libsonnet b/cortex/query-tee.libsonnet index 4ac3b0a1..0e1250c0 100644 --- a/cortex/query-tee.libsonnet +++ b/cortex/query-tee.libsonnet @@ -18,9 +18,15 @@ containerPort.newNamed(name='http-metrics', containerPort=9900), ]) + container.withArgsMixin($.util.mapToFlags($.query_tee_args)) + + container.withEnvMap($.query_tee_env_map) + $.util.resourcesRequests('1', '512Mi') + $.jaeger_mixin, + query_tee_env_map:: { + GOMAXPROCS: '1', + GOMEMLIMIT: '512MiB', + }, + query_tee_deployment: if !($._config.query_tee_enabled) then {} else deployment.new('query-tee', 2, [$.query_tee_container]), diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index cfb0252b..1688ca66 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -38,6 +38,7 @@ container.new('ruler', $._images.ruler) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + + container.withEnvMap($.ruler_env_map) + $.util.resourcesRequests('1', '6Gi') + $.util.resourcesLimits('16', '16Gi') + $.util.readinessProbe + @@ -56,6 +57,11 @@ $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') else {}, + ruler_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '6GiB', + }, + local service = $.core.v1.service, ruler_service: diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet index cea63089..757d9b63 100644 --- a/cortex/store-gateway.libsonnet +++ b/cortex/store-gateway.libsonnet @@ -40,12 +40,18 @@ container.new('store-gateway', $._images.store_gateway) + container.withPorts($.store_gateway_ports) + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + + container.withEnvMap($.store_gateway_env_map) + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + $.util.resourcesRequests('1', '12Gi') + $.util.resourcesLimits(null, '18Gi') + $.util.readinessProbe + $.jaeger_mixin, + store_gateway_env_map:: { + GOMAXPROCS: '2', + GOMEMLIMIT: '12GiB', + }, + newStoreGatewayStatefulSet(name, container):: statefulSet.new(name, 3, [container], store_gateway_data_pvc) + statefulSet.mixin.spec.withServiceName(name) + diff --git a/cortex/test-exporter.libsonnet b/cortex/test-exporter.libsonnet index 9d69abee..036d6fec 100644 --- a/cortex/test-exporter.libsonnet +++ b/cortex/test-exporter.libsonnet @@ -18,10 +18,16 @@ container.new('test-exporter', $._images.testExporter) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.test_exporter_args)) + + container.withEnvMap($.test_exporter_env_map) + $.util.resourcesRequests('100m', '100Mi') + $.util.resourcesLimits('100m', '100Mi') + $.jaeger_mixin, + test_exporter_env_map:: { + GOMAXPROCS: '1', + GOMEMLIMIT: '100MiB', + }, + local deployment = $.apps.v1.deployment, test_exporter_deployment: From 89d82f98ef5b1d1b721ab0dc0fb4d515e42df23e Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Fri, 25 Aug 2023 13:41:18 +0200 Subject: [PATCH 29/60] Add default instance limits for distributors and ingesters (#33) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 ++ cortex/config.libsonnet | 13 ++++++------- cortex/distributor.libsonnet | 1 + 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c84974eb..67a2b4b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ # Changelog ## master / unreleased +* [CHANGE] Add default instance max series for ingesters +* [CHANGE] Add default instance max inflight pushes for distributors * [CHANGE] Remove mem-ballast from distributor and querier. * [CHANGE] Increase cpu requests for querier to 2. * [CHANGE] Configure default GOMAXPROCS and GOMEMLIMIT for all cortex modules diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 39b50ee0..bad810f7 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -395,13 +395,12 @@ ingester_stream_chunks_when_using_blocks: true, // Ingester limits are put directly into runtime config, if not null. Available limits: - // ingester_instance_limits: { - // max_inflight_push_requests: 0, // Max inflight push requests per ingester. 0 = no limit. - // max_ingestion_rate: 0, // Max ingestion rate (samples/second) per ingester. 0 = no limit. - // max_series: 0, // Max number of series per ingester. 0 = no limit. - // max_tenants: 0, // Max number of tenants per ingester. 0 = no limit. - // }, - ingester_instance_limits: null, + ingester_instance_limits: { + // max_inflight_push_requests: 0, // Max inflight push requests per ingester. 0 = no limit. + // max_ingestion_rate: 0, // Max ingestion rate (samples/second) per ingester. 0 = no limit. + max_series: 4.8e+6, // Max number of series per ingester. 0 = no limit. 4.8 million is closely tied to 15Gb in requests per ingester + // max_tenants: 0, // Max number of tenants per ingester. 0 = no limit. + }, }, local configMap = $.core.v1.configMap, diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index 86a17e2c..13501b66 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -31,6 +31,7 @@ // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" // is set to false. 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, + 'distributor.instance-limits.max-inflight-push-requests': 60, //60 is very conservative to protect the distributor from OOMs }, distributor_env_map:: { From 7a667ea291158dec375cf370d77212db24714c38 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 28 Aug 2023 09:09:24 +0200 Subject: [PATCH 30/60] Decrease gomemlimit a bit to avoid running out of memory before trashing (#34) Signed-off-by: Friedrich Gonzalez --- cortex/compactor.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet index 9edfcdca..da511fde 100644 --- a/cortex/compactor.libsonnet +++ b/cortex/compactor.libsonnet @@ -53,7 +53,7 @@ compactor_env_map:: { GOMAXPROCS: std.toString($._config.cortex_compactor_max_concurrency), - GOMEMLIMIT: '6GiB', + GOMEMLIMIT: '5GiB', }, newCompactorStatefulSet(name, container):: From c35087e44f7aec30fd8ae0c78daa07a51acfbf68 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Thu, 28 Sep 2023 11:08:36 +0200 Subject: [PATCH 31/60] Fix querier GOMEMLIMIT (#35) Signed-off-by: Friedrich Gonzalez --- cortex/querier.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index e5cb82e6..9cde3f28 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -33,7 +33,7 @@ querier_env_map:: { GOMAXPROCS: '2', - GOMEMLIMIT: '12Gi', + GOMEMLIMIT: '12GiB', JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. }, From 524c3b3401fbe6efdb34339772cff63c05eeee03 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 20 Nov 2023 20:12:36 +0100 Subject: [PATCH 32/60] Configure GOMAXPROCS and GOMEMLIMIT based on requests cpu and memory Signed-off-by: Friedrich Gonzalez --- cortex/alertmanager.libsonnet | 11 ++++++++--- cortex/compactor.libsonnet | 11 ++++++++--- cortex/distributor.libsonnet | 9 +++++++-- cortex/ingester.libsonnet | 9 +++++++-- cortex/querier.libsonnet | 9 +++++++-- cortex/query-frontend.libsonnet | 9 +++++++-- cortex/query-scheduler.libsonnet | 9 +++++++-- cortex/query-tee.libsonnet | 9 +++++++-- cortex/ruler.libsonnet | 11 ++++++++--- cortex/store-gateway.libsonnet | 11 ++++++++--- 10 files changed, 74 insertions(+), 24 deletions(-) diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index 4df2e77b..719ac85b 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -3,6 +3,7 @@ local volumeMount = $.core.v1.volumeMount, local volume = $.core.v1.volume, local container = $.core.v1.container, + local envType = container.envType, local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, local configMap = $.core.v1.configMap, @@ -98,6 +99,12 @@ container.withPorts($.util.defaultPorts + mode.ports) + container.withEnvMap($.alertmanager_env_map) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + mode.flags @@ -108,14 +115,12 @@ [volumeMount.new('alertmanager-fallback-config', '/configs')] else [] ) + - $.util.resourcesRequests('100m', '1Gi') + + $.util.resourcesRequests('1', '1Gi') + $.util.readinessProbe + $.jaeger_mixin else {}, alertmanager_env_map:: { - GOMAXPROCS: '1', - GOMEMLIMIT: '1GiB', }, alertmanager_statefulset: diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet index da511fde..2e78c4d1 100644 --- a/cortex/compactor.libsonnet +++ b/cortex/compactor.libsonnet @@ -1,5 +1,6 @@ { local container = $.core.v1.container, + local envType = container.envType, local pvc = $.core.v1.persistentVolumeClaim, local statefulSet = $.apps.v1.statefulSet, local volumeMount = $.core.v1.volumeMount, @@ -44,16 +45,20 @@ container.withPorts($.compactor_ports) + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + container.withEnvMap($.compactor_env_map) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. - $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + + $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '5Gi') + $.util.resourcesLimits(null, '6Gi') + $.util.readinessProbe + $.jaeger_mixin, compactor_env_map:: { - GOMAXPROCS: std.toString($._config.cortex_compactor_max_concurrency), - GOMEMLIMIT: '5GiB', }, newCompactorStatefulSet(name, container):: diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index 13501b66..79c40335 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -1,5 +1,6 @@ { local container = $.core.v1.container, + local envType = container.envType, local containerPort = $.core.v1.containerPort, distributor_args:: @@ -35,8 +36,6 @@ }, distributor_env_map:: { - GOMAXPROCS: '2', - GOMEMLIMIT: '2GiB', }, distributor_ports:: $.util.defaultPorts, @@ -45,6 +44,12 @@ container.new('distributor', $._images.distributor) + container.withPorts($.distributor_ports) + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + container.withEnvMap($.distributor_env_map) + $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '4Gi') + diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 79945890..93f8ab54 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -49,12 +49,19 @@ local name = 'ingester', local container = $.core.v1.container, + local envType = container.envType, ingester_container:: container.new(name, $._images.ingester) + container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + container.withEnvMap($.ingester_env_map) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + @@ -63,8 +70,6 @@ ingester_deployment_labels:: {}, ingester_env_map:: { - GOMAXPROCS: '4', - GOMEMLIMIT: '15GiB', }, local ingester_pvc = diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index 9cde3f28..d58759ca 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -1,5 +1,6 @@ { local container = $.core.v1.container, + local envType = container.envType, querier_args:: $._config.grpcConfig + @@ -32,8 +33,6 @@ querier_ports:: $.util.defaultPorts, querier_env_map:: { - GOMAXPROCS: '2', - GOMEMLIMIT: '12GiB', JAEGER_REPORTER_MAX_QUEUE_SIZE: '1024', // Default is 100. }, @@ -44,6 +43,12 @@ $.jaeger_mixin + $.util.readinessProbe + container.withEnvMap($.querier_env_map) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + $.util.resourcesRequests('2', '12Gi') + $.util.resourcesLimits(null, '24Gi'), diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet index 39d4f6d3..e3b36670 100644 --- a/cortex/query-frontend.libsonnet +++ b/cortex/query-frontend.libsonnet @@ -1,5 +1,6 @@ { local container = $.core.v1.container, + local envType = container.envType, query_frontend_args:: $._config.grpcConfig @@ -43,14 +44,18 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + container.withEnvMap($.query_frontend_env_map) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '600Mi') + $.util.resourcesLimits(null, '1200Mi'), query_frontend_env_map:: { - GOMAXPROCS: '2', - GOMEMLIMIT: '600MiB', }, local deployment = $.apps.v1.deployment, diff --git a/cortex/query-scheduler.libsonnet b/cortex/query-scheduler.libsonnet index b0a60a5f..fab92958 100644 --- a/cortex/query-scheduler.libsonnet +++ b/cortex/query-scheduler.libsonnet @@ -3,6 +3,7 @@ { local container = $.core.v1.container, local deployment = $.apps.v1.deployment, + local envType = container.envType, local service = $.core.v1.service, query_scheduler_args+:: @@ -18,6 +19,12 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_scheduler_args)) + container.withEnvMap($.query_scheduler_env_map) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '1Gi') + @@ -32,8 +39,6 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), query_scheduler_env_map:: { - GOMAXPROCS: '2', - GOMEMLIMIT: '1GiB', }, query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else diff --git a/cortex/query-tee.libsonnet b/cortex/query-tee.libsonnet index 0e1250c0..6a89e990 100644 --- a/cortex/query-tee.libsonnet +++ b/cortex/query-tee.libsonnet @@ -2,6 +2,7 @@ local container = $.core.v1.container, local containerPort = $.core.v1.containerPort, local deployment = $.apps.v1.deployment, + local envType = container.envType, local service = $.core.v1.service, local servicePort = $.core.v1.servicePort, @@ -19,12 +20,16 @@ ]) + container.withArgsMixin($.util.mapToFlags($.query_tee_args)) + container.withEnvMap($.query_tee_env_map) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + $.util.resourcesRequests('1', '512Mi') + $.jaeger_mixin, query_tee_env_map:: { - GOMAXPROCS: '1', - GOMEMLIMIT: '512MiB', }, query_tee_deployment: if !($._config.query_tee_enabled) then {} else diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index 1688ca66..3735684d 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -1,5 +1,6 @@ { local container = $.core.v1.container, + local envType = container.envType, ruler_args:: $._config.grpcConfig + @@ -39,7 +40,13 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + container.withEnvMap($.ruler_env_map) + - $.util.resourcesRequests('1', '6Gi') + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + + $.util.resourcesRequests('2', '6Gi') + $.util.resourcesLimits('16', '16Gi') + $.util.readinessProbe + $.jaeger_mixin @@ -58,8 +65,6 @@ else {}, ruler_env_map:: { - GOMAXPROCS: '2', - GOMEMLIMIT: '6GiB', }, local service = $.core.v1.service, diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet index 757d9b63..7250b25d 100644 --- a/cortex/store-gateway.libsonnet +++ b/cortex/store-gateway.libsonnet @@ -1,5 +1,6 @@ { local container = $.core.v1.container, + local envType = container.envType, local podDisruptionBudget = $.policy.v1.podDisruptionBudget, local pvc = $.core.v1.persistentVolumeClaim, local statefulSet = $.apps.v1.statefulSet, @@ -41,15 +42,19 @@ container.withPorts($.store_gateway_ports) + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + container.withEnvMap($.store_gateway_env_map) + + container.withEnvMixin([ + envType.withName('GOMAXPROCS') + + envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), + envType.withName('GOMEMLIMIT') + + envType.valueFrom.resourceFieldRef.withResource('requests.memory'), + ]) + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + - $.util.resourcesRequests('1', '12Gi') + + $.util.resourcesRequests('2', '12Gi') + $.util.resourcesLimits(null, '18Gi') + $.util.readinessProbe + $.jaeger_mixin, store_gateway_env_map:: { - GOMAXPROCS: '2', - GOMEMLIMIT: '12GiB', }, newStoreGatewayStatefulSet(name, container):: From d9260dbc41c6bbc00e8c82931a20883595b841d7 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 20 Nov 2023 20:25:49 +0100 Subject: [PATCH 33/60] Update changelog for https://github.com/cortexproject/cortex-jsonnet/commit/524c3b3401fbe6efdb34339772cff63c05eeee03 (#36) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67a2b4b6..11238281 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ * [CHANGE] Add default instance max inflight pushes for distributors * [CHANGE] Remove mem-ballast from distributor and querier. * [CHANGE] Increase cpu requests for querier to 2. -* [CHANGE] Configure default GOMAXPROCS and GOMEMLIMIT for all cortex modules +* [CHANGE] Configure GOMAXPROCS and GOMEMLIMIT for all cortex modules based on requested cpu and memory * [CHANGE] Add default tenant shard sizes * [CHANGE] Use cortex v1.15.3 * [CHANGE] Azure storage endpoint suffix is set to `blob.core.windows.net` for backward compatibility From 006c8fb25bcbd13780e83c1b7275f5d5fe595f5d Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Mon, 20 Nov 2023 20:33:21 +0100 Subject: [PATCH 34/60] Revert alertmanager back to 100m for requests CPU Signed-off-by: Friedrich Gonzalez --- cortex/alertmanager.libsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index 719ac85b..f9725cef 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -115,7 +115,7 @@ [volumeMount.new('alertmanager-fallback-config', '/configs')] else [] ) + - $.util.resourcesRequests('1', '1Gi') + + $.util.resourcesRequests('100m', '1Gi') + $.util.readinessProbe + $.jaeger_mixin else {}, From 272aaee2086e84f5c5ca26de12ad80a3eaa63932 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Fri, 24 Nov 2023 03:24:21 +0100 Subject: [PATCH 35/60] Reorganize limits better and allow to use limits.cpu and limits.memory too (#37) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 +- cortex/alertmanager.libsonnet | 7 +------ cortex/compactor.libsonnet | 7 +------ cortex/config.libsonnet | 27 +++++++++++++++++++++++++++ cortex/distributor.libsonnet | 7 +------ cortex/flusher-job-blocks.libsonnet | 3 +-- cortex/ingester.libsonnet | 7 +------ cortex/overrides-exporter.libsonnet | 1 + cortex/querier.libsonnet | 7 +------ cortex/query-frontend.libsonnet | 7 +------ cortex/query-scheduler.libsonnet | 7 +------ cortex/query-tee.libsonnet | 7 +------ cortex/ruler.libsonnet | 7 +------ cortex/store-gateway.libsonnet | 7 +------ cortex/test-exporter.libsonnet | 3 +-- 15 files changed, 41 insertions(+), 65 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11238281..f5e7ae14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ * [CHANGE] Add default instance max inflight pushes for distributors * [CHANGE] Remove mem-ballast from distributor and querier. * [CHANGE] Increase cpu requests for querier to 2. -* [CHANGE] Configure GOMAXPROCS and GOMEMLIMIT for all cortex modules based on requested cpu and memory +* [CHANGE] Configure GOMAXPROCS and GOMEMLIMIT for all cortex modules based on cpu and memory requests or limits * [CHANGE] Add default tenant shard sizes * [CHANGE] Use cortex v1.15.3 * [CHANGE] Azure storage endpoint suffix is set to `blob.core.windows.net` for backward compatibility diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index f9725cef..1e870c67 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -99,12 +99,6 @@ container.withPorts($.util.defaultPorts + mode.ports) + container.withEnvMap($.alertmanager_env_map) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + mode.flags @@ -117,6 +111,7 @@ ) + $.util.resourcesRequests('100m', '1Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin else {}, diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet index 2e78c4d1..d12d5de5 100644 --- a/cortex/compactor.libsonnet +++ b/cortex/compactor.libsonnet @@ -45,17 +45,12 @@ container.withPorts($.compactor_ports) + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + container.withEnvMap($.compactor_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '5Gi') + $.util.resourcesLimits(null, '6Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin, compactor_env_map:: { diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index bad810f7..11c3f070 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -401,8 +401,35 @@ max_series: 4.8e+6, // Max number of series per ingester. 0 = no limit. 4.8 million is closely tied to 15Gb in requests per ingester // max_tenants: 0, // Max number of tenants per ingester. 0 = no limit. }, + + // if we disable this, we need to make sure we set the resource limits + // Disabling this can potentially increase cortex performance, + // but it will also cause performance inconsistencies + gomaxprocs_based_on_cpu_requests: true, + gomemlimit_based_on_mem_requests: true, + + gomaxprocs_resource: + if $._config.gomaxprocs_based_on_cpu_requests then + 'requests.cpu' + else + 'limits.cpu', + + gomemlimit_resource: + if $._config.gomemlimit_based_on_mem_requests then + 'requests.memory' + else + 'limits.memory', }, + go_container_mixin:: + local container = $.core.v1.container; + container.withEnvMixin([ + container.envType.withName('GOMAXPROCS') + + container.envType.valueFrom.resourceFieldRef.withResource($._config.gomaxprocs_resource), + container.envType.withName('GOMEMLIMIT') + + container.envType.valueFrom.resourceFieldRef.withResource($._config.gomemlimit_resource), + ]), + local configMap = $.core.v1.configMap, overrides_config: diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index 79c40335..27591569 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -44,16 +44,11 @@ container.new('distributor', $._images.distributor) + container.withPorts($.distributor_ports) + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + container.withEnvMap($.distributor_env_map) + $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '4Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin, local deployment = $.apps.v1.deployment, diff --git a/cortex/flusher-job-blocks.libsonnet b/cortex/flusher-job-blocks.libsonnet index 6917a867..56264c13 100644 --- a/cortex/flusher-job-blocks.libsonnet +++ b/cortex/flusher-job-blocks.libsonnet @@ -25,11 +25,10 @@ $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin, flusher_env_map:: { - GOMAXPROCS: '4', - GOMEMLIMIT: '15GiB', }, flusher_job_func(jobName, pvcName):: diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 93f8ab54..11e22f58 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -56,15 +56,10 @@ container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + container.withEnvMap($.ingester_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin, ingester_deployment_labels:: {}, diff --git a/cortex/overrides-exporter.libsonnet b/cortex/overrides-exporter.libsonnet index 1f9de4ea..8fbf4acc 100644 --- a/cortex/overrides-exporter.libsonnet +++ b/cortex/overrides-exporter.libsonnet @@ -20,6 +20,7 @@ container.withArgsMixin($.util.mapToFlags($.overrides_exporter_args, prefix='--')) + $.util.resourcesRequests('0.5', '0.5Gi') + $.util.readinessProbe + + $.go_container_mixin + container.mixin.readinessProbe.httpGet.withPort($.overrides_exporter_port.name), local deployment = $.apps.v1.deployment, diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index d58759ca..15e22459 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -43,12 +43,7 @@ $.jaeger_mixin + $.util.readinessProbe + container.withEnvMap($.querier_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + + $.go_container_mixin + $.util.resourcesRequests('2', '12Gi') + $.util.resourcesLimits(null, '24Gi'), diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet index e3b36670..5cbabaf0 100644 --- a/cortex/query-frontend.libsonnet +++ b/cortex/query-frontend.libsonnet @@ -44,12 +44,7 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + container.withEnvMap($.query_frontend_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + + $.go_container_mixin + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '600Mi') + diff --git a/cortex/query-scheduler.libsonnet b/cortex/query-scheduler.libsonnet index fab92958..8aa5cf74 100644 --- a/cortex/query-scheduler.libsonnet +++ b/cortex/query-scheduler.libsonnet @@ -19,12 +19,7 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_scheduler_args)) + container.withEnvMap($.query_scheduler_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + + $.go_container_mixin + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '1Gi') + diff --git a/cortex/query-tee.libsonnet b/cortex/query-tee.libsonnet index 6a89e990..9856c34b 100644 --- a/cortex/query-tee.libsonnet +++ b/cortex/query-tee.libsonnet @@ -20,13 +20,8 @@ ]) + container.withArgsMixin($.util.mapToFlags($.query_tee_args)) + container.withEnvMap($.query_tee_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + $.util.resourcesRequests('1', '512Mi') + + $.go_container_mixin + $.jaeger_mixin, query_tee_env_map:: { diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index 3735684d..c60a8740 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -40,12 +40,7 @@ container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + container.withEnvMap($.ruler_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + + $.go_container_mixin + $.util.resourcesRequests('2', '6Gi') + $.util.resourcesLimits('16', '16Gi') + $.util.readinessProbe + diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet index 7250b25d..c21ee302 100644 --- a/cortex/store-gateway.libsonnet +++ b/cortex/store-gateway.libsonnet @@ -42,12 +42,7 @@ container.withPorts($.store_gateway_ports) + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + container.withEnvMap($.store_gateway_env_map) + - container.withEnvMixin([ - envType.withName('GOMAXPROCS') + - envType.valueFrom.resourceFieldRef.withResource('requests.cpu'), - envType.withName('GOMEMLIMIT') + - envType.valueFrom.resourceFieldRef.withResource('requests.memory'), - ]) + + $.go_container_mixin + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + $.util.resourcesRequests('2', '12Gi') + $.util.resourcesLimits(null, '18Gi') + diff --git a/cortex/test-exporter.libsonnet b/cortex/test-exporter.libsonnet index 036d6fec..e7d088ef 100644 --- a/cortex/test-exporter.libsonnet +++ b/cortex/test-exporter.libsonnet @@ -21,11 +21,10 @@ container.withEnvMap($.test_exporter_env_map) + $.util.resourcesRequests('100m', '100Mi') + $.util.resourcesLimits('100m', '100Mi') + + $.go_container_mixin + $.jaeger_mixin, test_exporter_env_map:: { - GOMAXPROCS: '1', - GOMEMLIMIT: '100MiB', }, local deployment = $.apps.v1.deployment, From c9f3e2060eb2b3f4c4d47fc811b02d7b23e41757 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Fri, 24 Nov 2023 03:32:50 +0100 Subject: [PATCH 36/60] Release v1.15.3 (#38) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5e7ae14..70c494d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ # Changelog ## master / unreleased + +## 1.15.3 / 2023-11-24 * [CHANGE] Add default instance max series for ingesters * [CHANGE] Add default instance max inflight pushes for distributors * [CHANGE] Remove mem-ballast from distributor and querier. From cbb7997c018bb124553ce965c4e0993c1ce86827 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Fri, 24 Nov 2023 04:12:51 +0100 Subject: [PATCH 37/60] Upgrade build image (#39) * Upgrade build-image alpine 3.18 tanka 0.26.0 go 1.21 Signed-off-by: Friedrich Gonzalez * Update ci.yaml Signed-off-by: Friedrich Gonzalez --------- Signed-off-by: Friedrich Gonzalez --- .github/workflows/ci.yaml | 6 +++--- README.md | 4 ++-- build-image/Dockerfile | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e0eac22e..fa49025d 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -9,7 +9,7 @@ on: jobs: lint: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:e63d87f + container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda steps: - uses: actions/checkout@v2 name: Checkout @@ -23,7 +23,7 @@ jobs: run: make lint-playbooks build: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:e63d87f + container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda steps: - uses: actions/checkout@v2 name: Checkout @@ -34,7 +34,7 @@ jobs: run: make build-mixin readme: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:e63d87f + container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda steps: - uses: actions/checkout@v2 name: Checkout diff --git a/README.md b/README.md index b0a5133e..539559a0 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ To generate the YAMLs for deploying Cortex: ```console $ # make sure to be outside of GOPATH or a go.mod project - $ GO111MODULE=on go install github.com/grafana/tanka/cmd/tk@v0.24.0 + $ GO111MODULE=on go install github.com/grafana/tanka/cmd/tk@v0.26.0 $ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1 ``` @@ -25,7 +25,7 @@ To generate the YAMLs for deploying Cortex: ```console $ mkdir && cd - $ tk init --k8s=1.24 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.24@main + $ tk init --k8s=1.26 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.26@main $ jb install github.com/cortexproject/cortex-jsonnet/cortex@main ``` diff --git a/build-image/Dockerfile b/build-image/Dockerfile index a874e693..70d2fd9b 100644 --- a/build-image/Dockerfile +++ b/build-image/Dockerfile @@ -1,5 +1,5 @@ # Build jsonnet -FROM alpine:3.17 AS jsonnet-builder +FROM alpine:3.18 AS jsonnet-builder RUN apk add --no-cache git make g++ RUN git clone https://github.com/google/jsonnet && \ git -C jsonnet checkout v0.20.0 && \ @@ -8,7 +8,7 @@ RUN git clone https://github.com/google/jsonnet && \ cp jsonnet/jsonnetfmt /usr/bin # Build jb -FROM alpine:3.17 AS jb-builder +FROM alpine:3.18 AS jb-builder ARG JSONNET_BUNDLER_VERSION=0.5.1 ARG JSONNET_BUNDLER_CHECKSUM="f5bccc94d28fbbe8ad1d46fd4f208619e45d368a5d7924f6335f4ecfa0605c85 /usr/bin/jb" RUN apk add --no-cache curl @@ -17,19 +17,19 @@ RUN echo "${JSONNET_BUNDLER_CHECKSUM}" | sha256sum -c || (printf "wanted: %s\n RUN chmod +x /usr/bin/jb # Build tanka -FROM alpine:3.17 AS tk-builder -ARG TANKA_VERSION=0.24.0 -ARG TANKA_CHECKSUM="82c8c533c29eefea0af9c28f487203b19dec84ce2624702f99196e777f946ddc /usr/bin/tk" +FROM alpine:3.18 AS tk-builder +ARG TANKA_VERSION=0.26.0 +ARG TANKA_CHECKSUM="089796ae2ce65390501b2c68ceca1ce99ff12787d5ae3b4823c825a07e6e22f4 /usr/bin/tk" RUN apk add --no-cache curl RUN curl -fSL -o "/usr/bin/tk" "https://github.com/grafana/tanka/releases/download/v${TANKA_VERSION}/tk-linux-amd64" RUN echo "${TANKA_CHECKSUM}" | sha256sum -c || (printf "wanted: %s\n got: %s\n" "${TANKA_CHECKSUM}" "$(sha256sum /usr/bin/tk)"; exit 1) RUN chmod +x /usr/bin/tk # Build mixtool -FROM golang:1.20-alpine AS mixtool-builder +FROM golang:1.21-alpine AS mixtool-builder RUN GO111MODULE=on go install github.com/monitoring-mixins/mixtool/cmd/mixtool@ae18e31161ea10545b9c1ac0d23c10122f2c12b5 -FROM alpine:3.17 +FROM alpine:3.18 RUN apk add --no-cache git make libgcc libstdc++ zip findutils sed yq COPY --from=jsonnet-builder /usr/bin/jsonnetfmt /usr/bin COPY --from=jsonnet-builder /usr/bin/jsonnet /usr/bin From 967325b522ad51c703648f33563b8ea31547ba46 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Fri, 24 Nov 2023 04:20:44 +0100 Subject: [PATCH 38/60] Use cortex v1.16.0 (#40) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/images.libsonnet | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 70c494d8..f3fd0474 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] Use cortex v1.16.0 ## 1.15.3 / 2023-11-24 * [CHANGE] Add default instance max series for ingesters diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index 05f89c02..fed6dc71 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.6.0', // Our services. - cortex: 'cortexproject/cortex:v1.15.3', + cortex: 'cortexproject/cortex:v1.16.0', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, overrides_exporter: self.cortex, - query_tee: 'quay.io/cortexproject/query-tee:v1.15.3', - testExporter: 'cortexproject/test-exporter:v1.15.3', + query_tee: 'quay.io/cortexproject/query-tee:v1.16.0', + testExporter: 'cortexproject/test-exporter:v1.16.0', }, } From 8a792a819c53b74cd36da4ee44504f326c8b7cd1 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 20 Dec 2023 15:31:11 +0100 Subject: [PATCH 39/60] Enable query stats on frontend by default (#41) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/query-frontend.libsonnet | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3fd0474..2387656c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master / unreleased * [CHANGE] Use cortex v1.16.0 +* [ENHANCEMENT] Enable frontend query stats by default ## 1.15.3 / 2023-11-24 * [CHANGE] Add default instance max series for ingesters diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet index 5cbabaf0..a0552215 100644 --- a/cortex/query-frontend.libsonnet +++ b/cortex/query-frontend.libsonnet @@ -7,9 +7,12 @@ { target: 'query-frontend', - // Need log.level=debug so all queries are logged, needed for analyse.py. + // Need log.level=debug to see trace id for queries 'log.level': 'debug', + // a message with some statistics is logged for every query. + 'frontend.query-stats-enabled': true, + // Increase HTTP server response write timeout, as we were seeing some // queries that return a lot of data timeing out. 'server.http-write-timeout': '1m', From 810c37b0396f23072c32a26d878452c53be758a0 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 20 Dec 2023 16:07:07 +0100 Subject: [PATCH 40/60] Enable ruler query stats by default (#42) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/ruler.libsonnet | 3 +++ 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2387656c..65a43a06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master / unreleased * [CHANGE] Use cortex v1.16.0 * [ENHANCEMENT] Enable frontend query stats by default +* [ENHANCEMENT] Enable ruler query stats by default ## 1.15.3 / 2023-11-24 * [CHANGE] Add default instance max series for ingesters diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index c60a8740..b3f47027 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -32,6 +32,9 @@ // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" // is set to false. 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, + + // a message with some statistics is logged for every query. + 'ruler.query-stats-enabled': true, }, ruler_container:: From 2e0f64ddc17be61a5c4779500470a27df24fc4ee Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 20 Feb 2024 10:40:01 +0100 Subject: [PATCH 41/60] Upgrade memcached to 1.6.23 and memcached-exporter to v0.14.2 (#43) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/images.libsonnet | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 65a43a06..db11f825 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2 * [CHANGE] Use cortex v1.16.0 * [ENHANCEMENT] Enable frontend query stats by default * [ENHANCEMENT] Enable ruler query stats by default diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index fed6dc71..b87f9653 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -1,8 +1,8 @@ { _images+:: { // Various third-party images. - memcached: 'memcached:1.6.9-alpine', - memcachedExporter: 'prom/memcached-exporter:v0.6.0', + memcached: 'memcached:1.6.23-alpine', + memcachedExporter: 'prom/memcached-exporter:v0.14.2', // Our services. cortex: 'cortexproject/cortex:v1.16.0', From 8f6ad0d64a547f0389e0ad75a6d5c1b7437e846f Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 30 Apr 2024 10:32:15 -0700 Subject: [PATCH 42/60] Use ignore-blocks-within (#45) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/config.libsonnet | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index db11f825..2773b30b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * [CHANGE] Use cortex v1.16.0 * [ENHANCEMENT] Enable frontend query stats by default * [ENHANCEMENT] Enable ruler query stats by default +* [ENHANCEMENT] Configure `-blocks-storage.bucket-store.ignore-blocks-within` in queriers, rulers and store-gateways ## 1.15.3 / 2023-11-24 * [CHANGE] Add default instance max series for ingesters diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 11c3f070..f9649856 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -74,8 +74,18 @@ 'store.engine': 'blocks', }, + // Ignore blocks in querier, ruler and store-gateways for the last 11h + ignore_blocks_within: '11h', + + // No need to look at store for data younger than 12h, as ingesters have all of it. + query_store_after: '12h', + + // Ingesters don't have data older than 13h, no need to ask them. + query_ingesters_within: '13h', + queryBlocksStorageConfig:: { 'blocks-storage.bucket-store.sync-dir': '/data/tsdb', + 'blocks-storage.bucket-store.ignore-blocks-within': $._config.ignore_blocks_within, 'blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', 'store-gateway.sharding-enabled': true, @@ -116,12 +126,8 @@ // type queries. 32 days to allow for comparision over the last month (31d) and // then some. 'store.max-query-length': '768h', - - // Ingesters don't have data older than 13h, no need to ask them. - 'querier.query-ingesters-within': '13h', - - // No need to look at store for data younger than 12h, as ingesters have all of it. - 'querier.query-store-after': '12h', + 'querier.query-ingesters-within': $._config.query_ingesters_within, + 'querier.query-store-after': $._config.query_store_after, }, // PromQL query engine config (shared between all services running PromQL engine, like the ruler and querier). From 34f241a07bec0c46bffa64757c9eb8899751b9b0 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Tue, 30 Apr 2024 10:36:24 -0700 Subject: [PATCH 43/60] Release v1.16.1 (#46) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 4 ++-- cortex/images.libsonnet | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2773b30b..a900b307 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,8 @@ # Changelog -## master / unreleased +## 1.16.1 * [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2 -* [CHANGE] Use cortex v1.16.0 +* [CHANGE] Use cortex v1.16.1 * [ENHANCEMENT] Enable frontend query stats by default * [ENHANCEMENT] Enable ruler query stats by default * [ENHANCEMENT] Configure `-blocks-storage.bucket-store.ignore-blocks-within` in queriers, rulers and store-gateways diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index b87f9653..54659454 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.14.2', // Our services. - cortex: 'cortexproject/cortex:v1.16.0', + cortex: 'cortexproject/cortex:v1.16.1', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, overrides_exporter: self.cortex, - query_tee: 'quay.io/cortexproject/query-tee:v1.16.0', - testExporter: 'cortexproject/test-exporter:v1.16.0', + query_tee: 'quay.io/cortexproject/query-tee:v1.16.1', + testExporter: 'cortexproject/test-exporter:v1.16.1', }, } From f9386862998f8dc5229186fc2eafb2a1a6ca19aa Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 8 May 2024 10:43:35 -0700 Subject: [PATCH 44/60] Enable grpc compression to be snappy-block (#47) Reduces 93.6% traffic between ingester and distributor Causes no additional latency, cpu or memory usage Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 3 +++ cortex/config.libsonnet | 1 + 2 files changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a900b307..c8e1c01e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## master / unreleased +* [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` + ## 1.16.1 * [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2 * [CHANGE] Use cortex v1.16.1 diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index f9649856..708d2921 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -145,6 +145,7 @@ 'distributor.replication-factor': $._config.replication_factor, 'distributor.shard-by-all-labels': true, 'distributor.health-check-ingesters': true, + 'ingester.client.grpc-compression': 'snappy-block', 'ring.heartbeat-timeout': '10m', }, From 9cc367fa65b6e807093b85b18ae211daf3395a03 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez Date: Wed, 8 May 2024 10:54:13 -0700 Subject: [PATCH 45/60] Move grpc-compression to ingesterClientConfig (#48) Signed-off-by: Friedrich Gonzalez --- cortex/config.libsonnet | 5 ++++- cortex/distributor.libsonnet | 1 + cortex/querier.libsonnet | 1 + cortex/ruler.libsonnet | 1 + 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 708d2921..d70f4e00 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -70,6 +70,10 @@ 'server.grpc.keepalive.ping-without-stream-allowed': true, }, + ingesterClientConfig:: { + 'ingester.client.grpc-compression': 'snappy-block', + }, + genericBlocksStorageConfig:: { 'store.engine': 'blocks', }, @@ -145,7 +149,6 @@ 'distributor.replication-factor': $._config.replication_factor, 'distributor.shard-by-all-labels': true, 'distributor.health-check-ingesters': true, - 'ingester.client.grpc-compression': 'snappy-block', 'ring.heartbeat-timeout': '10m', }, diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index 27591569..28c6ea2f 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -8,6 +8,7 @@ $._config.ringConfig + $._config.distributorConfig + $._config.distributorLimitsConfig + + $._config.ingesterClientConfig + { target: 'distributor', diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index 15e22459..026b2825 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -9,6 +9,7 @@ $._config.queryConfig + $._config.queryEngineConfig + $._config.distributorConfig + + $._config.ingesterClientConfig + $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config + diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index b3f47027..96781a19 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -9,6 +9,7 @@ $._config.queryConfig + $._config.queryEngineConfig + $._config.distributorConfig + + $._config.ingesterClientConfig + $._config.rulerClientConfig + $._config.rulerLimitsConfig + $._config.queryBlocksStorageConfig + From 410244b3baded0e37339fbbaf17093848c6bdcfd Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Mon, 13 May 2024 05:20:15 -0700 Subject: [PATCH 46/60] Enable shuffle sharding in compactors (#51) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/compactor.libsonnet | 2 ++ 2 files changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index c8e1c01e..df972e8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] Enable shuffle sharding in compactors * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` ## 1.16.1 diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet index d12d5de5..65c10c81 100644 --- a/cortex/compactor.libsonnet +++ b/cortex/compactor.libsonnet @@ -30,6 +30,8 @@ // Enable sharding. 'compactor.sharding-enabled': true, + 'compactor.sharding-strategy': 'shuffle-sharding', + 'compactor.tenant-shard-size': 1, 'compactor.ring.store': 'consul', 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, 'compactor.ring.prefix': '', From 6804d0ab1c7defcd1ed0c59dbb6d867c6d718111 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Mon, 27 May 2024 02:22:40 -0700 Subject: [PATCH 47/60] Fix Cortex Service Scaling Dashboard for Grafana 11 (#52) * Fix Cortex Service Scaling for Grafana 11 Signed-off-by: Friedrich Gonzalez * Update CHANGELOG Signed-off-by: Friedrich Gonzalez --------- Signed-off-by: Friedrich Gonzalez --- .gitignore | 1 + CHANGELOG.md | 1 + .../dashboards/dashboard-utils.libsonnet | 38 +++++++++++++++++++ cortex-mixin/dashboards/scaling.libsonnet | 17 +++++---- 4 files changed, 49 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index 9d64df89..41d67a00 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ cortex-mixin.zip cortex-mixin/out cortex-mixin/vendor /test-readme/ +.vscode diff --git a/CHANGELOG.md b/CHANGELOG.md index df972e8c..4ff1583e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## master / unreleased * [CHANGE] Enable shuffle sharding in compactors * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` +* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard ## 1.16.1 * [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2 diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index e4268192..c0d2b087 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -503,4 +503,42 @@ local utils = import 'mixin-utils/utils.libsonnet'; %s ||| % [title, description], }, + + overrideHidden(name):: + { + matcher: { + id: 'byName', + options: name, + }, + properties: [ + { + id: 'custom.hidden', + value: true, + }, + ], + }, + + overrideDisplayName(name, displayName):: + { + matcher: { + id: 'byName', + options: name, + }, + properties: [ + { + id: 'displayName', + value: displayName, + }, + ], + }, + + + tablePanel(queries, overrides):: + super.tablePanel(queries, {}) + { + fieldConfig+: { + overrides+: overrides, + }, + styles:: null, + }, + } diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index a01a7db3..6ac244ea 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -47,14 +47,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} ) |||, - ], { - __name__: { alias: 'Cluster', type: 'hidden' }, - cluster: { alias: 'Cluster' }, - namespace: { alias: 'Namespace' }, - deployment: { alias: 'Service' }, - reason: { alias: 'Reason' }, - Value: { alias: 'Required Replicas', decimals: 0 }, - }) + ], [ + $.overrideHidden('__name__'), + $.overrideHidden('Time'), + $.overrideDisplayName('cluster', 'Cluster'), + $.overrideDisplayName('namespace', 'Namespace'), + $.overrideDisplayName('deployment', 'Service'), + $.overrideDisplayName('reason', 'Reason'), + $.overrideDisplayName('Value', 'Required Replicas'), + ]) ) ), } From 47240d3729f8bf6ce2201bd97775c9b457b1853e Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Thu, 30 May 2024 06:04:26 -0700 Subject: [PATCH 48/60] Remove deprecated chunks dashboards (#54) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 161 ------------------- cortex-mixin/config.libsonnet | 14 +- cortex-mixin/dashboards.libsonnet | 19 +-- cortex-mixin/dashboards/chunks.libsonnet | 100 ------------ cortex-mixin/dashboards/comparison.libsonnet | 105 ------------ cortex-mixin/dashboards/queries.libsonnet | 41 ----- cortex-mixin/dashboards/reads.libsonnet | 76 --------- cortex-mixin/dashboards/ruler.libsonnet | 41 ----- cortex-mixin/dashboards/writes.libsonnet | 64 -------- cortex-mixin/docs/playbooks.md | 41 ----- 11 files changed, 5 insertions(+), 658 deletions(-) delete mode 100644 cortex-mixin/dashboards/chunks.libsonnet delete mode 100644 cortex-mixin/dashboards/comparison.libsonnet diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ff1583e..0c6f70c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master / unreleased * [CHANGE] Enable shuffle sharding in compactors +* [CHANGE] Remove chunks support for dashboards * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index a6287e5e..e67ef449 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -71,27 +71,6 @@ |||, }, }, - { - // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail - // and we will never trigger the alert. - // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. - alert: 'CortexTableSyncFailure', - expr: ||| - 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) - / - rate(cortex_table_manager_sync_duration_seconds_count[15m]) - > 10 - |||, - 'for': '30m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. - |||, - }, - }, { alert: 'CortexQueriesIncorrect', expr: ||| @@ -206,41 +185,6 @@ |||, }, }, - { - alert: 'CortexTransferFailed', - expr: ||| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) - |||, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} transfer failed. - |||, - }, - }, - { - alert: 'CortexOldChunkInMemory', - // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer - // to 10 hours. - // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). - expr: ||| - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) - and - (cortex_oldest_unflushed_chunk_timestamp_seconds > 0) - |||, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory. - |||, - }, - }, { alert: 'CortexKVStoreFailure', expr: ||| @@ -379,87 +323,6 @@ }, ], }, - { - name: 'cortex_wal_alerts', - rules: [ - { - // Alert immediately if WAL is corrupt. - alert: 'CortexWALCorruption', - expr: ||| - increase(cortex_ingester_wal_corruptions_total[5m]) > 0 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. - |||, - }, - }, - { - // One or more failed checkpoint creation is a warning. - alert: 'CortexCheckpointCreationFailed', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint. - |||, - }, - }, - { - // Two or more failed checkpoint creation in 1h means something is wrong. - alert: 'CortexCheckpointCreationFailed', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint. - |||, - }, - }, - { - // One or more failed checkpoint deletion is a warning. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint. - |||, - }, - }, - { - // Two or more failed checkpoint deletion in 2h means something is wrong. - // We give this more buffer than creation as this is a less critical operation. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.instance }} is failing to delete checkpoint. - |||, - }, - }, - ], - }, { name: 'cortex-rollout-alerts', rules: [ @@ -524,30 +387,6 @@ { name: 'cortex-provisioning', rules: [ - { - alert: 'CortexProvisioningMemcachedTooSmall', - // 4 x in-memory series size = 24hrs of data. - expr: ||| - ( - 4 * - sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) - / 1e9 - ) - > - ( - sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 - ) - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB. - ||| % $._config, - }, - }, { alert: 'CortexProvisioningTooManyActiveSeries', // We target each ingester to 1.5M in-memory series. This alert fires if the average diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 06941b6d..2f620703 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -3,18 +3,7 @@ grafanaDashboardShards: 4, _config+:: { - // Switch for overall storage engine. - // May contain 'chunks', 'blocks' or both. - // Enables chunks- or blocks- specific panels and dashboards. - storage_engine: ['blocks'], - - // For chunks backend, switch for chunk index type. - // May contain 'bigtable', 'dynamodb' or 'cassandra'. - chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'], - - // For chunks backend, switch for chunk store type. - // May contain 'bigtable', 'dynamodb', 'cassandra', 's3' or 'gcs'. - chunk_store_backend: ['bigtable', 'dynamodb', 'cassandra', 's3', 'gcs'], + storage_engine: ['blocks'], // TODO: Remove this option, it's not needed // Tags for dashboards. tags: ['cortex'], @@ -32,7 +21,6 @@ ruler: '(ruler|cortex$)', query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. - table_manager: '(table-manager|cortex$)', ring_members: ['compactor', 'distributor', 'ingester.*', 'querier.*', 'ruler', 'store-gateway', 'cortex'], store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', diff --git a/cortex-mixin/dashboards.libsonnet b/cortex-mixin/dashboards.libsonnet index 9e7f71c2..e4b68c4b 100644 --- a/cortex-mixin/dashboards.libsonnet +++ b/cortex-mixin/dashboards.libsonnet @@ -9,22 +9,9 @@ (import 'dashboards/writes.libsonnet') + (import 'dashboards/slow-queries.libsonnet') + (import 'dashboards/rollout-progress.libsonnet') + - - (if std.member($._config.storage_engine, 'blocks') - then - (import 'dashboards/compactor.libsonnet') + - (import 'dashboards/compactor-resources.libsonnet') + - (import 'dashboards/object-store.libsonnet') - else {}) + - - (if std.member($._config.storage_engine, 'chunks') - then import 'dashboards/chunks.libsonnet' - else {}) + - - (if std.member($._config.storage_engine, 'blocks') - && std.member($._config.storage_engine, 'chunks') - then import 'dashboards/comparison.libsonnet' - else {}) + + (import 'dashboards/compactor.libsonnet') + + (import 'dashboards/compactor-resources.libsonnet') + + (import 'dashboards/object-store.libsonnet') + (if !$._config.resources_dashboards_enabled then {} else (import 'dashboards/reads-resources.libsonnet') + diff --git a/cortex-mixin/dashboards/chunks.libsonnet b/cortex-mixin/dashboards/chunks.libsonnet deleted file mode 100644 index b82c6880..00000000 --- a/cortex-mixin/dashboards/chunks.libsonnet +++ /dev/null @@ -1,100 +0,0 @@ -local utils = import 'mixin-utils/utils.libsonnet'; - -(import 'dashboard-utils.libsonnet') { - 'cortex-chunks.json': - ($.dashboard('Cortex / Chunks') + { uid: 'a56a3fa6284064eb392a115f3acbf744' }) - .addClusterSelectorTemplates() - .addRow( - $.row('Active Series / Chunks') - .addPanel( - $.panel('Series') + - $.queryPanel('sum(cortex_ingester_memory_series{%s})' % $.jobMatcher($._config.job_names.ingester), 'series'), - ) - .addPanel( - $.panel('Chunks per series') + - $.queryPanel('sum(cortex_ingester_memory_chunks{%s}) / sum(cortex_ingester_memory_series{%s})' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'chunks'), - ) - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Utilization') + - $.latencyPanel('cortex_ingester_chunk_utilization', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + - { yaxes: $.yaxes('percentunit') }, - ) - .addPanel( - $.panel('Age') + - $.latencyPanel('cortex_ingester_chunk_age_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)), - ), - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Size') + - $.latencyPanel('cortex_ingester_chunk_length', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Entries') + - $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'entries'), - ), - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Queue Length') + - $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher($._config.job_names.ingester), '{{%s}}' % $._config.per_instance_label), - ) - .addPanel( - $.panel('Flush Rate') + - $.qpsPanel('cortex_ingester_chunk_age_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)), - ), - ), - - 'cortex-wal.json': - ($.dashboard('Cortex / WAL') + { uid: 'd4fb924cdc1581cd8e870e3eb0110bda' }) - .addClusterSelectorTemplates() - .addRow( - $.row('') - .addPanel( - $.panel('Bytes Logged (WAL+Checkpoint) / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - ) - .addRow( - $.row('WAL') - .addPanel( - $.panel('Records logged / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'records'), - ) - .addPanel( - $.panel('Bytes per record') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - .addPanel( - $.panel('Bytes per sample') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - .addPanel( - $.panel('Min(available disk space)') + - $.queryPanel('min(kubelet_volume_stats_available_bytes{cluster=~"$cluster", namespace=~"$namespace", persistentvolumeclaim=~"ingester.*"})', 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - ) - .addRow( - $.row('Checkpoint') - .addPanel( - $.panel('Checkpoint creation/deletion / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), - ) - .addPanel( - $.panel('Checkpoint creation/deletion failed / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), - ) - ), -} diff --git a/cortex-mixin/dashboards/comparison.libsonnet b/cortex-mixin/dashboards/comparison.libsonnet deleted file mode 100644 index 1716f7d4..00000000 --- a/cortex-mixin/dashboards/comparison.libsonnet +++ /dev/null @@ -1,105 +0,0 @@ -local utils = import 'mixin-utils/utils.libsonnet'; - -(import 'dashboard-utils.libsonnet') -{ - 'cortex-blocks-vs-chunks.json': - ($.dashboard('Cortex / Blocks vs Chunks') + { uid: '0e2b4dd23df9921972e3fb554c0fc483' }) - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addTemplate('blocks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addTemplate('chunks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - $.row('Ingesters') - .addPanel( - $.panel('Samples / sec') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__rate_interval]))', 'chunks') - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('Blocks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($blocks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - .addPanel( - $.panel('Chunks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($chunks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU per sample') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory per active series') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ) - .addRow( - $.row('Queriers') - .addPanel( - $.panel('Queries / sec (query-frontend)') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Queries / sec (query-tee)') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('Latency 99th') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])))', 'blocks') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])))', 'chunks') + - { yaxes: $.yaxes('s') } - ) - .addPanel( - $.panel('Latency average') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') + - { yaxes: $.yaxes('s') } - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ), -} diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index 853d0f8c..cada5c8e 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -110,23 +110,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'Corrupt chunks'), ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Querier - Chunks storage - Index Cache') - .addPanel( - $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Entries'), - ) - .addPanel( - $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'hit rate') - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'churn rate'), - ) - ) .addRow( $.row('Ingester') .addPanel( @@ -145,30 +128,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('short') }, ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Querier - Chunks storage - Store') - .addPanel( - $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 9bc9b7d6..5a720784 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -216,30 +216,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('s') } ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached - Chunks storage - Index') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached - Chunks storage - Chunks') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'chunksmemcache.fetch')]) - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache @@ -339,58 +315,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'metadata-cache' ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), - $.row('Cassandra') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'SELECT')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), - $.row('BigTable') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) - ), - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), - $.row('DynamoDB') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'DynamoDB.QueryPages')]) - ), - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_store_backend, 'gcs'), - $.row('GCS') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'GET')]) - ) - ) // Object store metrics for the store-gateway. .addRowsIf( std.member($._config.storage_engine, 'blocks'), diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index d1062581..b243198c 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -144,47 +144,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Ruler - Chunks storage - Index Cache') - .addPanel( - $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Entries'), - ) - .addPanel( - $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.ruler), 'churn rate'), - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Ruler - Chunks storage - Store') - .addPanel( - $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ruler - Blocks storage') diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index e99faee4..c6563645 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -140,70 +140,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('method', 'Memcache.Put')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), - $.row('Cassandra') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'INSERT')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), - $.row('BigTable') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), - $.row('DynamoDB') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_store_backend, 'gcs'), - $.row('GCS') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'POST')]) - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - Shipper') diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index c1ee3ef4..b5b68895 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -198,8 +198,6 @@ How to **investigate**: - If the failing service is going OOM (`OOMKilled`): scale up or increase the memory - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there -### CortexTransferFailed -This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` ### CortexIngesterUnhealthy This alert goes off when an ingester is marked as unhealthy. Check the ring web page to see which is marked as unhealthy. You could then check the logs to see if there are any related to that ingester ex: `kubectl logs -f ingester-01 --namespace=prod`. A simple way to resolve this may be to click the "Forgot" button on the ring page, especially if the pod doesn't exist anymore. It might not exist anymore because it was on a node that got shut down, so you could check to see if there are any logs related to the node that pod is/was on, ex: `kubectl get events --namespace=prod | grep cloud-provider-node`. @@ -464,29 +462,6 @@ How to **investigate**: - Safely manually delete the block from the bucket if was a partial delete or an upload failed by a compactor - Further investigate if was an upload failed by an ingester but not later retried (ingesters are expected to retry uploads until succeed) -### CortexWALCorruption - -This alert is only related to the chunks storage. This can happen because of 2 reasons: (1) Non graceful shutdown of ingesters. (2) Faulty storage or NFS. - -WAL corruptions are only detected at startups, so at this point the WAL/Checkpoint would have been repaired automatically. So we can only check what happened and if there was any data loss and take actions to avoid this happening in future. - -1. Check if there was any node restarts that force killed pods. If there is, then the corruption is from the non graceful shutdown of ingesters, which is generally fine. You can: - * Describe the pod to see the last state. - * Use `kube_pod_info` to check the node for the pod. `node_boot_time_seconds` to see if node just booted (which also indicates restart). - * You can use `eventrouter` logs to double check. - * Check ingester logs to check if the shutdown logs are missing at that time. -2. To confirm this, in the logs, check the WAL segment on which the corruption happened (let's say `X`) and the last checkpoint attempt number (let's say `Y`, this is the last WAL segment that was present when checkpointing started). -3. If `X > Y`, then it's most likely an abrupt restart of ingester and the corruption would be on the last few records of the last segment. To verify this, check the file timestamps of WAL segment `X` and `X - 1` if they were recent. -4. If `X < Y`, then the corruption was in some WAL segment which was not the last one. This indicates faulty disk and some data loss on that ingester. -5. In case of faulty disk corruption, if the number or ingesters that had corruption within the chunk flush age: - 1. Less than the quorum number for your replication factor: No data loss, because there is a guarantee that the data is replicated. For example, if replication factor is 3, then it's fine if corruption was on 1 ingester. - 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. - 3. Equal or more than the replication factor: Then there is definitely some data loss. - -### CortexTableSyncFailure - -_This alert applies to Cortex chunks storage only._ - ### CortexQueriesIncorrect _TODO: this playbook has not been written yet._ @@ -578,22 +553,6 @@ How to **investigate**: - `other` - Check both Cortex and memcached logs to find more details -### CortexOldChunkInMemory - -_This alert applies to Cortex chunks storage only._ - -### CortexCheckpointCreationFailed - -_This alert applies to Cortex chunks storage only._ - -### CortexCheckpointDeletionFailed - -_This alert applies to Cortex chunks storage only._ - -### CortexProvisioningMemcachedTooSmall - -_This alert applies to Cortex chunks storage only._ - ### CortexProvisioningTooManyActiveSeries This alert fires if the average number of in-memory series per ingester is above our target (1.5M). From 1fcee6a596b2ceea7deda6a7f234f2e7a4d0be6d Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Sat, 8 Jun 2024 11:26:36 -0700 Subject: [PATCH 49/60] Fix test-readme (#50) * Fix test-readme Signed-off-by: Friedrich Gonzalez * Add error Signed-off-by: Friedrich Gonzalez * :ry again Signed-off-by: Friedrich Gonzalez * Fix test Signed-off-by: Friedrich Gonzalez * Remove unneeded Signed-off-by: Friedrich Gonzalez * Really show content Signed-off-by: Friedrich Gonzalez --------- Signed-off-by: Friedrich Gonzalez --- .github/workflows/ci.yaml | 14 +++++++++----- Makefile | 17 ++++++++--------- scripts/test-readme.sh | 11 +++++++++++ 3 files changed, 28 insertions(+), 14 deletions(-) create mode 100755 scripts/test-readme.sh diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index fa49025d..9e484516 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 name: Checkout with: fetch-depth: 0 @@ -25,7 +25,7 @@ jobs: runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 name: Checkout with: fetch-depth: 0 @@ -36,10 +36,14 @@ jobs: runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 name: Checkout with: fetch-depth: 0 - - name: "Test readme" - run: make test-readme + - name: "Test readme s3" + run: make test-readme/s3 + - name: "Test readme azure" + run: make test-readme/azure + - name: "Test readme gcs" + run: make test-readme/gcs diff --git a/Makefile b/Makefile index 3b1f77d1..7ddbd645 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: lint build-image publish-build-image test-readme +.PHONY: lint build-image publish-build-image test-readme clean JSONNET_FMT := jsonnetfmt @@ -47,14 +47,7 @@ build-mixin: test-readme: test-readme/azure test-readme/gcs test-readme/s3 test-readme/%: - rm -rf $@ && \ - mkdir -p $@ && cd $@ && \ - tk init --k8s=1.24 && \ - jb install github.com/cortexproject/cortex-jsonnet/cortex@main && \ - rm -fr ./vendor/cortex && \ - cp -r ../../cortex ./vendor/ && \ - cp vendor/cortex/$(notdir $@)/main.jsonnet.example environments/default/main.jsonnet && \ - PAGER=cat tk show environments/default + @./scripts/test-readme.sh $@ clean-white-noise: @$(FIND) . -type f -regextype posix-extended -regex '.*(md|libsonnet)' -print | \ @@ -62,3 +55,9 @@ clean-white-noise: check-white-noise: clean-white-noise @git diff --exit-code --quiet || (echo "Please remove trailing whitespaces running 'make clean-white-noise'" && false) + +clean: + rm -rf cortex-mixin/out + rm -rf cortex-mixin/vendor + rm -f cortex-mixin/cortex-mixin.zip + rm -rf test-readme diff --git a/scripts/test-readme.sh b/scripts/test-readme.sh new file mode 100755 index 00000000..84a51a60 --- /dev/null +++ b/scripts/test-readme.sh @@ -0,0 +1,11 @@ +#!/bin/sh +set -xe +rm -rf $1 +mkdir -p $1 +cd $1 +tk init --k8s=1.26 +jb install github.com/cortexproject/cortex-jsonnet/cortex@main +rm -fr ./vendor/cortex +cp -r ../../cortex ./vendor/ +cp vendor/cortex/$(basename $1)/main.jsonnet.example environments/default/main.jsonnet +PAGER=cat tk show --dangerous-allow-redirect environments/default From a9fec63ae225f2ea236132e28f86900ac74e341b Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Wed, 12 Jun 2024 05:36:08 -0700 Subject: [PATCH 50/60] Upgrade to cortex v1.17.1 (#49) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex/images.libsonnet | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c6f70c3..14cbab6b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Changelog ## master / unreleased +* [CHANGE] Use cortex v1.17.1 * [CHANGE] Enable shuffle sharding in compactors * [CHANGE] Remove chunks support for dashboards * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index 54659454..0a0238cf 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -5,7 +5,7 @@ memcachedExporter: 'prom/memcached-exporter:v0.14.2', // Our services. - cortex: 'cortexproject/cortex:v1.16.1', + cortex: 'cortexproject/cortex:v1.17.1', alertmanager: self.cortex, distributor: self.cortex, @@ -20,7 +20,7 @@ query_scheduler: self.cortex, overrides_exporter: self.cortex, - query_tee: 'quay.io/cortexproject/query-tee:v1.16.1', - testExporter: 'cortexproject/test-exporter:v1.16.1', + query_tee: 'quay.io/cortexproject/query-tee:v1.17.1', + testExporter: 'cortexproject/test-exporter:v1.17.1', }, } From fbe4726b91b5bb20a608e1c82ca21933cc132075 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Sun, 16 Jun 2024 10:53:19 -0700 Subject: [PATCH 51/60] Build and push build-image (#55) Signed-off-by: Friedrich Gonzalez --- .github/workflows/build-image.yaml | 56 ++++++++++++++++++++++++++++++ .github/workflows/ci.yaml | 18 +++++++--- .gitignore | 1 + Makefile | 6 ++++ 4 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/build-image.yaml diff --git a/.github/workflows/build-image.yaml b/.github/workflows/build-image.yaml new file mode 100644 index 00000000..107e6beb --- /dev/null +++ b/.github/workflows/build-image.yaml @@ -0,0 +1,56 @@ +name: Build Image + +on: + push: + branches: [ main ] + paths: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' + pull_request: + branches: [ main ] + paths: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + name: Checkout + + - name: Build & save image + run: make build-image save-build-image + + - name: Upload Docker Images Artifact + uses: actions/upload-artifact@v4 + with: + name: build-image + path: ./build-image.tar + if-no-files-found: error + + push: + if: github.ref == 'refs/heads/main' && github.repository == 'cortexproject/cortex-jsonnet' + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + name: Checkout + + - name: Download Docker Images Artifacts + uses: actions/download-artifact@v4 + with: + name: build-image + + - name: Load image + run: make load-build-image + + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{secrets.QUAY_REGISTRY_USER}} + password: ${{secrets.QUAY_REGISTRY_PASSWORD}} + + - name: Push image + run: make publish-build-image diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 9e484516..90341011 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -3,8 +3,14 @@ name: CI on: push: branches: [ main ] + paths-ignore: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' pull_request: branches: [ main ] + paths-ignore: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' jobs: lint: @@ -15,12 +21,13 @@ jobs: name: Checkout with: fetch-depth: 0 - + - name: "Lint mixin" run: make lint-mixin - + - name: "Lint playbooks" run: make lint-playbooks + build: runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda @@ -29,9 +36,10 @@ jobs: name: Checkout with: fetch-depth: 0 - + - name: "Build mixin" run: make build-mixin + readme: runs-on: ubuntu-latest container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda @@ -40,10 +48,12 @@ jobs: name: Checkout with: fetch-depth: 0 - + - name: "Test readme s3" run: make test-readme/s3 + - name: "Test readme azure" run: make test-readme/azure + - name: "Test readme gcs" run: make test-readme/gcs diff --git a/.gitignore b/.gitignore index 41d67a00..4c7277c4 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ cortex-mixin/out cortex-mixin/vendor /test-readme/ .vscode +build-image.tar diff --git a/Makefile b/Makefile index 7ddbd645..a1f0de73 100644 --- a/Makefile +++ b/Makefile @@ -34,6 +34,12 @@ fmt: build-image: docker build -t quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-image +save-build-image: + docker save quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) > build-image.tar + +load-build-image: + docker load < build-image.tar + publish-build-image: docker push quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) From 5038e0cc8739790b061123ac318debf4c4a8b2f1 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Mon, 17 Jun 2024 23:29:57 -0700 Subject: [PATCH 52/60] Update build image https://github.com/cortexproject/cortex-jsonnet/commit/fbe4726b91b5bb20a608e1c82ca21933cc132075 (#56) Signed-off-by: Friedrich Gonzalez --- .github/workflows/ci.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 90341011..423e0c96 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -15,7 +15,7 @@ on: jobs: lint: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda + container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726 steps: - uses: actions/checkout@v4 name: Checkout @@ -30,7 +30,7 @@ jobs: build: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda + container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726 steps: - uses: actions/checkout@v4 name: Checkout @@ -42,7 +42,7 @@ jobs: readme: runs-on: ubuntu-latest - container: quay.io/cortexproject/cortex-jsonnet-build-image:e158eda + container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726 steps: - uses: actions/checkout@v4 name: Checkout From 0dad9895dea9c26d3c5777fcc3a3562c0aa54b37 Mon Sep 17 00:00:00 2001 From: Charlie Le <3375195+CharlieTLe@users.noreply.github.com> Date: Tue, 20 Aug 2024 08:11:46 -0700 Subject: [PATCH 53/60] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 (#57) Updates grafana-builder and mixin-utils to latest version in master branch. This will be helpful in creating Grafana panels that use the timeseriesPanel instead of the deprecated panel. grafana-builder changes: - grafana-builder: add support for native/classic stat panel query (grafana/jsonnet-libs#1285) - More native histograms related utils and renaming (grafana/jsonnet-libs#1270) - Support recording and switching between naive and classic latency histograms (grafana/jsonnet-libs#1150) - Basic native histogram utilities (grafana/jsonnet-libs#1164) - grafana-builder: rename template variable "Data Source" to "Data source" (grafana/jsonnet-libs#1111) - Mixins: draw graphs at full resolution (grafana/jsonnet-libs#825) - Allow dashboards to show gRPC codes as labels (grafana/jsonnet-libs#1098) - Allow configuring sort order for variables (grafana/jsonnet-libs#1014) - remove unused/wrong step param (grafana/jsonnet-libs#999) - Show cancelled requests in grey on QPS dashboards. (grafana/jsonnet-libs#988) - Show cancelled requests in yellow on QPS dashboards. (grafana/jsonnet-libs#986) - Add timeseriesPanel (grafana/jsonnet-libs#824) - Allow including "All" for single template var - Allow datasource's regex to be configured - grafana-builder: make allValue configurable (grafana/jsonnet-libs#703) - grafana_builder: add dashboard link func (grafana/jsonnet-libs#683) - Add 'Data Source' label for the default datasource template variable. (grafana/jsonnet-libs#672) - enable toolip by default (grafana/jsonnet-libs#665) mixin-utils changes: - grafana-builder: add support for native/classic stat panel query (grafana/jsonnet-libs#1285) - More native histograms related utils and renaming (grafana/jsonnet-libs#1270) - nativeClassicSumBy: format list of labels nicer (grafana/jsonnet-libs#1204) - Support recording and switching between naive and classic latency histograms (grafana/jsonnet-libs#1150) - chore: fix hardcoded range interval (grafana/jsonnet-libs#1190) - Basic native histogram utilities (grafana/jsonnet-libs#1164) - utils: allow defining native histogram recording rule (grafana/jsonnet-libs#1156) - modify withRunbookURL to allow internal annotation (grafana/jsonnet-libs#1139) - mixin-utils: drop unsupported step target parameter (grafana/jsonnet-libs#1128) - Mixins: draw graphs at full resolution (grafana/jsonnet-libs#825) - Align with style conventions (grafana/jsonnet-libs#1038) - Add a function to remove an alert rule (grafana/jsonnet-libs#812) - mixin-utils: Parameterize interval for histogramRules (grafana/jsonnet-libs#806) - refactor(grafana/jsonnet-libsprometheus): shard mixins over multiple configmaps (grafana/jsonnet-libs#497) - Not all Prometheus rules are alerts. (grafana/jsonnet-libs#490) Signed-off-by: Charlie Le --- CHANGELOG.md | 1 + cortex-mixin/jsonnetfile.lock.json | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 14cbab6b..3e0206ac 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [CHANGE] Use cortex v1.17.1 * [CHANGE] Enable shuffle sharding in compactors * [CHANGE] Remove chunks support for dashboards +* [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard diff --git a/cortex-mixin/jsonnetfile.lock.json b/cortex-mixin/jsonnetfile.lock.json index a1b02191..ff6dd095 100644 --- a/cortex-mixin/jsonnetfile.lock.json +++ b/cortex-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "0d13e5ba1b3a4c29015738c203d92ea39f71ebe2", - "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" + "version": "1d877bb0651ef92176f651d0be473c06e372a8a0", + "sum": "udZaafkbKYMGodLqsFhEe+Oy/St2p0edrK7hiMPEey0=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "21b638f4e4922c0b6fde12120ed45d8ef803edc7", - "sum": "Je2SxBKu+1WrKEEG60zjSKaY/6TPX8uRz5bsaw0a8oA=" + "version": "1d877bb0651ef92176f651d0be473c06e372a8a0", + "sum": "mzLmCv9n3ldLChVGPfyRJOVKoBw+dfK40vU9792aHIM=" } ], "legacyImports": false From 435c753777f8e333876a372c78e130c5458dd9e3 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Sun, 8 Sep 2024 10:36:17 -0700 Subject: [PATCH 54/60] Update CortexProvisioningTooManyActiveSeries to 3.2M series per ingester (#59) * Update CortexProvisioningTooManyActiveSeries to 3.2M series per ingester Signed-off-by: Friedrich Gonzalez * Adjust more things Signed-off-by: Friedrich Gonzalez * Update CHANGELOG.md --------- Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 6 +++--- cortex-mixin/docs/playbooks.md | 6 +++--- cortex-mixin/recording_rules.libsonnet | 6 +++--- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e0206ac..e6895ec1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * [CHANGE] Use cortex v1.17.1 * [CHANGE] Enable shuffle sharding in compactors * [CHANGE] Remove chunks support for dashboards +* [CHANGE] Target 3M memory series per ingester instead of 1.5M * [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index e67ef449..7145d028 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -389,11 +389,11 @@ rules: [ { alert: 'CortexProvisioningTooManyActiveSeries', - // We target each ingester to 1.5M in-memory series. This alert fires if the average - // number of series / ingester in a Cortex cluster is > 1.6M for 2h (we compact + // We target each ingester to 3.0M in-memory series. This alert fires if the average + // number of series / ingester in a Cortex cluster is > 3.2M for 2h (we compact // the TSDB head every 2h). expr: ||| - avg by (%s) (cortex_ingester_memory_series) > 1.6e6 + avg by (%s) (cortex_ingester_memory_series) > 3.2e6 ||| % [$._config.alert_aggregation_labels], 'for': '2h', labels: { diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index b5b68895..39586870 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -555,13 +555,13 @@ How to **investigate**: ### CortexProvisioningTooManyActiveSeries -This alert fires if the average number of in-memory series per ingester is above our target (1.5M). +This alert fires if the average number of in-memory series per ingester is above our target (3.0M). How to **fix**: - Scale up ingesters - To find out the Cortex clusters where ingesters should be scaled up and how many minimum replicas are expected: ``` - ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 1.5e6) > + ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 3.0e6) > count by(cluster, namespace) (cortex_ingester_memory_series) ``` - After the scale up, the in-memory series are expected to be reduced at the next TSDB head compaction (occurring every 2h) @@ -595,7 +595,7 @@ How to **fix**: kubectl -n delete pod ingester-XXX ``` - Restarting an ingester typically reduces the memory allocated by mmap-ed files. After the restart, ingester may allocate this memory again over time, but it may give more time while working on a longer term solution -- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (1.5M). If so: +- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (3.0M). If so: - Scale up ingesters - Memory is expected to be reclaimed at the next TSDB head compaction (occurring every 2h) diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 03835247..86650fa5 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { local _config = { - max_series_per_ingester: 1.5e6, + max_series_per_ingester: 3.0e6, max_samples_per_sec_per_ingester: 80e3, max_samples_per_sec_per_distributor: 240e3, limit_utilisation_target: 0.6, @@ -148,7 +148,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % _config, }, { - // Ingester should have 1.5M series in memory + // Ingester should have 3.0M series in memory record: 'cluster_namespace_deployment_reason:required_replicas:count', labels: { deployment: 'ingester', @@ -167,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // We should be about to cover 60% of our limits, - // and ingester can have 1.5M series in memory + // and ingester can have 3.0M series in memory record: 'cluster_namespace_deployment_reason:required_replicas:count', labels: { deployment: 'ingester', From 4bde8b0629b72a708ceb5c641613875f84f57b38 Mon Sep 17 00:00:00 2001 From: Narsing Metpally Date: Wed, 25 Sep 2024 12:43:12 -0600 Subject: [PATCH 55/60] Increase CortexProvisioningTooManyWrites alert threshold to 160k (#60) * Increase CortexProvisioningTooManyWrites alert threshold to 160k --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e6895ec1..74252e88 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ * [CHANGE] Remove chunks support for dashboards * [CHANGE] Target 3M memory series per ingester instead of 1.5M * [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 +* [CHANGE] Increase CortexProvisioningTooManyWrites alert threshold to 160e3 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index 7145d028..ec44565c 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -407,9 +407,9 @@ }, { alert: 'CortexProvisioningTooManyWrites', - // 80k writes / s per ingester max. + // 160k writes / s per ingester max. expr: ||| - avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 + avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 160e3 ||| % $._config.alert_aggregation_labels, 'for': '15m', labels: { From a630e0e1dbe6296bd7272f3b77a9edfa6abe21b0 Mon Sep 17 00:00:00 2001 From: Justin D Holcomb Date: Tue, 22 Oct 2024 06:21:21 -0600 Subject: [PATCH 56/60] Remove deprecated option - max_series_per_query (#62) Signed-off-by: Justin Holcomb --- CHANGELOG.md | 1 + cortex/config.libsonnet | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74252e88..79419a38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ * [CHANGE] Increase CortexProvisioningTooManyWrites alert threshold to 160e3 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard +* [BUGFIX] Remove deprecated option `max_series_per_query` ## 1.16.1 * [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2 diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index d70f4e00..5311921a 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -236,7 +236,6 @@ 'ingester.max-series-per-metric': $._config.limits.max_series_per_metric, 'ingester.max-global-series-per-user': $._config.limits.max_global_series_per_user, 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, - 'ingester.max-series-per-query': $._config.limits.max_series_per_query, }, rulerLimitsConfig: { 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, @@ -259,8 +258,6 @@ max_global_series_per_user: 150000, max_global_series_per_metric: 20000, - max_series_per_query: 100000, - ingestion_rate: 10000, ingestion_burst_size: 200000, @@ -281,8 +278,6 @@ max_global_series_per_user: 300000, max_global_series_per_metric: 30000, - max_series_per_query: 100000, - ingestion_rate: 30000, ingestion_burst_size: 300000, @@ -300,8 +295,6 @@ max_global_series_per_user: 1000000, max_global_series_per_metric: 100000, - max_series_per_query: 100000, - ingestion_rate: 100000, ingestion_burst_size: 1000000, @@ -319,8 +312,6 @@ max_global_series_per_user: 3000000, // 3M max_global_series_per_metric: 300000, // 300K - max_series_per_query: 100000, - ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M @@ -335,8 +326,6 @@ max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit - max_series_per_query: 100000, - max_global_series_per_user: 6000000, // 6M max_global_series_per_metric: 600000, // 600K @@ -357,8 +346,6 @@ max_global_series_per_user: 12000000, // 12M max_global_series_per_metric: 1200000, // 1.2M - max_series_per_query: 100000, - ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M @@ -377,8 +364,6 @@ max_global_series_per_user: 16000000, // 16M max_global_series_per_metric: 1600000, // 1.6M - max_series_per_query: 100000, - ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M From b3faa7be33681265141eee621d8f592b44428405 Mon Sep 17 00:00:00 2001 From: Charlie Le <3375195+CharlieTLe@users.noreply.github.com> Date: Wed, 23 Oct 2024 02:17:43 -0700 Subject: [PATCH 57/60] Use `timeseriesPanel` instead of `panel` when creating panels (#58) * Update jsonnet-libs to Fri Jul 19 12:51:49 2024 Updates grafana-builder and mixin-utils to latest version in master branch. This will be helpful in creating Grafana panels that use the timeseriesPanel instead of the deprecated panel. grafana-builder changes: - grafana-builder: add support for native/classic stat panel query (grafana/jsonnet-libs#1285) - More native histograms related utils and renaming (grafana/jsonnet-libs#1270) - Support recording and switching between naive and classic latency histograms (grafana/jsonnet-libs#1150) - Basic native histogram utilities (grafana/jsonnet-libs#1164) - grafana-builder: rename template variable "Data Source" to "Data source" (grafana/jsonnet-libs#1111) - Mixins: draw graphs at full resolution (grafana/jsonnet-libs#825) - Allow dashboards to show gRPC codes as labels (grafana/jsonnet-libs#1098) - Allow configuring sort order for variables (grafana/jsonnet-libs#1014) - remove unused/wrong step param (grafana/jsonnet-libs#999) - Show cancelled requests in grey on QPS dashboards. (grafana/jsonnet-libs#988) - Show cancelled requests in yellow on QPS dashboards. (grafana/jsonnet-libs#986) - Add timeseriesPanel (grafana/jsonnet-libs#824) - Allow including "All" for single template var - Allow datasource's regex to be configured - grafana-builder: make allValue configurable (grafana/jsonnet-libs#703) - grafana_builder: add dashboard link func (grafana/jsonnet-libs#683) - Add 'Data Source' label for the default datasource template variable. (grafana/jsonnet-libs#672) - enable toolip by default (grafana/jsonnet-libs#665) mixin-utils changes: - grafana-builder: add support for native/classic stat panel query (grafana/jsonnet-libs#1285) - More native histograms related utils and renaming (grafana/jsonnet-libs#1270) - nativeClassicSumBy: format list of labels nicer (grafana/jsonnet-libs#1204) - Support recording and switching between naive and classic latency histograms (grafana/jsonnet-libs#1150) - chore: fix hardcoded range interval (grafana/jsonnet-libs#1190) - Basic native histogram utilities (grafana/jsonnet-libs#1164) - utils: allow defining native histogram recording rule (grafana/jsonnet-libs#1156) - modify withRunbookURL to allow internal annotation (grafana/jsonnet-libs#1139) - mixin-utils: drop unsupported step target parameter (grafana/jsonnet-libs#1128) - Mixins: draw graphs at full resolution (grafana/jsonnet-libs#825) - Align with style conventions (grafana/jsonnet-libs#1038) - Add a function to remove an alert rule (grafana/jsonnet-libs#812) - mixin-utils: Parameterize interval for histogramRules (grafana/jsonnet-libs#806) - refactor(grafana/jsonnet-libsprometheus): shard mixins over multiple configmaps (grafana/jsonnet-libs#497) - Not all Prometheus rules are alerts. (grafana/jsonnet-libs#490) Signed-off-by: Charlie Le * Use `timeseriesPanel` instead of `panel` when creating panels Fixes: #44 Depends on: #57 Signed-off-by: Charlie Le * Add units to timeseries panel The yaxes field doesn't seem to do anything in the timeseries panel and was replaced with the units field instead. So I defaulted the units to be short and allowed it to be set for the panel. Signed-off-by: Charlie Le * Update CHANGELOG.md --------- Signed-off-by: Charlie Le Co-authored-by: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> --- CHANGELOG.md | 3 +- .../dashboards/alertmanager.libsonnet | 42 +++---- cortex-mixin/dashboards/compactor.libsonnet | 19 ++- cortex-mixin/dashboards/config.libsonnet | 10 +- .../dashboards/dashboard-utils.libsonnet | 110 ++++++++++++++---- .../dashboards/object-store.libsonnet | 32 +++-- cortex-mixin/dashboards/queries.libsonnet | 94 +++++++-------- .../dashboards/reads-resources.libsonnet | 2 +- cortex-mixin/dashboards/reads.libsonnet | 66 +++++------ .../dashboards/rollout-progress.libsonnet | 22 ++-- cortex-mixin/dashboards/ruler.libsonnet | 55 +++++---- cortex-mixin/dashboards/scaling.libsonnet | 2 +- .../dashboards/writes-resources.libsonnet | 2 +- cortex-mixin/dashboards/writes.libsonnet | 52 ++++----- 14 files changed, 272 insertions(+), 239 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79419a38..66792572 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,9 @@ * [CHANGE] Target 3M memory series per ingester instead of 1.5M * [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 * [CHANGE] Increase CortexProvisioningTooManyWrites alert threshold to 160e3 +* [CHANGE] Use `timeseriesPanel` instead of `panel` when creating panels #58 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` -* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard +* [ENHANCEMENT] Support Grafana 11 in all dashboards * [BUGFIX] Remove deprecated option `max_series_per_query` ## 1.16.1 diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 0bf88c43..731135db 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -10,22 +10,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Total Alerts') + + $.timeseriesPanel('Total Alerts') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Total Silences') + + $.timeseriesPanel('Total Silences') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short') ) ) .addRow( $.row('Alerts Received') .addPanel( - $.panel('APS') + + $.timeseriesPanel('APS') + $.queryPanel( [ ||| @@ -42,7 +42,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Alert Notifications') .addPanel( - $.panel('NPS') + + $.timeseriesPanel('NPS') + $.queryPanel( [ ||| @@ -56,7 +56,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('NPS by integration') + + $.timeseriesPanel('NPS by integration') + $.queryPanel( [ ||| @@ -73,18 +73,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) ) ) .addRow( $.row('Configuration API (gateway) + Alertmanager UI') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')]) ) ) @@ -94,7 +94,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Replication') .addPanel( - $.panel('Per %s Tenants' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Alerts' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -110,7 +110,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Silences' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Tenant Configuration Sync') .addPanel( - $.panel('Syncs/sec') + + $.timeseriesPanel('Syncs/sec') + $.queryPanel( [ ||| @@ -135,14 +135,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Syncs/sec (By Reason)') + + $.timeseriesPanel('Syncs/sec (By Reason)') + $.queryPanel( 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{reason}}' ) ) .addPanel( - $.panel('Ring Check Errors/sec') + + $.timeseriesPanel('Ring Check Errors/sec') + $.queryPanel( 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), 'errors' @@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Initial State Sync') .addPanel( - $.panel('Initial syncs /sec') + + $.timeseriesPanel('Initial syncs /sec') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{outcome}}' @@ -166,7 +166,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Initial sync duration') + + $.timeseriesPanel('Initial sync duration', unit='s') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + { targets: [ target { @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Fetch state from other alertmanagers /sec') + + $.timeseriesPanel('Fetch state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -201,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Runtime State Sync') .addPanel( - $.panel('Replicate state to other alertmanagers /sec') + + $.timeseriesPanel('Replicate state to other alertmanagers /sec') + $.queryPanel( [ ||| @@ -215,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Merge state from other alertmanagers /sec') + + $.timeseriesPanel('Merge state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -229,7 +229,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Persist state to remote storage /sec') + + $.timeseriesPanel('Persist state to remote storage /sec') + $.queryPanel( [ ||| diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet index aeb64491..720b6fff 100644 --- a/cortex-mixin/dashboards/compactor.libsonnet +++ b/cortex-mixin/dashboards/compactor.libsonnet @@ -14,7 +14,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor) ) + $.bars + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Per-instance runs', ||| @@ -23,7 +22,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Tenants compaction progress') + + $.timeseriesPanel('Tenants compaction progress') + $.queryPanel(||| ( cortex_compactor_tenants_processing_succeeded{%s} + @@ -44,9 +43,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Compacted blocks / sec') + + $.timeseriesPanel('Compacted blocks / sec', unit='ops') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Compacted blocks / sec', ||| @@ -55,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Per-block compaction duration') + + $.timeseriesPanel('Per-block compaction duration', unit='s') + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + $.panelDescription( 'Per-block compaction duration', @@ -68,11 +66,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Average blocks / tenant') + + $.timeseriesPanel('Average blocks / tenant') + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'), ) .addPanel( - $.panel('Tenants with largest number of blocks') + + $.timeseriesPanel('Tenants with largest number of blocks') + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') + $.panelDescription( 'Tenants with largest number of blocks', @@ -85,9 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Garbage Collector') .addPanel( - $.panel('Blocks marked for deletion / sec') + - $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks marked for deletion / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks'), ) .addPanel( $.successFailurePanel( @@ -111,7 +108,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) + { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Metadata Sync Duration') + + $.timeseriesPanel('Metadata Sync Duration', unit='ms') + // This metric tracks the duration of a per-tenant metadata sync. $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) diff --git a/cortex-mixin/dashboards/config.libsonnet b/cortex-mixin/dashboards/config.libsonnet index 9240ef89..10692a3d 100644 --- a/cortex-mixin/dashboards/config.libsonnet +++ b/cortex-mixin/dashboards/config.libsonnet @@ -8,19 +8,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Startup config file') .addPanel( - $.panel('Startup config file hashes') + + $.timeseriesPanel('Startup config file hashes', unit='instances') + $.queryPanel('count(cortex_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ) .addRow( $.row('Runtime config file') .addPanel( - $.panel('Runtime config file hashes') + + $.timeseriesPanel('Runtime config file hashes', unit='instances') + $.queryPanel('count(cortex_runtime_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ), } diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index c0d2b087..3d9eea30 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -62,6 +62,44 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addTemplate('cluster', 'cortex_build_info', 'cluster') .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), }, + timeseriesPanel(title, unit='short'):: { + datasource: '$datasource', + fieldConfig: { + defaults: { + custom: { + drawStyle: 'line', + fillOpacity: 1, + lineWidth: 1, + pointSize: 5, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + }, + thresholds: { + mode: 'absolute', + steps: [], + }, + unit: unit, + }, + overrides: [], + }, + options: { + legend: { + showLegend: true, + }, + tooltip: { + mode: 'single', + sort: 'none', + }, + }, + links: [], + targets: [], + title: title, + type: 'timeseries', + }, // The mixin allow specialism of the job selector depending on if its a single binary // deployment or a namespaced one. @@ -108,6 +146,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; } for target in super.targets ], + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', + group: 'A', + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byName', + options: status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)], + }, }, latencyPanel(metricName, selector, multiplier='1e3'):: @@ -121,7 +188,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, successFailurePanel(title, successMetric, failureMetric):: - $.panel(title) + + $.timeseriesPanel(title, unit='short') + $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.stack + { aliasColors: { @@ -132,7 +199,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Displays started, completed and failed rate. startedCompletedFailedPanel(title, startedMetric, completedMetric, failedMetric):: - $.panel(title) + + $.timeseriesPanel(title, unit='ops') + $.queryPanel([startedMetric, completedMetric, failedMetric], ['started', 'completed', 'failed']) + $.stack + { aliasColors: { @@ -143,7 +210,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerCPUUsagePanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel([ 'sum by(%s) (rate(container_cpu_usage_seconds_total{%s,container=~"%s"}[$__rate_interval]))' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], 'min(container_spec_cpu_quota{%s,container=~"%s"} / container_spec_cpu_period{%s,container=~"%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], @@ -160,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerMemoryWorkingSetPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel([ // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up // summing the memory of the old instance/pod (whose metric will be stale for 5m) to the new instance/pod. @@ -180,7 +247,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerNetworkPanel(title, metric, instanceName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( 'sum by(%(instance)s) (rate(%(metric)s{%(namespace)s,%(instance)s=~"%(instanceName)s"}[$__rate_interval]))' % { namespace: $.namespaceMatcher(), @@ -199,7 +266,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerNetworkPanel('Transmit Bandwidth', 'container_network_transmit_bytes_total', instanceName), containerDiskWritesPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -220,7 +287,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskReadsPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -239,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskSpaceUtilization(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='percentunit') + $.queryPanel( ||| max by(persistentvolumeclaim) ( @@ -266,7 +333,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; else 'label_name="%s"' % containerName, goHeapInUsePanel(title, jobName):: - $.panel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel( 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], '{{%s}}' % $._config.per_instance_label @@ -361,39 +428,38 @@ local utils = import 'mixin-utils/utils.libsonnet'; getObjectStoreRows(title, component):: [ super.row(title) .addPanel( - $.panel('Operations / sec') + + $.timeseriesPanel('Operations / sec', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack ) .addPanel( - $.panel('Error rate') + + $.timeseriesPanel('Error rate', unit='percentunit') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Latency of Op: Attributes') + + $.timeseriesPanel('Latency of Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Exists') + + $.timeseriesPanel('Latency of Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]), ), $.row('') .addPanel( - $.panel('Latency of Op: Get') + + $.timeseriesPanel('Latency of Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: GetRange') + + $.timeseriesPanel('Latency of Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Upload') + + $.timeseriesPanel('Latency of Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Delete') + + $.timeseriesPanel('Latency of Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), ], @@ -406,7 +472,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }; super.row(title) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -425,7 +491,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -439,7 +505,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio', unit='percentunit') + $.queryPanel( ||| sum( diff --git a/cortex-mixin/dashboards/object-store.libsonnet b/cortex-mixin/dashboards/object-store.libsonnet index 69e257b6..d58976a2 100644 --- a/cortex-mixin/dashboards/object-store.libsonnet +++ b/cortex-mixin/dashboards/object-store.libsonnet @@ -7,58 +7,54 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Components') .addPanel( - $.panel('RPS / component') + + $.timeseriesPanel('RPS / component', unit='rps') + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{component}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.panel('Error rate / component') + - $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / component', unit='percentunit') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') ) ) .addRow( $.row('Operations') .addPanel( - $.panel('RPS / operation') + + $.timeseriesPanel('RPS / operation', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.panel('Error rate / operation') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / operation', unit='percentunit') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') ) ) .addRow( $.row('') .addPanel( - $.panel('Op: Get') + + $.timeseriesPanel('Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: GetRange') + + $.timeseriesPanel('Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get_range"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Exists') + + $.timeseriesPanel('Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="exists"}' % $.namespaceMatcher()), ) ) .addRow( $.row('') .addPanel( - $.panel('Op: Attributes') + + $.timeseriesPanel('Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="attributes"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Upload') + + $.timeseriesPanel('Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="upload"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Delete') + + $.timeseriesPanel('Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="delete"}' % $.namespaceMatcher()), ) ), diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index cada5c8e..212ab9d2 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -8,34 +8,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_frontend)), ) .addPanel( - $.panel('Retries') + - $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Retries', unit='short') + + $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1), ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Scheduler') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)), ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_scheduler_queue_length{%s}' % $.jobMatcher($._config.job_names.query_scheduler), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Frontend - Query Splitting and Results Cache') .addPanel( - $.panel('Intervals per Query') + + $.timeseriesPanel('Intervals per Query') + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'splitting rate') + $.panelDescription( 'Intervals per Query', @@ -45,7 +44,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Results Cache Hit %') + + $.timeseriesPanel('Results Cache Hit %') + $.queryPanel(||| sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) or sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) @@ -53,7 +52,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - $.panel('Results Cache misses') + + $.timeseriesPanel('Results Cache misses') + $.queryPanel(||| sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) or sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) @@ -63,7 +62,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend - Query sharding') .addPanel( - $.panel('Sharded Queries Ratio') + + $.timeseriesPanel('Sharded Queries Ratio') + $.queryPanel(||| sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{%s}[$__rate_interval])) / sum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{%s}[$__rate_interval])) @@ -78,9 +77,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Number of Sharded Queries per Query') + + $.timeseriesPanel('Number of Sharded Queries per Query', unit='short') + $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') } + $.panelDescription( 'Number of Sharded Queries per Query', ||| @@ -93,56 +91,50 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('Stages') + + $.timeseriesPanel('Stages', unit='ms') + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher($._config.job_names.querier), '{{slice}}') + - { yaxes: $.yaxes('ms') } + $.stack, ) .addPanel( - $.panel('Chunk cache misses') + + $.timeseriesPanel('Chunk cache misses') + $.queryPanel(||| sum(rate(cortex_cache_fetched_keys{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%(q)s,name="chunksmemcache"}[1m])) or sum(rate(cortex_cache_fetched_keys_total{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits_total{%(q)s,name="chunksmemcache"}[1m])) ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Hit rate'), ) .addPanel( - $.panel('Chunk cache corruptions') + + $.timeseriesPanel('Chunk cache corruptions') + $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'Corrupt chunks'), ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Series per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Series per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Chunks per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.panel('Samples per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Samples per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( - $.panel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -151,13 +143,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Bucket indexes loaded (per querier)') + + $.timeseriesPanel('Bucket indexes loaded (per querier)', unit='short') + $.queryPanel([ 'max(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'min(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'avg(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), - ], ['Max', 'Min', 'Average']) + - { yaxes: $.yaxes('short') }, + ], ['Max', 'Min', 'Average']), ) .addPanel( $.successFailurePanel( @@ -167,7 +158,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Bucket indexes load latency') + + $.timeseriesPanel('Bucket indexes load latency', unit='ms') + $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.querier)), ) ) @@ -175,36 +166,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway - Blocks storage') .addPanel( - $.panel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks queried / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks'), ) .addPanel( - $.panel('Data fetched / sec') + + $.timeseriesPanel('Data fetched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( - $.panel('Data touched / sec') + + $.timeseriesPanel('Data touched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Series fetch duration (per request)') + + $.timeseriesPanel('Series fetch duration (per request)') + $.latencyPanel('cortex_bucket_store_series_get_all_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series merge duration (per request)') + + $.timeseriesPanel('Series merge duration (per request)') + $.latencyPanel('cortex_bucket_store_series_merge_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series returned (per request)') + + $.timeseriesPanel('Series returned (per request)') + $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__rate_interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) @@ -212,7 +200,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Blocks currently loaded') + + $.timeseriesPanel('Blocks currently loaded') + $.queryPanel('sum(cortex_bucket_store_blocks_loaded{component="store-gateway",%s}) without (user)' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) ) .addPanel( @@ -234,15 +222,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Lazy loaded index-headers') + + $.timeseriesPanel('Lazy loaded index-headers') + $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{%s}}' % $._config.per_instance_label) ) .addPanel( - $.panel('Index-header lazy load duration') + + $.timeseriesPanel('Index-header lazy load duration', unit='ms') + $.latencyPanel('cortex_bucket_store_indexheader_lazy_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series hash cache hit ratio') + + $.timeseriesPanel('Series hash cache hit ratio') + $.queryPanel(||| sum(rate(cortex_bucket_store_series_hash_cache_hits_total{%s}[$__rate_interval])) / diff --git a/cortex-mixin/dashboards/reads-resources.libsonnet b/cortex-mixin/dashboards/reads-resources.libsonnet index f0750c88..437a57a2 100644 --- a/cortex-mixin/dashboards/reads-resources.libsonnet +++ b/cortex-mixin/dashboards/reads-resources.libsonnet @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ruler') .addPanel( - $.panel('Rules') + + $.timeseriesPanel('Rules') + $.queryPanel( 'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ruler)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 5a720784..c0ddbe4f 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Instant queries / sec') + + $.timeseriesPanel('Instant queries / sec') + $.statPanel(||| sum( rate( @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Range queries / sec') + + $.timeseriesPanel('Range queries / sec') + $.statPanel(||| sum( rate( @@ -92,37 +92,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Query Frontend') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -142,85 +140,82 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( - $.panel('Latency (Time in Queue)') + + $.timeseriesPanel('Latency (Time in Queue)') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) ) .addRow( $.row('Cache - Query Results') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) ) ) .addRow( $.row('Querier') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -234,11 +229,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' ) + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -252,7 +246,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio') + $.queryPanel( ||| sum by(item_type) ( diff --git a/cortex-mixin/dashboards/rollout-progress.libsonnet b/cortex-mixin/dashboards/rollout-progress.libsonnet index 16c54095..775a199e 100644 --- a/cortex-mixin/dashboards/rollout-progress.libsonnet +++ b/cortex-mixin/dashboards/rollout-progress.libsonnet @@ -20,7 +20,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Rollout progress // - $.panel('Rollout progress') + + $.timeseriesPanel('Rollout progress') + $.barGauge([ // Multi-zone deployments are grouped together removing the "zone-X" suffix. // After the grouping, the resulting label is called "cortex_service". @@ -89,7 +89,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Writes // - $.panel('Writes - 2xx') + + $.timeseriesPanel('Writes - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -100,7 +100,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 0 }, }, - $.panel('Writes - 4xx') + + $.timeseriesPanel('Writes - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 0 }, }, - $.panel('Writes - 5xx') + + $.timeseriesPanel('Writes - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -125,7 +125,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 0 }, }, - $.panel('Writes 99th Latency') + + $.timeseriesPanel('Writes 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -140,7 +140,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Reads // - $.panel('Reads - 2xx') + + $.timeseriesPanel('Reads - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -151,7 +151,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 4 }, }, - $.panel('Reads - 4xx') + + $.timeseriesPanel('Reads - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -164,7 +164,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 4 }, }, - $.panel('Reads - 5xx') + + $.timeseriesPanel('Reads - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -176,7 +176,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 4 }, }, - $.panel('Reads 99th Latency') + + $.timeseriesPanel('Reads 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -191,7 +191,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Unhealthy pods // - $.panel('Unhealthy pods') + + $.timeseriesPanel('Unhealthy pods') + $.newStatPanel([ ||| kube_deployment_status_replicas_unavailable{%(namespace_matcher)s, deployment=~"%(all_services_regex)s"} @@ -280,7 +280,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Performance comparison with 24h ago // - $.panel('Latency vs 24h ago') + + $.timeseriesPanel('Latency vs 24h ago') + $.queryPanel([||| 1 - ( avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index b243198c..88742e23 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -67,26 +67,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Active Configurations') + + $.timeseriesPanel('Active Configurations') + $.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Total Rules') + + $.timeseriesPanel('Total Rules') + $.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Read from Ingesters - QPS') + + $.timeseriesPanel('Read from Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) .addPanel( - $.panel('Write to Ingesters - QPS') + + $.timeseriesPanel('Write to Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) ) .addRow( $.row('Rule Evaluations Global') .addPanel( - $.panel('EPS') + + $.timeseriesPanel('EPS') + $.queryPanel( [ $.rulerQueries.ruleEvaluations.success % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.ruleEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'average' @@ -106,41 +106,40 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Configuration API (gateway)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re]) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', ruler_config_api_routes_re)]) ) .addPanel( - $.panel('Per route p99 Latency') + + $.timeseriesPanel('Per route p99 latency', unit='s') + $.queryPanel( 'histogram_quantile(0.99, sum by (route, le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], '{{ route }}' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Writes (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) ) .addRow( $.row('Reads (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) ) @@ -148,17 +147,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Ruler - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( - $.panel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -166,33 +163,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Notifications') .addPanel( - $.panel('Delivery Errors') + + $.timeseriesPanel('Delivery Errors') + $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Dropped') + + $.timeseriesPanel('Dropped') + $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher($._config.job_names.ruler), '{{ user }}') ) ) .addRow( ($.row('Group Evaluations') + { collapse: true }) .addPanel( - $.panel('Missed Iterations') + + $.timeseriesPanel('Missed Iterations') + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher($._config.job_names.ruler), '{{ user }}'), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.groupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' ), ) .addPanel( - $.panel('Failures') + + $.timeseriesPanel('Failures') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher($._config.job_names.ruler)], '{{ rule_group }}' ) @@ -201,7 +198,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Rule Evaluation per User') + { collapse: true }) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index 6ac244ea..e078a350 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -38,7 +38,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Scaling') + { height: '400px' }) .addPanel( - $.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } + + $.timeseriesPanel('Workload-based scaling') + { sort: { col: 0, desc: false } } + $.tablePanel([ ||| sort_desc( diff --git a/cortex-mixin/dashboards/writes-resources.libsonnet b/cortex-mixin/dashboards/writes-resources.libsonnet index 64f83ef1..e11ac223 100644 --- a/cortex-mixin/dashboards/writes-resources.libsonnet +++ b/cortex-mixin/dashboards/writes-resources.libsonnet @@ -31,7 +31,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('In-memory series') + + $.timeseriesPanel('In-memory series') + $.queryPanel( 'sum by(%s) (cortex_ingester_memory_series{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index c6563645..67d10581 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -34,7 +34,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Samples / sec') + + $.timeseriesPanel('Samples / sec') + $.statPanel( 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Active Series') + + $.timeseriesPanel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} / on(%(group_by_cluster)s) group_left @@ -56,87 +56,84 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Distributor') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Key-value store for high-availability (HA) deduplication') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Key-value store for the ingesters ring') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) @@ -158,7 +155,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Upload latency') + + $.timeseriesPanel('Upload latency', unit='ms') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Upload latency', @@ -188,7 +185,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Compactions latency') + + $.timeseriesPanel('Compactions latency', unit='ms') + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Compaction latency', @@ -231,9 +228,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('WAL truncations latency (includes checkpointing)') + + $.timeseriesPanel('WAL truncations latency (includes checkpointing)', unit='s') + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') } + $.panelDescription( 'WAL truncations latency (including checkpointing)', ||| @@ -243,7 +239,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Corruptions / sec') + + $.timeseriesPanel('Corruptions / sec', unit='ops') + $.queryPanel([ 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), From 65ab2e31e1edecd0761127815a40e3150c9bb972 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Mon, 28 Oct 2024 01:57:54 -0700 Subject: [PATCH 58/60] Release v1.17.1 (#63) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 66792572..528ab104 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # Changelog -## master / unreleased +## 1.17.1 / 2024-10-23 * [CHANGE] Use cortex v1.17.1 * [CHANGE] Enable shuffle sharding in compactors * [CHANGE] Remove chunks support for dashboards @@ -12,7 +12,7 @@ * [ENHANCEMENT] Support Grafana 11 in all dashboards * [BUGFIX] Remove deprecated option `max_series_per_query` -## 1.16.1 +## 1.16.1 / 2024-04-30 * [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2 * [CHANGE] Use cortex v1.16.1 * [ENHANCEMENT] Enable frontend query stats by default From a7ba53eb1eb3ee18349a4ea918bcf22d6b7b4311 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Fri, 11 Apr 2025 04:15:40 -0700 Subject: [PATCH 59/60] Update to k8s-libsonnet to 1.30 (#66) Signed-off-by: Friedrich Gonzalez --- README.md | 2 +- scripts/test-readme.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 539559a0..65d91709 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ To generate the YAMLs for deploying Cortex: ```console $ mkdir && cd - $ tk init --k8s=1.26 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.26@main + $ tk init --k8s=1.30 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.30@main $ jb install github.com/cortexproject/cortex-jsonnet/cortex@main ``` diff --git a/scripts/test-readme.sh b/scripts/test-readme.sh index 84a51a60..55120b06 100755 --- a/scripts/test-readme.sh +++ b/scripts/test-readme.sh @@ -3,7 +3,7 @@ set -xe rm -rf $1 mkdir -p $1 cd $1 -tk init --k8s=1.26 +tk init --k8s=1.30 jb install github.com/cortexproject/cortex-jsonnet/cortex@main rm -fr ./vendor/cortex cp -r ../../cortex ./vendor/ From c99aab2b2aca15b54648875a60a20869e91eb018 Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Wed, 16 Apr 2025 02:05:23 -0700 Subject: [PATCH 60/60] Improve default tenant sizes (#65) * Improve default tenant sizes Signed-off-by: Friedrich Gonzalez * Increase default compactor to handle bigger tenants Signed-off-by: Friedrich Gonzalez --------- Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 3 ++ cortex/config.libsonnet | 56 ++++++++++++++++++++++++++++++++++++ cortex/tsdb-config.libsonnet | 2 +- 3 files changed, 60 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 528ab104..03e0f873 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ # Changelog +## master +* [ENHANCEMENT] Add bigger tenants and configure default compactor tenant shards + ## 1.17.1 / 2024-10-23 * [CHANGE] Use cortex v1.17.1 * [CHANGE] Enable shuffle sharding in compactors diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 5311921a..3f103009 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -340,6 +340,7 @@ }, super_user:: { + compactor_tenant_shard_size: 2, max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit @@ -358,6 +359,7 @@ // This user class has limits increased by +50% compared to the previous one. mega_user+:: { + compactor_tenant_shard_size: 2, max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit @@ -373,6 +375,60 @@ ingestion_tenant_shard_size: 180, }, + + user_24M:: { // 50% more than previous + compactor_tenant_shard_size: 4, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 24000000, // 24M + max_global_series_per_metric: 2400000, // 2.4M + + ingestion_rate: 3000000, // 3M + ingestion_burst_size: 30000000, // 30M + + // 3400 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 170, + + ingestion_tenant_shard_size: 270, + }, + + user_32M:: { // 33% more than previous + compactor_tenant_shard_size: 4, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 32000000, // 32M + max_global_series_per_metric: 3200000, // 3.2M + + ingestion_rate: 4500000, // 4.5M + ingestion_burst_size: 45000000, // 45M + + // 3800 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 190, + + ingestion_tenant_shard_size: 360, + }, + + user_48M:: { // 50% more than previous + compactor_tenant_shard_size: 8, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 48000000, // 48M + max_global_series_per_metric: 4800000, // 4.8M + + ingestion_rate: 6000000, // 6M + ingestion_burst_size: 60000000, // 60M + + // 4200 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 210, + + ingestion_tenant_shard_size: 540, + }, }, // if not empty, passed to overrides.yaml as another top-level field diff --git a/cortex/tsdb-config.libsonnet b/cortex/tsdb-config.libsonnet index 3d2c5e4b..365a9b5e 100644 --- a/cortex/tsdb-config.libsonnet +++ b/cortex/tsdb-config.libsonnet @@ -13,7 +13,7 @@ cortex_store_gateway_data_disk_class: 'standard', // Allow to configure the compactor disk. - cortex_compactor_data_disk_size: '250Gi', + cortex_compactor_data_disk_size: '500Gi', cortex_compactor_data_disk_class: 'fast', // Allow to fine tune compactor. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy