diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 41cd2ad2..00000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,41 +0,0 @@ -version: 2 - -workflows: - version: 2 - ci: - jobs: - - lint - - build - - test-readme - -jobs: - lint: - docker: - - image: grafana/cortex-jsonnet-build-image:3527936 - steps: - - checkout - - run: - name: "Check white noise" - command: make check-white-noise - - run: - name: "Lint mixin" - command: make lint-mixin - - run: - name: "Lint playbooks" - command: make lint-playbooks - - build: - docker: - - image: grafana/cortex-jsonnet-build-image:3527936 - steps: - - checkout - - run: make build-mixin - - store_artifacts: - path: cortex-mixin/cortex-mixin.zip - - test-readme: - docker: - - image: grafana/cortex-jsonnet-build-image:3527936 - steps: - - checkout - - run: make test-readme diff --git a/.github/workflows/build-image.yaml b/.github/workflows/build-image.yaml new file mode 100644 index 00000000..107e6beb --- /dev/null +++ b/.github/workflows/build-image.yaml @@ -0,0 +1,56 @@ +name: Build Image + +on: + push: + branches: [ main ] + paths: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' + pull_request: + branches: [ main ] + paths: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + name: Checkout + + - name: Build & save image + run: make build-image save-build-image + + - name: Upload Docker Images Artifact + uses: actions/upload-artifact@v4 + with: + name: build-image + path: ./build-image.tar + if-no-files-found: error + + push: + if: github.ref == 'refs/heads/main' && github.repository == 'cortexproject/cortex-jsonnet' + needs: build + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + name: Checkout + + - name: Download Docker Images Artifacts + uses: actions/download-artifact@v4 + with: + name: build-image + + - name: Load image + run: make load-build-image + + - name: Login to Quay.io + uses: docker/login-action@v3 + with: + registry: quay.io + username: ${{secrets.QUAY_REGISTRY_USER}} + password: ${{secrets.QUAY_REGISTRY_PASSWORD}} + + - name: Push image + run: make publish-build-image diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml new file mode 100644 index 00000000..423e0c96 --- /dev/null +++ b/.github/workflows/ci.yaml @@ -0,0 +1,59 @@ +name: CI + +on: + push: + branches: [ main ] + paths-ignore: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' + pull_request: + branches: [ main ] + paths-ignore: + - 'build-image/Dockerfile' + - '.github/workflows/build-image.yaml' + +jobs: + lint: + runs-on: ubuntu-latest + container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726 + steps: + - uses: actions/checkout@v4 + name: Checkout + with: + fetch-depth: 0 + + - name: "Lint mixin" + run: make lint-mixin + + - name: "Lint playbooks" + run: make lint-playbooks + + build: + runs-on: ubuntu-latest + container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726 + steps: + - uses: actions/checkout@v4 + name: Checkout + with: + fetch-depth: 0 + + - name: "Build mixin" + run: make build-mixin + + readme: + runs-on: ubuntu-latest + container: quay.io/cortexproject/cortex-jsonnet-build-image:fbe4726 + steps: + - uses: actions/checkout@v4 + name: Checkout + with: + fetch-depth: 0 + + - name: "Test readme s3" + run: make test-readme/s3 + + - name: "Test readme azure" + run: make test-readme/azure + + - name: "Test readme gcs" + run: make test-readme/gcs diff --git a/.gitignore b/.gitignore index 9d64df89..4c7277c4 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,5 @@ cortex-mixin.zip cortex-mixin/out cortex-mixin/vendor /test-readme/ +.vscode +build-image.tar diff --git a/CHANGELOG.md b/CHANGELOG.md index 3135d97c..03e0f873 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,59 @@ # Changelog -## master / unreleased +## master +* [ENHANCEMENT] Add bigger tenants and configure default compactor tenant shards + +## 1.17.1 / 2024-10-23 +* [CHANGE] Use cortex v1.17.1 +* [CHANGE] Enable shuffle sharding in compactors +* [CHANGE] Remove chunks support for dashboards +* [CHANGE] Target 3M memory series per ingester instead of 1.5M +* [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 +* [CHANGE] Increase CortexProvisioningTooManyWrites alert threshold to 160e3 +* [CHANGE] Use `timeseriesPanel` instead of `panel` when creating panels #58 +* [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` +* [ENHANCEMENT] Support Grafana 11 in all dashboards +* [BUGFIX] Remove deprecated option `max_series_per_query` + +## 1.16.1 / 2024-04-30 +* [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2 +* [CHANGE] Use cortex v1.16.1 +* [ENHANCEMENT] Enable frontend query stats by default +* [ENHANCEMENT] Enable ruler query stats by default +* [ENHANCEMENT] Configure `-blocks-storage.bucket-store.ignore-blocks-within` in queriers, rulers and store-gateways + +## 1.15.3 / 2023-11-24 +* [CHANGE] Add default instance max series for ingesters +* [CHANGE] Add default instance max inflight pushes for distributors +* [CHANGE] Remove mem-ballast from distributor and querier. +* [CHANGE] Increase cpu requests for querier to 2. +* [CHANGE] Configure GOMAXPROCS and GOMEMLIMIT for all cortex modules based on cpu and memory requests or limits +* [CHANGE] Add default tenant shard sizes +* [CHANGE] Use cortex v1.15.3 +* [CHANGE] Azure storage endpoint suffix is set to `blob.core.windows.net` for backward compatibility + +## 1.14.1 / 2023-07-11 + +* [CHANGE] Use faster disks for compactor +* [CHANGE] Enables query-scheduler by default +* [CHANGE] Enables bucket-index by default +* [BUGFIX] Fix `Blocks currently loaded` in Queries + +## 1.13.2 / 2023-04-29 + +* [CHANGE] Use policy/v1 PodDisruptionBudget to support k8s 1.25+ + +## 1.11.1 / 2023-01-13 + +* [CHANGE] Updated readme to use this repo with tanka +* [CHANGE] Removed chunks support +* [CHANGE] Use integrated cortex overrides exporter +* [CHANGE] Use default remote timeout in distributors +* [ENHANCEMENT] Added main.jsonnet examples for azure, gcs and s3 +* [ENHANCEMENT] How to rename buckets in AWS and Azure for `not healthy index found` playbook. #5 +* [ENHANCEMENT] Support new metrics cortex_cache_fetched_keys_total and cortex_cache_fetched_keys_total +* [BUGFIX] Updated blocks_storage_s3_endpoint in config.libsonnet to include the correct aws region +* [BUGFIX] Fixes `-blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections` for ruler and querier ## 1.11.0 / 2021-12-30 diff --git a/CODEOWNERS b/CODEOWNERS index 0296909a..1f769c76 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -1,4 +1,4 @@ # https://help.github.com/articles/about-codeowners/ # https://git-scm.com/docs/gitignore#_pattern_format -* @grafana/cortex-team +* @cortexproject/jsonnet-team diff --git a/Makefile b/Makefile index d0ca2f52..a1f0de73 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: lint build-image publish-build-image test-readme +.PHONY: lint build-image publish-build-image test-readme clean JSONNET_FMT := jsonnetfmt @@ -32,10 +32,16 @@ fmt: xargs -n 1 -- $(JSONNET_FMT) -i build-image: - docker build -t grafana/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-image + docker build -t quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-image + +save-build-image: + docker save quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) > build-image.tar + +load-build-image: + docker load < build-image.tar publish-build-image: - docker push grafana/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) + docker push quay.io/cortexproject/cortex-jsonnet-build-image:$(shell git rev-parse --short HEAD) build-mixin: @cd cortex-mixin && \ @@ -44,17 +50,10 @@ build-mixin: mixtool generate all --output-alerts out/alerts.yaml --output-rules out/rules.yaml --directory out/dashboards mixin.libsonnet && \ zip -q -r cortex-mixin.zip out -test-readme: - rm -rf test-readme && \ - mkdir test-readme && cd test-readme && \ - tk init --k8s=false && \ - jb install github.com/jsonnet-libs/k8s-alpha/1.18 && \ - printf '(import "github.com/jsonnet-libs/k8s-alpha/1.18/main.libsonnet")\n+(import "github.com/jsonnet-libs/k8s-alpha/1.18/extensions/kausal-shim.libsonnet")' > lib/k.libsonnet && \ - jb install github.com/grafana/cortex-jsonnet/cortex@main && \ - rm -fr ./vendor/cortex && \ - cp -r ../cortex ./vendor/ && \ - cp vendor/cortex/cortex-manifests.jsonnet.example environments/default/main.jsonnet && \ - PAGER=cat tk show environments/default +test-readme: test-readme/azure test-readme/gcs test-readme/s3 + +test-readme/%: + @./scripts/test-readme.sh $@ clean-white-noise: @$(FIND) . -type f -regextype posix-extended -regex '.*(md|libsonnet)' -print | \ @@ -62,3 +61,9 @@ clean-white-noise: check-white-noise: clean-white-noise @git diff --exit-code --quiet || (echo "Please remove trailing whitespaces running 'make clean-white-noise'" && false) + +clean: + rm -rf cortex-mixin/out + rm -rf cortex-mixin/vendor + rm -f cortex-mixin/cortex-mixin.zip + rm -rf test-readme diff --git a/README.md b/README.md index 8864e8cf..65d91709 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,13 @@ This repo has the jsonnet for deploying Cortex and the related monitoring in Kubernetes. +--- +**NOTE** + +If you are more familiar with helm you should use the [helm chart](https://cortexproject.github.io/cortex-helm-chart/) for cortex + +--- + To generate the YAMLs for deploying Cortex: 1. Make sure you have tanka and jb installed: @@ -10,28 +17,30 @@ To generate the YAMLs for deploying Cortex: ```console $ # make sure to be outside of GOPATH or a go.mod project - $ GO111MODULE=on go get github.com/grafana/tanka/cmd/tk - $ GO111MODULE=on go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb + $ GO111MODULE=on go install github.com/grafana/tanka/cmd/tk@v0.26.0 + $ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1 ``` -1. Initialise the Tanka, and install the Cortex and Kubernetes Jsonnet libraries. +1. Initialise the Tanka repo, install the Cortex and Kubernetes Jsonnet libraries. ```console $ mkdir && cd - $ tk init --k8s=false - $ # The k8s-alpha library supports Kubernetes versions 1.14+ - $ jb install github.com/jsonnet-libs/k8s-alpha/1.18 - $ cat < lib/k.libsonnet - (import "github.com/jsonnet-libs/k8s-alpha/1.18/main.libsonnet") - + (import "github.com/jsonnet-libs/k8s-alpha/1.18/extensions/kausal-shim.libsonnet") - EOF - $ jb install github.com/grafana/cortex-jsonnet/cortex@main + $ tk init --k8s=1.30 # this includes github.com/jsonnet-libs/k8s-libsonnet/1.30@main + $ jb install github.com/cortexproject/cortex-jsonnet/cortex@main + ``` + +1. Use any of the examples to get a main.jsonnet and adjust as needed + + ```console + $ cp vendor/cortex/azure/main.jsonnet.example environments/default/main.jsonnet ``` -1. Use the example monitoring.jsonnet.example: + ```console + $ cp vendor/cortex/gcs/main.jsonnet.example environments/default/main.jsonnet + ``` ```console - $ cp vendor/cortex/cortex-manifests.jsonnet.example environments/default/main.jsonnet + $ cp vendor/cortex/s3/main.jsonnet.example environments/default/main.jsonnet ``` 1. Check what is in the example: @@ -58,13 +67,11 @@ To generate the YAMLs for deploying Cortex: To generate the Grafana dashboards and Prometheus alerts for Cortex: ```console -$ GO111MODULE=on go get github.com/monitoring-mixins/mixtool/cmd/mixtool -$ GO111MODULE=on go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb -$ git clone https://github.com/grafana/cortex-jsonnet +$ GO111MODULE=on go install github.com/monitoring-mixins/mixtool/cmd/mixtool@2ff523ea63d1cdeee2a10e01d1d48d20adcc7030 +$ GO111MODULE=on go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@v0.5.1 +$ git clone https://github.com/cortexproject/cortex-jsonnet $ cd cortex-jsonnet $ make build-mixin ``` This will leave all the alerts and dashboards in cortex-mixin/cortex-mixin.zip (or cortex-mixin/out). - -If you get an error like `cannot use cli.StringSliceFlag literal (type cli.StringSliceFlag) as type cli.Flag in slice literal` when installing [mixtool](https://github.com/monitoring-mixins/mixtool/issues/27), make sure you set `GO111MODULE=on` before `go get`. diff --git a/RELEASE.md b/RELEASE.md index 47a14659..6a255c19 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -27,7 +27,7 @@ A release of `cortex-jsonnet` should follow shortly after a release of Cortex. $ make build-mixin ``` 7. Add the `cortex-mixin/cortex-mixin.zip` and release change log to the GitHub release. - - Edit the release in GitHub by going to https://github.com/grafana/cortex-jsonnet/releases/edit/x.y.z + - Edit the release in GitHub by going to https://github.com/cortexproject/cortex-jsonnet/releases/edit/x.y.z ### How to tag a release diff --git a/build-image/Dockerfile b/build-image/Dockerfile index 5b0f50aa..70d2fd9b 100644 --- a/build-image/Dockerfile +++ b/build-image/Dockerfile @@ -1,43 +1,39 @@ # Build jsonnet -FROM alpine:3.13 AS jsonnet-builder +FROM alpine:3.18 AS jsonnet-builder RUN apk add --no-cache git make g++ RUN git clone https://github.com/google/jsonnet && \ - git -C jsonnet checkout v0.15.0 && \ + git -C jsonnet checkout v0.20.0 && \ make -C jsonnet 2LDFLAGS=-static && \ cp jsonnet/jsonnet /usr/bin && \ cp jsonnet/jsonnetfmt /usr/bin # Build jb -FROM alpine:3.13 AS jb-builder -ARG JSONNET_BUNDLER_VERSION=0.4.0 -ARG JSONNET_BUNDLER_CHECKSUM="433edab5554a88a0371e11e93080408b225d41c31decf321c02b50d2e44993ce /usr/bin/jb" +FROM alpine:3.18 AS jb-builder +ARG JSONNET_BUNDLER_VERSION=0.5.1 +ARG JSONNET_BUNDLER_CHECKSUM="f5bccc94d28fbbe8ad1d46fd4f208619e45d368a5d7924f6335f4ecfa0605c85 /usr/bin/jb" RUN apk add --no-cache curl RUN curl -fSL -o "/usr/bin/jb" "https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v${JSONNET_BUNDLER_VERSION}/jb-linux-amd64" RUN echo "${JSONNET_BUNDLER_CHECKSUM}" | sha256sum -c || (printf "wanted: %s\n got: %s\n" "${JSONNET_BUNDLER_CHECKSUM}" "$(sha256sum /usr/bin/jb)"; exit 1) RUN chmod +x /usr/bin/jb # Build tanka -FROM alpine:3.13 AS tk-builder -ARG TANKA_VERSION=0.11.1 -ARG TANKA_CHECKSUM="3b253ca7d7bf01189604c10a8f7cead20a553ddc04c813f0f836d80338cfad71 /usr/bin/tk" +FROM alpine:3.18 AS tk-builder +ARG TANKA_VERSION=0.26.0 +ARG TANKA_CHECKSUM="089796ae2ce65390501b2c68ceca1ce99ff12787d5ae3b4823c825a07e6e22f4 /usr/bin/tk" RUN apk add --no-cache curl RUN curl -fSL -o "/usr/bin/tk" "https://github.com/grafana/tanka/releases/download/v${TANKA_VERSION}/tk-linux-amd64" RUN echo "${TANKA_CHECKSUM}" | sha256sum -c || (printf "wanted: %s\n got: %s\n" "${TANKA_CHECKSUM}" "$(sha256sum /usr/bin/tk)"; exit 1) RUN chmod +x /usr/bin/tk # Build mixtool -FROM golang:1.15-alpine AS mixtool-builder -RUN GO111MODULE=on go get github.com/monitoring-mixins/mixtool/cmd/mixtool@ae18e31161ea10545b9c1ac0d23c10122f2c12b5 +FROM golang:1.21-alpine AS mixtool-builder +RUN GO111MODULE=on go install github.com/monitoring-mixins/mixtool/cmd/mixtool@ae18e31161ea10545b9c1ac0d23c10122f2c12b5 -FROM alpine:3.13 -RUN apk add --no-cache git make libgcc libstdc++ zip findutils sed +FROM alpine:3.18 +RUN apk add --no-cache git make libgcc libstdc++ zip findutils sed yq COPY --from=jsonnet-builder /usr/bin/jsonnetfmt /usr/bin COPY --from=jsonnet-builder /usr/bin/jsonnet /usr/bin COPY --from=jb-builder /usr/bin/jb /usr/bin COPY --from=tk-builder /usr/bin/tk /usr/bin COPY --from=mixtool-builder /go/bin/mixtool /usr/bin -# Install yq. -# TODO We can install it via apk once alpine 3.14 or above will be released. Previous versions don't package v4. -RUN wget -O /usr/bin/yq https://github.com/mikefarah/yq/releases/download/v4.9.3/yq_linux_amd64 && \ - chmod +x /usr/bin/yq diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index a6287e5e..ec44565c 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -71,27 +71,6 @@ |||, }, }, - { - // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail - // and we will never trigger the alert. - // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. - alert: 'CortexTableSyncFailure', - expr: ||| - 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) - / - rate(cortex_table_manager_sync_duration_seconds_count[15m]) - > 10 - |||, - 'for': '30m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. - |||, - }, - }, { alert: 'CortexQueriesIncorrect', expr: ||| @@ -206,41 +185,6 @@ |||, }, }, - { - alert: 'CortexTransferFailed', - expr: ||| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) - |||, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} transfer failed. - |||, - }, - }, - { - alert: 'CortexOldChunkInMemory', - // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer - // to 10 hours. - // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). - expr: ||| - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) - and - (cortex_oldest_unflushed_chunk_timestamp_seconds > 0) - |||, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory. - |||, - }, - }, { alert: 'CortexKVStoreFailure', expr: ||| @@ -379,87 +323,6 @@ }, ], }, - { - name: 'cortex_wal_alerts', - rules: [ - { - // Alert immediately if WAL is corrupt. - alert: 'CortexWALCorruption', - expr: ||| - increase(cortex_ingester_wal_corruptions_total[5m]) > 0 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. - |||, - }, - }, - { - // One or more failed checkpoint creation is a warning. - alert: 'CortexCheckpointCreationFailed', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint. - |||, - }, - }, - { - // Two or more failed checkpoint creation in 1h means something is wrong. - alert: 'CortexCheckpointCreationFailed', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint. - |||, - }, - }, - { - // One or more failed checkpoint deletion is a warning. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint. - |||, - }, - }, - { - // Two or more failed checkpoint deletion in 2h means something is wrong. - // We give this more buffer than creation as this is a less critical operation. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.instance }} is failing to delete checkpoint. - |||, - }, - }, - ], - }, { name: 'cortex-rollout-alerts', rules: [ @@ -524,37 +387,13 @@ { name: 'cortex-provisioning', rules: [ - { - alert: 'CortexProvisioningMemcachedTooSmall', - // 4 x in-memory series size = 24hrs of data. - expr: ||| - ( - 4 * - sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) - / 1e9 - ) - > - ( - sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 - ) - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB. - ||| % $._config, - }, - }, { alert: 'CortexProvisioningTooManyActiveSeries', - // We target each ingester to 1.5M in-memory series. This alert fires if the average - // number of series / ingester in a Cortex cluster is > 1.6M for 2h (we compact + // We target each ingester to 3.0M in-memory series. This alert fires if the average + // number of series / ingester in a Cortex cluster is > 3.2M for 2h (we compact // the TSDB head every 2h). expr: ||| - avg by (%s) (cortex_ingester_memory_series) > 1.6e6 + avg by (%s) (cortex_ingester_memory_series) > 3.2e6 ||| % [$._config.alert_aggregation_labels], 'for': '2h', labels: { @@ -568,9 +407,9 @@ }, { alert: 'CortexProvisioningTooManyWrites', - // 80k writes / s per ingester max. + // 160k writes / s per ingester max. expr: ||| - avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 80e3 + avg by (%s) (rate(cortex_ingester_ingested_samples_total[1m])) > 160e3 ||| % $._config.alert_aggregation_labels, 'for': '15m', labels: { diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 06941b6d..2f620703 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -3,18 +3,7 @@ grafanaDashboardShards: 4, _config+:: { - // Switch for overall storage engine. - // May contain 'chunks', 'blocks' or both. - // Enables chunks- or blocks- specific panels and dashboards. - storage_engine: ['blocks'], - - // For chunks backend, switch for chunk index type. - // May contain 'bigtable', 'dynamodb' or 'cassandra'. - chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'], - - // For chunks backend, switch for chunk store type. - // May contain 'bigtable', 'dynamodb', 'cassandra', 's3' or 'gcs'. - chunk_store_backend: ['bigtable', 'dynamodb', 'cassandra', 's3', 'gcs'], + storage_engine: ['blocks'], // TODO: Remove this option, it's not needed // Tags for dashboards. tags: ['cortex'], @@ -32,7 +21,6 @@ ruler: '(ruler|cortex$)', query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. - table_manager: '(table-manager|cortex$)', ring_members: ['compactor', 'distributor', 'ingester.*', 'querier.*', 'ruler', 'store-gateway', 'cortex'], store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', diff --git a/cortex-mixin/dashboards.libsonnet b/cortex-mixin/dashboards.libsonnet index 9e7f71c2..e4b68c4b 100644 --- a/cortex-mixin/dashboards.libsonnet +++ b/cortex-mixin/dashboards.libsonnet @@ -9,22 +9,9 @@ (import 'dashboards/writes.libsonnet') + (import 'dashboards/slow-queries.libsonnet') + (import 'dashboards/rollout-progress.libsonnet') + - - (if std.member($._config.storage_engine, 'blocks') - then - (import 'dashboards/compactor.libsonnet') + - (import 'dashboards/compactor-resources.libsonnet') + - (import 'dashboards/object-store.libsonnet') - else {}) + - - (if std.member($._config.storage_engine, 'chunks') - then import 'dashboards/chunks.libsonnet' - else {}) + - - (if std.member($._config.storage_engine, 'blocks') - && std.member($._config.storage_engine, 'chunks') - then import 'dashboards/comparison.libsonnet' - else {}) + + (import 'dashboards/compactor.libsonnet') + + (import 'dashboards/compactor-resources.libsonnet') + + (import 'dashboards/object-store.libsonnet') + (if !$._config.resources_dashboards_enabled then {} else (import 'dashboards/reads-resources.libsonnet') + diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 0bf88c43..731135db 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -10,22 +10,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Total Alerts') + + $.timeseriesPanel('Total Alerts') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Total Silences') + + $.timeseriesPanel('Total Silences') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short') ) ) .addRow( $.row('Alerts Received') .addPanel( - $.panel('APS') + + $.timeseriesPanel('APS') + $.queryPanel( [ ||| @@ -42,7 +42,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Alert Notifications') .addPanel( - $.panel('NPS') + + $.timeseriesPanel('NPS') + $.queryPanel( [ ||| @@ -56,7 +56,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('NPS by integration') + + $.timeseriesPanel('NPS by integration') + $.queryPanel( [ ||| @@ -73,18 +73,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) ) ) .addRow( $.row('Configuration API (gateway) + Alertmanager UI') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')]) ) ) @@ -94,7 +94,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Replication') .addPanel( - $.panel('Per %s Tenants' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Alerts' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -110,7 +110,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Silences' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Tenant Configuration Sync') .addPanel( - $.panel('Syncs/sec') + + $.timeseriesPanel('Syncs/sec') + $.queryPanel( [ ||| @@ -135,14 +135,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Syncs/sec (By Reason)') + + $.timeseriesPanel('Syncs/sec (By Reason)') + $.queryPanel( 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{reason}}' ) ) .addPanel( - $.panel('Ring Check Errors/sec') + + $.timeseriesPanel('Ring Check Errors/sec') + $.queryPanel( 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), 'errors' @@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Initial State Sync') .addPanel( - $.panel('Initial syncs /sec') + + $.timeseriesPanel('Initial syncs /sec') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{outcome}}' @@ -166,7 +166,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Initial sync duration') + + $.timeseriesPanel('Initial sync duration', unit='s') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + { targets: [ target { @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Fetch state from other alertmanagers /sec') + + $.timeseriesPanel('Fetch state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -201,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Runtime State Sync') .addPanel( - $.panel('Replicate state to other alertmanagers /sec') + + $.timeseriesPanel('Replicate state to other alertmanagers /sec') + $.queryPanel( [ ||| @@ -215,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Merge state from other alertmanagers /sec') + + $.timeseriesPanel('Merge state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -229,7 +229,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Persist state to remote storage /sec') + + $.timeseriesPanel('Persist state to remote storage /sec') + $.queryPanel( [ ||| diff --git a/cortex-mixin/dashboards/chunks.libsonnet b/cortex-mixin/dashboards/chunks.libsonnet deleted file mode 100644 index b82c6880..00000000 --- a/cortex-mixin/dashboards/chunks.libsonnet +++ /dev/null @@ -1,100 +0,0 @@ -local utils = import 'mixin-utils/utils.libsonnet'; - -(import 'dashboard-utils.libsonnet') { - 'cortex-chunks.json': - ($.dashboard('Cortex / Chunks') + { uid: 'a56a3fa6284064eb392a115f3acbf744' }) - .addClusterSelectorTemplates() - .addRow( - $.row('Active Series / Chunks') - .addPanel( - $.panel('Series') + - $.queryPanel('sum(cortex_ingester_memory_series{%s})' % $.jobMatcher($._config.job_names.ingester), 'series'), - ) - .addPanel( - $.panel('Chunks per series') + - $.queryPanel('sum(cortex_ingester_memory_chunks{%s}) / sum(cortex_ingester_memory_series{%s})' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'chunks'), - ) - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Utilization') + - $.latencyPanel('cortex_ingester_chunk_utilization', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + - { yaxes: $.yaxes('percentunit') }, - ) - .addPanel( - $.panel('Age') + - $.latencyPanel('cortex_ingester_chunk_age_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)), - ), - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Size') + - $.latencyPanel('cortex_ingester_chunk_length', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Entries') + - $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'entries'), - ), - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Queue Length') + - $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher($._config.job_names.ingester), '{{%s}}' % $._config.per_instance_label), - ) - .addPanel( - $.panel('Flush Rate') + - $.qpsPanel('cortex_ingester_chunk_age_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)), - ), - ), - - 'cortex-wal.json': - ($.dashboard('Cortex / WAL') + { uid: 'd4fb924cdc1581cd8e870e3eb0110bda' }) - .addClusterSelectorTemplates() - .addRow( - $.row('') - .addPanel( - $.panel('Bytes Logged (WAL+Checkpoint) / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - ) - .addRow( - $.row('WAL') - .addPanel( - $.panel('Records logged / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'records'), - ) - .addPanel( - $.panel('Bytes per record') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - .addPanel( - $.panel('Bytes per sample') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - .addPanel( - $.panel('Min(available disk space)') + - $.queryPanel('min(kubelet_volume_stats_available_bytes{cluster=~"$cluster", namespace=~"$namespace", persistentvolumeclaim=~"ingester.*"})', 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - ) - .addRow( - $.row('Checkpoint') - .addPanel( - $.panel('Checkpoint creation/deletion / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), - ) - .addPanel( - $.panel('Checkpoint creation/deletion failed / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), - ) - ), -} diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet index aeb64491..720b6fff 100644 --- a/cortex-mixin/dashboards/compactor.libsonnet +++ b/cortex-mixin/dashboards/compactor.libsonnet @@ -14,7 +14,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor) ) + $.bars + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Per-instance runs', ||| @@ -23,7 +22,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Tenants compaction progress') + + $.timeseriesPanel('Tenants compaction progress') + $.queryPanel(||| ( cortex_compactor_tenants_processing_succeeded{%s} + @@ -44,9 +43,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Compacted blocks / sec') + + $.timeseriesPanel('Compacted blocks / sec', unit='ops') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Compacted blocks / sec', ||| @@ -55,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Per-block compaction duration') + + $.timeseriesPanel('Per-block compaction duration', unit='s') + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + $.panelDescription( 'Per-block compaction duration', @@ -68,11 +66,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Average blocks / tenant') + + $.timeseriesPanel('Average blocks / tenant') + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'), ) .addPanel( - $.panel('Tenants with largest number of blocks') + + $.timeseriesPanel('Tenants with largest number of blocks') + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') + $.panelDescription( 'Tenants with largest number of blocks', @@ -85,9 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Garbage Collector') .addPanel( - $.panel('Blocks marked for deletion / sec') + - $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks marked for deletion / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks'), ) .addPanel( $.successFailurePanel( @@ -111,7 +108,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) + { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Metadata Sync Duration') + + $.timeseriesPanel('Metadata Sync Duration', unit='ms') + // This metric tracks the duration of a per-tenant metadata sync. $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) diff --git a/cortex-mixin/dashboards/comparison.libsonnet b/cortex-mixin/dashboards/comparison.libsonnet deleted file mode 100644 index 1716f7d4..00000000 --- a/cortex-mixin/dashboards/comparison.libsonnet +++ /dev/null @@ -1,105 +0,0 @@ -local utils = import 'mixin-utils/utils.libsonnet'; - -(import 'dashboard-utils.libsonnet') -{ - 'cortex-blocks-vs-chunks.json': - ($.dashboard('Cortex / Blocks vs Chunks') + { uid: '0e2b4dd23df9921972e3fb554c0fc483' }) - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addTemplate('blocks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addTemplate('chunks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - $.row('Ingesters') - .addPanel( - $.panel('Samples / sec') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__rate_interval]))', 'chunks') - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('Blocks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($blocks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - .addPanel( - $.panel('Chunks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($chunks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU per sample') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory per active series') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ) - .addRow( - $.row('Queriers') - .addPanel( - $.panel('Queries / sec (query-frontend)') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Queries / sec (query-tee)') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('Latency 99th') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])))', 'blocks') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])))', 'chunks') + - { yaxes: $.yaxes('s') } - ) - .addPanel( - $.panel('Latency average') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') + - { yaxes: $.yaxes('s') } - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ), -} diff --git a/cortex-mixin/dashboards/config.libsonnet b/cortex-mixin/dashboards/config.libsonnet index 9240ef89..10692a3d 100644 --- a/cortex-mixin/dashboards/config.libsonnet +++ b/cortex-mixin/dashboards/config.libsonnet @@ -8,19 +8,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Startup config file') .addPanel( - $.panel('Startup config file hashes') + + $.timeseriesPanel('Startup config file hashes', unit='instances') + $.queryPanel('count(cortex_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ) .addRow( $.row('Runtime config file') .addPanel( - $.panel('Runtime config file hashes') + + $.timeseriesPanel('Runtime config file hashes', unit='instances') + $.queryPanel('count(cortex_runtime_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ), } diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index e4268192..3d9eea30 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -62,6 +62,44 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addTemplate('cluster', 'cortex_build_info', 'cluster') .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), }, + timeseriesPanel(title, unit='short'):: { + datasource: '$datasource', + fieldConfig: { + defaults: { + custom: { + drawStyle: 'line', + fillOpacity: 1, + lineWidth: 1, + pointSize: 5, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + }, + thresholds: { + mode: 'absolute', + steps: [], + }, + unit: unit, + }, + overrides: [], + }, + options: { + legend: { + showLegend: true, + }, + tooltip: { + mode: 'single', + sort: 'none', + }, + }, + links: [], + targets: [], + title: title, + type: 'timeseries', + }, // The mixin allow specialism of the job selector depending on if its a single binary // deployment or a namespaced one. @@ -108,6 +146,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; } for target in super.targets ], + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', + group: 'A', + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byName', + options: status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)], + }, }, latencyPanel(metricName, selector, multiplier='1e3'):: @@ -121,7 +188,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, successFailurePanel(title, successMetric, failureMetric):: - $.panel(title) + + $.timeseriesPanel(title, unit='short') + $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.stack + { aliasColors: { @@ -132,7 +199,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Displays started, completed and failed rate. startedCompletedFailedPanel(title, startedMetric, completedMetric, failedMetric):: - $.panel(title) + + $.timeseriesPanel(title, unit='ops') + $.queryPanel([startedMetric, completedMetric, failedMetric], ['started', 'completed', 'failed']) + $.stack + { aliasColors: { @@ -143,7 +210,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerCPUUsagePanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel([ 'sum by(%s) (rate(container_cpu_usage_seconds_total{%s,container=~"%s"}[$__rate_interval]))' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], 'min(container_spec_cpu_quota{%s,container=~"%s"} / container_spec_cpu_period{%s,container=~"%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], @@ -160,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerMemoryWorkingSetPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel([ // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up // summing the memory of the old instance/pod (whose metric will be stale for 5m) to the new instance/pod. @@ -180,7 +247,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerNetworkPanel(title, metric, instanceName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( 'sum by(%(instance)s) (rate(%(metric)s{%(namespace)s,%(instance)s=~"%(instanceName)s"}[$__rate_interval]))' % { namespace: $.namespaceMatcher(), @@ -199,7 +266,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerNetworkPanel('Transmit Bandwidth', 'container_network_transmit_bytes_total', instanceName), containerDiskWritesPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -220,7 +287,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskReadsPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -239,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskSpaceUtilization(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='percentunit') + $.queryPanel( ||| max by(persistentvolumeclaim) ( @@ -266,7 +333,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; else 'label_name="%s"' % containerName, goHeapInUsePanel(title, jobName):: - $.panel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel( 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], '{{%s}}' % $._config.per_instance_label @@ -361,39 +428,38 @@ local utils = import 'mixin-utils/utils.libsonnet'; getObjectStoreRows(title, component):: [ super.row(title) .addPanel( - $.panel('Operations / sec') + + $.timeseriesPanel('Operations / sec', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack ) .addPanel( - $.panel('Error rate') + + $.timeseriesPanel('Error rate', unit='percentunit') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Latency of Op: Attributes') + + $.timeseriesPanel('Latency of Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Exists') + + $.timeseriesPanel('Latency of Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]), ), $.row('') .addPanel( - $.panel('Latency of Op: Get') + + $.timeseriesPanel('Latency of Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: GetRange') + + $.timeseriesPanel('Latency of Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Upload') + + $.timeseriesPanel('Latency of Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Delete') + + $.timeseriesPanel('Latency of Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), ], @@ -406,7 +472,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }; super.row(title) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -425,7 +491,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -439,7 +505,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio', unit='percentunit') + $.queryPanel( ||| sum( @@ -503,4 +569,42 @@ local utils = import 'mixin-utils/utils.libsonnet'; %s ||| % [title, description], }, + + overrideHidden(name):: + { + matcher: { + id: 'byName', + options: name, + }, + properties: [ + { + id: 'custom.hidden', + value: true, + }, + ], + }, + + overrideDisplayName(name, displayName):: + { + matcher: { + id: 'byName', + options: name, + }, + properties: [ + { + id: 'displayName', + value: displayName, + }, + ], + }, + + + tablePanel(queries, overrides):: + super.tablePanel(queries, {}) + { + fieldConfig+: { + overrides+: overrides, + }, + styles:: null, + }, + } diff --git a/cortex-mixin/dashboards/object-store.libsonnet b/cortex-mixin/dashboards/object-store.libsonnet index 69e257b6..d58976a2 100644 --- a/cortex-mixin/dashboards/object-store.libsonnet +++ b/cortex-mixin/dashboards/object-store.libsonnet @@ -7,58 +7,54 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Components') .addPanel( - $.panel('RPS / component') + + $.timeseriesPanel('RPS / component', unit='rps') + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{component}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.panel('Error rate / component') + - $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / component', unit='percentunit') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') ) ) .addRow( $.row('Operations') .addPanel( - $.panel('RPS / operation') + + $.timeseriesPanel('RPS / operation', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.panel('Error rate / operation') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / operation', unit='percentunit') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') ) ) .addRow( $.row('') .addPanel( - $.panel('Op: Get') + + $.timeseriesPanel('Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: GetRange') + + $.timeseriesPanel('Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get_range"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Exists') + + $.timeseriesPanel('Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="exists"}' % $.namespaceMatcher()), ) ) .addRow( $.row('') .addPanel( - $.panel('Op: Attributes') + + $.timeseriesPanel('Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="attributes"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Upload') + + $.timeseriesPanel('Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="upload"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Delete') + + $.timeseriesPanel('Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="delete"}' % $.namespaceMatcher()), ) ), diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index 259f5dfa..212ab9d2 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -8,34 +8,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_frontend)), ) .addPanel( - $.panel('Retries') + - $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Retries', unit='short') + + $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1), ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Scheduler') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)), ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_scheduler_queue_length{%s}' % $.jobMatcher($._config.job_names.query_scheduler), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Frontend - Query Splitting and Results Cache') .addPanel( - $.panel('Intervals per Query') + + $.timeseriesPanel('Intervals per Query') + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'splitting rate') + $.panelDescription( 'Intervals per Query', @@ -45,19 +44,25 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Results Cache Hit %') + - $.queryPanel('sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Hit Rate') + + $.timeseriesPanel('Results Cache Hit %') + + $.queryPanel(||| + sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) or + sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) + ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Hit Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - $.panel('Results Cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %s}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'Miss Rate'), + $.timeseriesPanel('Results Cache misses') + + $.queryPanel(||| + sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) or + sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) + ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Miss Rate'), ) ) .addRow( $.row('Query Frontend - Query sharding') .addPanel( - $.panel('Sharded Queries Ratio') + + $.timeseriesPanel('Sharded Queries Ratio') + $.queryPanel(||| sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{%s}[$__rate_interval])) / sum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{%s}[$__rate_interval])) @@ -72,9 +77,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Number of Sharded Queries per Query') + + $.timeseriesPanel('Number of Sharded Queries per Query', unit='short') + $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') } + $.panelDescription( 'Number of Sharded Queries per Query', ||| @@ -87,94 +91,50 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('Stages') + + $.timeseriesPanel('Stages', unit='ms') + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher($._config.job_names.querier), '{{slice}}') + - { yaxes: $.yaxes('ms') } + $.stack, ) .addPanel( - $.panel('Chunk cache misses') + - $.queryPanel('sum(rate(cortex_cache_fetched_keys{%s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%s,name="chunksmemcache"}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Hit rate'), + $.timeseriesPanel('Chunk cache misses') + + $.queryPanel(||| + sum(rate(cortex_cache_fetched_keys{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%(q)s,name="chunksmemcache"}[1m])) or + sum(rate(cortex_cache_fetched_keys_total{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits_total{%(q)s,name="chunksmemcache"}[1m])) + ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Hit rate'), ) .addPanel( - $.panel('Chunk cache corruptions') + + $.timeseriesPanel('Chunk cache corruptions') + $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'Corrupt chunks'), ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Querier - Chunks storage - Index Cache') - .addPanel( - $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Entries'), - ) - .addPanel( - $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'hit rate') - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'churn rate'), - ) - ) .addRow( $.row('Ingester') .addPanel( - $.panel('Series per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Series per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Chunks per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.panel('Samples per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Querier - Chunks storage - Store') - .addPanel( - $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Samples per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( - $.panel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -183,13 +143,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Bucket indexes loaded (per querier)') + + $.timeseriesPanel('Bucket indexes loaded (per querier)', unit='short') + $.queryPanel([ 'max(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'min(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'avg(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), - ], ['Max', 'Min', 'Average']) + - { yaxes: $.yaxes('short') }, + ], ['Max', 'Min', 'Average']), ) .addPanel( $.successFailurePanel( @@ -199,7 +158,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Bucket indexes load latency') + + $.timeseriesPanel('Bucket indexes load latency', unit='ms') + $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.querier)), ) ) @@ -207,36 +166,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway - Blocks storage') .addPanel( - $.panel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks queried / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks'), ) .addPanel( - $.panel('Data fetched / sec') + + $.timeseriesPanel('Data fetched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( - $.panel('Data touched / sec') + + $.timeseriesPanel('Data touched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Series fetch duration (per request)') + + $.timeseriesPanel('Series fetch duration (per request)') + $.latencyPanel('cortex_bucket_store_series_get_all_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series merge duration (per request)') + + $.timeseriesPanel('Series merge duration (per request)') + $.latencyPanel('cortex_bucket_store_series_merge_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series returned (per request)') + + $.timeseriesPanel('Series returned (per request)') + $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__rate_interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) @@ -244,8 +200,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Blocks currently loaded') + - $.queryPanel('cortex_bucket_store_blocks_loaded{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) + $.timeseriesPanel('Blocks currently loaded') + + $.queryPanel('sum(cortex_bucket_store_blocks_loaded{component="store-gateway",%s}) without (user)' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) ) .addPanel( $.successFailurePanel( @@ -266,15 +222,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Lazy loaded index-headers') + + $.timeseriesPanel('Lazy loaded index-headers') + $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{%s}}' % $._config.per_instance_label) ) .addPanel( - $.panel('Index-header lazy load duration') + + $.timeseriesPanel('Index-header lazy load duration', unit='ms') + $.latencyPanel('cortex_bucket_store_indexheader_lazy_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series hash cache hit ratio') + + $.timeseriesPanel('Series hash cache hit ratio') + $.queryPanel(||| sum(rate(cortex_bucket_store_series_hash_cache_hits_total{%s}[$__rate_interval])) / diff --git a/cortex-mixin/dashboards/reads-resources.libsonnet b/cortex-mixin/dashboards/reads-resources.libsonnet index f0750c88..437a57a2 100644 --- a/cortex-mixin/dashboards/reads-resources.libsonnet +++ b/cortex-mixin/dashboards/reads-resources.libsonnet @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ruler') .addPanel( - $.panel('Rules') + + $.timeseriesPanel('Rules') + $.queryPanel( 'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ruler)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 9bc9b7d6..c0ddbe4f 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Instant queries / sec') + + $.timeseriesPanel('Instant queries / sec') + $.statPanel(||| sum( rate( @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Range queries / sec') + + $.timeseriesPanel('Range queries / sec') + $.statPanel(||| sum( rate( @@ -92,37 +92,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Query Frontend') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -142,109 +140,82 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( - $.panel('Latency (Time in Queue)') + + $.timeseriesPanel('Latency (Time in Queue)') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) ) .addRow( $.row('Cache - Query Results') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) ) ) .addRow( $.row('Querier') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' - ) + - { yaxes: $.yaxes('s') } - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached - Chunks storage - Index') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached - Chunks storage - Chunks') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'chunksmemcache.fetch')]) + ) ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -258,11 +229,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' ) + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -276,7 +246,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio') + $.queryPanel( ||| sum by(item_type) ( @@ -339,58 +309,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'metadata-cache' ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), - $.row('Cassandra') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'SELECT')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), - $.row('BigTable') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) - ), - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), - $.row('DynamoDB') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'DynamoDB.QueryPages')]) - ), - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_store_backend, 'gcs'), - $.row('GCS') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'GET')]) - ) - ) // Object store metrics for the store-gateway. .addRowsIf( std.member($._config.storage_engine, 'blocks'), diff --git a/cortex-mixin/dashboards/rollout-progress.libsonnet b/cortex-mixin/dashboards/rollout-progress.libsonnet index 16c54095..775a199e 100644 --- a/cortex-mixin/dashboards/rollout-progress.libsonnet +++ b/cortex-mixin/dashboards/rollout-progress.libsonnet @@ -20,7 +20,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Rollout progress // - $.panel('Rollout progress') + + $.timeseriesPanel('Rollout progress') + $.barGauge([ // Multi-zone deployments are grouped together removing the "zone-X" suffix. // After the grouping, the resulting label is called "cortex_service". @@ -89,7 +89,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Writes // - $.panel('Writes - 2xx') + + $.timeseriesPanel('Writes - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -100,7 +100,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 0 }, }, - $.panel('Writes - 4xx') + + $.timeseriesPanel('Writes - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 0 }, }, - $.panel('Writes - 5xx') + + $.timeseriesPanel('Writes - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -125,7 +125,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 0 }, }, - $.panel('Writes 99th Latency') + + $.timeseriesPanel('Writes 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -140,7 +140,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Reads // - $.panel('Reads - 2xx') + + $.timeseriesPanel('Reads - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -151,7 +151,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 4 }, }, - $.panel('Reads - 4xx') + + $.timeseriesPanel('Reads - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -164,7 +164,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 4 }, }, - $.panel('Reads - 5xx') + + $.timeseriesPanel('Reads - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -176,7 +176,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 4 }, }, - $.panel('Reads 99th Latency') + + $.timeseriesPanel('Reads 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -191,7 +191,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Unhealthy pods // - $.panel('Unhealthy pods') + + $.timeseriesPanel('Unhealthy pods') + $.newStatPanel([ ||| kube_deployment_status_replicas_unavailable{%(namespace_matcher)s, deployment=~"%(all_services_regex)s"} @@ -280,7 +280,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Performance comparison with 24h ago // - $.panel('Latency vs 24h ago') + + $.timeseriesPanel('Latency vs 24h ago') + $.queryPanel([||| 1 - ( avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index d1062581..88742e23 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -67,26 +67,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Active Configurations') + + $.timeseriesPanel('Active Configurations') + $.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Total Rules') + + $.timeseriesPanel('Total Rules') + $.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Read from Ingesters - QPS') + + $.timeseriesPanel('Read from Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) .addPanel( - $.panel('Write to Ingesters - QPS') + + $.timeseriesPanel('Write to Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) ) .addRow( $.row('Rule Evaluations Global') .addPanel( - $.panel('EPS') + + $.timeseriesPanel('EPS') + $.queryPanel( [ $.rulerQueries.ruleEvaluations.success % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.ruleEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'average' @@ -106,100 +106,56 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Configuration API (gateway)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re]) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', ruler_config_api_routes_re)]) ) .addPanel( - $.panel('Per route p99 Latency') + + $.timeseriesPanel('Per route p99 latency', unit='s') + $.queryPanel( 'histogram_quantile(0.99, sum by (route, le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], '{{ route }}' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Writes (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) ) .addRow( $.row('Reads (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Ruler - Chunks storage - Index Cache') - .addPanel( - $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Entries'), - ) - .addPanel( - $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.ruler), 'churn rate'), - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Ruler - Chunks storage - Store') - .addPanel( - $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ruler - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( - $.panel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -207,33 +163,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Notifications') .addPanel( - $.panel('Delivery Errors') + + $.timeseriesPanel('Delivery Errors') + $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Dropped') + + $.timeseriesPanel('Dropped') + $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher($._config.job_names.ruler), '{{ user }}') ) ) .addRow( ($.row('Group Evaluations') + { collapse: true }) .addPanel( - $.panel('Missed Iterations') + + $.timeseriesPanel('Missed Iterations') + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher($._config.job_names.ruler), '{{ user }}'), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.groupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' ), ) .addPanel( - $.panel('Failures') + + $.timeseriesPanel('Failures') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher($._config.job_names.ruler)], '{{ rule_group }}' ) @@ -242,7 +198,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Rule Evaluation per User') + { collapse: true }) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index a01a7db3..e078a350 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -38,7 +38,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Scaling') + { height: '400px' }) .addPanel( - $.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } + + $.timeseriesPanel('Workload-based scaling') + { sort: { col: 0, desc: false } } + $.tablePanel([ ||| sort_desc( @@ -47,14 +47,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; cluster_namespace_deployment:actual_replicas:count{cluster=~"$cluster", namespace=~"$namespace"} ) |||, - ], { - __name__: { alias: 'Cluster', type: 'hidden' }, - cluster: { alias: 'Cluster' }, - namespace: { alias: 'Namespace' }, - deployment: { alias: 'Service' }, - reason: { alias: 'Reason' }, - Value: { alias: 'Required Replicas', decimals: 0 }, - }) + ], [ + $.overrideHidden('__name__'), + $.overrideHidden('Time'), + $.overrideDisplayName('cluster', 'Cluster'), + $.overrideDisplayName('namespace', 'Namespace'), + $.overrideDisplayName('deployment', 'Service'), + $.overrideDisplayName('reason', 'Reason'), + $.overrideDisplayName('Value', 'Required Replicas'), + ]) ) ), } diff --git a/cortex-mixin/dashboards/writes-resources.libsonnet b/cortex-mixin/dashboards/writes-resources.libsonnet index 64f83ef1..e11ac223 100644 --- a/cortex-mixin/dashboards/writes-resources.libsonnet +++ b/cortex-mixin/dashboards/writes-resources.libsonnet @@ -31,7 +31,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('In-memory series') + + $.timeseriesPanel('In-memory series') + $.queryPanel( 'sum by(%s) (cortex_ingester_memory_series{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index e99faee4..67d10581 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -34,7 +34,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Samples / sec') + + $.timeseriesPanel('Samples / sec') + $.statPanel( 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Active Series') + + $.timeseriesPanel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} / on(%(group_by_cluster)s) group_left @@ -56,154 +56,87 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Distributor') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Key-value store for high-availability (HA) deduplication') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Key-value store for the ingesters ring') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('method', 'Memcache.Put')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), - $.row('Cassandra') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'INSERT')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), - $.row('BigTable') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), - $.row('DynamoDB') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_store_backend, 'gcs'), - $.row('GCS') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'POST')]) - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - Shipper') @@ -222,7 +155,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Upload latency') + + $.timeseriesPanel('Upload latency', unit='ms') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Upload latency', @@ -252,7 +185,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Compactions latency') + + $.timeseriesPanel('Compactions latency', unit='ms') + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Compaction latency', @@ -295,9 +228,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('WAL truncations latency (includes checkpointing)') + + $.timeseriesPanel('WAL truncations latency (includes checkpointing)', unit='s') + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') } + $.panelDescription( 'WAL truncations latency (including checkpointing)', ||| @@ -307,7 +239,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Corruptions / sec') + + $.timeseriesPanel('Corruptions / sec', unit='ops') + $.queryPanel([ 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index 0e98a891..39586870 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -3,7 +3,7 @@ This document contains playbooks, or at least a checklist of what to look for, for alerts in the cortex-mixin and logs from Cortex. This document assumes that you are running a Cortex cluster: 1. Using this mixin config -2. Using GCS as object store (but similar procedures apply to other backends) +2. Using GCS (Google), S3 (AWS) or Blobs (Azure). Similar procedures apply to other backends. ## Alerts @@ -198,8 +198,6 @@ How to **investigate**: - If the failing service is going OOM (`OOMKilled`): scale up or increase the memory - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there -### CortexTransferFailed -This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` ### CortexIngesterUnhealthy This alert goes off when an ingester is marked as unhealthy. Check the ring web page to see which is marked as unhealthy. You could then check the logs to see if there are any related to that ingester ex: `kubectl logs -f ingester-01 --namespace=prod`. A simple way to resolve this may be to click the "Forgot" button on the ring page, especially if the pod doesn't exist anymore. It might not exist anymore because it was on a node that got shut down, so you could check to see if there are any logs related to the node that pod is/was on, ex: `kubectl get events --namespace=prod | grep cloud-provider-node`. @@ -413,6 +411,32 @@ Where: - `TENANT` is the tenant id reported in the example error message above as `REDACTED-TENANT` - `BLOCK` is the last part of the file path reported as `REDACTED-BLOCK` in the example error message above +To rename a block stored on S3 you can use the `aws` CLI command: +``` +aws s3 mv gs://BUCKET/TENANT/BLOCK gs://BUCKET/TENANT/corrupted-BLOCK +``` +Where: +- `BUCKET` is the s3 bucket name the compactor is using. +- `TENANT` is the tenant id reported in the example error message above as `REDACTED-TENANT` +- `BLOCK` is the last part of the file path reported as `REDACTED-BLOCK` in the example error message above + + +To rename a block stored on Azure you can use the `azcopy` and `az` CLI command: +``` +azcopy copy "https://$STORAGE-ACCOUNT.blob.core.windows.net/$CONTAINER/$TENANT/$BLOCK?$SASTOKEN" "https://$STORAGE-ACCOUNT.blob.core.windows.net/$CONTAINER/$TENANT/corrupted-$BLOCK?$SASTOKEN" --recursive +azcopy remove "https://$STORAGE-ACCOUNT.blob.core.windows.net/$CONTAINER/$TENANT/$BLOCK?$SASTOKEN" --recursive +``` +Where: +- `STORAGE-ACCOUNT` is the storage account the compactor is using. +- `CONTAINER` is what is specified as `-blocks-storage.azure.container-name` +- `TENANT` is the tenant id reported in the example error message above as REDACTED-TENANT +- `BLOCK` is the last part of the file path reported as REDACTED-BLOCK in the example error message above +- `SAS-TOKEN` this is a token that can be created with the following command: + +``` +az storage container generate-sas --account-name $STORAGE-ACCOUNT --expiry $(date -v +1d +%Y-%m-%d) --name $CONTAINER --permissions dlrw +``` + ### CortexBucketIndexNotUpdated This alert fires when the bucket index, for a given tenant, is not updated since a long time. The bucket index is expected to be periodically updated by the compactor and is used by queriers and store-gateways to get an almost-updated view over the bucket store. @@ -438,29 +462,6 @@ How to **investigate**: - Safely manually delete the block from the bucket if was a partial delete or an upload failed by a compactor - Further investigate if was an upload failed by an ingester but not later retried (ingesters are expected to retry uploads until succeed) -### CortexWALCorruption - -This alert is only related to the chunks storage. This can happen because of 2 reasons: (1) Non graceful shutdown of ingesters. (2) Faulty storage or NFS. - -WAL corruptions are only detected at startups, so at this point the WAL/Checkpoint would have been repaired automatically. So we can only check what happened and if there was any data loss and take actions to avoid this happening in future. - -1. Check if there was any node restarts that force killed pods. If there is, then the corruption is from the non graceful shutdown of ingesters, which is generally fine. You can: - * Describe the pod to see the last state. - * Use `kube_pod_info` to check the node for the pod. `node_boot_time_seconds` to see if node just booted (which also indicates restart). - * You can use `eventrouter` logs to double check. - * Check ingester logs to check if the shutdown logs are missing at that time. -2. To confirm this, in the logs, check the WAL segment on which the corruption happened (let's say `X`) and the last checkpoint attempt number (let's say `Y`, this is the last WAL segment that was present when checkpointing started). -3. If `X > Y`, then it's most likely an abrupt restart of ingester and the corruption would be on the last few records of the last segment. To verify this, check the file timestamps of WAL segment `X` and `X - 1` if they were recent. -4. If `X < Y`, then the corruption was in some WAL segment which was not the last one. This indicates faulty disk and some data loss on that ingester. -5. In case of faulty disk corruption, if the number or ingesters that had corruption within the chunk flush age: - 1. Less than the quorum number for your replication factor: No data loss, because there is a guarantee that the data is replicated. For example, if replication factor is 3, then it's fine if corruption was on 1 ingester. - 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. - 3. Equal or more than the replication factor: Then there is definitely some data loss. - -### CortexTableSyncFailure - -_This alert applies to Cortex chunks storage only._ - ### CortexQueriesIncorrect _TODO: this playbook has not been written yet._ @@ -552,31 +553,15 @@ How to **investigate**: - `other` - Check both Cortex and memcached logs to find more details -### CortexOldChunkInMemory - -_This alert applies to Cortex chunks storage only._ - -### CortexCheckpointCreationFailed - -_This alert applies to Cortex chunks storage only._ - -### CortexCheckpointDeletionFailed - -_This alert applies to Cortex chunks storage only._ - -### CortexProvisioningMemcachedTooSmall - -_This alert applies to Cortex chunks storage only._ - ### CortexProvisioningTooManyActiveSeries -This alert fires if the average number of in-memory series per ingester is above our target (1.5M). +This alert fires if the average number of in-memory series per ingester is above our target (3.0M). How to **fix**: - Scale up ingesters - To find out the Cortex clusters where ingesters should be scaled up and how many minimum replicas are expected: ``` - ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 1.5e6) > + ceil(sum by(cluster, namespace) (cortex_ingester_memory_series) / 3.0e6) > count by(cluster, namespace) (cortex_ingester_memory_series) ``` - After the scale up, the in-memory series are expected to be reduced at the next TSDB head compaction (occurring every 2h) @@ -610,7 +595,7 @@ How to **fix**: kubectl -n delete pod ingester-XXX ``` - Restarting an ingester typically reduces the memory allocated by mmap-ed files. After the restart, ingester may allocate this memory again over time, but it may give more time while working on a longer term solution -- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (1.5M). If so: +- Check the `Cortex / Writes Resources` dashboard to see if the number of series per ingester is above the target (3.0M). If so: - Scale up ingesters - Memory is expected to be reclaimed at the next TSDB head compaction (occurring every 2h) diff --git a/cortex-mixin/jsonnetfile.lock.json b/cortex-mixin/jsonnetfile.lock.json index a1b02191..ff6dd095 100644 --- a/cortex-mixin/jsonnetfile.lock.json +++ b/cortex-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "grafana-builder" } }, - "version": "0d13e5ba1b3a4c29015738c203d92ea39f71ebe2", - "sum": "GRf2GvwEU4jhXV+JOonXSZ4wdDv8mnHBPCQ6TUVd+g8=" + "version": "1d877bb0651ef92176f651d0be473c06e372a8a0", + "sum": "udZaafkbKYMGodLqsFhEe+Oy/St2p0edrK7hiMPEey0=" }, { "source": { @@ -18,8 +18,8 @@ "subdir": "mixin-utils" } }, - "version": "21b638f4e4922c0b6fde12120ed45d8ef803edc7", - "sum": "Je2SxBKu+1WrKEEG60zjSKaY/6TPX8uRz5bsaw0a8oA=" + "version": "1d877bb0651ef92176f651d0be473c06e372a8a0", + "sum": "mzLmCv9n3ldLChVGPfyRJOVKoBw+dfK40vU9792aHIM=" } ], "legacyImports": false diff --git a/cortex-mixin/recording_rules.libsonnet b/cortex-mixin/recording_rules.libsonnet index 03835247..86650fa5 100644 --- a/cortex-mixin/recording_rules.libsonnet +++ b/cortex-mixin/recording_rules.libsonnet @@ -2,7 +2,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { local _config = { - max_series_per_ingester: 1.5e6, + max_series_per_ingester: 3.0e6, max_samples_per_sec_per_ingester: 80e3, max_samples_per_sec_per_distributor: 240e3, limit_utilisation_target: 0.6, @@ -148,7 +148,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ||| % _config, }, { - // Ingester should have 1.5M series in memory + // Ingester should have 3.0M series in memory record: 'cluster_namespace_deployment_reason:required_replicas:count', labels: { deployment: 'ingester', @@ -167,7 +167,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, { // We should be about to cover 60% of our limits, - // and ingester can have 1.5M series in memory + // and ingester can have 3.0M series in memory record: 'cluster_namespace_deployment_reason:required_replicas:count', labels: { deployment: 'ingester', diff --git a/cortex/alertmanager.libsonnet b/cortex/alertmanager.libsonnet index 480112d3..1e870c67 100644 --- a/cortex/alertmanager.libsonnet +++ b/cortex/alertmanager.libsonnet @@ -3,6 +3,7 @@ local volumeMount = $.core.v1.volumeMount, local volume = $.core.v1.volume, local container = $.core.v1.container, + local envType = container.envType, local statefulSet = $.apps.v1.statefulSet, local service = $.core.v1.service, local configMap = $.core.v1.configMap, @@ -96,6 +97,7 @@ if $._config.alertmanager_enabled then container.new('alertmanager', $._images.alertmanager) + container.withPorts($.util.defaultPorts + mode.ports) + + container.withEnvMap($.alertmanager_env_map) + container.withEnvMixin([container.envType.fromFieldPath('POD_IP', 'status.podIP')]) + container.withArgsMixin( $.util.mapToFlags($.alertmanager_args) + @@ -109,9 +111,13 @@ ) + $.util.resourcesRequests('100m', '1Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin else {}, + alertmanager_env_map:: { + }, + alertmanager_statefulset: if $._config.alertmanager_enabled then statefulSet.new('alertmanager', $._config.alertmanager.replicas, [$.alertmanager_container], $.alertmanager_pvc) + diff --git a/cortex/azure/main.jsonnet.example b/cortex/azure/main.jsonnet.example new file mode 100644 index 00000000..dddd337d --- /dev/null +++ b/cortex/azure/main.jsonnet.example @@ -0,0 +1,28 @@ +local cortex = import 'cortex/cortex.libsonnet'; + +cortex { + _config+:: { + namespace: 'default', + + blocks_storage_backend: 'azure', + blocks_storage_bucket_name: 'example-bucket', + blocks_storage_azure_account_key: 'replace-with-valid-key', + blocks_storage_azure_account_name: 'example-account', + + // Cortex Ruler config. + ruler_enabled: true, + ruler_client_type: 'azure', + ruler_storage_bucket_name: 'ruler-example-bucket', + ruler_storage_azure_account_name: 'example-account', + ruler_storage_azure_account_key: 'replace-with-valid-key', + + // Cortex Alertmanager config + alertmanager_enabled: true, + alertmanager_client_type: 'azure', + alertmanager_azure_container_name: 'alertmanager-example-bucket', + alertmanager_azure_account_key: 'replace-with-valid-key', + alertmanager_azure_account_name: 'example-account', + external_url: 'https://cortex.example.com', //Alertmanager UI + cluster: 'cluster', + }, +} diff --git a/cortex/compactor.libsonnet b/cortex/compactor.libsonnet new file mode 100644 index 00000000..65c10c81 --- /dev/null +++ b/cortex/compactor.libsonnet @@ -0,0 +1,80 @@ +{ + local container = $.core.v1.container, + local envType = container.envType, + local pvc = $.core.v1.persistentVolumeClaim, + local statefulSet = $.apps.v1.statefulSet, + local volumeMount = $.core.v1.volumeMount, + + // The compactor runs a statefulset with a single replica, because + // it does not support horizontal scalability yet. + local compactor_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_compactor_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_compactor_data_disk_class) + + pvc.mixin.metadata.withName('compactor-data'), + + compactor_args:: + $._config.grpcConfig + + $._config.blocksStorageConfig + + $._config.compactorLimitsConfig + + { + target: 'compactor', + + // Compactor config. + 'compactor.block-ranges': '2h,12h,24h', + 'compactor.data-dir': '/data', + 'compactor.compaction-interval': '30m', + 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, + 'compactor.cleanup-interval': $._config.cortex_compactor_cleanup_interval, + + // Enable sharding. + 'compactor.sharding-enabled': true, + 'compactor.sharding-strategy': 'shuffle-sharding', + 'compactor.tenant-shard-size': 1, + 'compactor.ring.store': 'consul', + 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, + 'compactor.ring.prefix': '', + + // Limits config. + 'runtime-config.file': '/etc/cortex/overrides.yaml', + }, + + compactor_ports:: $.util.defaultPorts, + + compactor_container:: + container.new('compactor', $._images.compactor) + + container.withPorts($.compactor_ports) + + container.withArgsMixin($.util.mapToFlags($.compactor_args)) + + container.withEnvMap($.compactor_env_map) + + container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + + // Do not limit compactor CPU and request enough cores to honor configured max concurrency. + $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '5Gi') + + $.util.resourcesLimits(null, '6Gi') + + $.util.readinessProbe + + $.go_container_mixin + + $.jaeger_mixin, + + compactor_env_map:: { + }, + + newCompactorStatefulSet(name, container):: + statefulSet.new(name, 1, [container], compactor_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + + // Parallelly scale up/down compactor instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + + compactor_statefulset: + $.newCompactorStatefulSet('compactor', $.compactor_container), +} diff --git a/cortex/config.libsonnet b/cortex/config.libsonnet index 7cf316b7..3f103009 100644 --- a/cortex/config.libsonnet +++ b/cortex/config.libsonnet @@ -5,13 +5,7 @@ replication_factor: 3, external_url: error 'must define external url for cluster', - storage_backend: error 'must specify storage backend (cassandra, gcp, aws)', - table_prefix: $._config.namespace, - cassandra_addresses: error 'must specify cassandra addresses', - bigtable_instance: error 'must specify bigtable instance', - bigtable_project: error 'must specify bigtable project', aws_region: error 'must specify AWS region', - s3_bucket_name: error 'must specify S3 bucket name', // If false, ingesters are not unregistered on shutdown and left in the ring with // the LEAVING state. Setting to false prevents series resharding during ingesters rollouts, @@ -26,17 +20,6 @@ cortex_querier_allow_multiple_replicas_on_same_node: false, cortex_query_frontend_allow_multiple_replicas_on_same_node: false, - // schema is used to generate the storage schema yaml file used by - // the Cortex chunks storage: - // - More information: https://github.com/cortexproject/cortex/pull/1072 - // - Blocks storage doesn't support / uses the schema config. - schema: if $._config.storage_engine != 'blocks' then - error 'must specify a schema config' - else - [], - - max_chunk_idle: '15m', - test_exporter_enabled: false, test_exporter_start_time: error 'must specify test exporter start time', test_exporter_user_id: error 'must specify test exporter used id', @@ -52,40 +35,20 @@ jaeger_agent_host: null, - // Use the Cortex chunks storage engine by default, while giving the ability - // to switch to blocks storage. - storage_engine: 'chunks', // Available options are 'chunks' or 'blocks' - blocks_storage_backend: 'gcs', // Available options are 'gcs', 's3', 'azure' + blocks_storage_backend: error "must specify $._config.blocks_storage_backend . Available options are 'gcs', 's3', 'azure'", blocks_storage_bucket_name: error 'must specify blocks storage bucket name', - blocks_storage_s3_endpoint: 's3.dualstack.us-east-1.amazonaws.com', + blocks_storage_s3_endpoint: 's3.dualstack.%s.amazonaws.com' % $._config.aws_region, blocks_storage_azure_account_name: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account name' else '', blocks_storage_azure_account_key: if $._config.blocks_storage_backend == 'azure' then error 'must specify azure account key' else '', - // Secondary storage engine is only used for querying. - querier_second_storage_engine: null, - store_gateway_replication_factor: 3, - // By default ingesters will be run as StatefulSet with WAL. - // If this is set to true, ingesters will use staless deployments without WAL. - ingester_deployment_without_wal: false, - ingester: { // These config options are only for the chunks storage. wal_dir: '/wal_data', statefulset_disk: '150Gi', }, - // Blocks storage engine doesn't require the table manager. - // When running blocks with chunks as secondary storage engine for querier only, we need table-manager to apply - // retention policies. - table_manager_enabled: $._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks', - - // Blocks storage engine doesn't support index-writes (for writes deduplication) cache. - memcached_index_writes_enabled: $._config.storage_engine != 'blocks', - memcached_index_writes_max_item_size_mb: 1, - - // Index and chunks caches are supported by both blocks storage engine and chunks engine. memcached_index_queries_enabled: true, memcached_index_queries_max_item_size_mb: 5, @@ -102,57 +65,31 @@ query_tee_backend_endpoints: [], query_tee_backend_preferred: '', - enabledBackends: [ - backend - for backend in std.split($._config.storage_backend, ',') - ], - - client_configs: { - aws: - if std.count($._config.enabledBackends, 'aws') > 0 then { - 'dynamodb.api-limit': 10, - 'dynamodb.url': 'https://%s' % $._config.aws_region, - 's3.url': 'https://%s/%s' % [$._config.aws_region, $._config.s3_bucket_name], - } else {}, - cassandra: - if std.count($._config.enabledBackends, 'cassandra') > 0 then { - 'cassandra.keyspace': $._config.namespace, - 'cassandra.addresses': $._config.cassandra_addresses, - 'cassandra.replication-factor': $._config.replication_factor, - } else {}, - gcp: - if std.count($._config.enabledBackends, 'gcp') > 0 then { - 'bigtable.project': $._config.bigtable_project, - 'bigtable.instance': $._config.bigtable_instance, - } else {}, - }, - - storeConfig: self.storeMemcachedChunksConfig, - - storeMemcachedChunksConfig: if $._config.memcached_chunks_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then - { - 'store.chunks-cache.memcached.hostname': 'memcached.%s.svc.cluster.local' % $._config.namespace, - 'store.chunks-cache.memcached.service': 'memcached-client', - 'store.chunks-cache.memcached.timeout': '3s', - } - else {}, - grpcConfig:: { 'server.grpc.keepalive.min-time-between-pings': '10s', 'server.grpc.keepalive.ping-without-stream-allowed': true, }, - storageConfig: - $._config.client_configs.aws + - $._config.client_configs.cassandra + - $._config.client_configs.gcp + - { 'schema-config-file': '/etc/cortex/schema/config.yaml' }, + ingesterClientConfig:: { + 'ingester.client.grpc-compression': 'snappy-block', + }, genericBlocksStorageConfig:: { - 'store.engine': $._config.storage_engine, // May still be chunks + 'store.engine': 'blocks', }, + + // Ignore blocks in querier, ruler and store-gateways for the last 11h + ignore_blocks_within: '11h', + + // No need to look at store for data younger than 12h, as ingesters have all of it. + query_store_after: '12h', + + // Ingesters don't have data older than 13h, no need to ask them. + query_ingesters_within: '13h', + queryBlocksStorageConfig:: { 'blocks-storage.bucket-store.sync-dir': '/data/tsdb', + 'blocks-storage.bucket-store.ignore-blocks-within': $._config.ignore_blocks_within, 'blocks-storage.bucket-store.ignore-deletion-marks-delay': '1h', 'store-gateway.sharding-enabled': true, @@ -175,60 +112,27 @@ 'blocks-storage.azure.container-name': $._config.blocks_storage_bucket_name, 'blocks-storage.azure.account-name': $._config.blocks_storage_azure_account_name, 'blocks-storage.azure.account-key': $._config.blocks_storage_azure_account_key, + 'blocks-storage.azure.endpoint-suffix': 'blob.core.windows.net', }, - // Blocks storage configuration, used only when 'blocks' storage - // engine is explicitly enabled. - blocksStorageConfig: ( - if $._config.storage_engine == 'blocks' || $._config.querier_second_storage_engine == 'blocks' then ( - if $._config.blocks_storage_backend == 'gcs' then $._config.gcsBlocksStorageConfig - else if $._config.blocks_storage_backend == 's3' then $._config.s3BlocksStorageConfig - else if $._config.blocks_storage_backend == 'azure' then $._config.azureBlocksStorageConfig - else $._config.genericBlocksStorageConfig - ) else {} - ), + + blocksStorageConfig: + if $._config.blocks_storage_backend == 'gcs' then $._config.gcsBlocksStorageConfig + else if $._config.blocks_storage_backend == 's3' then $._config.s3BlocksStorageConfig + else if $._config.blocks_storage_backend == 'azure' then $._config.azureBlocksStorageConfig + else $._config.genericBlocksStorageConfig, // Querier component config (shared between the ruler and querier). queryConfig: { 'runtime-config.file': '/etc/cortex/overrides.yaml', - // Limit the size of the rows we read from the index. - 'store.cardinality-limit': 1e6, - // Don't allow individual queries of longer than 32days. Due to day query // splitting in the frontend, the reality is this only limits rate(foo[32d]) // type queries. 32 days to allow for comparision over the last month (31d) and // then some. 'store.max-query-length': '768h', - } + ( - if $._config.storage_engine == 'chunks' then { - // Don't query ingesters for older queries. - // Chunks are held in memory for up to 6hrs right now. Additional 6h are granted for safety reasons because - // the remote writing Prometheus may have a delay or write requests into the database are queued. - 'querier.query-ingesters-within': '12h', - - // Don't query the chunk store for data younger than max_chunk_idle. - 'querier.query-store-after': $._config.max_chunk_idle, - } else if $._config.storage_engine == 'blocks' then { - // Ingesters don't have data older than 13h, no need to ask them. - 'querier.query-ingesters-within': '13h', - - // No need to look at store for data younger than 12h, as ingesters have all of it. - 'querier.query-store-after': '12h', - } - ) + ( - if $._config.memcached_index_queries_enabled && ($._config.storage_engine == 'chunks' || $._config.querier_second_storage_engine == 'chunks') then - { - // Setting for index cache. - 'store.index-cache-validity': '14m', // ingester.retain-period=15m, 1m less for safety. - 'store.index-cache-read.cache.enable-fifocache': true, - 'store.index-cache-read.fifocache.max-size-items': 102400, - 'store.index-cache-read.memcached.hostname': 'memcached-index-queries.%(namespace)s.svc.cluster.local' % $._config, - 'store.index-cache-read.memcached.service': 'memcached-client', - 'store.index-cache-read.memcached.timeout': '500ms', - 'store.cache-lookups-older-than': '36h', - } - else {} - ), + 'querier.query-ingesters-within': $._config.query_ingesters_within, + 'querier.query-store-after': $._config.query_store_after, + }, // PromQL query engine config (shared between all services running PromQL engine, like the ruler and querier). queryEngineConfig: { @@ -271,6 +175,7 @@ 'ruler-storage.azure.container-name': $._config.ruler_storage_bucket_name, 'ruler-storage.azure.account-name': $._config.ruler_storage_azure_account_name, 'ruler-storage.azure.account-key': $._config.ruler_storage_azure_account_key, + 'ruler-storage.azure.endpoint-suffix': 'blob.core.windows.net', }, 'local': { 'ruler-storage.local.directory': $._config.ruler_local_directory, @@ -301,6 +206,7 @@ 'alertmanager-storage.azure.account-key': $._config.alertmanager_azure_account_key, 'alertmanager-storage.azure.account-name': $._config.alertmanager_azure_account_name, 'alertmanager-storage.azure.container-name': $._config.alertmanager_azure_container_name, + 'alertmanager-storage.azure.endpoint-suffix': 'blob.core.windows.net', }, gcs: { 'alertmanager-storage.gcs.bucket-name': $._config.alertmanager_gcs_bucket_name, @@ -330,7 +236,6 @@ 'ingester.max-series-per-metric': $._config.limits.max_series_per_metric, 'ingester.max-global-series-per-user': $._config.limits.max_global_series_per_user, 'ingester.max-global-series-per-metric': $._config.limits.max_global_series_per_metric, - 'ingester.max-series-per-query': $._config.limits.max_series_per_query, }, rulerLimitsConfig: { 'ruler.max-rules-per-rule-group': $._config.limits.ruler_max_rules_per_rule_group, @@ -353,8 +258,6 @@ max_global_series_per_user: 150000, max_global_series_per_metric: 20000, - max_series_per_query: 100000, - ingestion_rate: 10000, ingestion_burst_size: 200000, @@ -364,6 +267,8 @@ // No retention for now. compactor_blocks_retention_period: '0', + + ingestion_tenant_shard_size: 3, }, medium_small_user:: { @@ -373,14 +278,14 @@ max_global_series_per_user: 300000, max_global_series_per_metric: 30000, - max_series_per_query: 100000, - ingestion_rate: 30000, ingestion_burst_size: 300000, // 1000 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 50, + + ingestion_tenant_shard_size: 9, }, small_user:: { @@ -390,14 +295,14 @@ max_global_series_per_user: 1000000, max_global_series_per_metric: 100000, - max_series_per_query: 100000, - ingestion_rate: 100000, ingestion_burst_size: 1000000, // 1400 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 70, + + ingestion_tenant_shard_size: 15, }, medium_user:: { @@ -407,22 +312,20 @@ max_global_series_per_user: 3000000, // 3M max_global_series_per_metric: 300000, // 300K - max_series_per_query: 100000, - ingestion_rate: 350000, // 350K ingestion_burst_size: 3500000, // 3.5M // 1800 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 90, + + ingestion_tenant_shard_size: 30, }, big_user:: { max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit - max_series_per_query: 100000, - max_global_series_per_user: 6000000, // 6M max_global_series_per_metric: 600000, // 600K @@ -432,70 +335,152 @@ // 2200 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 110, + + ingestion_tenant_shard_size: 60, }, super_user:: { + compactor_tenant_shard_size: 2, max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit max_global_series_per_user: 12000000, // 12M max_global_series_per_metric: 1200000, // 1.2M - max_series_per_query: 100000, - ingestion_rate: 1500000, // 1.5M ingestion_burst_size: 15000000, // 15M // 2600 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 130, + + ingestion_tenant_shard_size: 120, }, // This user class has limits increased by +50% compared to the previous one. mega_user+:: { + compactor_tenant_shard_size: 2, max_series_per_metric: 0, // Disabled in favour of the max global limit max_series_per_user: 0, // Disabled in favour of the max global limit max_global_series_per_user: 16000000, // 16M max_global_series_per_metric: 1600000, // 1.6M - max_series_per_query: 100000, - ingestion_rate: 2250000, // 2.25M ingestion_burst_size: 22500000, // 22.5M // 3000 rules ruler_max_rules_per_rule_group: 20, ruler_max_rule_groups_per_tenant: 150, + + ingestion_tenant_shard_size: 180, + }, + + user_24M:: { // 50% more than previous + compactor_tenant_shard_size: 4, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 24000000, // 24M + max_global_series_per_metric: 2400000, // 2.4M + + ingestion_rate: 3000000, // 3M + ingestion_burst_size: 30000000, // 30M + + // 3400 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 170, + + ingestion_tenant_shard_size: 270, + }, + + user_32M:: { // 33% more than previous + compactor_tenant_shard_size: 4, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 32000000, // 32M + max_global_series_per_metric: 3200000, // 3.2M + + ingestion_rate: 4500000, // 4.5M + ingestion_burst_size: 45000000, // 45M + + // 3800 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 190, + + ingestion_tenant_shard_size: 360, + }, + + user_48M:: { // 50% more than previous + compactor_tenant_shard_size: 8, + max_series_per_metric: 0, // Disabled in favour of the max global limit + max_series_per_user: 0, // Disabled in favour of the max global limit + + max_global_series_per_user: 48000000, // 48M + max_global_series_per_metric: 4800000, // 4.8M + + ingestion_rate: 6000000, // 6M + ingestion_burst_size: 60000000, // 60M + + // 4200 rules + ruler_max_rules_per_rule_group: 20, + ruler_max_rule_groups_per_tenant: 210, + + ingestion_tenant_shard_size: 540, }, }, // if not empty, passed to overrides.yaml as another top-level field multi_kv_config: {}, - schemaID: std.md5(std.toString($._config.schema)), - enable_pod_priorities: true, alertmanager_enabled: false, // Enables query-scheduler component, and reconfigures querier and query-frontend to use it. - query_scheduler_enabled: false, + query_scheduler_enabled: true, // Enables streaming of chunks from ingesters using blocks. // Changing it will not cause new rollout of ingesters, as it gets passed to them via runtime-config. ingester_stream_chunks_when_using_blocks: true, // Ingester limits are put directly into runtime config, if not null. Available limits: - // ingester_instance_limits: { - // max_inflight_push_requests: 0, // Max inflight push requests per ingester. 0 = no limit. - // max_ingestion_rate: 0, // Max ingestion rate (samples/second) per ingester. 0 = no limit. - // max_series: 0, // Max number of series per ingester. 0 = no limit. - // max_tenants: 0, // Max number of tenants per ingester. 0 = no limit. - // }, - ingester_instance_limits: null, + ingester_instance_limits: { + // max_inflight_push_requests: 0, // Max inflight push requests per ingester. 0 = no limit. + // max_ingestion_rate: 0, // Max ingestion rate (samples/second) per ingester. 0 = no limit. + max_series: 4.8e+6, // Max number of series per ingester. 0 = no limit. 4.8 million is closely tied to 15Gb in requests per ingester + // max_tenants: 0, // Max number of tenants per ingester. 0 = no limit. + }, + + // if we disable this, we need to make sure we set the resource limits + // Disabling this can potentially increase cortex performance, + // but it will also cause performance inconsistencies + gomaxprocs_based_on_cpu_requests: true, + gomemlimit_based_on_mem_requests: true, + + gomaxprocs_resource: + if $._config.gomaxprocs_based_on_cpu_requests then + 'requests.cpu' + else + 'limits.cpu', + + gomemlimit_resource: + if $._config.gomemlimit_based_on_mem_requests then + 'requests.memory' + else + 'limits.memory', }, + go_container_mixin:: + local container = $.core.v1.container; + container.withEnvMixin([ + container.envType.withName('GOMAXPROCS') + + container.envType.valueFrom.resourceFieldRef.withResource($._config.gomaxprocs_resource), + container.envType.withName('GOMEMLIMIT') + + container.envType.valueFrom.resourceFieldRef.withResource($._config.gomemlimit_resource), + ]), + local configMap = $.core.v1.configMap, overrides_config: @@ -509,19 +494,6 @@ ), }), - storage_config: - configMap.new('schema-' + $._config.schemaID) + - configMap.withData({ - 'config.yaml': $.util.manifestYaml({ - configs: $._config.schema, - }), - }), - - local deployment = $.apps.v1.deployment, - storage_config_mixin:: - deployment.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + - $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), - // This removed the CPU limit from the config. NB won't show up in subset // diffs, but ks apply will do the right thing. removeCPULimitsMixin:: { diff --git a/cortex/cortex-manifests.jsonnet.example b/cortex/cortex-manifests.jsonnet.example deleted file mode 100644 index 9abcc1b1..00000000 --- a/cortex/cortex-manifests.jsonnet.example +++ /dev/null @@ -1,27 +0,0 @@ -local cortex = import "cortex/cortex.libsonnet"; - -cortex { - _config+:: { - namespace: "default", - schema: [{ - from: '2019-11-15', - store: 'bigtable-hashed', - object_store: 'gcs', - schema: 'v10', - index: { - prefix: 'dev_index_', - period: '168h', - }, - chunks: { - prefix: 'dev_chunks_', - period: '168h', - }, - }], - - storage_backend: 'gcp', - bigtable_instance: 'example-instance-prod', - bigtable_project: 'example-project1-cortex', - ruler_client_type: 'gcs' - }, -} - diff --git a/cortex/cortex.libsonnet b/cortex/cortex.libsonnet index b8716d19..122aa80c 100644 --- a/cortex/cortex.libsonnet +++ b/cortex/cortex.libsonnet @@ -3,6 +3,7 @@ (import 'images.libsonnet') + (import 'common.libsonnet') + (import 'config.libsonnet') + +(import 'tsdb-config.libsonnet') + (import 'consul.libsonnet') + // Cortex services @@ -10,10 +11,11 @@ (import 'ingester.libsonnet') + (import 'querier.libsonnet') + (import 'query-frontend.libsonnet') + -(import 'table-manager.libsonnet') + (import 'ruler.libsonnet') + (import 'alertmanager.libsonnet') + (import 'query-scheduler.libsonnet') + +(import 'compactor.libsonnet') + +(import 'store-gateway.libsonnet') + // Supporting services (import 'etcd.libsonnet') + diff --git a/cortex/distributor.libsonnet b/cortex/distributor.libsonnet index ea22523e..28c6ea2f 100644 --- a/cortex/distributor.libsonnet +++ b/cortex/distributor.libsonnet @@ -1,5 +1,6 @@ { local container = $.core.v1.container, + local envType = container.envType, local containerPort = $.core.v1.containerPort, distributor_args:: @@ -7,13 +8,13 @@ $._config.ringConfig + $._config.distributorConfig + $._config.distributorLimitsConfig + + $._config.ingesterClientConfig + { target: 'distributor', 'validation.reject-old-samples': true, 'validation.reject-old-samples.max-age': '12h', 'runtime-config.file': '/etc/cortex/overrides.yaml', - 'distributor.remote-timeout': '20s', 'distributor.ha-tracker.enable': true, 'distributor.ha-tracker.enable-for-all-users': true, @@ -21,11 +22,6 @@ 'distributor.ha-tracker.etcd.endpoints': 'etcd-client.%s.svc.cluster.local.:2379' % $._config.namespace, 'distributor.ha-tracker.prefix': 'prom_ha/', - // The memory requests are 2G, and we barely use 100M. - // By adding a ballast of 1G, we can drastically reduce GC, but also keep the usage at - // around 1.25G, reducing the 99%ile. - 'mem-ballast-size-bytes': 1 << 30, // 1GB - 'server.grpc.keepalive.max-connection-age': '2m', 'server.grpc.keepalive.max-connection-age-grace': '5m', 'server.grpc.keepalive.max-connection-idle': '1m', @@ -37,17 +33,23 @@ // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" // is set to false. 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, + 'distributor.instance-limits.max-inflight-push-requests': 60, //60 is very conservative to protect the distributor from OOMs }, + distributor_env_map:: { + }, + distributor_ports:: $.util.defaultPorts, distributor_container:: container.new('distributor', $._images.distributor) + container.withPorts($.distributor_ports) + container.withArgsMixin($.util.mapToFlags($.distributor_args)) + + container.withEnvMap($.distributor_env_map) + $.util.resourcesRequests('2', '2Gi') + $.util.resourcesLimits(null, '4Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin, local deployment = $.apps.v1.deployment, diff --git a/cortex/flusher-job-blocks.libsonnet b/cortex/flusher-job-blocks.libsonnet index 1e6266ca..56264c13 100644 --- a/cortex/flusher-job-blocks.libsonnet +++ b/cortex/flusher-job-blocks.libsonnet @@ -21,11 +21,16 @@ target: 'flusher', 'blocks-storage.tsdb.retention-period': '10000h', // don't delete old blocks too soon. })) + + container.withEnvMap($.flusher_env_map) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin, + flusher_env_map:: { + }, + flusher_job_func(jobName, pvcName):: job.new() + job.mixin.spec.template.spec.withContainers([ diff --git a/cortex/flusher-job.libsonnet b/cortex/flusher-job.libsonnet deleted file mode 100644 index 4d9a5762..00000000 --- a/cortex/flusher-job.libsonnet +++ /dev/null @@ -1,51 +0,0 @@ -{ - // Usage example: - // local flusher_job = import 'cortex/flusher-job.libsonnet'; - // flusher_job + { - // flusher_job: - // $.flusher_job_func('pvc-af8947e6-182e-11ea-82e4-42010a9a0137', 'ingester-pvc-ingester-5'), - // } - - local container = $.core.v1.container, - local job = $.batch.v1.job, - local volumeMount = $.core.v1.volumeMount, - local volume = $.core.v1.volume, - - flusher_container:: - container.new('flusher', $._images.flusher) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.ingester_args { - target: 'flusher', - 'flusher.wal-dir': $._config.wal_dir, - })) + - $.util.resourcesRequests('4', '15Gi') + - $.util.resourcesLimits(null, '25Gi') + - $.util.readinessProbe + - $.jaeger_mixin, - - flusher_job_storage_config_mixin:: - job.mixin.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + - $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), - - flusher_job_func(volumeName, pvcName):: - job.new() + - job.mixin.spec.template.spec.withContainers([ - $.flusher_container + - container.withVolumeMountsMixin([ - volumeMount.new(volumeName, $._config.wal_dir), - ]), - ]) + - job.mixin.spec.template.spec.withRestartPolicy('Never') + - job.mixin.spec.template.spec.withVolumes([ - volume.fromPersistentVolumeClaim(volumeName, pvcName), - ]) + - $.flusher_job_storage_config_mixin + - job.mixin.metadata.withName('flusher') + - job.mixin.metadata.withNamespace($._config.namespace) + - job.mixin.metadata.withLabels({ name: 'flusher' }) + - job.mixin.spec.template.metadata.withLabels({ name: 'flusher' }) + - job.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - job.mixin.spec.template.spec.withTerminationGracePeriodSeconds(300) + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.podPriority('high'), -} diff --git a/cortex/gcs/main.jsonnet.example b/cortex/gcs/main.jsonnet.example new file mode 100644 index 00000000..99d40caf --- /dev/null +++ b/cortex/gcs/main.jsonnet.example @@ -0,0 +1,22 @@ +local cortex = import 'cortex/cortex.libsonnet'; + +cortex { + _config+:: { + namespace: 'default', + + blocks_storage_backend: 'gcs', + blocks_storage_bucket_name: 'example-bucket', + + // Cortex Ruler config. + ruler_enabled: true, + ruler_client_type: 'gcs', + ruler_storage_bucket_name: 'ruler-example-bucket', + + // Cortex Alertmanager config + alertmanager_enabled: true, + alertmanager_client_type: 'gcs', + alertmanager_gcs_bucket_name: 'alertmanager-example-bucket', + external_url: 'https://cortex.example.com', //Alertmanager UI + cluster: 'cluster', + }, +} diff --git a/cortex/images.libsonnet b/cortex/images.libsonnet index 1eb891c4..0a0238cf 100644 --- a/cortex/images.libsonnet +++ b/cortex/images.libsonnet @@ -1,11 +1,11 @@ { _images+:: { // Various third-party images. - memcached: 'memcached:1.6.9-alpine', - memcachedExporter: 'prom/memcached-exporter:v0.6.0', + memcached: 'memcached:1.6.23-alpine', + memcachedExporter: 'prom/memcached-exporter:v0.14.2', // Our services. - cortex: 'cortexproject/cortex:v1.11.0', + cortex: 'cortexproject/cortex:v1.17.1', alertmanager: self.cortex, distributor: self.cortex, @@ -19,8 +19,8 @@ store_gateway: self.cortex, query_scheduler: self.cortex, - cortex_tools: 'grafana/cortex-tools:v0.4.0', - query_tee: 'quay.io/cortexproject/query-tee:v1.11.0', - testExporter: 'cortexproject/test-exporter:v1.11.0', + overrides_exporter: self.cortex, + query_tee: 'quay.io/cortexproject/query-tee:v1.17.1', + testExporter: 'cortexproject/test-exporter:v1.17.1', }, } diff --git a/cortex/ingester.libsonnet b/cortex/ingester.libsonnet index 3078db36..11e22f58 100644 --- a/cortex/ingester.libsonnet +++ b/cortex/ingester.libsonnet @@ -1,9 +1,22 @@ { + local podDisruptionBudget = $.policy.v1.podDisruptionBudget, + local pvc = $.core.v1.persistentVolumeClaim, + local statefulSet = $.apps.v1.statefulSet, + local volume = $.core.v1.volume, + local volumeMount = $.core.v1.volumeMount, + + // The ingesters should persist TSDB blocks and WAL on a persistent + // volume in order to be crash resilient. + local ingester_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_ingester_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_ingester_data_disk_class) + + pvc.mixin.metadata.withName('ingester-data'), + ingester_args:: $._config.grpcConfig + $._config.ringConfig + - $._config.storeConfig + - $._config.storageConfig + $._config.blocksStorageConfig + $._config.distributorConfig + // This adds the distributor ring flags to the ingester. $._config.ingesterLimitsConfig + @@ -12,73 +25,47 @@ // Ring config. 'ingester.num-tokens': 512, - 'ingester.join-after': '30s', - 'ingester.max-transfer-retries': 60, // Each retry is backed off by 5s, so 5mins for new ingester to come up. + 'ingester.join-after': '0s', 'ingester.heartbeat-period': '15s', - 'ingester.max-stale-chunk-idle': '5m', 'ingester.unregister-on-shutdown': $._config.unregister_ingesters_on_shutdown, - // Chunk building/flushing config. - 'ingester.chunk-encoding': 3, // Bigchunk encoding - 'ingester.retain-period': '15m', - 'ingester.max-chunk-age': '6h', - // Limits config. - 'ingester.max-chunk-idle': $._config.max_chunk_idle, 'runtime-config.file': '/etc/cortex/overrides.yaml', 'server.grpc-max-concurrent-streams': 10000, 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, - } + ( - if $._config.memcached_index_writes_enabled then - { - // Setup index write deduping. - 'store.index-cache-write.memcached.hostname': 'memcached-index-writes.%(namespace)s.svc.cluster.local' % $._config, - 'store.index-cache-write.memcached.service': 'memcached-client', - } - else {} - ), - - ingester_statefulset_args:: - $._config.grpcConfig - { - 'ingester.wal-enabled': true, - 'ingester.checkpoint-enabled': true, - 'ingester.recover-from-wal': true, - 'ingester.wal-dir': $._config.ingester.wal_dir, - 'ingester.checkpoint-duration': '15m', - '-log.level': 'info', - 'ingester.tokens-file-path': $._config.ingester.wal_dir + '/tokens', + + 'blocks-storage.tsdb.dir': '/data/tsdb', + 'blocks-storage.tsdb.block-ranges-period': '2h', + 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. + 'blocks-storage.tsdb.ship-interval': '1m', + + // Persist ring tokens so that when the ingester will be restarted + // it will pick the same tokens + 'ingester.tokens-file-path': '/data/tokens', }, ingester_ports:: $.util.defaultPorts, local name = 'ingester', local container = $.core.v1.container, + local envType = container.envType, ingester_container:: container.new(name, $._images.ingester) + container.withPorts($.ingester_ports) + container.withArgsMixin($.util.mapToFlags($.ingester_args)) + + container.withEnvMap($.ingester_env_map) + $.util.resourcesRequests('4', '15Gi') + $.util.resourcesLimits(null, '25Gi') + $.util.readinessProbe + + $.go_container_mixin + $.jaeger_mixin, - local volumeMount = $.core.v1.volumeMount, - - ingester_statefulset_container:: - $.ingester_container + - container.withArgsMixin($.util.mapToFlags($.ingester_statefulset_args)) + - container.withVolumeMountsMixin([ - volumeMount.new('ingester-pvc', $._config.ingester.wal_dir), - ]), - ingester_deployment_labels:: {}, - local pvc = $.core.v1.persistentVolumeClaim, - local volume = $.core.v1.volume, - local statefulSet = $.apps.v1.statefulSet, + ingester_env_map:: { + }, local ingester_pvc = pvc.new('ingester-pvc') + @@ -86,60 +73,43 @@ pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + pvc.mixin.spec.withStorageClassName('fast'), - statefulset_storage_config_mixin:: - statefulSet.mixin.spec.template.metadata.withAnnotationsMixin({ schemaID: $._config.schemaID },) + - $.util.configVolumeMount('schema-' + $._config.schemaID, '/etc/cortex/schema'), - - ingester_statefulset: - if $._config.ingester_deployment_without_wal == false then - statefulSet.new('ingester', 3, [$.ingester_statefulset_container], ingester_pvc) + - statefulSet.mixin.spec.withServiceName('ingester') + - statefulSet.mixin.spec.template.spec.withVolumes([volume.fromPersistentVolumeClaim('ingester-pvc', 'ingester-pvc')]) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: 'ingester' }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: 'ingester' } + $.ingester_deployment_labels) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: 'ingester' }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - $.statefulset_storage_config_mixin + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.podPriority('high') + - $.util.antiAffinityStatefulSet - else null, - - local deployment = $.apps.v1.deployment, - - ingester_deployment: - if $._config.ingester_deployment_without_wal then - deployment.new(name, 3, [$.ingester_container], $.ingester_deployment_labels) + - $.util.antiAffinity + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - deployment.mixin.metadata.withLabels({ name: name }) + - deployment.mixin.spec.withMinReadySeconds(60) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + - deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(4800) + - $.storage_config_mixin + - $.util.podPriority('high') - else null, - ingester_service_ignored_labels:: [], - ingester_service: - if $._config.ingester_deployment_without_wal then - $.util.serviceFor($.ingester_deployment, $.ingester_service_ignored_labels) - else - $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), - - local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, - newIngesterPdb(pdbName, ingesterName):: - podDisruptionBudget.new() + - podDisruptionBudget.mixin.metadata.withName(pdbName) + + podDisruptionBudget.new(pdbName) + podDisruptionBudget.mixin.metadata.withLabels({ name: pdbName }) + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: ingesterName }) + podDisruptionBudget.mixin.spec.withMaxUnavailable(1), ingester_pdb: self.newIngesterPdb('ingester-pdb', name), + + newIngesterStatefulSet(name, container, with_anti_affinity=true):: + statefulSet.new(name, 3, [ + container + $.core.v1.container.withVolumeMountsMixin([ + volumeMount.new('ingester-data', '/data'), + ]), + ], ingester_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. + // For this reason, we grant an high termination period (80 minutes). + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + + $.util.podPriority('high') + + // Parallelly scale up/down ingester instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + (if with_anti_affinity then $.util.antiAffinity else {}), + + ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container), + + ingester_service: + $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), } diff --git a/cortex/memcached.libsonnet b/cortex/memcached.libsonnet index 011328c3..fb8d2e3e 100644 --- a/cortex/memcached.libsonnet +++ b/cortex/memcached.libsonnet @@ -38,14 +38,6 @@ memcached { } else {}, - // Dedicated memcached instance used to dedupe writes to the index. - memcached_index_writes: if $._config.memcached_index_writes_enabled then - $.memcached { - name: 'memcached-index-writes', - max_item_size: '%dm' % [$._config.memcached_index_writes_max_item_size_mb], - } - else {}, - // Memcached instance used to cache chunks. memcached_chunks: if $._config.memcached_chunks_enabled then $.memcached { diff --git a/cortex/overrides-exporter.libsonnet b/cortex/overrides-exporter.libsonnet index d8eb411a..8fbf4acc 100644 --- a/cortex/overrides-exporter.libsonnet +++ b/cortex/overrides-exporter.libsonnet @@ -1,65 +1,32 @@ -// this enables overrides exporter, which will expose the configured -// overrides and presets (if configured). Those metrics can be potentially -// high cardinality. +// this enables overrides exporter, which will expose the configured overrides. { local name = 'overrides-exporter', - _config+: { - // overrides exporter can also make the configured presets available, this - // list references entries within $._config.overrides - - overrides_exporter_presets:: [ - 'extra_small_user', - 'small_user', - 'medium_user', - 'big_user', - 'super_user', - 'mega_user', - ], - }, - - local presets_enabled = std.length($._config.overrides_exporter_presets) > 0, - - local configMap = $.core.v1.configMap, - overrides_exporter_presets_configmap: - if presets_enabled then - configMap.new('overrides-presets') + - configMap.withData({ - 'overrides-presets.yaml': $.util.manifestYaml( - { - presets: { - [key]: $._config.overrides[key] - for key in $._config.overrides_exporter_presets - }, - } - ), - }), - local containerPort = $.core.v1.containerPort, - overrides_exporter_port:: containerPort.newNamed(name='http-metrics', containerPort=9683), + overrides_exporter_port:: containerPort.newNamed(name='http-metrics', containerPort=80), overrides_exporter_args:: { - 'overrides-file': '/etc/cortex/overrides.yaml', - } + if presets_enabled then { - 'presets-file': '/etc/cortex_presets/overrides-presets.yaml', - } else {}, + target: 'overrides-exporter', + + 'runtime-config.file': '/etc/cortex/overrides.yaml', + }, local container = $.core.v1.container, overrides_exporter_container:: - container.new(name, $._images.cortex_tools) + + container.new(name, $._images.overrides_exporter) + container.withPorts([ $.overrides_exporter_port, ]) + - container.withArgsMixin([name] + $.util.mapToFlags($.overrides_exporter_args, prefix='--')) + + container.withArgsMixin($.util.mapToFlags($.overrides_exporter_args, prefix='--')) + $.util.resourcesRequests('0.5', '0.5Gi') + $.util.readinessProbe + + $.go_container_mixin + container.mixin.readinessProbe.httpGet.withPort($.overrides_exporter_port.name), local deployment = $.apps.v1.deployment, overrides_exporter_deployment: deployment.new(name, 1, [$.overrides_exporter_container], { name: name }) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.configVolumeMount('overrides-presets', '/etc/cortex_presets') + deployment.mixin.metadata.withLabels({ name: name }), overrides_exporter_service: diff --git a/cortex/querier.libsonnet b/cortex/querier.libsonnet index eb807ee2..026b2825 100644 --- a/cortex/querier.libsonnet +++ b/cortex/querier.libsonnet @@ -1,15 +1,18 @@ { local container = $.core.v1.container, + local envType = container.envType, querier_args:: $._config.grpcConfig + $._config.ringConfig + - $._config.storeConfig + - $._config.storageConfig + $._config.blocksStorageConfig + $._config.queryConfig + $._config.queryEngineConfig + $._config.distributorConfig + + $._config.ingesterClientConfig + + $._config.queryBlocksStorageConfig + + $.blocks_metadata_caching_config + + $.bucket_index_config + { target: 'querier', @@ -25,12 +28,6 @@ 'querier.frontend-address': 'query-frontend-discovery.%(namespace)s.svc.cluster.local:9095' % $._config, 'querier.frontend-client.grpc-max-send-msg-size': 100 << 20, - 'querier.second-store-engine': $._config.querier_second_storage_engine, - - // We request high memory but the Go heap is typically very low (< 100MB) and this causes - // the GC to trigger continuously. Setting a ballast of 256MB reduces GC. - 'mem-ballast-size-bytes': 1 << 28, // 256M - 'log.level': 'debug', }, @@ -47,7 +44,8 @@ $.jaeger_mixin + $.util.readinessProbe + container.withEnvMap($.querier_env_map) + - $.util.resourcesRequests('1', '12Gi') + + $.go_container_mixin + + $.util.resourcesRequests('2', '12Gi') + $.util.resourcesLimits(null, '24Gi'), local deployment = $.apps.v1.deployment, @@ -59,8 +57,7 @@ (if $._config.cortex_querier_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(5) + - deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + - $.storage_config_mixin, + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), querier_deployment: self.newQuerierDeployment('querier', $.querier_container), diff --git a/cortex/query-frontend.libsonnet b/cortex/query-frontend.libsonnet index 80f36d04..a0552215 100644 --- a/cortex/query-frontend.libsonnet +++ b/cortex/query-frontend.libsonnet @@ -1,14 +1,18 @@ { local container = $.core.v1.container, + local envType = container.envType, query_frontend_args:: $._config.grpcConfig { target: 'query-frontend', - // Need log.level=debug so all queries are logged, needed for analyse.py. + // Need log.level=debug to see trace id for queries 'log.level': 'debug', + // a message with some statistics is logged for every query. + 'frontend.query-stats-enabled': true, + // Increase HTTP server response write timeout, as we were seeing some // queries that return a lot of data timeing out. 'server.http-write-timeout': '1m', @@ -42,11 +46,16 @@ container.new('query-frontend', $._images.query_frontend) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_frontend_args)) + + container.withEnvMap($.query_frontend_env_map) + + $.go_container_mixin + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '600Mi') + $.util.resourcesLimits(null, '1200Mi'), + query_frontend_env_map:: { + }, + local deployment = $.apps.v1.deployment, newQueryFrontendDeployment(name, container):: diff --git a/cortex/query-scheduler.libsonnet b/cortex/query-scheduler.libsonnet index 604d258a..8aa5cf74 100644 --- a/cortex/query-scheduler.libsonnet +++ b/cortex/query-scheduler.libsonnet @@ -3,6 +3,7 @@ { local container = $.core.v1.container, local deployment = $.apps.v1.deployment, + local envType = container.envType, local service = $.core.v1.service, query_scheduler_args+:: @@ -17,6 +18,8 @@ container.new('query-scheduler', $._images.query_scheduler) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.query_scheduler_args)) + + container.withEnvMap($.query_scheduler_env_map) + + $.go_container_mixin + $.jaeger_mixin + $.util.readinessProbe + $.util.resourcesRequests('2', '1Gi') + @@ -30,6 +33,9 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxSurge(0) + deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1), + query_scheduler_env_map:: { + }, + query_scheduler_deployment: if !$._config.query_scheduler_enabled then {} else self.newQuerySchedulerDeployment('query-scheduler', $.query_scheduler_container), diff --git a/cortex/query-tee.libsonnet b/cortex/query-tee.libsonnet index 4ac3b0a1..9856c34b 100644 --- a/cortex/query-tee.libsonnet +++ b/cortex/query-tee.libsonnet @@ -2,6 +2,7 @@ local container = $.core.v1.container, local containerPort = $.core.v1.containerPort, local deployment = $.apps.v1.deployment, + local envType = container.envType, local service = $.core.v1.service, local servicePort = $.core.v1.servicePort, @@ -18,9 +19,14 @@ containerPort.newNamed(name='http-metrics', containerPort=9900), ]) + container.withArgsMixin($.util.mapToFlags($.query_tee_args)) + + container.withEnvMap($.query_tee_env_map) + $.util.resourcesRequests('1', '512Mi') + + $.go_container_mixin + $.jaeger_mixin, + query_tee_env_map:: { + }, + query_tee_deployment: if !($._config.query_tee_enabled) then {} else deployment.new('query-tee', 2, [$.query_tee_container]), diff --git a/cortex/ruler.libsonnet b/cortex/ruler.libsonnet index a7df54fd..96781a19 100644 --- a/cortex/ruler.libsonnet +++ b/cortex/ruler.libsonnet @@ -1,17 +1,20 @@ { local container = $.core.v1.container, + local envType = container.envType, ruler_args:: $._config.grpcConfig + $._config.ringConfig + - $._config.storeConfig + - $._config.storageConfig + $._config.blocksStorageConfig + $._config.queryConfig + $._config.queryEngineConfig + $._config.distributorConfig + + $._config.ingesterClientConfig + $._config.rulerClientConfig + $._config.rulerLimitsConfig + + $._config.queryBlocksStorageConfig + + $.blocks_metadata_caching_config + + $.bucket_index_config + { target: 'ruler', // Alertmanager configs @@ -27,12 +30,12 @@ 'server.grpc-max-send-msg-size-bytes': 10 * 1024 * 1024, 'server.grpc-max-recv-msg-size-bytes': 10 * 1024 * 1024, - // Storage - 'querier.second-store-engine': $._config.querier_second_storage_engine, - // Do not extend the replication set on unhealthy (or LEAVING) ingester when "unregister on shutdown" // is set to false. 'distributor.extend-writes': $._config.unregister_ingesters_on_shutdown, + + // a message with some statistics is logged for every query. + 'ruler.query-stats-enabled': true, }, ruler_container:: @@ -40,7 +43,9 @@ container.new('ruler', $._images.ruler) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.ruler_args)) + - $.util.resourcesRequests('1', '6Gi') + + container.withEnvMap($.ruler_env_map) + + $.go_container_mixin + + $.util.resourcesRequests('2', '6Gi') + $.util.resourcesLimits('16', '16Gi') + $.util.readinessProbe + $.jaeger_mixin @@ -55,10 +60,12 @@ deployment.mixin.spec.strategy.rollingUpdate.withMaxUnavailable(1) + deployment.mixin.spec.template.spec.withTerminationGracePeriodSeconds(600) + (if $._config.cortex_ruler_allow_multiple_replicas_on_same_node then {} else $.util.antiAffinity) + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.storage_config_mixin + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') else {}, + ruler_env_map:: { + }, + local service = $.core.v1.service, ruler_service: diff --git a/cortex/s3/main.jsonnet.example b/cortex/s3/main.jsonnet.example new file mode 100644 index 00000000..0d961ffb --- /dev/null +++ b/cortex/s3/main.jsonnet.example @@ -0,0 +1,24 @@ +local cortex = import 'cortex/cortex.libsonnet'; + +cortex { + _config+:: { + namespace: 'default', + + blocks_storage_backend: 's3', + blocks_storage_bucket_name: 'blocks-example-bucket', + + aws_region: 'replace-with-valid-region', // For example 'us-east-2', + + // Cortex Ruler config + ruler_enabled: true, + ruler_client_type: 's3', + ruler_storage_bucket_name: 'ruler-example-bucket', + + // Cortex Alertmanager config + alertmanager_enabled: true, + alertmanager_client_type: 's3', + alertmanager_s3_bucket_name: 'alertmanager-example-bucket', + external_url: 'https://cortex.example.com', //Alertmanager UI + cluster: 'cluster', + }, +} diff --git a/cortex/store-gateway.libsonnet b/cortex/store-gateway.libsonnet new file mode 100644 index 00000000..c21ee302 --- /dev/null +++ b/cortex/store-gateway.libsonnet @@ -0,0 +1,84 @@ +{ + local container = $.core.v1.container, + local envType = container.envType, + local podDisruptionBudget = $.policy.v1.podDisruptionBudget, + local pvc = $.core.v1.persistentVolumeClaim, + local statefulSet = $.apps.v1.statefulSet, + local volumeMount = $.core.v1.volumeMount, + + // The store-gateway runs a statefulset. + local store_gateway_data_pvc = + pvc.new() + + pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_store_gateway_data_disk_size }) + + pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + + pvc.mixin.spec.withStorageClassName($._config.cortex_store_gateway_data_disk_class) + + pvc.mixin.metadata.withName('store-gateway-data'), + + store_gateway_args:: + $._config.grpcConfig + + $._config.blocksStorageConfig + + $._config.queryBlocksStorageConfig + + { + target: 'store-gateway', + 'runtime-config.file': '/etc/cortex/overrides.yaml', + + // Persist ring tokens so that when the store-gateway will be restarted + // it will pick the same tokens + 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', + + // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. + 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', + 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', + 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, + } + + $.blocks_chunks_caching_config + + $.blocks_metadata_caching_config + + $.bucket_index_config, + + store_gateway_ports:: $.util.defaultPorts, + + store_gateway_container:: + container.new('store-gateway', $._images.store_gateway) + + container.withPorts($.store_gateway_ports) + + container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + + container.withEnvMap($.store_gateway_env_map) + + $.go_container_mixin + + container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + + $.util.resourcesRequests('2', '12Gi') + + $.util.resourcesLimits(null, '18Gi') + + $.util.readinessProbe + + $.jaeger_mixin, + + store_gateway_env_map:: { + }, + + newStoreGatewayStatefulSet(name, container):: + statefulSet.new(name, 3, [container], store_gateway_data_pvc) + + statefulSet.mixin.spec.withServiceName(name) + + statefulSet.mixin.metadata.withNamespace($._config.namespace) + + statefulSet.mixin.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + + statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + + statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + + statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + + statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) + + // Parallelly scale up/down store-gateway instances instead of starting them + // one by one. This does NOT affect rolling updates: they will continue to be + // rolled out one by one (the next pod will be rolled out once the previous is + // ready). + statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + + $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), + + store_gateway_statefulset: self.newStoreGatewayStatefulSet('store-gateway', $.store_gateway_container), + + store_gateway_service: + $.util.serviceFor($.store_gateway_statefulset), + + store_gateway_pdb: + podDisruptionBudget.new('store-gateway-pdb') + + podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + + podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + + // To avoid any disruption in the read path we need at least 1 replica of each + // block available, so the disruption budget depends on the blocks replication factor. + podDisruptionBudget.mixin.spec.withMaxUnavailable(if $._config.store_gateway_replication_factor > 1 then $._config.store_gateway_replication_factor - 1 else 1), +} diff --git a/cortex/table-manager.libsonnet b/cortex/table-manager.libsonnet deleted file mode 100644 index 90cb733c..00000000 --- a/cortex/table-manager.libsonnet +++ /dev/null @@ -1,44 +0,0 @@ -{ - local container = $.core.v1.container, - - table_manager_args:: - $._config.storageConfig - { - target: 'table-manager', - - // Rate limit Bigtable Admin calls. Google seem to limit to ~100QPS, - // and given 2yrs worth of tables (~100) a sync will table 20s. This - // allows you to run upto 20 independant Cortex clusters on the same - // Google project before running into issues. - 'bigtable.grpc-client-rate-limit': 5.0, - 'bigtable.grpc-client-rate-limit-burst': 5, - 'bigtable.backoff-on-ratelimits': true, - 'bigtable.table-cache.enabled': true, - 'table-manager.poll-interval': '10m', - 'table-manager.periodic-table.grace-period': '3h', - }, - - table_manager_container:: - if $._config.table_manager_enabled then - container.new('table-manager', $._images.tableManager) + - container.withPorts($.util.defaultPorts) + - container.withArgsMixin($.util.mapToFlags($.table_manager_args)) + - $.util.resourcesRequests('100m', '100Mi') + - $.util.resourcesLimits('200m', '200Mi') + - $.util.readinessProbe + - $.jaeger_mixin - else {}, - - local deployment = $.apps.v1.deployment, - - table_manager_deployment: - if $._config.table_manager_enabled then - deployment.new('table-manager', 1, [$.table_manager_container]) + - $.storage_config_mixin - else {}, - - table_manager_service: - if $._config.table_manager_enabled then - $.util.serviceFor($.table_manager_deployment) - else {}, -} diff --git a/cortex/test-exporter.libsonnet b/cortex/test-exporter.libsonnet index 9d69abee..e7d088ef 100644 --- a/cortex/test-exporter.libsonnet +++ b/cortex/test-exporter.libsonnet @@ -18,10 +18,15 @@ container.new('test-exporter', $._images.testExporter) + container.withPorts($.util.defaultPorts) + container.withArgsMixin($.util.mapToFlags($.test_exporter_args)) + + container.withEnvMap($.test_exporter_env_map) + $.util.resourcesRequests('100m', '100Mi') + $.util.resourcesLimits('100m', '100Mi') + + $.go_container_mixin + $.jaeger_mixin, + test_exporter_env_map:: { + }, + local deployment = $.apps.v1.deployment, test_exporter_deployment: diff --git a/cortex/tsdb-config.libsonnet b/cortex/tsdb-config.libsonnet new file mode 100644 index 00000000..365a9b5e --- /dev/null +++ b/cortex/tsdb-config.libsonnet @@ -0,0 +1,82 @@ +{ + _config+:: { + // Enforce blocks storage + storage_backend: 'none', + storage_engine: 'blocks', + + // Allow to configure the ingester disk. + cortex_ingester_data_disk_size: '100Gi', + cortex_ingester_data_disk_class: 'fast', + + // Allow to configure the store-gateway disk. + cortex_store_gateway_data_disk_size: '50Gi', + cortex_store_gateway_data_disk_class: 'standard', + + // Allow to configure the compactor disk. + cortex_compactor_data_disk_size: '500Gi', + cortex_compactor_data_disk_class: 'fast', + + // Allow to fine tune compactor. + cortex_compactor_max_concurrency: 1, + // While this is the default value, we want to pass the same to the -blocks-storage.bucket-store.sync-interval + cortex_compactor_cleanup_interval: '15m', + + // Enable use of bucket index by querier, ruler and store-gateway. + // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor. + cortex_bucket_index_enabled: true, + }, + + // We should keep a number of idle connections equal to the max "get" concurrency, + // in order to avoid re-opening connections continuously (this would be slower + // and fill up the conntrack table too). + // + // The downside of this approach is that we'll end up with an higher number of + // active connections to memcached, so we have to make sure connections limit + // set in memcached is high enough. + + blocks_chunks_caching_config:: + ( + if $._config.memcached_index_queries_enabled then { + 'blocks-storage.bucket-store.index-cache.backend': 'memcached', + 'blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': self['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], + } else {} + ) + ( + if $._config.memcached_chunks_enabled then { + 'blocks-storage.bucket-store.chunks-cache.backend': 'memcached', + 'blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': self['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], + } else {} + ), + + blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { + 'blocks-storage.bucket-store.metadata-cache.backend': 'memcached', + 'blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, + 'blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', + 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, + 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': self['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], + } else {}, + + bucket_index_config:: if $._config.cortex_bucket_index_enabled then { + 'blocks-storage.bucket-store.bucket-index.enabled': true, + + // Bucket index is updated by compactor on each cleanup cycle. + 'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval, + } else {}, +} diff --git a/cortex/tsdb.libsonnet b/cortex/tsdb.libsonnet deleted file mode 100644 index 15902099..00000000 --- a/cortex/tsdb.libsonnet +++ /dev/null @@ -1,290 +0,0 @@ -{ - local pvc = $.core.v1.persistentVolumeClaim, - local volumeMount = $.core.v1.volumeMount, - local container = $.core.v1.container, - local statefulSet = $.apps.v1.statefulSet, - local service = $.core.v1.service, - - _config+:: { - // Enforce blocks storage - storage_backend: 'none', - storage_engine: 'blocks', - - // Allow to configure the ingester disk. - cortex_ingester_data_disk_size: '100Gi', - cortex_ingester_data_disk_class: 'fast', - - // Allow to configure the store-gateway disk. - cortex_store_gateway_data_disk_size: '50Gi', - cortex_store_gateway_data_disk_class: 'standard', - - // Allow to configure the compactor disk. - cortex_compactor_data_disk_size: '250Gi', - cortex_compactor_data_disk_class: 'standard', - - // Allow to fine tune compactor. - cortex_compactor_max_concurrency: 1, - // While this is the default value, we want to pass the same to the -blocks-storage.bucket-store.sync-interval - cortex_compactor_cleanup_interval: '15m', - - // Enable use of bucket index by querier, ruler and store-gateway. - // Bucket index is generated by compactor from Cortex 1.7, there is no flag required to enable this on compactor. - cortex_bucket_index_enabled: false, - }, - - blocks_chunks_caching_config:: - ( - if $._config.memcached_index_queries_enabled then { - 'blocks-storage.bucket-store.index-cache.backend': 'memcached', - 'blocks-storage.bucket-store.index-cache.memcached.addresses': 'dnssrvnoa+memcached-index-queries.%(namespace)s.svc.cluster.local:11211' % $._config, - 'blocks-storage.bucket-store.index-cache.memcached.timeout': '200ms', - 'blocks-storage.bucket-store.index-cache.memcached.max-item-size': $._config.memcached_index_queries_max_item_size_mb * 1024 * 1024, - 'blocks-storage.bucket-store.index-cache.memcached.max-async-buffer-size': '25000', - 'blocks-storage.bucket-store.index-cache.memcached.max-async-concurrency': '50', - 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-batch-size': '100', - } else {} - ) + ( - if $._config.memcached_chunks_enabled then { - 'blocks-storage.bucket-store.chunks-cache.backend': 'memcached', - 'blocks-storage.bucket-store.chunks-cache.memcached.addresses': 'dnssrvnoa+memcached.%(namespace)s.svc.cluster.local:11211' % $._config, - 'blocks-storage.bucket-store.chunks-cache.memcached.timeout': '200ms', - 'blocks-storage.bucket-store.chunks-cache.memcached.max-item-size': $._config.memcached_chunks_max_item_size_mb * 1024 * 1024, - 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-buffer-size': '25000', - 'blocks-storage.bucket-store.chunks-cache.memcached.max-async-concurrency': '50', - 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-batch-size': '100', - } else {} - ), - - blocks_metadata_caching_config:: if $._config.memcached_metadata_enabled then { - 'blocks-storage.bucket-store.metadata-cache.backend': 'memcached', - 'blocks-storage.bucket-store.metadata-cache.memcached.addresses': 'dnssrvnoa+memcached-metadata.%(namespace)s.svc.cluster.local:11211' % $._config, - 'blocks-storage.bucket-store.metadata-cache.memcached.timeout': '200ms', - 'blocks-storage.bucket-store.metadata-cache.memcached.max-item-size': $._config.memcached_metadata_max_item_size_mb * 1024 * 1024, - 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-buffer-size': '25000', - 'blocks-storage.bucket-store.metadata-cache.memcached.max-async-concurrency': '50', - 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-batch-size': '100', - } else {}, - - bucket_index_config:: if $._config.cortex_bucket_index_enabled then { - 'blocks-storage.bucket-store.bucket-index.enabled': true, - - // Bucket index is updated by compactor on each cleanup cycle. - 'blocks-storage.bucket-store.sync-interval': $._config.cortex_compactor_cleanup_interval, - } else {}, - - querier_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, - ruler_args+:: $._config.queryBlocksStorageConfig + $.blocks_metadata_caching_config + $.bucket_index_config, - - // The ingesters should persist TSDB blocks and WAL on a persistent - // volume in order to be crash resilient. - local ingester_data_pvc = - pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_ingester_data_disk_size }) + - pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName($._config.cortex_ingester_data_disk_class) + - pvc.mixin.metadata.withName('ingester-data'), - - ingester_deployment: {}, - - ingester_args+:: { - 'blocks-storage.tsdb.dir': '/data/tsdb', - 'blocks-storage.tsdb.block-ranges-period': '2h', - 'blocks-storage.tsdb.retention-period': '96h', // 4 days protection against blocks not being uploaded from ingesters. - 'blocks-storage.tsdb.ship-interval': '1m', - - // Disable TSDB blocks transfer because of persistent volumes - 'ingester.max-transfer-retries': 0, - 'ingester.join-after': '0s', - - // Persist ring tokens so that when the ingester will be restarted - // it will pick the same tokens - 'ingester.tokens-file-path': '/data/tokens', - }, - - newIngesterStatefulSet(name, container, with_anti_affinity=true):: - statefulSet.new(name, 3, [ - container + $.core.v1.container.withVolumeMountsMixin([ - volumeMount.new('ingester-data', '/data'), - ]), - ], ingester_data_pvc) + - statefulSet.mixin.spec.withServiceName(name) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: name } + $.ingester_deployment_labels) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - // When the ingester needs to flush blocks to the storage, it may take quite a lot of time. - // For this reason, we grant an high termination period (80 minutes). - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(1200) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex') + - $.util.podPriority('high') + - // Parallelly scale up/down ingester instances instead of starting them - // one by one. This does NOT affect rolling updates: they will continue to be - // rolled out one by one (the next pod will be rolled out once the previous is - // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + - (if with_anti_affinity then $.util.antiAffinity else {}), - - ingester_statefulset: self.newIngesterStatefulSet('ingester', $.ingester_container), - - ingester_service: - $.util.serviceFor($.ingester_statefulset, $.ingester_service_ignored_labels), - - // The compactor runs a statefulset with a single replica, because - // it does not support horizontal scalability yet. - local compactor_data_pvc = - pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_compactor_data_disk_size }) + - pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName($._config.cortex_compactor_data_disk_class) + - pvc.mixin.metadata.withName('compactor-data'), - - compactor_args:: - $._config.grpcConfig + - $._config.storageConfig + - $._config.blocksStorageConfig + - $._config.compactorLimitsConfig + - { - target: 'compactor', - - // Compactor config. - 'compactor.block-ranges': '2h,12h,24h', - 'compactor.data-dir': '/data', - 'compactor.compaction-interval': '30m', - 'compactor.compaction-concurrency': $._config.cortex_compactor_max_concurrency, - 'compactor.cleanup-interval': $._config.cortex_compactor_cleanup_interval, - - // Enable sharding. - 'compactor.sharding-enabled': true, - 'compactor.ring.store': 'consul', - 'compactor.ring.consul.hostname': 'consul.%s.svc.cluster.local:8500' % $._config.namespace, - 'compactor.ring.prefix': '', - - // Limits config. - 'runtime-config.file': '/etc/cortex/overrides.yaml', - }, - - compactor_ports:: $.util.defaultPorts, - - compactor_container:: - container.new('compactor', $._images.compactor) + - container.withPorts($.compactor_ports) + - container.withArgsMixin($.util.mapToFlags($.compactor_args)) + - container.withVolumeMountsMixin([volumeMount.new('compactor-data', '/data')]) + - // Do not limit compactor CPU and request enough cores to honor configured max concurrency. - $.util.resourcesRequests($._config.cortex_compactor_max_concurrency, '6Gi') + - $.util.resourcesLimits(null, '6Gi') + - $.util.readinessProbe + - $.jaeger_mixin, - - newCompactorStatefulSet(name, container):: - statefulSet.new(name, 1, [container], compactor_data_pvc) + - statefulSet.mixin.spec.withServiceName(name) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(900) + - // Parallelly scale up/down compactor instances instead of starting them - // one by one. This does NOT affect rolling updates: they will continue to be - // rolled out one by one (the next pod will be rolled out once the previous is - // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), - - compactor_statefulset: - $.newCompactorStatefulSet('compactor', $.compactor_container), - - // The store-gateway runs a statefulset. - local store_gateway_data_pvc = - pvc.new() + - pvc.mixin.spec.resources.withRequests({ storage: $._config.cortex_store_gateway_data_disk_size }) + - pvc.mixin.spec.withAccessModes(['ReadWriteOnce']) + - pvc.mixin.spec.withStorageClassName($._config.cortex_store_gateway_data_disk_class) + - pvc.mixin.metadata.withName('store-gateway-data'), - - store_gateway_args:: - $._config.grpcConfig + - $._config.storageConfig + - $._config.blocksStorageConfig + - $._config.queryBlocksStorageConfig + - { - target: 'store-gateway', - 'runtime-config.file': '/etc/cortex/overrides.yaml', - - // Persist ring tokens so that when the store-gateway will be restarted - // it will pick the same tokens - 'store-gateway.sharding-ring.tokens-file-path': '/data/tokens', - - // Block index-headers are pre-downloaded but lazy mmaped and loaded at query time. - 'blocks-storage.bucket-store.index-header-lazy-loading-enabled': 'true', - 'blocks-storage.bucket-store.index-header-lazy-loading-idle-timeout': '60m', - - 'blocks-storage.bucket-store.max-chunk-pool-bytes': 12 * 1024 * 1024 * 1024, - - // We should keep a number of idle connections equal to the max "get" concurrency, - // in order to avoid re-opening connections continuously (this would be slower - // and fill up the conntrack table too). - // - // The downside of this approach is that we'll end up with an higher number of - // active connections to memcached, so we have to make sure connections limit - // set in memcached is high enough. - 'blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency': 100, - 'blocks-storage.bucket-store.index-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.index-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.chunks-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.chunks-cache.memcached.max-get-multi-concurrency'], - 'blocks-storage.bucket-store.metadata-cache.memcached.max-idle-connections': $.store_gateway_args['blocks-storage.bucket-store.metadata-cache.memcached.max-get-multi-concurrency'], - } + - $.blocks_chunks_caching_config + - $.blocks_metadata_caching_config + - $.bucket_index_config, - - store_gateway_ports:: $.util.defaultPorts, - - store_gateway_container:: - container.new('store-gateway', $._images.store_gateway) + - container.withPorts($.store_gateway_ports) + - container.withArgsMixin($.util.mapToFlags($.store_gateway_args)) + - container.withVolumeMountsMixin([volumeMount.new('store-gateway-data', '/data')]) + - $.util.resourcesRequests('1', '12Gi') + - $.util.resourcesLimits(null, '18Gi') + - $.util.readinessProbe + - $.jaeger_mixin, - - newStoreGatewayStatefulSet(name, container):: - statefulSet.new(name, 3, [container], store_gateway_data_pvc) + - statefulSet.mixin.spec.withServiceName(name) + - statefulSet.mixin.metadata.withNamespace($._config.namespace) + - statefulSet.mixin.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.template.metadata.withLabels({ name: name }) + - statefulSet.mixin.spec.selector.withMatchLabels({ name: name }) + - statefulSet.mixin.spec.template.spec.securityContext.withRunAsUser(0) + - statefulSet.mixin.spec.updateStrategy.withType('RollingUpdate') + - statefulSet.mixin.spec.template.spec.withTerminationGracePeriodSeconds(120) + - // Parallelly scale up/down store-gateway instances instead of starting them - // one by one. This does NOT affect rolling updates: they will continue to be - // rolled out one by one (the next pod will be rolled out once the previous is - // ready). - statefulSet.mixin.spec.withPodManagementPolicy('Parallel') + - $.util.configVolumeMount($._config.overrides_configmap, '/etc/cortex'), - - store_gateway_statefulset: self.newStoreGatewayStatefulSet('store-gateway', $.store_gateway_container), - - store_gateway_service: - $.util.serviceFor($.store_gateway_statefulset), - - local podDisruptionBudget = $.policy.v1beta1.podDisruptionBudget, - - store_gateway_pdb: - podDisruptionBudget.new() + - podDisruptionBudget.mixin.metadata.withName('store-gateway-pdb') + - podDisruptionBudget.mixin.metadata.withLabels({ name: 'store-gateway-pdb' }) + - podDisruptionBudget.mixin.spec.selector.withMatchLabels({ name: 'store-gateway' }) + - // To avoid any disruption in the read path we need at least 1 replica of each - // block available, so the disruption budget depends on the blocks replication factor. - podDisruptionBudget.mixin.spec.withMaxUnavailable(if $._config.store_gateway_replication_factor > 1 then $._config.store_gateway_replication_factor - 1 else 1), -} diff --git a/scripts/test-readme.sh b/scripts/test-readme.sh new file mode 100755 index 00000000..55120b06 --- /dev/null +++ b/scripts/test-readme.sh @@ -0,0 +1,11 @@ +#!/bin/sh +set -xe +rm -rf $1 +mkdir -p $1 +cd $1 +tk init --k8s=1.30 +jb install github.com/cortexproject/cortex-jsonnet/cortex@main +rm -fr ./vendor/cortex +cp -r ../../cortex ./vendor/ +cp vendor/cortex/$(basename $1)/main.jsonnet.example environments/default/main.jsonnet +PAGER=cat tk show --dangerous-allow-redirect environments/default pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy