From 86ff71d1fa137eef6fbed4c78c2bb4be478e1f16 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 12:16:16 +0000 Subject: [PATCH 1/7] feat(scaletest): add grafana annotations --- scaletest/templates/scaletest-runner/main.tf | 24 +++- .../templates/scaletest-runner/scripts/lib.sh | 130 +++++++++++++++++- .../scaletest-runner/scripts/prepare.sh | 7 - .../templates/scaletest-runner/shutdown.sh | 4 + .../templates/scaletest-runner/startup.sh | 18 ++- 5 files changed, 168 insertions(+), 15 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index 4802c9887793d..b4e80749f0e1a 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -43,6 +43,9 @@ locals { home_disk_size = 10 scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" + grafana_url = "https://stats.dev.c8s.io" + grafana_dashboard_uid = "qLVSTR-Vz" + grafana_dashboard_name = "coderv2-loadtest-dashboard" } data "coder_provisioner" "me" { @@ -237,6 +240,9 @@ resource "coder_agent" "main" { SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + GRAFANA_URL : local.grafana_url, + # GRAFANA_DASHBOARD_UID : local.grafana_dashboard_uid, + SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path), SCRIPTS_DIR : "/tmp/scripts", } @@ -332,7 +338,7 @@ resource "coder_app" "grafana" { agent_id = coder_agent.main.id slug = "00-grafana" display_name = "Grafana" - url = "https://stats.dev.c8s.io/d/qLVSTR-Vz/coderv2-loadtest-dashboard?orgId=1&from=${time_static.start_time.unix * 1000}&to=now" + url = "${local.grafana_url}/d/${local.grafana_dashboard_uid}/${local.grafana_dashboard_name}?orgId=1&from=${time_static.start_time.unix * 1000}&to=now" icon = "https://grafana.com/static/assets/img/fav32.png" external = true } @@ -440,6 +446,15 @@ resource "kubernetes_pod" "main" { name = "CODER_AGENT_LOG_DIR" value = "${local.scaletest_run_dir}/logs" } + env { + name = "GRAFANA_API_TOKEN" + value_from { + secret_key_ref { + name = data.kubernetes_secret.grafana_editor_api_token.metadata[0].name + key = "token" + } + } + } resources { # Set requests and limits values such that we can do performant # execution of `coder scaletest` commands. @@ -505,6 +520,13 @@ resource "kubernetes_pod" "main" { } } +data "kubernetes_secret" "grafana_editor_api_token" { + metadata { + name = "grafana-editor-api-token" + namespace = data.coder_parameter.namespace.value + } +} + resource "kubernetes_manifest" "pod_monitor" { count = data.coder_workspace.me.start_count manifest = { diff --git a/scaletest/templates/scaletest-runner/scripts/lib.sh b/scaletest/templates/scaletest-runner/scripts/lib.sh index d392d09681f0a..0982eb01429ef 100644 --- a/scaletest/templates/scaletest-runner/scripts/lib.sh +++ b/scaletest/templates/scaletest-runner/scripts/lib.sh @@ -33,7 +33,13 @@ set_status() { if [[ ${DRY_RUN} == 1 ]]; then dry_run=" (dry-ryn)" fi + prev_status=$(get_status) + if [[ ${prev_status} != *"Not started"* ]]; then + annotate_grafana_end "status" "Status: ${prev_status}" + fi echo "$(date -Ins) ${*}${dry_run}" >>"${SCALETEST_STATE_DIR}/status" + + annotate_grafana "status" "Status: ${*}" } lock_status() { chmod 0440 "${SCALETEST_STATE_DIR}/status" @@ -51,25 +57,29 @@ phase_num=0 start_phase() { # This may be incremented from another script, so we read it every time. if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then - phase_num="$(grep -c START: "${SCALETEST_PHASE_FILE}")" + phase_num=$(grep -c START: "${SCALETEST_PHASE_FILE}") fi phase_num=$((phase_num + 1)) log "Start phase ${phase_num}: ${*}" echo "$(date -Ins) START:${phase_num}: ${*}" >>"${SCALETEST_PHASE_FILE}" + + GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana "phase" "Phase ${phase_num}: ${*}" } end_phase() { - phase="$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-)" + phase=$(tail -n 1 "${SCALETEST_PHASE_FILE}" | grep "START:${phase_num}:" | cut -d' ' -f3-) if [[ -z ${phase} ]]; then log "BUG: Could not find start phase ${phase_num} in ${SCALETEST_PHASE_FILE}" exit 1 fi log "End phase ${phase_num}: ${phase}" echo "$(date -Ins) END:${phase_num}: ${phase}" >>"${SCALETEST_PHASE_FILE}" + + GRAFANA_EXTRA_TAGS="${PHASE_TYPE:-phase-default}" annotate_grafana_end "phase" "Phase ${phase_num}: ${phase}" } get_phase() { if [[ -f "${SCALETEST_PHASE_FILE}" ]]; then - phase_raw="$(tail -n1 "${SCALETEST_PHASE_FILE}")" - phase="$(echo "${phase_raw}" | cut -d' ' -f3-)" + phase_raw=$(tail -n1 "${SCALETEST_PHASE_FILE}") + phase=$(echo "${phase_raw}" | cut -d' ' -f3-) if [[ ${phase_raw} == *"END:"* ]]; then phase+=" [done]" fi @@ -86,9 +96,117 @@ get_previous_phase() { fi } +annotate_grafana() { + local tags=${1} text=${2} start=${3:-$(($(date +%s) * 1000))} + local json resp id + + if [[ -z $tags ]]; then + tags="scaletest,runner" + else + tags="scaletest,runner,${tags}" + fi + if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then + tags="${tags},${GRAFANA_EXTRA_TAGS}" + fi + + log "Annotating Grafana (start=${start}): ${text} [${tags}]" + + json="$( + jq \ + --argjson time "${start}" \ + --arg text "${text}" \ + --arg tags "${tags}" \ + '{time: $time, tags: $tags | split(","), text: $text}' <<<'{}' + )" + if [[ ${DRY_RUN} == 1 ]]; then + log "Would have annotated Grafana, data=${json}" + return 0 + fi + if ! resp="$( + curl -sSL \ + --insecure \ + -H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -d "${json}" \ + "${GRAFANA_URL}/api/annotations" + )"; then + # Don't abort scaletest just because we couldn't annotate Grafana. + log "Failed to annotate Grafana: ${resp}" + return 0 + fi + + if [[ $(jq -r '.message' <<<"${resp}") != "Annotation added" ]]; then + log "Failed to annotate Grafana: ${resp}" + return 0 + fi + + log "Grafana annotation added!" + + if [[ ! -f "${SCALETEST_STATE_DIR}" ]]; then + mkdir -p "${SCALETEST_STATE_DIR}" + fi + id="$(jq -r '.id' <<<"${resp}")" + echo "${id}:${tags}:${text}:${start}" >>"${SCALETEST_STATE_DIR}/grafana-annotations" +} +annotate_grafana_end() { + local tags=${1} text=${2} start=${3:-} end=${4:-$(($(date +%s) * 1000))} + local id json resp + + if [[ -z $tags ]]; then + tags="scaletest,runner" + else + tags="scaletest,runner,${tags}" + fi + if [[ -n ${GRAFANA_EXTRA_TAGS:-} ]]; then + tags="${tags},${GRAFANA_EXTRA_TAGS}" + fi + + if [[ ${DRY_RUN} == 1 ]]; then + log "Would have updated Grafana annotation (end=${end}): ${text} [${tags}]" + return 0 + fi + + if ! id=$(grep ":${tags}:${text}:${start}" "${SCALETEST_STATE_DIR}/grafana-annotations" | sort -n | tail -n1 | cut -d: -f1); then + log "NOTICE: Could not find Grafana annotation to end: '${tags}:${text}:${start}', skipping..." + return 0 + fi + + log "Annotating Grafana (end=${end}): ${text} [${tags}]" + + json="$( + jq \ + --argjson timeEnd "${end}" \ + '{timeEnd: $timeEnd}' <<<'{}' + )" + if [[ ${DRY_RUN} == 1 ]]; then + log "Would have patched Grafana annotation: id=${id}, data=${json}" + return 0 + fi + if ! resp="$( + curl -sSL \ + --insecure \ + -H "Authorization: Bearer ${GRAFANA_API_TOKEN}" \ + -H "Content-Type: application/json" \ + -X PATCH \ + -d "${json}" \ + "${GRAFANA_URL}/api/annotations/${id}" + )"; then + # Don't abort scaletest just because we couldn't annotate Grafana. + log "Failed to annotate Grafana end: ${resp}" + return 0 + fi + + if [[ $(jq -r '.message' <<<"${resp}") != "Annotation patched" ]]; then + log "Failed to annotate Grafana end: ${resp}" + return 0 + fi + + log "Grafana annotation patched!" +} + wait_baseline() { s=${1:-2} - start_phase "Waiting ${s}m to establish baseline" + PHASE_TYPE="phase-wait" start_phase "Waiting ${s}m to establish baseline" maybedryrun "$DRY_RUN" sleep $((s * 60)) - end_phase + PHASE_TYPE="phase-wait" end_phase } diff --git a/scaletest/templates/scaletest-runner/scripts/prepare.sh b/scaletest/templates/scaletest-runner/scripts/prepare.sh index f6fbcb7dd3227..2c20ace6f9cd6 100755 --- a/scaletest/templates/scaletest-runner/scripts/prepare.sh +++ b/scaletest/templates/scaletest-runner/scripts/prepare.sh @@ -28,13 +28,6 @@ for dir in "${HOME}/scaletest-"*; do fi done -log "Cloning coder/coder repo..." - -if [[ ! -d "${HOME}/coder" ]]; then - git clone https://github.com/coder/coder.git "${HOME}/coder" -fi -(cd "${HOME}/coder" && git pull) - log "Creating coder CLI token (needed for cleanup during shutdown)..." mkdir -p "${CODER_CONFIG_DIR}" diff --git a/scaletest/templates/scaletest-runner/shutdown.sh b/scaletest/templates/scaletest-runner/shutdown.sh index fe621afe4c6c4..14d6023aaaa62 100755 --- a/scaletest/templates/scaletest-runner/shutdown.sh +++ b/scaletest/templates/scaletest-runner/shutdown.sh @@ -11,4 +11,8 @@ cleanup() { } trap cleanup EXIT +annotate_grafana "workspace" "Agent stopping..." + "${SCRIPTS_DIR}/cleanup.sh" shutdown + +annotate_grafana_end "workspace" "Agent running" diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh index 0d7c8fb144324..57151fcf2e1a4 100755 --- a/scaletest/templates/scaletest-runner/startup.sh +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -12,9 +12,17 @@ mkdir -p "${SCRIPTS_DIR}" unzip -o /tmp/scripts.zip -d "${SCRIPTS_DIR}" rm /tmp/scripts.zip +echo "Cloning coder/coder repo..." +if [[ ! -d "${HOME}/coder" ]]; then + git clone https://github.com/coder/coder.git "${HOME}/coder" +fi +(cd "${HOME}/coder" && git pull) + # shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh . "${SCRIPTS_DIR}/lib.sh" +annotate_grafana "workspace" "Agent running" # Ended in shutdown.sh. + # Show failure in the UI if script exits with error. failed_status=Failed on_exit() { @@ -38,15 +46,23 @@ on_exit() { "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" ;; esac + + annotate_grafana_end "" "Start scaletest" } trap on_exit EXIT on_err() { + code=${?} + trap - ERR + log "Scaletest failed!" - set_status "${failed_status}" + GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})" lock_status # Ensure we never rewrite the status after a failure. } trap on_err ERR +annotate_grafana "" "Start scaletest" + "${SCRIPTS_DIR}/prepare.sh" + "${SCRIPTS_DIR}/run.sh" From 2deffab519ade1c7d09012431e9e661bbe6b64d5 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 12:16:16 +0000 Subject: [PATCH 2/7] feat(scaletest): add slack reporting --- scaletest/templates/scaletest-runner/main.tf | 126 +++++++++++++++--- .../scaletest-runner/scripts/cleanup.sh | 2 +- .../scaletest-runner/scripts/report.sh | 103 ++++++++++++++ .../templates/scaletest-runner/scripts/run.sh | 79 ++++++----- .../templates/scaletest-runner/startup.sh | 14 +- 5 files changed, 267 insertions(+), 57 deletions(-) create mode 100755 scaletest/templates/scaletest-runner/scripts/report.sh diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index b4e80749f0e1a..a1b3b52735770 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -38,8 +38,8 @@ locals { workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" service_account_name = "scaletest-sa" - cpu = 2 - memory = 2 + cpu = 16 + memory = 64 home_disk_size = 10 scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" @@ -94,15 +94,14 @@ data "coder_parameter" "job_concurrency" { order = 11 type = "number" name = "Job concurrency" - default = 10 + default = 0 description = "The number of concurrent jobs (e.g. when producing workspace traffic)." mutable = true # Setting zero = unlimited, but perhaps not a good idea, # we can raise this limit instead. validation { - min = 1 - max = 100 + min = 0 } } @@ -200,6 +199,73 @@ data "coder_parameter" "num_workspaces" { } } + +data "coder_parameter" "load_scenarios" { + order = 22 + name = "Load Scenarios" + type = "list(string)" + description = "The load scenarios to run." + mutable = true + ephemeral = true + default = jsonencode([ + "SSH Traffic", + "Web Terminal Traffic", + "Dashboard Traffic", + ]) +} + +data "coder_parameter" "load_scenario_ssh_traffic_duration" { + order = 23 + name = "SSH Traffic Duration" + type = "number" + description = "The duration of the SSH traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { + order = 24 + name = "Web Terminal Traffic Duration" + type = "number" + description = "The duration of the web terminal traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_dashboard_traffic_duration" { + order = 25 + name = "Dashboard Traffic Duration" + type = "number" + description = "The duration of the dashboard traffic load scenario in minutes." + mutable = true + default = 30 + validation { + min = 1 + max = 1440 // 24 hours. + } +} + +data "coder_parameter" "load_scenario_baseline_duration" { + order = 26 + name = "Baseline Wait Duration" + type = "number" + description = "The duration to wait before starting a load scenario in minutes." + mutable = true + default = 5 + validation { + min = 0 + max = 60 + } +} + data "coder_parameter" "namespace" { order = 999 type = "string" @@ -224,6 +290,8 @@ resource "coder_agent" "main" { CODER_CONFIG_DIR : "/home/coder/.config/coderv2", CODER_USER_TOKEN : data.coder_workspace.me.owner_session_token, CODER_URL : data.coder_workspace.me.access_url, + CODER_USER : data.coder_workspace.me.owner, + CODER_WORKSPACE : data.coder_workspace.me.name, # Global scaletest envs that may affect each `coder exp scaletest` invocation. CODER_SCALETEST_PROMETHEUS_ADDRESS : "0.0.0.0:21112", @@ -231,14 +299,23 @@ resource "coder_agent" "main" { CODER_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}", CODER_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}", + # Expose as params as well, for reporting (TODO(mafredri): refactor, only have one). + SCALETEST_PARAM_SCALETEST_CONCURRENCY : "${data.coder_parameter.job_concurrency.value}", + SCALETEST_PARAM_SCALETEST_CLEANUP_CONCURRENCY : "${data.coder_parameter.cleanup_concurrency.value}", + # Local envs passed as arguments to `coder exp scaletest` invocations. SCALETEST_RUN_ID : local.scaletest_run_id, SCALETEST_RUN_DIR : local.scaletest_run_dir, - SCALETEST_TEMPLATE : data.coder_parameter.workspace_template.value, - SCALETEST_SKIP_CLEANUP : "1", - SCALETEST_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, - SCALETEST_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", - SCALETEST_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + + SCALETEST_PARAM_TEMPLATE : data.coder_parameter.workspace_template.value, + SCALETEST_PARAM_NUM_WORKSPACES : data.coder_parameter.num_workspaces.value, + SCALETEST_PARAM_CREATE_CONCURRENCY : "${data.coder_parameter.create_concurrency.value}", + SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, + SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", GRAFANA_URL : local.grafana_url, # GRAFANA_DASHBOARD_UID : local.grafana_dashboard_uid, @@ -250,12 +327,13 @@ resource "coder_agent" "main" { vscode = false ssh_helper = false } - startup_script_timeout = 3600 - shutdown_script_timeout = 1800 + startup_script_timeout = 86400 + shutdown_script_timeout = 7200 startup_script_behavior = "blocking" startup_script = file("startup.sh") shutdown_script = file("shutdown.sh") + # IDEA(mafredri): It would be pretty cool to define metadata to expect JSON output, each field/item could become a separate metadata item. # Scaletest metadata. metadata { display_name = "Scaletest status" @@ -415,7 +493,7 @@ resource "kubernetes_pod" "main" { } # Set the pod delete timeout to termination_grace_period_seconds + 1m. timeouts { - delete = "32m" + delete = "122m" } spec { security_context { @@ -427,8 +505,8 @@ resource "kubernetes_pod" "main" { service_account_name = local.service_account_name # Allow the coder agent to perform graceful shutdown and cleanup of - # scaletest resources, 30 minutes (cleanup timeout) + 1 minute. - termination_grace_period_seconds = 1860 + # scaletest resources, 2 hours (cleanup timeout) + 1 minute. + termination_grace_period_seconds = 7260 container { name = "dev" @@ -455,6 +533,15 @@ resource "kubernetes_pod" "main" { } } } + env { + name = "SLACK_WEBHOOK_URL" + value_from { + secret_key_ref { + name = data.kubernetes_secret.slack_scaletest_notifications_webhook_url.metadata[0].name + key = "url" + } + } + } resources { # Set requests and limits values such that we can do performant # execution of `coder scaletest` commands. @@ -511,7 +598,7 @@ resource "kubernetes_pod" "main" { match_expressions { key = "cloud.google.com/gke-nodepool" operator = "In" - values = ["big-misc"] # Avoid placing on the same nodes as scaletest workspaces. + values = ["big-workspacetraffic"] # Avoid placing on the same nodes as scaletest workspaces. } } } @@ -527,6 +614,13 @@ data "kubernetes_secret" "grafana_editor_api_token" { } } +data "kubernetes_secret" "slack_scaletest_notifications_webhook_url" { + metadata { + name = "slack-scaletest-notifications-webhook-url" + namespace = data.coder_parameter.namespace.value + } +} + resource "kubernetes_manifest" "pod_monitor" { count = data.coder_workspace.me.start_count manifest = { diff --git a/scaletest/templates/scaletest-runner/scripts/cleanup.sh b/scaletest/templates/scaletest-runner/scripts/cleanup.sh index a6d29211a080b..9d2c23463249e 100755 --- a/scaletest/templates/scaletest-runner/scripts/cleanup.sh +++ b/scaletest/templates/scaletest-runner/scripts/cleanup.sh @@ -24,7 +24,7 @@ fi start_phase "Cleanup (${event})" coder exp scaletest cleanup \ --cleanup-job-timeout 15m \ - --cleanup-timeout 30m | + --cleanup-timeout 2h | tee "${SCALETEST_RESULTS_DIR}/cleanup-${event}.txt" end_phase diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh new file mode 100755 index 0000000000000..bb7cc393ef8ba --- /dev/null +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -0,0 +1,103 @@ +#!/bin/bash +set -euo pipefail + +[[ $VERBOSE == 1 ]] && set -x + +status=$1 +shift + +case "${status}" in +started) ;; +completed) ;; +failed) ;; +*) + echo "Unknown status: ${status}" >&2 + exit 1 + ;; +esac + +# shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh +. "${SCRIPTS_DIR}/lib.sh" + +# NOTE(mafredri): API returns HTML if we accidentally use `...//api` vs `.../api`. +CODER_URL="${CODER_URL%/}" +buildinfo="$(curl -sSL "${CODER_URL}/api/v2/buildinfo")" +server_version="$(jq -r '.version' <<<"${buildinfo}")" +server_version_commit="$(jq -r '.external_url' <<<"${buildinfo}")" + +# Since `coder show` doesn't support JSON output, we list the workspaces instead. +workspace_json="$(DRYRUN=0 coder list --all --output json | jq --arg workspace "${CODER_WORKSPACE}" --arg user "${CODER_USER}" 'map(select(.name == $workspace) | select(.owner_name == $user)) | .[0]')" +owner_name="$(jq -r '.latest_build.workspace_owner_name' <<<"${workspace_json}")" +workspace_name="$(jq -r '.latest_build.workspace_name' <<<"${workspace_json}")" +initiator_name="$(jq -r '.latest_build.initiator_name' <<<"${workspace_json}")" + +bullet='•' +app_urls_raw="$(jq -r '.latest_build.resources[].agents[]?.apps | map(select(.external == true)) | .[] | .display_name, .url' <<<"${workspace_json}")" +app_urls=() +while read -r app_name; do + read -r app_url + bold= + if [[ ${status} != started ]] && [[ ${app_url} = *to=now* ]]; then + # Update Grafana URL with end stamp and make bold. + app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}" + bold='**' + fi + app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}") +done <<<"${app_urls_raw}" + +params=() +header= + +case "${status}" in +started) + created_at="$(jq -r '.latest_build.created_at' <<<"${workspace_json}")" + params=("${bullet} Options:") + while read -r param; do + params+=(" ${bullet} ${param}") + done <<<"$(jq -r '.latest_build.resources[].agents[]?.environment_variables | to_entries | map(select(.key | startswith("SCALETEST_PARAM_"))) | .[] | "`\(.key)`: `\(.value)`"' <<<"${workspace_json}")" + + header="New scaletest started at \`${created_at}\` by \`${initiator_name}\` on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)." + ;; +completed) + completed_at=$(date -Iseconds) + header="Scaletest completed at \`${completed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)." + ;; +failed) + failed_at=$(date -Iseconds) + header="Scaletest failed at \`${failed_at}\` (started by \`${initiator_name}\`) on ${CODER_URL} (<${server_version_commit}|\`${server_version}\`>)." + ;; +*) + echo "Unknown status: ${status}" >&2 + exit 1 + ;; +esac + +text_arr=( + "${header}" + "" + "${bullet} Workspace (runner): ${CODER_URL}@${owner_name}/${workspace_name}" + "${bullet} Run ID: ${SCALETEST_RUN_ID}" + "${app_urls[@]}" + "${params[@]}" +) + +text= +for field in "${text_arr[@]}"; do + text+="${field}"$'\n' +done + +json=$( + jq -n --arg text "${text}" '{ + blocks: [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": $text + } + } + ] + }' +) + +maybedryrun "${DRY_RUN}" curl -X POST -H 'Content-type: application/json' --data "${json}" "${SLACK_WEBHOOK_URL}" diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 7ebf8c4310593..294cc956cc4ab 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -6,54 +6,61 @@ set -euo pipefail # shellcheck disable=SC2153 source=scaletest/templates/scaletest-runner/scripts/lib.sh . "${SCRIPTS_DIR}/lib.sh" +mapfile -t scaletest_load_scenarios < <(jq -r '. | join ("\n")' <<<"${SCALETEST_PARAM_LOAD_SCENARIOS}") +export SCALETEST_PARAM_LOAD_SCENARIOS=("${scaletest_load_scenarios[@]}") + log "Running scaletest..." set_status Running start_phase "Creating workspaces" coder exp scaletest create-workspaces \ - --count "${SCALETEST_NUM_WORKSPACES}" \ - --template "${SCALETEST_TEMPLATE}" \ - --concurrency "${SCALETEST_CREATE_CONCURRENCY}" \ - --job-timeout 15m \ + --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ + --template "${SCALETEST_PARAM_TEMPLATE}" \ + --concurrency "${SCALETEST_PARAM_CREATE_CONCURRENCY}" \ + --job-timeout 2h \ --no-cleanup \ --output json:"${SCALETEST_RESULTS_DIR}/create-workspaces.json" show_json "${SCALETEST_RESULTS_DIR}/create-workspaces.json" end_phase -wait_baseline 5 - -start_phase "SSH traffic" -coder exp scaletest workspace-traffic \ - --ssh \ - --bytes-per-tick 10240 \ - --tick-interval 1s \ - --timeout 5m \ - --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" -show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" -end_phase - -wait_baseline 5 +wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" -start_phase "ReconnectingPTY traffic" -coder exp scaletest workspace-traffic \ - --bytes-per-tick 10240 \ - --tick-interval 1s \ - --timeout 5m \ - --output json:"${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json" -show_json "${SCALETEST_RESULTS_DIR}/traffic-reconnectingpty.json" -end_phase - -wait_baseline 5 - -start_phase "Dashboard traffic" -coder exp scaletest dashboard \ - --count "${SCALETEST_NUM_WORKSPACES}" \ - --job-timeout 5m \ - --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" -show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" -end_phase +for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do + start_phase "Load scenario: ${scenario}" + case "${scenario}" in + "SSH Traffic") + coder exp scaletest workspace-traffic \ + --ssh \ + --bytes-per-tick 1024 \ + --tick-interval 100ms \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" + show_json "${SCALETEST_RESULTS_DIR}/traffic-ssh.json" + ;; + "Web Terminal Traffic") + coder exp scaletest workspace-traffic \ + --bytes-per-tick 1024 \ + --tick-interval 100ms \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" + show_json "${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" + ;; + "Dashboard Traffic") + coder exp scaletest dashboard \ + --count "${SCALETEST_PARAM_NUM_WORKSPACES}" \ + --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m" \ + --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION}m30s" \ + --output json:"${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" \ + >"${SCALETEST_RESULTS_DIR}/traffic-dashboard-output.log" + show_json "${SCALETEST_RESULTS_DIR}/traffic-dashboard.json" + ;; + esac + end_phase -wait_baseline 5 + wait_baseline "${SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION}" +done log "Scaletest complete!" set_status Complete diff --git a/scaletest/templates/scaletest-runner/startup.sh b/scaletest/templates/scaletest-runner/startup.sh index 57151fcf2e1a4..e0ea9316a9be8 100755 --- a/scaletest/templates/scaletest-runner/startup.sh +++ b/scaletest/templates/scaletest-runner/startup.sh @@ -28,22 +28,22 @@ failed_status=Failed on_exit() { trap - ERR EXIT - case "${SCALETEST_CLEANUP_STRATEGY}" in + case "${SCALETEST_PARAM_CLEANUP_STRATEGY}" in on_stop) # Handled by shutdown script. ;; on_success) if [[ $(get_status) != "${failed_status}" ]]; then - "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" fi ;; on_error) if [[ $(get_status) = "${failed_status}" ]]; then - "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" fi ;; *) - "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_CLEANUP_STRATEGY}" + "${SCRIPTS_DIR}/cleanup.sh" "${SCALETEST_PARAM_CLEANUP_STRATEGY}" ;; esac @@ -54,15 +54,21 @@ trap on_exit EXIT on_err() { code=${?} trap - ERR + set +e log "Scaletest failed!" GRAFANA_EXTRA_TAGS=error set_status "${failed_status} (exit=${code})" + "${SCRIPTS_DIR}/report.sh" failed lock_status # Ensure we never rewrite the status after a failure. } trap on_err ERR +# Pass session token since `prepare.sh` has not yet run. +CODER_SESSION_TOKEN=$CODER_USER_TOKEN "${SCRIPTS_DIR}/report.sh" started annotate_grafana "" "Start scaletest" "${SCRIPTS_DIR}/prepare.sh" "${SCRIPTS_DIR}/run.sh" + +"${SCRIPTS_DIR}/report.sh" completed From a9beb89a7ef348c4df970cdd1d57ca6638987c28 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 12:23:22 +0000 Subject: [PATCH 3/7] fix url --- scaletest/templates/scaletest-runner/scripts/report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh index bb7cc393ef8ba..d9bcaa2e08258 100755 --- a/scaletest/templates/scaletest-runner/scripts/report.sh +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -75,7 +75,7 @@ esac text_arr=( "${header}" "" - "${bullet} Workspace (runner): ${CODER_URL}@${owner_name}/${workspace_name}" + "${bullet} Workspace (runner): ${CODER_URL}/@${owner_name}/${workspace_name}" "${bullet} Run ID: ${SCALETEST_RUN_ID}" "${app_urls[@]}" "${params[@]}" From 2ef0e8fe0dd06f8d0b78cb3cde0cedbb2621263f Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Mon, 25 Sep 2023 13:51:45 +0000 Subject: [PATCH 4/7] try signle * for bold slack message --- scaletest/templates/scaletest-runner/scripts/report.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh index d9bcaa2e08258..a6f11318ab6c4 100755 --- a/scaletest/templates/scaletest-runner/scripts/report.sh +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -40,7 +40,7 @@ while read -r app_name; do if [[ ${status} != started ]] && [[ ${app_url} = *to=now* ]]; then # Update Grafana URL with end stamp and make bold. app_url="${app_url//to=now/to=$(($(date +%s) * 1000))}" - bold='**' + bold='*' fi app_urls+=("${bullet} ${bold}${app_name}: ${app_url}${bold}") done <<<"${app_urls_raw}" From 1414f739a003727fd0c8fdc903b482e74a393e56 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 26 Sep 2023 17:43:24 +0000 Subject: [PATCH 5/7] non-hardcoded grace period --- scaletest/templates/scaletest-runner/main.tf | 31 ++++++++++---------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index a1b3b52735770..b3b74cb54af6a 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -35,17 +35,18 @@ resource "null_resource" "permission_check" { } locals { - workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" - workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" - service_account_name = "scaletest-sa" - cpu = 16 - memory = 64 - home_disk_size = 10 - scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" - scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" - grafana_url = "https://stats.dev.c8s.io" - grafana_dashboard_uid = "qLVSTR-Vz" - grafana_dashboard_name = "coderv2-loadtest-dashboard" + workspace_pod_name = "coder-scaletest-runner-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" + workspace_pod_instance = "coder-workspace-${lower(data.coder_workspace.me.owner)}-${lower(data.coder_workspace.me.name)}" + workspace_pod_termination_grace_period_seconds = 7200 # 2 hours (cleanup timeout). + service_account_name = "scaletest-sa" + cpu = 16 + memory = 64 + home_disk_size = 10 + scaletest_run_id = "scaletest-${time_static.start_time.rfc3339}" + scaletest_run_dir = "/home/coder/${local.scaletest_run_id}" + grafana_url = "https://stats.dev.c8s.io" + grafana_dashboard_uid = "qLVSTR-Vz" + grafana_dashboard_name = "coderv2-loadtest-dashboard" } data "coder_provisioner" "me" { @@ -318,7 +319,6 @@ resource "coder_agent" "main" { SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", GRAFANA_URL : local.grafana_url, - # GRAFANA_DASHBOARD_UID : local.grafana_dashboard_uid, SCRIPTS_ZIP : filebase64(data.archive_file.scripts_zip.output_path), SCRIPTS_DIR : "/tmp/scripts", @@ -493,7 +493,7 @@ resource "kubernetes_pod" "main" { } # Set the pod delete timeout to termination_grace_period_seconds + 1m. timeouts { - delete = "122m" + delete = "${(local.workspace_pod_termination_grace_period_seconds + 120) / 60}s" } spec { security_context { @@ -505,8 +505,9 @@ resource "kubernetes_pod" "main" { service_account_name = local.service_account_name # Allow the coder agent to perform graceful shutdown and cleanup of - # scaletest resources, 2 hours (cleanup timeout) + 1 minute. - termination_grace_period_seconds = 7260 + # scaletest resources. We add an extra minute so ensure work + # completion is prioritized over timeout. + termination_grace_period_seconds = local.workspace_pod_termination_grace_period_seconds + 60 container { name = "dev" From 73dfa18950f74d371fdc621b649e90f9a837b15a Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 26 Sep 2023 17:51:13 +0000 Subject: [PATCH 6/7] add params for tick bytes/interval --- scaletest/templates/scaletest-runner/main.tf | 56 ++++++++++++++++++- .../templates/scaletest-runner/scripts/run.sh | 8 +-- 2 files changed, 58 insertions(+), 6 deletions(-) diff --git a/scaletest/templates/scaletest-runner/main.tf b/scaletest/templates/scaletest-runner/main.tf index b3b74cb54af6a..c5b93d00978e1 100644 --- a/scaletest/templates/scaletest-runner/main.tf +++ b/scaletest/templates/scaletest-runner/main.tf @@ -228,8 +228,32 @@ data "coder_parameter" "load_scenario_ssh_traffic_duration" { } } -data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { +data "coder_parameter" "load_scenario_ssh_bytes_per_tick" { order = 24 + name = "SSH Bytes Per Tick" + type = "number" + description = "The number of bytes to send per tick in the SSH traffic load scenario." + mutable = true + default = 1024 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_ssh_tick_interval" { + order = 25 + name = "SSH Tick Interval" + type = "number" + description = "The number of milliseconds between each tick in the SSH traffic load scenario." + mutable = true + default = 100 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { + order = 26 name = "Web Terminal Traffic Duration" type = "number" description = "The duration of the web terminal traffic load scenario in minutes." @@ -241,8 +265,32 @@ data "coder_parameter" "load_scenario_web_terminal_traffic_duration" { } } +data "coder_parameter" "load_scenario_web_terminal_bytes_per_tick" { + order = 27 + name = "Web Terminal Bytes Per Tick" + type = "number" + description = "The number of bytes to send per tick in the web terminal traffic load scenario." + mutable = true + default = 1024 + validation { + min = 1 + } +} + +data "coder_parameter" "load_scenario_web_terminal_tick_interval" { + order = 28 + name = "Web Terminal Tick Interval" + type = "number" + description = "The number of milliseconds between each tick in the web terminal traffic load scenario." + mutable = true + default = 100 + validation { + min = 1 + } +} + data "coder_parameter" "load_scenario_dashboard_traffic_duration" { - order = 25 + order = 29 name = "Dashboard Traffic Duration" type = "number" description = "The duration of the dashboard traffic load scenario in minutes." @@ -314,7 +362,11 @@ resource "coder_agent" "main" { SCALETEST_PARAM_CLEANUP_STRATEGY : data.coder_parameter.cleanup_strategy.value, SCALETEST_PARAM_LOAD_SCENARIOS : data.coder_parameter.load_scenarios.value, SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_ssh_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_ssh_bytes_per_tick.value}", + SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_ssh_tick_interval.value}", SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_web_terminal_traffic_duration.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK : "${data.coder_parameter.load_scenario_web_terminal_bytes_per_tick.value}", + SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL : "${data.coder_parameter.load_scenario_web_terminal_tick_interval.value}", SCALETEST_PARAM_LOAD_SCENARIO_DASHBOARD_TRAFFIC_DURATION : "${data.coder_parameter.load_scenario_dashboard_traffic_duration.value}", SCALETEST_PARAM_LOAD_SCENARIO_BASELINE_DURATION : "${data.coder_parameter.load_scenario_baseline_duration.value}", diff --git a/scaletest/templates/scaletest-runner/scripts/run.sh b/scaletest/templates/scaletest-runner/scripts/run.sh index 294cc956cc4ab..1197283f82b8d 100755 --- a/scaletest/templates/scaletest-runner/scripts/run.sh +++ b/scaletest/templates/scaletest-runner/scripts/run.sh @@ -31,8 +31,8 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do "SSH Traffic") coder exp scaletest workspace-traffic \ --ssh \ - --bytes-per-tick 1024 \ - --tick-interval 100ms \ + --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_BYTES_PER_TICK}" \ + --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_SSH_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-ssh.json" @@ -40,8 +40,8 @@ for scenario in "${SCALETEST_PARAM_LOAD_SCENARIOS[@]}"; do ;; "Web Terminal Traffic") coder exp scaletest workspace-traffic \ - --bytes-per-tick 1024 \ - --tick-interval 100ms \ + --bytes-per-tick "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_BYTES_PER_TICK}" \ + --tick-interval "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_TICK_INTERVAL}ms" \ --timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m" \ --job-timeout "${SCALETEST_PARAM_LOAD_SCENARIO_WEB_TERMINAL_TRAFFIC_DURATION}m30s" \ --output json:"${SCALETEST_RESULTS_DIR}/traffic-web-terminal.json" From b92ead16bce5b6413a752635e7825d426baf5d95 Mon Sep 17 00:00:00 2001 From: Mathias Fredriksson Date: Tue, 26 Sep 2023 21:25:31 +0300 Subject: [PATCH 7/7] Update scaletest/templates/scaletest-runner/scripts/report.sh --- scaletest/templates/scaletest-runner/scripts/report.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scaletest/templates/scaletest-runner/scripts/report.sh b/scaletest/templates/scaletest-runner/scripts/report.sh index a6f11318ab6c4..453d4e53c6e16 100755 --- a/scaletest/templates/scaletest-runner/scripts/report.sh +++ b/scaletest/templates/scaletest-runner/scripts/report.sh @@ -20,6 +20,7 @@ esac . "${SCRIPTS_DIR}/lib.sh" # NOTE(mafredri): API returns HTML if we accidentally use `...//api` vs `.../api`. +# https://github.com/coder/coder/issues/9877 CODER_URL="${CODER_URL%/}" buildinfo="$(curl -sSL "${CODER_URL}/api/v2/buildinfo")" server_version="$(jq -r '.version' <<<"${buildinfo}")" pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy