From 03b8e1217b08e6580e08129fddd64dbc20265fd4 Mon Sep 17 00:00:00 2001 From: grinko Date: Wed, 8 Sep 2021 17:02:02 +0300 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D1=80=D0=B0=D0=B1=D0=BE=D1=82?= =?UTF-8?q?=D0=B0=D0=BB=20=D0=BF=D0=BB=D0=B0=D0=B3=D0=B8=D0=BD=20pg=5Fprob?= =?UTF-8?q?ackup.py?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- documentation/configuration_file.md | 12 +- documentation/metrics.md | 162 +++++++------ documentation/metrics.ru.md | 82 +++++++ mamonsu/plugins/system/linux/pg_probackup.py | 231 ++++++++++++++++--- packaging/conf/example.conf | 6 +- 5 files changed, 383 insertions(+), 110 deletions(-) create mode 100644 documentation/metrics.ru.md diff --git a/documentation/configuration_file.md b/documentation/configuration_file.md index a7bd93d7..5cc47750 100644 --- a/documentation/configuration_file.md +++ b/documentation/configuration_file.md @@ -190,9 +190,11 @@ interval = 60 [pgprobackup] enabled = false -interval = 300 +interval = 900 backup_dirs = /backup_dir1,/backup_dir2 -pg_probackup_path = /usr/bin/pg_probackup-11 +pg_probackup_path = /usr/bin/pg_probackup-13 +max_time_run_backup2alert_in_sec = 21600 +max_time_lack_backup2alert_in_sec = 100800 ``` **[preparedtransaction]** @@ -219,6 +221,10 @@ The *interval* parameter allows you to change the metrics collection interval. By default this plugin is disabled. To enable it set the enabled parameter to True. -This plugin collects two metrics: *pg_probackup.dir.size[#backup_directory]* (the size of the target directory) and *pg_probackup.dir.error[#backup_directory]* (backup errors) for each specified *backup_directory*. +This plugin collects several metrics: +- *pg_probackup.dir.size[#backup_directory]* (the size of the target directory) +- *pg_probackup.dir.error[#backup_directory]* (backup errors) +- other metrics for each specified *backup_directory*. +See file metrics.md If any generated backup has bad status, like ERROR, CORRUPT, ORPHAN, а trigger is fired. diff --git a/documentation/metrics.md b/documentation/metrics.md index e3049162..949e1121 100644 --- a/documentation/metrics.md +++ b/documentation/metrics.md @@ -1,20 +1,21 @@ # Mamonsu: metrics **Metrics:** -- [Mamonsu health metrics](#mamonsu-health-metrics) - - [Items](#items) - - [Triggers](#triggers) -- [System metrics](#system-metrics) - - [*nix](#nix) +- [Mamonsu: metrics](#mamonsu-metrics) + - [Mamonsu Health metrics](#mamonsu-health-metrics) + - [Items](#items) + - [Triggers](#triggers) + - [System metrics](#system-metrics) + - [*nix](#nix) - [Items](#items-1) - [Discovery Rules](#discovery-rules) - [Graphs](#graphs) - [Triggers](#triggers-1) - - [Windows](#windows) + - [Windows](#windows) - [Items](#items-2) - [Discovery Rules](#discovery-rules-1) -- [PostgreSQL metrics](#postgresql-metrics) - - [Archiving](#archiving) + - [PostgreSQL metrics](#postgresql-metrics) + - [Archiving](#archiving) - [Items](#items-3) - [Graphs](#graphs-1) - [Triggers](#triggers-2) @@ -23,63 +24,63 @@ - [Background Writer](#background-writer) - [Items](#items-5) - [Graphs](#graphs-2) - - [Blocks](#blocks) + - [Blocks](#blocks) - [Items](#items-6) - [Graphs](#graphs-3) - - [Checkpoints](#checkpoints) + - [Checkpoints](#checkpoints) - [Items](#items-7) - [Graphs](#graphs-4) - [Triggers](#triggers-3) - - [Connections](#connections) + - [Connections](#connections) - [Items](#items-8) - [Graphs](#graphs-5) - [Triggers](#triggers-4) - - [Databases](#databases) + - [Databases](#databases) - [Discovery Rules](#discovery-rules-2) - - [Events](#events) + - [Events](#events) - [Items](#items-9) - [Graphs](#graphs-6) - - [Health](#health) + - [Health](#health) - [Items](#items-10) - [Triggers](#triggers-5) - - [Memory Leak](#memory-leak) + - [Memory Leak](#memory-leak) - [Items](#items-11) - [Triggers](#triggers-6) - - [pg_buffercache](#pg_buffercache) + - [pg_buffercache](#pg_buffercache) - [Items](#items-12) - [Graphs](#graphs-7) - - [pg_locks](#pg_locks) + - [pg_locks](#pg_locks) - [Items](#items-13) - [Graphs](#graphs-8) - - [pg_stat_statements](#pg_stat_statements) + - [pg_stat_statements](#pg_stat_statements) - [Items](#items-14) - [Graphs](#graphs-9) - - [Prepared Transactions](#prepared-transactions) + - [Prepared Transactions](#prepared-transactions) - [Items](#items-15) - [Graphs](#graphs-10) - [Triggers](#triggers-7) - - [Relations](#relations) + - [Relations](#relations) - [Discovery Rules](#discovery-rules-3) - - [Replication](#replication) + - [Replication](#replication) - [Items](#items-16) - [Discovery Rules](#discovery-rules-4) - [Triggers](#triggers-8) - - [Temp Files](#temp-files) + - [Temp Files](#temp-files) - [Items](#items-17) - [Graphs](#graphs-11) - - [Transactions](#transactions) + - [Transactions](#transactions) - [Items](#items-18) - [Triggers](#triggers-9) - - [Tuples](#tuples) + - [Tuples](#tuples) - [Items](#items-19) - [Graphs](#graphs-12) - - [WAL](#wal) + - [WAL](#wal) - [Items](#items-20) -- [Postgres Pro metrics](#postgres-pro-metrics) - - [Compressed File System](#compressed-file-system) + - [Postgres Pro metrics](#postgres-pro-metrics) + - [Compressed File System](#compressed-file-system) - [Items](#items-21) - [Discovery Rules](#discovery-rules-5) - - [pg_wait_sampling](#pg_wait_sampling) + - [pg_wait_sampling](#pg_wait_sampling) - [Items](#items-22) - [Graphs](#graphs-13) @@ -1203,57 +1204,68 @@ Default config: 4. **pg_probackup Discovery** Items: - - - - - - - - - - - - - - - - - - - - - - - - - - -
NamePg_probackup dir {#BACKUPDIR}: errorPg_probackup dir {#BACKUPDIR}: size
Keypg_probackup.dir.error[{#BACKUPDIR}]pg_probackup.dir.size[{#BACKUPDIR}]
TypeTextNumeric (float)
UnitsBytes
DeltaAs IsAs Is
+ +| Name | Key | Storage | Description | +| ---------------------------------------------------------- | ------------------------------------------------ | ------- | ---------------------------------------------------------- | +| Pg_probackup dir {#BACKUPDIR}: size | pg_probackup.dir.size[{#BACKUPDIR}] | 31d | Total catalog size: /backups + /wal | +| Pg_probackup dir {#BACKUPDIR}/backups: size | pg_probackup.dir.size[{#BACKUPDIR}/backups] | 31d | Subdirectory Size /backups | +| Pg_probackup dir {#BACKUPDIR}/wal: size | pg_probackup.dir.size[{#BACKUPDIR}/wal] | 31d | Subdirectory Size /wal | +| Pg_probackup dir {#BACKUPDIR}: duration full backup | pg_probackup.dir.duration_full[{#BACKUPDIR}] | 31d | Duration in seconds of creating a complete backup | +| Pg_probackup dir {#BACKUPDIR}: duration incremental backup | pg_probackup.dir.duration_inc[{#BACKUPDIR}] | 31d | Duration in seconds of creating an incremental backup | +| Pg_probackup dir {#BACKUPDIR}: start time backup | pg_probackup.dir.start_time_backup[{#BACKUPDIR}] | | Time (unixtime) start creating backup | +| Pg_probackup dir {#BACKUPDIR}: end time backup | pg_probackup.dir.end_time_backup[{#BACKUPDIR}] | | Time (UnixTime) Completion of Bacup Creation | +| Pg_probackup dir {#BACKUPDIR}: mode | pg_probackup.dir.mode_backup[{#BACKUPDIR}] | | Current backup mode | +| Pg_probackup dir {#BACKUPDIR}: status | pg_probackup.dir.status_backup[{#BACKUPDIR}] | | Current backup status | +| Pg_probackup dir {#BACKUPDIR}: error | pg_probackup.dir.error[{#BACKUPDIR}] | | A sign of an erroneous state or "ok" if everything is fine | Graphs: - - - - - - - - - -
NamePg_probackup: backup dir: {#BACKUPDIR} size
MetricsPg_probackup dir {#BACKUPDIR}: size
+ +1. Pg_probackup: backup dir: {#BACKUPDIR} size + +Shows 3 metrics with information about the size of directories with archival copies: + +| Key | Side graphs | Description | +| ------------------------------------------- | ------------ | ------------------------------------ | +| pg_probackup.dir.size[{#BACKUPDIR}] | (Left Side) | Total Directory Size /backups + /wal | +| pg_probackup.dir.size[{#BACKUPDIR}/backups] | (Left Side) | Subdirectory size /backups | +| pg_probackup.dir.size[{#BACKUPDIR}/wal] | (Right Side) | Subdirectory size /wal | + +2. Pg_probackup: backup dir: {#BACKUPDIR} duration + +Shows 2 metrics with a duration of creating archive copies: + +| Key | Side graphs | Description | +| -------------------------------------------- | ------------ | ----------------------------------------------------- | +| pg_probackup.dir.duration_full[{#BACKUPDIR}] | (Left Side) | Duration in seconds of creating a complete backup | +| pg_probackup.dir.duration_inc[{#BACKUPDIR}] | (Right Side) | Duration in seconds of creating an incremental backup | Triggers: - - - - - - - - - -
NameError in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})
ExpressionTriggers if pg_probackup status is not OK.
+ +The following alerts have been created that allow you to monitor the status of archive directories: + +* The alert triggers if the creation of a backup is performed more than indicated in the configuration parameter `max_time_run_backup2alert_in_sec`. Time is specified in seconds and default value = 21600 (6 hours). The current state is monitored in which the process of creating a backfill. + +| Category | Details | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Importance: | Warning | +| Name: | Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING) | +| Expression: | {PostgresPro-Linux:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING" and ( {PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()}) > max_time_run_backup2alert_in_sec | + +* The alert triggers if it does not create a new backup longer than indicated in the configuration parameter `max_time_lack_backupup2alert_in_sec`. Time is specified in seconds and default value = 100800 (28 hours). It is monitored that the next backup (the type of backup of any) will be created no later than indicated in the parameter. + +| Category | Details | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Importance: | Warning | +| Name: | Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} | +| Expression: | ( {PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()} -{PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()}) > max_time_lack_backup2alert_in_sec | + +* Alert triggers if an error occurred when creating a backup - 'error', 'corrupt', 'orphan'. Controls the state of any archive copy, not only the latter. Active all the time has any historical copy with an erroneous state. + +| Category | Details | +| ----------- | ----------------------------------------------------------------------------------- | +| Importance: | Average | +| Name: | Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE}) | +| Expression: | {PostgresPro-Linux:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1 | ### Graphs diff --git a/documentation/metrics.ru.md b/documentation/metrics.ru.md new file mode 100644 index 00000000..1cab5da6 --- /dev/null +++ b/documentation/metrics.ru.md @@ -0,0 +1,82 @@ +# Описания плагинов + +## pg_probackup.py +Предназначен для контроля за состоянием каталогов бэкапов создаваемых утилитой [pg_probackup](https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup). +Плагин адаптирован для контроля нескольких инстансов в одном каталоге. Имя инстанса указывается в ключе метрики как подкаталог. + +### Настройки в секции [pgprobackup] + +| Наименование | Ключ | Описание | +| --------------------------------- | ------------------------- | ------------------------------------------------------------------ | +| enabled | False | По умолчанию плагин отключен. Укажите True для включения | +| interval | 900 | Как часто опрашивать состояние каталогов. Указано в секундах | +| backup_dirs | /backup_dir1,/backup_dir2 | Список каталогов бэкапов утилиты pg_probackup | +| pg_probackup_path | /usr/bin/pg_probackup-13 | Полный путь к утилите создания бэкапов pg_probackup | +| max_time_run_backup2alert_in_sec | 21600 | Время срабатывания алерта "Backup runs too long on..." в секундах. | +| max_time_lack_backup2alert_in_sec | 100800 | Время срабатывания алерта "Long time no backups on..." в секундах. | + + +### Текущие метрики в Discovery правиле: + +| Наименование | Ключ | Хранить | Описание | +| ---------------------------------------------------------- | ------------------------------------------------ | ------- | -------------------------------------------------------- | +| Pg_probackup dir {#BACKUPDIR}: size | pg_probackup.dir.size[{#BACKUPDIR}] | 31d | Общий размер каталога: /backups + /wal | +| Pg_probackup dir {#BACKUPDIR}/backups: size | pg_probackup.dir.size[{#BACKUPDIR}/backups] | 31d | Размер подкаталога /backups | +| Pg_probackup dir {#BACKUPDIR}/wal: size | pg_probackup.dir.size[{#BACKUPDIR}/wal] | 31d | Размер подкаталога /wal | +| Pg_probackup dir {#BACKUPDIR}: duration full backup | pg_probackup.dir.duration_full[{#BACKUPDIR}] | 31d | Длительность в секундах создания полного бэкапа | +| Pg_probackup dir {#BACKUPDIR}: duration incremental backup | pg_probackup.dir.duration_inc[{#BACKUPDIR}] | 31d | Длительность в секундах создания инкрементального бэкапа | +| Pg_probackup dir {#BACKUPDIR}: start time backup | pg_probackup.dir.start_time_backup[{#BACKUPDIR}] | | Время (UNIXTIME) старта создания бэкапа | +| Pg_probackup dir {#BACKUPDIR}: end time backup | pg_probackup.dir.end_time_backup[{#BACKUPDIR}] | | Время (UNIXTIME) завершения создания бэкапа | +| Pg_probackup dir {#BACKUPDIR}: mode | pg_probackup.dir.mode_backup[{#BACKUPDIR}] | | Текущий режим бэкапа | +| Pg_probackup dir {#BACKUPDIR}: status | pg_probackup.dir.status_backup[{#BACKUPDIR}] | | Текущий статус бэкапа | +| Pg_probackup dir {#BACKUPDIR}: error | pg_probackup.dir.error[{#BACKUPDIR}] | | Признак ошибочного состояния или "ok" если всё хорошо | + + +### Текущие алерты в Discovery правиле: +Созданы следующие алерты, позволящие контролировать состояние архивных каталогов: + +* Алерт срабатывает если создание бэкапа выполняется дольше, чем указано в настроечном параметре `max_time_run_backup2alert_in_sec`. Время задаётся в секундах и значение по умолчанию = 21600 (6 часов). Контролируется текущее состояние в котором находится процесс создания бэкапной копии. + +| Категория | Детали | +| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Важность: | Warning | +| Наименование: | Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING) | +| Выражение: | {PostgresPro-Linux:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING" and ( {PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()}) > max_time_run_backup2alert_in_sec | + +* Алерт срабатывает если не выполняется создание нового бэкапа дольше, чем указано в настроечном параметре `max_time_lack_backup2alert_in_sec`. Время задаётся в секундах и значение по умолчанию = 100800 (28 часов). Контролируется, что очередной бэкап (тип бэкапа любой) будет создан не позже, чем указано в параметре. + +| Категория | Детали | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Важность: | Warning | +| Наименование: | Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} | +| Выражение: | ( {PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()} -{PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()}) > max_time_lack_backup2alert_in_sec | + +* Алерт срабатывает если при создании бэкапа произошла ошибка - 'ERROR', 'CORRUPT', 'ORPHAN'. Контролирует состояние любой архивной копии, не только последней. Активен всё время пока есть любая архивная копия с ошибочным состоянием. + +| Категория | Детали | +| ------------- | ----------------------------------------------------------------------------------- | +| Важность: | Average | +| Наименование: | Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE}) | +| Выражение: | {PostgresPro-Linux:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1 | + + +### Текущие графики в Discovery правиле: + +1. Pg_probackup: backup dir: {#BACKUPDIR} size + +Показывает 3 метрики с информацией о размерах каталогов с архивными копиями: + +| Метрика | Сторона графика | Описание | +| ------------------------------------------- | --------------- | -------------------------------------- | +| pg_probackup.dir.size[{#BACKUPDIR}] | (Left Side) | Общий размер каталогов /backups + /wal | +| pg_probackup.dir.size[{#BACKUPDIR}/backups] | (Left Side) | размер подкаталога /backups | +| pg_probackup.dir.size[{#BACKUPDIR}/wal] | (Right Side) | размер подкаталога /wal | + +2. Pg_probackup: backup dir: {#BACKUPDIR} duration + +Показывает 2 метрики с длительностью создания архивных копий: + +| Метрика | Сторона графика | Описание | +| -------------------------------------------- | --------------- | -------------------------------------------------------- | +| pg_probackup.dir.duration_full[{#BACKUPDIR}] | (Left Side) | Длительность в секундах создания полного бэкапа | +| pg_probackup.dir.duration_inc[{#BACKUPDIR}] | (Right Side) | Длительность в секундах создания инкрементального бэкапа | diff --git a/mamonsu/plugins/system/linux/pg_probackup.py b/mamonsu/plugins/system/linux/pg_probackup.py index 2472aa52..1f937436 100644 --- a/mamonsu/plugins/system/linux/pg_probackup.py +++ b/mamonsu/plugins/system/linux/pg_probackup.py @@ -3,17 +3,28 @@ import json import os import subprocess - +from datetime import datetime class PgProbackup(Plugin): os_walk_error = None block_size = 4096 - Interval = 5 * 60 + Interval = 15 * 60 key_main = 'pg_probackup.discovery{0}' key_dir_size = 'pg_probackup.dir.size{0}' key_dir_error = 'pg_probackup.dir.error{0}' + key_dir_duration_full = 'pg_probackup.dir.duration_full{0}' + key_dir_duration_inc = 'pg_probackup.dir.duration_inc{0}' + key_dir_endtime_backup = 'pg_probackup.dir.end_time_backup{0}' + key_dir_starttime_backup = 'pg_probackup.dir.start_time_backup{0}' + key_dir_status_backup = 'pg_probackup.dir.status_backup{0}' + key_dir_mode_backup = 'pg_probackup.dir.mode_backup{0}' AgentPluginType = 'pg' Type = "mamonsu" + + DEFAULT_CONFIG = { + 'max_time_run_backup2alert_in_sec': str(21600), # The maximum time of running time of backup to Alert in seconds (6 hours) + 'max_time_lack_backup2alert_in_sec': str(100800), # The maximum time of lack of backup to Alert (28 hours) + } def set_os_walk_error(self, e): self.os_walk_error = e @@ -56,50 +67,102 @@ def run(self, zbx): """Disable plugin and exit, because the parameter 'backup_dirs' in section [pgprobackup] is not set. Set this parameter if needed and restart.""") + fmt_data = '%Y-%m-%d %H:%M:%S+03' backup_dirs = config_backup_dirs.split(',') dirs = [] - for _dir in backup_dirs: - dirs.append({'{#BACKUPDIR}': _dir}) - - dir_size = self.dir_size(_dir) - if self.os_walk_error: - self.log.error( - "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format( - backup_catalog=_dir, error=str(self.os_walk_error))) - else: - zbx.send(self.key_dir_size.format('[' + _dir + ']'), dir_size) + for _dir_top in backup_dirs: # Search for backups with bad status is done by running # "pg_probackup show -B backup_dir" command - command = [config_pg_probackup_path, 'show', '-B', _dir, '--format=json'] + command = [config_pg_probackup_path, 'show', '-B', _dir_top, '--format=json'] process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() return_code = process.returncode if return_code != 0: self.log.error( "The command: {command} return code {return_code}. Error: {error}".format(command=command, - return_code=return_code, - error=stderr)) + return_code=return_code, + error=stderr)) continue try: result = json.loads(stdout.decode('utf-8')) except Exception as e: self.log.error('Error in convert data: {stdout} \n {e}'.format(stdout=stdout, e=e)) continue + no_error= True + for instance in result: - for backup in instance.get('backups', []): + # We consider the sizes of each instance + instance_name = instance['instance'] + _dir = _dir_top + '/' + instance_name + dirs.append({'{#BACKUPDIR}': _dir}) + + # sud-directory backups + dir_size_backups = self.dir_size(_dir_top + '/backups/' + instance_name) + if self.os_walk_error: + self.log.error( + "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format( + backup_catalog=(_dir_top + '/backups/' + instance_name), error=str(self.os_walk_error))) + else: + # We consider the size of the predefined directories - backups + zbx.send(self.key_dir_size.format('[' + _dir + '/backups]'), dir_size_backups) + + # sud-directory wal + dir_size_wal = self.dir_size(_dir_top + '/wal/' + instance_name) + if self.os_walk_error: + self.log.error( + "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format( + backup_catalog=(_dir_top + '/wal/' + instance_name), error=str(self.os_walk_error))) + else: + # We consider the size of the predefined directories - wal + zbx.send(self.key_dir_size.format('[' + _dir + '/wal]'), dir_size_wal) + + # We consider the size of the predefined directories - backups and wal + zbx.send(self.key_dir_size.format('[' + _dir + ']'), dir_size_backups+dir_size_wal) + + full_send = 0 + for idx, backup in enumerate(instance.get('backups', [])): status = backup['status'] + mode = backup['backup-mode'] + if idx == 0: + # Status of the last backup + zbx.send(self.key_dir_status_backup.format('[' + _dir + ']'), status) + # Backup Creation Mode Full, Page, Delta and Ptrack of the last backup + zbx.send(self.key_dir_mode_backup.format('[' + _dir + ']'), mode) if status in ['ERROR', 'CORRUPT', 'ORPHAN']: error = 'Backup with id: {backup_id} in instance: {instance_name} in pg_probackup dir: ' \ - '{backup_catalog} has status: {status}.'.format(backup_id=backup['id'], - instance_name=instance['instance'], - status=status, backup_catalog=_dir) + '{backup_catalog} has status: {status}.'.format(backup_id=backup['id'], + instance_name=instance_name, + status=status, backup_catalog=_dir) self.log.info(error) no_error = False zbx.send(self.key_dir_error.format('[' + _dir + ']'), error) - if no_error: - zbx.send(self.key_dir_error.format('[' + _dir + ']'), 'ok') + if idx == 0: + # the start time of the last backup at unixtime + start = datetime.strptime(backup['start-time'], fmt_data) + zbx.send(self.key_dir_starttime_backup.format('[' + _dir + ']'), start.timestamp()) + # check end-time and calculate duration + if 'end-time' in backup: + end = datetime.strptime(backup['end-time'], fmt_data) + delta = (end - start).total_seconds() + # the end time of the last backup at unixtime + zbx.send(self.key_dir_endtime_backup.format('[' + _dir + ']'), end.timestamp()) + # duration full or incremental of the last backup + if backup['backup-mode'] == "FULL": + zbx.send(self.key_dir_duration_full.format('[' + _dir + ']'), delta) + full_send = 1 + else: + zbx.send(self.key_dir_duration_inc.format('[' + _dir + ']'), delta) + if full_send == 0 and 'end-time' in backup and backup['backup-mode'] == "FULL": + start = datetime.strptime(backup['start-time'], fmt_data) + end = datetime.strptime(backup['end-time'], fmt_data) + delta = (end - start).total_seconds() + zbx.send(self.key_dir_duration_full.format('[' + _dir + ']'), delta) + full_send = 1 + + if no_error: + zbx.send(self.key_dir_error.format('[' + _dir + ']'), 'ok') zbx.send(self.key_main.format('[]'), zbx.json({'data': dirs})) del dirs @@ -128,24 +191,132 @@ def discovery_rules(self, template, dashboard=False): 'name': 'Pg_probackup dir {#BACKUPDIR}: size', 'units': Plugin.UNITS.bytes, 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, - 'delay': self.plugin_config('interval')}, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "Size of the entire catalog with backups"}, + {'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/backups,"), + 'name': 'Pg_probackup dir {#BACKUPDIR}/backups: size', + 'units': Plugin.UNITS.bytes, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The size of the entire subdirectory /backups"}, + {'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/wal,"), + 'name': 'Pg_probackup dir {#BACKUPDIR}/wal: size', + 'units': Plugin.UNITS.bytes, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The size of the entire subdirectory /wal"}, {'key': self.right_type(self.key_dir_error, var_discovery="{#BACKUPDIR},"), 'name': 'Pg_probackup dir {#BACKUPDIR}: error', 'value_type': Plugin.VALUE_TYPE.text, - 'delay': self.plugin_config('interval')}, + 'delay': self.plugin_config('interval'), + 'description': "Sign of the erroneous completion of the backup: 'ERROR', 'CORRUPT', 'ORPHAN'"}, + {'key': self.right_type(self.key_dir_duration_full, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: duration full backup', + 'units': Plugin.UNITS.s, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The duration of the last full backup"}, + {'key': self.right_type(self.key_dir_duration_inc, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: duration incremental backup', + 'units': Plugin.UNITS.s, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The duration of the last incremental backup"}, + {'key': self.right_type(self.key_dir_endtime_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: end time backup', + 'units': Plugin.UNITS.unixtime, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'delay': self.plugin_config('interval'), + 'description': "The end time of the last any backup"}, + {'key': self.right_type(self.key_dir_starttime_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: start time backup', + 'units': Plugin.UNITS.unixtime, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'delay': self.plugin_config('interval'), + 'description': "The start time of the last any backup"}, + {'key': self.right_type(self.key_dir_status_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: status', + 'value_type': Plugin.VALUE_TYPE.text, + 'delay': self.plugin_config('interval'), + 'description': "Sign of the status completion of the last backup:\n\n" + "OK — the backup is complete and valid.\n" + "DONE — the backup is complete, but was not validated.\n" + "RUNNING — the backup is in progress.\n" + "MERGING — the backup is being merged.\n" + "MERGED — the backup data files were successfully merged, but its metadata is in the process of being updated. Only full backups can have this status.\n" + "DELETING — the backup files are being deleted.\n" + "CORRUPT — some of the backup files are corrupt.\n" + "ERROR — the backup was aborted because of an unexpected error.\n" + "ORPHAN — the backup is invalid because one of its parent backups is corrupt or missing.\n\n" + "https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup" + }, + {'key': self.right_type(self.key_dir_mode_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: mode', + 'value_type': Plugin.VALUE_TYPE.text, + 'delay': self.plugin_config('interval'), + 'description': "Backup Creation Mode:\n\n" + "FULL — creates a full backup that contains all the data files of the cluster to be restored.\n" + "DELTA — reads all data files in the data directory and creates an incremental backup for pages that have changed since the previous backup.\n" + "PAGE — creates an incremental backup based on the WAL files that have been generated since the previous full or incremental backup was taken. Only changed blocks are read from data files.\n" + "PTRACK — creates an incremental backup tracking page changes on the fly.\n\n" + "https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup" + }, ] graphs = [ + { + 'name': 'Pg_probackup: backup dir: {#BACKUPDIR} duration', + 'type': 0, + 'items': [ + {'color': '00897B', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_duration_full, var_discovery="{#BACKUPDIR},")}, + {'color': '66BB6A', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_duration_inc, var_discovery="{#BACKUPDIR},"), + 'yaxisside': 1} + ] + }, { 'name': 'Pg_probackup: backup dir: {#BACKUPDIR} size', - 'type': 1, + 'type': 0, 'items': [ - {'color': '00CC00', - 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR},")}] + {'color': 'C8E6C9', + 'drawtype': 1, + 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR},")}, + {'color': '00897B', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/backups,")}, + {'color': '66BB6A', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/wal,"), + 'yaxisside': 1} + ] }, ] - triggers = [{ - 'name': 'Error in pg_probackup dir ' - '{#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})', - 'expression': '{#TEMPLATE:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1'} + triggers = [ + {'name': 'Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})', + 'expression': '{#TEMPLATE:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1', + 'priority': 3, + 'description': 'Backup status: CORRUPT / ERROR / ORPHAN'}, + {'name': 'Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR}', + 'expression': '({#TEMPLATE:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()}-{#TEMPLATE:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()})>' + + self.plugin_config('max_time_lack_backup2alert_in_sec'), + 'priority': 2, + 'description': 'From the moment of completion of the backup passed more than ' + + str(int(int(self.plugin_config('max_time_lack_backup2alert_in_sec'))/3600)) + ' hours (' + + self.plugin_config('max_time_lack_backup2alert_in_sec') + ' seconds)'}, + {'name': 'Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING)', + 'expression': '{#TEMPLATE:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING"' + ' and ({#TEMPLATE:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{#TEMPLATE:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()})>' + + self.plugin_config('max_time_run_backup2alert_in_sec'), + 'priority': 2, + 'description': 'From the moment of start of the backup passed more than ' + + str(int(int(self.plugin_config('max_time_run_backup2alert_in_sec'))/3600)) + ' hours (' + + self.plugin_config('max_time_run_backup2alert_in_sec') + ' seconds)'}, ] return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers) diff --git a/packaging/conf/example.conf b/packaging/conf/example.conf index b313b6c3..14fe99fc 100644 --- a/packaging/conf/example.conf +++ b/packaging/conf/example.conf @@ -183,9 +183,11 @@ interval = 60 # Trigger fires if some backup has bad status e.g. (ERROR,CORRUPT,ORPHAN). [pgprobackup] enabled = False -interval = 300 +interval = 900 backup_dirs = /backup_dir1,/backup_dir2 -pg_probackup_path = /usr/bin/pg_probackup-11 +pg_probackup_path = /usr/bin/pg_probackup-13 +max_time_run_backup2alert_in_sec = 21600 +max_time_lack_backup2alert_in_sec = 100800 # Get size of relations defined in this section # Relations - comma separated list of objects - tables and endexes (database_name.schema.relation) used to calculate relations size. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy