feat: add basic pgo

This commit is contained in:
2023-04-06 02:21:56 +02:00
commit ebd0fcf700
128 changed files with 61656 additions and 0 deletions

View File

@@ -0,0 +1,5 @@
To deploy monitoring,
1. verify the namespace is correct in kustomization.yaml
2. If you are deploying in openshift, edit deploy*.yaml and comment out fsGroup line under securityContext
3. kubectl apply -k .

View File

@@ -0,0 +1,87 @@
apiVersion: v1
data:
alertmanager.yml: |
###
#
# Copyright © 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###
# Based on upstream example file found here: https://github.com/prometheus/alertmanager/blob/master/doc/examples/simple.yml
global:
smtp_smarthost: 'localhost: 25'
smtp_require_tls: false
smtp_from: 'Alertmanager <abc@yahoo.com>'
# smtp_smarthost: 'smtp.example.com:587'
# smtp_from: 'Alertmanager <abc@yahoo.com>'
# smtp_auth_username: '<username>'
# smtp_auth_password: '<password>'
# templates:
# - '/etc/alertmanager/template/*.tmpl'
inhibit_rules:
# Apply inhibition of warning if the alertname for the same system and service is already critical
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'job', 'service']
receivers:
- name: 'default-receiver'
email_configs:
- to: 'example@crunchydata.com'
send_resolved: true
## Examples of alternative alert receivers. See documentation for more info on how to configure these fully
#- name: 'pagerduty-dba'
# pagerduty_configs:
# - service_key: <RANDOMKEYSTUFF>
#- name: 'pagerduty-sre'
# pagerduty_configs:
# - service_key: <RANDOMKEYSTUFF>
#- name: 'dba-team'
# email_configs:
# - to: 'example-dba-team@crunchydata.com'
# send_resolved: true
#- name: 'sre-team'
# email_configs:
# - to: 'example-sre-team@crunchydata.com'
# send_resolved: true
route:
receiver: default-receiver
group_by: [severity, service, job, alertname]
group_wait: 30s
group_interval: 5m
repeat_interval: 24h
## Example routes to show how to route outgoing alerts based on the content of that alert
# routes:
# - match_re:
# service: ^(postgresql|mysql|oracle)$
# receiver: dba-team
# # sub route to send critical dba alerts to pagerduty
# routes:
# - match:
# severity: critical
# receiver: pagerduty-dba
#
# - match:
# service: system
# receiver: sre-team
# # sub route to send critical sre alerts to pagerduty
# routes:
# - match:
# severity: critical
# receiver: pagerduty-sre
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: alertmanager-config

View File

@@ -0,0 +1,428 @@
apiVersion: v1
data:
crunchy-alert-rules-pg.yml: |
###
#
# Copyright 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###
groups:
- name: alert-rules
rules:
########## EXPORTER RULES ##########
- alert: PGExporterScrapeError
expr: pg_exporter_last_scrape_error > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'Postgres Exporter running on {{ $labels.job }} (instance: {{ $labels.instance }}) is encountering scrape errors processing queries. Error count: ( {{ $value }} )'
########## SYSTEM RULES ##########
- alert: ExporterDown
expr: avg_over_time(up[5m]) < 0.5
for: 10s
labels:
service: system
severity: critical
severity_num: 300
annotations:
description: 'Metrics exporter service for {{ $labels.job }} running on {{ $labels.instance }} has been down at least 50% of the time for the last 5 minutes. Service may be flapping or down.'
summary: 'Prometheus Exporter Service Down'
########## POSTGRESQL RULES ##########
- alert: PGIsUp
expr: pg_up < 1
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: 'postgres_exporter running on {{ $labels.job }} is unable to communicate with the configured database'
# Example to check for current version of PostgreSQL. Metric returns the version that the exporter is running on, so you can set a rule to check for the minimum version you'd like all systems to be on. Number returned is the 6 digit integer representation contained in the setting "server_version_num".
#
# - alert: PGMinimumVersion
# expr: ccp_postgresql_version_current < 110005
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: '{{ $labels.job }} is not running at least version 11.5 of PostgreSQL'
# Whether a system switches from primary to replica or vice versa must be configured per named job.
# No way to tell what value a system is supposed to be without a rule expression for that specific system
# 2 to 1 means it changed from primary to replica. 1 to 2 means it changed from replica to primary
# Set this alert for each system that you want to monitor a recovery status change
# Below is an example for a target job called "Replica" and watches for the value to change above 1 which means it's no longer a replica
#
# - alert: PGRecoveryStatusSwitch_Replica
# expr: ccp_is_in_recovery_status{job="Replica"} > 1
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: '{{ $labels.job }} has changed from replica to primary'
# Absence alerts must be configured per named job, otherwise there's no way to know which job is down
# Below is an example for a target job called "Prod"
# - alert: PGConnectionAbsent_Prod
# expr: absent(ccp_connection_stats_max_connections{job="Prod"})
# for: 10s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# description: 'Connection metric is absent from target (Prod). Check that postgres_exporter can connect to PostgreSQL.'
# Optional monitor for changes to pg_settings (postgresql.conf) system catalog.
# A similar metric is available for monitoring pg_hba.conf. See ccp_hba_settings_checksum().
# If metric returns 0, then NO settings have changed for either pg_settings since last known valid state
# If metric returns 1, then pg_settings have changed since last known valid state
# To see what may have changed, check the monitor.pg_settings_checksum table for a history of config state.
# - alert: PGSettingsChecksum
# expr: ccp_pg_settings_checksum > 0
# for 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# description: 'Configuration settings on {{ $labels.job }} have changed from previously known valid state. To reset current config to a valid state after alert fires, run monitor.pg_settings_checksum_set_valid().'
# summary: 'PGSQL Instance settings checksum'
# Monitor for data block checksum failures. Only works in PG12+
# - alert: PGDataChecksum
# expr: ccp_data_checksum_failure > 0
# for 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# description: '{{ $labels.job }} has at least one data checksum failure in database {{ $labels.dbname }}. See pg_stat_database system catalog for more information.'
# summary: 'PGSQL Data Checksum failure'
- alert: PGIdleTxn
expr: ccp_connection_stats_max_idle_in_txn_time > 300
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: '{{ $labels.job }} has at least one session idle in transaction for over 5 minutes.'
summary: 'PGSQL Instance idle transactions'
- alert: PGIdleTxn
expr: ccp_connection_stats_max_idle_in_txn_time > 900
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: '{{ $labels.job }} has at least one session idle in transaction for over 15 minutes.'
summary: 'PGSQL Instance idle transactions'
- alert: PGQueryTime
expr: ccp_connection_stats_max_query_time > 43200
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: '{{ $labels.job }} has at least one query running for over 12 hours.'
summary: 'PGSQL Max Query Runtime'
- alert: PGQueryTime
expr: ccp_connection_stats_max_query_time > 86400
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: '{{ $labels.job }} has at least one query running for over 1 day.'
summary: 'PGSQL Max Query Runtime'
- alert: PGConnPerc
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 75
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: '{{ $labels.job }} is using 75% or more of available connections ({{ $value }}%)'
summary: 'PGSQL Instance connections'
- alert: PGConnPerc
expr: 100 * (ccp_connection_stats_total / ccp_connection_stats_max_connections) > 90
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: '{{ $labels.job }} is using 90% or more of available connections ({{ $value }}%)'
summary: 'PGSQL Instance connections'
- alert: DiskFillPredict
expr: predict_linear(ccp_nodemx_data_disk_available_bytes{mount_point!~"tmpfs"}[1h], 24 * 3600) < 0 and 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 70
for: 5m
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
summary: 'Disk predicted to be full in 24 hours'
description: 'Disk on {{ $labels.pg_cluster }}:{{ $labels.kubernetes_pod_name }} is predicted to fill in 24 hrs based on current usage'
- alert: PGClusterRoleChange
expr: count by (pg_cluster) (ccp_is_in_recovery_status != ignoring(instance,ip,pod,role) (ccp_is_in_recovery_status offset 5m)) >= 1
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
summary: '{{ $labels.pg_cluster }} has had a switchover/failover event. Please check this cluster for more details'
- alert: PGDiskSize
expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 75
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.deployment }} over 75% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%'
summary: PGSQL Instance usage warning
- alert: PGDiskSize
expr: 100 * ((ccp_nodemx_data_disk_total_bytes - ccp_nodemx_data_disk_available_bytes) / ccp_nodemx_data_disk_total_bytes) > 90
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.deployment }} over 90% disk usage at mount point "{{ $labels.mount_point }}": {{ $value }}%'
summary: 'PGSQL Instance size critical'
- alert: PGReplicationByteLag
expr: ccp_replication_lag_size_bytes > 5.24288e+07
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 50MB behind.'
summary: 'PGSQL Instance replica lag warning'
- alert: PGReplicationByteLag
expr: ccp_replication_lag_size_bytes > 1.048576e+08
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} has at least one replica lagging over 100MB behind.'
summary: 'PGSQL Instance replica lag warning'
- alert: PGReplicationSlotsInactive
expr: ccp_replication_slots_active == 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} has one or more inactive replication slots'
summary: 'PGSQL Instance inactive replication slot'
- alert: PGXIDWraparound
expr: ccp_transaction_wraparound_percent_towards_wraparound > 50
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 50% towards transaction id wraparound.'
summary: 'PGSQL Instance {{ $labels.job }} transaction id wraparound imminent'
- alert: PGXIDWraparound
expr: ccp_transaction_wraparound_percent_towards_wraparound > 75
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 75% towards transaction id wraparound.'
summary: 'PGSQL Instance transaction id wraparound imminent'
- alert: PGEmergencyVacuum
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 110
for: 60s
labels:
service: postgresql
severity: warning
severity_num: 200
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 110% beyond autovacuum_freeze_max_age value. Autovacuum may need tuning to better keep up.'
summary: 'PGSQL Instance emergency vacuum imminent'
- alert: PGEmergencyVacuum
expr: ccp_transaction_wraparound_percent_towards_emergency_autovac > 125
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} is over 125% beyond autovacuum_freeze_max_age value. Autovacuum needs tuning to better keep up.'
summary: 'PGSQL Instance emergency vacuum imminent'
- alert: PGArchiveCommandStatus
expr: ccp_archive_command_status_seconds_since_last_fail > 300
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'PGSQL Instance {{ $labels.job }} has a recent failing archive command'
summary: 'Seconds since the last recorded failure of the archive_command'
- alert: PGSequenceExhaustion
expr: ccp_sequence_exhaustion_count > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'Count of sequences on instance {{ $labels.job }} at over 75% usage: {{ $value }}. Run following query to see full sequence status: SELECT * FROM monitor.sequence_status() WHERE percent >= 75'
- alert: PGSettingsPendingRestart
expr: ccp_settings_pending_restart_count > 0
for: 60s
labels:
service: postgresql
severity: critical
severity_num: 300
annotations:
description: 'One or more settings in the pg_settings system catalog on system {{ $labels.job }} are in a pending_restart state. Check the system catalog for which settings are pending and review postgresql.conf for changes.'
########## PGBACKREST RULES ##########
#
# Uncomment and customize one or more of these rules to monitor your pgbackrest backups.
# Full backups are considered the equivalent of both differentials and incrementals since both are based on the last full
# And differentials are considered incrementals since incrementals will be based off the last diff if one exists
# This avoid false alerts, for example when you don't run diff/incr backups on the days that you run a full
# Stanza should also be set if different intervals are expected for each stanza.
# Otherwise rule will be applied to all stanzas returned on target system if not set.
#
# Relevant metric names are:
# ccp_backrest_last_full_backup_time_since_completion_seconds
# ccp_backrest_last_incr_backup_time_since_completion_seconds
# ccp_backrest_last_diff_backup_time_since_completion_seconds
#
# To avoid false positives on backup time alerts, 12 hours are added onto each threshold to allow a buffer if the backup runtime varies from day to day.
# Further adjustment may be needed depending on your backup runtimes/schedule.
#
# - alert: PGBackRestLastCompletedFull_main
# expr: ccp_backrest_last_full_backup_time_since_completion_seconds{stanza="main"} > 648000
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Full backup for stanza [main] on system {{ $labels.job }} has not completed in the last week.'
#
# - alert: PGBackRestLastCompletedIncr_main
# expr: ccp_backrest_last_incr_backup_time_since_completion_seconds{stanza="main"} > 129600
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Incremental backup for stanza [main] on system {{ $labels.job }} has not completed in the last 24 hours.'
#
#
# Runtime monitoring is handled with a single metric:
#
# ccp_backrest_last_info_backup_runtime_seconds
#
# Runtime monitoring should have the "backup_type" label set.
# Otherwise the rule will apply to the last run of all backup types returned (full, diff, incr)
# Stanza should also be set if runtimes per stanza have different expected times
#
# - alert: PGBackRestLastRuntimeFull_main
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="full", stanza="main"} > 14400
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Expected runtime of full backup for stanza [main] has exceeded 4 hours'
#
# - alert: PGBackRestLastRuntimeDiff_main
# expr: ccp_backrest_last_info_backup_runtime_seconds{backup_type="diff", stanza="main"} > 3600
# for: 60s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# summary: 'Expected runtime of diff backup for stanza [main] has exceeded 1 hour'
##
#
## If the pgbackrest command fails to run, the metric disappears from the exporter output and the alert never fires.
## An absence alert must be configured explicitly for each target (job) that backups are being monitored.
## Checking for absence of just the full backup type should be sufficient (no need for diff/incr).
## Note that while the backrest check command failing will likely also cause a scrape error alert, the addition of this
## check gives a clearer answer as to what is causing it and that something is wrong with the backups.
#
# - alert: PGBackrestAbsentFull_Prod
# expr: absent(ccp_backrest_last_full_backup_time_since_completion_seconds{job="Prod"})
# for: 10s
# labels:
# service: postgresql
# severity: critical
# severity_num: 300
# annotations:
# description: 'Backup Full status missing for Prod. Check that pgbackrest info command is working on target system.'
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: alertmanager-rules-config

View File

@@ -0,0 +1,16 @@
###
#
# Copyright © 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###
apiVersion: 1
providers:
- name: 'crunchy_dashboards'
orgId: 1
folder: ''
type: file
disableDeletion: false
updateIntervalSeconds: 3 #how often Grafana will scan for changed dashboards
options:
path: /etc/grafana/provisioning/dashboards

View File

@@ -0,0 +1,331 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "PROMETHEUS",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "6.7.4"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}
],
"annotations": {
"list": [
{
"$$hashKey": "object:111",
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1596817489973,
"links": [
{
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [],
"type": "dashboards"
}
],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 12,
"w": 24,
"x": 0,
"y": 0
},
"height": "480",
"hiddenSeries": false,
"id": 1,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": true,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"maxPerRow": 2,
"nullPointMode": "null",
"options": {
"dataLinks": []
},
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"repeat": null,
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(ccp_stat_user_tables_n_tup_ins{pg_cluster=\"[[cluster]]\", pod=~\"[[pod]]\", dbname=~\"[[dbname]]\", schemaname=~\"[[schemaname]]\", relname=~\"[[tablename]]\"}[60s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "inserts - [[dbname]].[[schemaname]].[[tablename]]",
"refId": "A",
"step": 60
},
{
"expr": "sum(rate(ccp_stat_user_tables_n_tup_upd{pg_cluster=\"[[cluster]]\", pod=~\"[[pod]]\", dbname=~\"[[dbname]]\", schemaname=~\"[[schemaname]]\", relname=~\"[[tablename]]\"}[60s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Updates - [[dbname]].[[schemaname]].[[tablename]]",
"refId": "B",
"step": 60
},
{
"expr": "sum(rate(ccp_stat_user_tables_n_tup_del{pg_cluster=\"[[cluster]]\", pod=~\"[[pod]]\", dbname=~\"[[dbname]]\", schemaname=~\"[[schemaname]]\", relname=~\"[[tablename]]\"}[60s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Deletes - [[dbname]].[[schemaname]].[[tablename]]",
"refId": "C",
"step": 60
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "CRUD",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "30s",
"schemaVersion": 22,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allValue": null,
"current": {},
"datasource": "PROMETHEUS",
"definition": "",
"hide": 0,
"includeAll": false,
"index": -1,
"label": null,
"multi": false,
"name": "cluster",
"options": [],
"query": "label_values(pg_cluster)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {},
"datasource": "PROMETHEUS",
"definition": "label_values({pg_cluster=\"[[cluster]]\"},pod)",
"hide": 0,
"includeAll": true,
"index": -1,
"label": "pod",
"multi": true,
"name": "pod",
"options": [],
"query": "label_values({pg_cluster=\"[[cluster]]\"},pod)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {},
"datasource": "PROMETHEUS",
"definition": "label_values(ccp_database_size_bytes{pg_cluster=\"[[cluster]]\"},dbname)",
"hide": 0,
"includeAll": true,
"index": -1,
"label": "dbname",
"multi": true,
"name": "dbname",
"options": [],
"query": "label_values(ccp_database_size_bytes{pg_cluster=\"[[cluster]]\"},dbname)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {},
"datasource": "PROMETHEUS",
"definition": "",
"hide": 0,
"includeAll": true,
"index": -1,
"label": "schemaname",
"multi": true,
"name": "schemaname",
"options": [],
"query": "label_values(ccp_stat_user_tables_n_tup_ins{pg_cluster=\"[[cluster]]\",dbname=~\"[[dbname]]\"},schemaname)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": ".*",
"current": {},
"datasource": "PROMETHEUS",
"definition": "",
"hide": 0,
"includeAll": true,
"index": -1,
"label": null,
"multi": true,
"name": "tablename",
"options": [],
"query": "label_values(ccp_stat_user_tables_n_tup_ins{pg_cluster=\"[[cluster]]\",dbname=~\"[[dbname]]\",schemaname=~\"[[schemaname]]\"},relname)",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"refresh_intervals": [
"5s",
"10s",
"30s",
"1m",
"5m",
"15m",
"30m",
"1h",
"2h",
"1d"
],
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "CRUD_Details",
"uid": "cruddetails",
"variables": {
"list": []
},
"version": 2
}

View File

@@ -0,0 +1,14 @@
kind: Kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
configMapGenerator:
- name: grafana-dashboards
files:
- pgbackrest.json
- pod_details.json
- postgres_overview.json
- postgresql_details.json
- postgresql_service_health.json
- prometheus_alerts.json
- query_statistics.json
generatorOptions:
disableNameSuffixHash: true

View File

@@ -0,0 +1,687 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "PROMETHEUS",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "7.4.5"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1625069660860,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"vendor=crunchydata"
],
"title": "",
"type": "dashboards"
}
],
"panels": [
{
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-blue",
"value": null
}
]
},
"unit": "dtdhms"
},
"overrides": []
},
"gridPos": {
"h": 3,
"w": 24,
"x": 0,
"y": 0
},
"id": 8,
"options": {
"colorMode": "background",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": {
"calcs": [
"last"
],
"fields": "/^Value$/",
"values": false
},
"text": {
"valueSize": 45
},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "time()-ccp_backrest_oldest_full_backup_time_seconds{pg_cluster=\"[[cluster]]\", role=\"master\"}",
"format": "table",
"instant": true,
"interval": "",
"legendFormat": "Recovery window",
"refId": "A"
}
],
"title": "Recovery Window",
"type": "stat"
},
{
"aliasColors": {
"Differential": "dark-blue",
"Differential Backup": "dark-blue",
"Full": "dark-green",
"Full Backup": "dark-green",
"Incremental": "light-blue",
"Incremental Backup": "light-blue"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 3
},
"hiddenSeries": false,
"id": 2,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": false
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "min(ccp_backrest_last_incr_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\", role=\"master\"}) without(deployment,instance,ip,pod)",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Incremental Backup",
"refId": "A"
},
{
"expr": "min(ccp_backrest_last_diff_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\", role=\"master\"}) without(deployment, instance,ip,pod)",
"hide": false,
"interval": "",
"legendFormat": "Differential Backup",
"refId": "B"
},
{
"expr": "min(ccp_backrest_last_full_backup_time_since_completion_seconds{pg_cluster=\"[[cluster]]\", role=\"master\"}) without(deployment, instance,ip,pod)",
"hide": false,
"interval": "",
"legendFormat": "Full Backup",
"refId": "C"
},
{
"expr": "min(ccp_archive_command_status_seconds_since_last_archive{pg_cluster=\"[[cluster]]\", role=\"master\"}) without(deployment, instance,ip,pod)",
"hide": false,
"interval": "",
"legendFormat": "WAL Archive",
"refId": "D"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Time Since",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"Differential": "dark-blue",
"Full": "dark-green",
"Incremental": "light-blue"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 3
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"hideEmpty": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "min(ccp_backrest_last_info_backup_runtime_seconds{pg_cluster=\"[[cluster]]\", role=\"master\", backup_type=\"incr\"}) without (deployment,instance,pod,ip)",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Incremental",
"refId": "A"
},
{
"expr": "min(ccp_backrest_last_info_backup_runtime_seconds{pg_cluster=\"[[cluster]]\", role=\"master\", backup_type=\"diff\"}) without (deployment,instance,pod,ip)",
"hide": false,
"interval": "",
"legendFormat": "Differential",
"refId": "B"
},
{
"expr": "min(ccp_backrest_last_info_backup_runtime_seconds{pg_cluster=\"[[cluster]]\", role=\"master\", backup_type=\"full\"}) without (deployment,instance,pod,ip)",
"hide": false,
"interval": "",
"legendFormat": "Full",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Backup Runtimes",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "s",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 2,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"Differential": "dark-blue",
"Full": "dark-green",
"Incremental": "light-blue"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 10
},
"hiddenSeries": false,
"id": 5,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"hideEmpty": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "min(ccp_backrest_last_info_repo_backup_size_bytes{pg_cluster=\"[[cluster]]\", role=\"master\", backup_type=\"incr\"}) without (deployment, instance,pod,ip)",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Incremental",
"refId": "A"
},
{
"expr": "min(ccp_backrest_last_info_repo_backup_size_bytes{pg_cluster=\"[[cluster]]\", role=\"master\", backup_type=\"diff\"}) without (deployment,instance,pod,ip)",
"hide": false,
"interval": "",
"legendFormat": "Differential",
"refId": "B"
},
{
"expr": "min(ccp_backrest_last_info_repo_backup_size_bytes{pg_cluster=\"[[cluster]]\", role=\"master\", backup_type=\"full\"}) without (deployment,instance,pod,ip)",
"hide": false,
"interval": "",
"legendFormat": "Full",
"refId": "C"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Backup Size",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "bytes",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 2,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {
"Archive age": "blue",
"Archive count": "green",
"Differential": "dark-blue",
"Failed count": "red",
"Full": "dark-green",
"Incremental": "light-blue"
},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"description": "",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 3,
"fillGradient": 0,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 10
},
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"hideEmpty": false,
"hideZero": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "avg(idelta(ccp_archive_command_status_failed_count{pg_cluster=\"[[cluster]]\", role=\"master\"}[1m])) without (instance,ip)",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Failed count",
"refId": "A"
},
{
"expr": "avg(idelta(ccp_archive_command_status_archived_count{pg_cluster=\"[[cluster]]\", role=\"master\"}[1m])) without (instance,pod, ip)",
"hide": false,
"interval": "",
"legendFormat": "Archive count",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "WAL Stats",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": "0",
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "5m",
"schemaVersion": 27,
"style": "dark",
"tags": [
"vendor=crunchydata"
],
"templating": {
"list": [
{
"allValue": null,
"current": {},
"datasource": "PROMETHEUS",
"definition": "label_values(pg_cluster)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": "cluster",
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(pg_cluster)",
"refId": "PROMETHEUS-cluster-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-2w",
"to": "now"
},
"timepicker": {
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "pgBackRest",
"uid": "2fcFZ6PGk",
"version": 1
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,237 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "PROMETHEUS",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "7.4.5"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1625069480601,
"links": [],
"panels": [
{
"cacheTimeout": null,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"links": [
{
"targetBlank": true,
"title": "Cluster Details",
"url": "dashboard/db/postgresqldetails?$__all_variables"
},
{
"targetBlank": true,
"title": "Backup Details",
"url": "dashboard/db/pgbackrest?$__all_variables"
},
{
"targetBlank": true,
"title": "POD Details",
"url": "dashboard/db/pod-details?$__all_variables"
},
{
"targetBlank": true,
"title": "Query Statistics",
"url": "dashboard/db/query-statistics?$__all_variables"
},
{
"targetBlank": true,
"title": "Service Health",
"url": "dashboard/db/postgresql-service-health?$__all_variables"
}
],
"mappings": [
{
"from": "0",
"id": 0,
"text": "DOWN",
"to": "99",
"type": 2
},
{
"from": "100",
"id": 1,
"text": "Standalone Cluster",
"to": "199",
"type": 2
},
{
"from": "200",
"id": 2,
"text": "HA CLUSTER",
"to": "1000",
"type": 2
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#bf1b00",
"value": null
},
{
"color": "#eab839",
"value": 10
},
{
"color": "#56A64B",
"value": 100
}
]
},
"unit": "short"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 12,
"x": 0,
"y": 0
},
"id": 1,
"interval": null,
"links": [],
"maxDataPoints": 100,
"maxPerRow": 2,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"lastNotNull"
],
"fields": "",
"values": false
},
"text": {
"valueSize": 30
},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"repeat": "cluster",
"repeatDirection": "h",
"targets": [
{
"$hashKey": "object:243",
"expr": "sum(pg_up{pg_cluster=~\"$cluster\"})*100+sum(ccp_is_in_recovery_status{pg_cluster=~\"$cluster\"})",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "{{cluster}}",
"metric": "up",
"refId": "A",
"step": 2
}
],
"title": "$cluster - Overview",
"type": "stat"
}
],
"refresh": "5m",
"schemaVersion": 27,
"style": "dark",
"tags": [],
"templating": {
"list": [
{
"allFormat": "glob",
"allValue": null,
"current": {},
"datasource": "PROMETHEUS",
"definition": "label_values(pg_cluster)",
"description": null,
"error": null,
"hide": 1,
"includeAll": true,
"label": "cluster",
"multi": true,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(pg_cluster)",
"refId": "PROMETHEUS-cluster-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-5m",
"to": "now"
},
"timepicker": {
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "PostgreSQL Overview",
"uid": "D2X39SlGk",
"version": 1
}

View File

@@ -0,0 +1,649 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "PROMETHEUS",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "7.4.5"
},
{
"type": "panel",
"id": "graph",
"name": "Graph",
"version": ""
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": false,
"gnetId": null,
"graphTooltip": 0,
"id": null,
"iteration": 1625069909806,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": true,
"keepTime": true,
"tags": [
"vendor=crunchydata"
],
"title": "",
"type": "dashboards"
}
],
"panels": [
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 5,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 0
},
"hiddenSeries": false,
"id": 6,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(ccp_connection_stats_total{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without (pod,instance,ip) / sum(ccp_connection_stats_max_connections{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without (pod,instance,ip)",
"format": "time_series",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Connections",
"refId": "C"
},
{
"expr": "100 - 100 * avg(ccp_nodemx_data_disk_available_bytes{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without (pod,instance,ip) / avg(ccp_nodemx_data_disk_total_bytes{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without (pod,instance,ip)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Mount:{{mount_point}}",
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Saturation (pct used)",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "percent",
"label": null,
"logBase": 1,
"max": "100",
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"cacheTimeout": null,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 5,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 0
},
"hiddenSeries": false,
"id": 18,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 2,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"exemplar": false,
"expr": " sum(irate(ccp_stat_database_xact_commit{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}[1m])) \n+ sum(irate(ccp_stat_database_xact_rollback{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}[1m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Transactions",
"refId": "A"
},
{
"expr": "max(ccp_connection_stats_active{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without (pod,instance,ip,dbname)",
"format": "time_series",
"interval": "",
"intervalFactor": 1,
"legendFormat": "Active connections",
"refId": "C"
},
{
"expr": "sum(irate(ccp_pg_stat_statements_total_calls_count{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}[1m]))",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Queries",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Traffic",
"tooltip": {
"shared": true,
"sort": 2,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"format": "short",
"label": "",
"logBase": 1,
"max": null,
"min": "0.001",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"description": "Errors",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 5,
"gridPos": {
"h": 7,
"w": 12,
"x": 0,
"y": 7
},
"hiddenSeries": false,
"id": 4,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum(irate(ccp_stat_database_xact_rollback{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}[1m]) without(pod,instance,ip))",
"format": "time_series",
"hide": true,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Rollbacks",
"refId": "A"
},
{
"expr": "sum(irate(ccp_stat_database_deadlocks{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}[1m])) without(pod,instance,ip,dbname)",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Deadlock ",
"refId": "D"
},
{
"expr": "sum(irate(ccp_stat_database_conflicts{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}[1m])) without(pod,instance,ip,dbname)",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Conflicts",
"refId": "B"
},
{
"expr": "max(pg_exporter_last_scrape_error{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without(pod,instance,ip,dbname)",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "scrape error",
"refId": "C"
},
{
"expr": "max(clamp_max(ccp_archive_command_status_seconds_since_last_fail{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"},1)) without (instance,pod,ip)",
"format": "time_series",
"hide": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "archive error",
"refId": "E"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Errors",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "short",
"label": "",
"logBase": 2,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"custom": {},
"links": []
},
"overrides": []
},
"fill": 1,
"fillGradient": 1,
"gridPos": {
"h": 7,
"w": 12,
"x": 12,
"y": 7
},
"hiddenSeries": false,
"id": 10,
"legend": {
"alignAsTable": true,
"avg": false,
"current": false,
"max": false,
"min": false,
"rightSide": true,
"show": true,
"sideWidth": 150,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"options": {
"alertThreshold": true
},
"percentage": false,
"pluginVersion": "7.4.5",
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [
{
"alias": "/Max:/",
"color": "#E02F44",
"nullPointMode": "null as zero"
},
{
"alias": "/Avg:/",
"color": "#8AB8FF"
}
],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "max(ccp_pg_stat_statements_total_mean_exec_time_ms{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without (pod,instance,ip)",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Avg: {{exported_role}}({{dbname}})",
"refId": "A"
},
{
"expr": "max(ccp_pg_stat_statements_top_max_exec_time_ms{pg_cluster=\"[[cluster]]\",role=\"[[role]]\"}) without (pod,instance,ip,query,queryid)",
"format": "time_series",
"hide": false,
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Max: {{exported_role}}({{dbname}})",
"refId": "B"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "Query Duration",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": null,
"format": "ms",
"label": null,
"logBase": 2,
"max": null,
"min": "0",
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"refresh": "5m",
"schemaVersion": 27,
"style": "dark",
"tags": [
"vendor=crunchydata"
],
"templating": {
"list": [
{
"allValue": null,
"current": {},
"datasource": "PROMETHEUS",
"definition": "label_values(pg_cluster)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "cluster",
"options": [],
"query": {
"query": "label_values(pg_cluster)",
"refId": "PROMETHEUS-cluster-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
},
{
"allValue": null,
"current": {},
"datasource": "PROMETHEUS",
"definition": "label_values({pg_cluster=\"[[cluster]]\"},role)",
"description": null,
"error": null,
"hide": 0,
"includeAll": false,
"label": null,
"multi": false,
"name": "role",
"options": [],
"query": {
"query": "label_values({pg_cluster=\"[[cluster]]\"},role)",
"refId": "PROMETHEUS-role-Variable-Query"
},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 0,
"tagValuesQuery": "",
"tags": [],
"tagsQuery": "",
"type": "query",
"useTags": false
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "PostgreSQL Service Health",
"uid": "dhG1wgsMz",
"version": 1
}

View File

@@ -0,0 +1,961 @@
{
"__inputs": [
{
"name": "DS_PROMETHEUS",
"label": "PROMETHEUS",
"description": "",
"type": "datasource",
"pluginId": "prometheus",
"pluginName": "Prometheus"
}
],
"__requires": [
{
"type": "grafana",
"id": "grafana",
"name": "Grafana",
"version": "7.4.5"
},
{
"type": "datasource",
"id": "prometheus",
"name": "Prometheus",
"version": "1.0.0"
},
{
"type": "panel",
"id": "stat",
"name": "Stat",
"version": ""
},
{
"type": "panel",
"id": "table",
"name": "Table",
"version": ""
}
],
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": "-- Grafana --",
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"description": "Show current firing and pending alerts, and severity alert counts.",
"editable": false,
"gnetId": 4181,
"graphTooltip": 0,
"id": null,
"links": [
{
"icon": "external link",
"tags": [
"vendor=crunchydata"
],
"type": "dashboards"
}
],
"panels": [
{
"collapsed": false,
"datasource": "PROMETHEUS",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 0
},
"id": 10,
"panels": [],
"repeat": null,
"title": "Environment Summary",
"type": "row"
},
{
"cacheTimeout": null,
"datasource": "PROMETHEUS",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-blue",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 4,
"x": 0,
"y": 1
},
"id": 6,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "count(count by (kubernetes_namespace) (pg_up))",
"format": "time_series",
"instant": true,
"interval": "",
"intervalFactor": 2,
"legendFormat": "Namespaces",
"refId": "A"
}
],
"title": "Namespaces",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "PROMETHEUS",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-blue",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 4,
"x": 4,
"y": 1
},
"id": 13,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "count(count by (pg_cluster) (pg_up))",
"format": "time_series",
"instant": true,
"interval": "",
"intervalFactor": 2,
"legendFormat": "PostgreSQL Clusters",
"refId": "A"
}
],
"title": "PG Clusters",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "PROMETHEUS",
"description": "",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-blue",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 4,
"x": 8,
"y": 1
},
"id": 14,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "count(pg_up)",
"format": "time_series",
"instant": true,
"interval": "",
"intervalFactor": 2,
"legendFormat": "PostgreSQL Clusters",
"refId": "A"
}
],
"title": "PG Instances",
"type": "stat"
},
{
"collapsed": false,
"datasource": "PROMETHEUS",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 3
},
"id": 11,
"panels": [],
"repeat": null,
"title": "Alert Summary",
"type": "row"
},
{
"cacheTimeout": null,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-red",
"value": null
},
{
"color": "#F2495C",
"value": 1
},
{
"color": "#F2495C"
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 4,
"x": 0,
"y": 4
},
"id": 2,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"targets": [
{
"bucketAggs": [
{
"id": "2",
"settings": {
"interval": "auto",
"min_doc_count": 0,
"trimEdges": 0
},
"type": "date_histogram"
}
],
"dsType": "elasticsearch",
"expr": "sum(ALERTS{alertstate=\"firing\",severity=\"critical\"} > 0) OR on() vector(0)",
"format": "time_series",
"instant": true,
"interval": "",
"intervalFactor": 1,
"legendFormat": "Critical",
"metrics": [
{
"field": "select field",
"id": "1",
"type": "count"
}
],
"refId": "A"
}
],
"title": "Critical",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "semi-dark-orange",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 4,
"x": 4,
"y": 4
},
"id": 5,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "sum(ALERTS{alertstate=\"firing\",severity=\"warning\"} > 0) OR on() vector(0)",
"format": "time_series",
"instant": true,
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
}
],
"title": "Warning",
"type": "stat"
},
{
"cacheTimeout": null,
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {},
"mappings": [
{
"id": 0,
"op": "=",
"text": "N/A",
"type": 1,
"value": "null"
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "#299c46",
"value": null
}
]
},
"unit": "none"
},
"overrides": []
},
"gridPos": {
"h": 2,
"w": 4,
"x": 8,
"y": 4
},
"id": 9,
"interval": null,
"links": [],
"maxDataPoints": 100,
"options": {
"colorMode": "background",
"graphMode": "none",
"justifyMode": "auto",
"orientation": "horizontal",
"reduceOptions": {
"calcs": [
"mean"
],
"fields": "",
"values": false
},
"text": {},
"textMode": "auto"
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "sum(ALERTS{alertstate=\"firing\",severity=\"info\"} > 0) OR on() vector(0)",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "",
"refId": "A"
}
],
"title": "Info",
"type": "stat"
},
{
"collapsed": false,
"datasource": "PROMETHEUS",
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 6
},
"id": 12,
"panels": [],
"repeat": null,
"title": "Alerts",
"type": "row"
},
{
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": null,
"displayMode": "auto",
"filterable": true
},
"decimals": 2,
"displayName": "",
"mappings": [
{
"from": "",
"id": 1,
"text": "",
"to": "",
"type": 1,
"value": ""
}
],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "blue",
"value": 100
},
{
"color": "#EAB839",
"value": 200
},
{
"color": "red",
"value": 300
}
]
},
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byName",
"options": "severity_num"
},
"properties": [
{
"id": "custom.displayMode",
"value": "color-background"
},
{
"id": "custom.width",
"value": 124
}
]
},
{
"matcher": {
"id": "byName",
"options": "Time"
},
"properties": [
{
"id": "custom.width",
"value": 170
}
]
},
{
"matcher": {
"id": "byName",
"options": "severity"
},
"properties": [
{
"id": "custom.width",
"value": 119
}
]
},
{
"matcher": {
"id": "byName",
"options": "alertname"
},
"properties": [
{
"id": "custom.width",
"value": 206
}
]
},
{
"matcher": {
"id": "byName",
"options": "alertstate"
},
"properties": [
{
"id": "custom.width",
"value": 128
}
]
}
]
},
"gridPos": {
"h": 5,
"w": 24,
"x": 0,
"y": 7
},
"id": 1,
"links": [],
"options": {
"showHeader": true,
"sortBy": []
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "ALERTS{alertstate='firing'} > 0",
"format": "table",
"instant": true,
"interval": "2s",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"title": "Firing",
"transformations": [
{
"id": "merge",
"options": {
"reducers": []
}
},
{
"id": "organize",
"options": {
"excludeByName": {
"Value": true,
"__name__": true,
"alertstate": false,
"deployment": false,
"exp_type": true,
"fs_type": true,
"instance": true,
"job": true,
"kubernetes_namespace": true,
"mount_point": true,
"server": true,
"service": true,
"severity_num": false
},
"indexByName": {
"Time": 0,
"Value": 16,
"__name__": 3,
"alertname": 4,
"alertstate": 5,
"deployment": 7,
"exp_type": 9,
"instance": 10,
"ip": 11,
"job": 12,
"kubernetes_namespace": 13,
"pg_cluster": 6,
"pod": 8,
"role": 14,
"service": 15,
"severity": 2,
"severity_num": 1
},
"renameByName": {
"Time": "",
"__name__": "",
"severity": "",
"severity_num": ""
}
}
}
],
"type": "table"
},
{
"datasource": "PROMETHEUS",
"fieldConfig": {
"defaults": {
"color": {
"mode": "thresholds"
},
"custom": {
"align": null,
"filterable": true
},
"decimals": 2,
"displayName": "",
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
},
"unit": "short"
},
"overrides": [
{
"matcher": {
"id": "byRegexp",
"options": "/(instance|__name__|Time|alertstate|job|type|Value)/"
},
"properties": [
{
"id": "unit",
"value": "short"
},
{
"id": "decimals",
"value": 2
},
{
"id": "custom.align",
"value": null
}
]
},
{
"matcher": {
"id": "byName",
"options": "Time"
},
"properties": [
{
"id": "custom.width",
"value": null
}
]
},
{
"matcher": {
"id": "byName",
"options": "severity_num"
},
"properties": [
{
"id": "custom.width",
"value": 126
}
]
},
{
"matcher": {
"id": "byName",
"options": "severity"
},
"properties": [
{
"id": "custom.width",
"value": 115
}
]
},
{
"matcher": {
"id": "byName",
"options": "alertname"
},
"properties": [
{
"id": "custom.width",
"value": 207
}
]
},
{
"matcher": {
"id": "byName",
"options": "alertstate"
},
"properties": [
{
"id": "custom.width",
"value": 131
}
]
}
]
},
"gridPos": {
"h": 7,
"w": 24,
"x": 0,
"y": 12
},
"id": 3,
"links": [],
"options": {
"showHeader": true,
"sortBy": []
},
"pluginVersion": "7.4.5",
"targets": [
{
"expr": "ALERTS{alertstate=\"pending\"}",
"format": "table",
"instant": false,
"interval": "",
"intervalFactor": 1,
"legendFormat": "",
"refId": "A"
}
],
"title": "Alerts (1 week)",
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {
"Value": true,
"__name__": true,
"exp_type": true,
"instance": true,
"job": true,
"kubernetes_namespace": true,
"service": true
},
"indexByName": {
"Time": 0,
"Value": 16,
"__name__": 3,
"alertname": 4,
"alertstate": 5,
"deployment": 7,
"exp_type": 8,
"instance": 9,
"ip": 11,
"job": 12,
"kubernetes_namespace": 13,
"pg_cluster": 6,
"pod": 10,
"role": 14,
"service": 15,
"severity": 2,
"severity_num": 1
},
"renameByName": {}
}
}
],
"type": "table"
}
],
"refresh": "15m",
"schemaVersion": 27,
"style": "dark",
"tags": [
"vendor=crunchydata"
],
"templating": {
"list": []
},
"time": {
"from": "now-1h",
"to": "now"
},
"timepicker": {
"time_options": [
"5m",
"15m",
"1h",
"6h",
"12h",
"24h",
"2d",
"7d",
"30d"
]
},
"timezone": "browser",
"title": "Prometheus Alerts",
"uid": "lwxXsZsMk",
"version": 1
}

View File

@@ -0,0 +1,83 @@
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "1"
labels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-alertmanager
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-alertmanager
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-alertmanager
spec:
containers:
- args:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
- --log.level=info
- --cluster.advertise-address=0.0.0.0:9093
image: prom/alertmanager:v0.22.2
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /-/healthy
port: 9093
scheme: HTTP
initialDelaySeconds: 25
periodSeconds: 20
successThreshold: 1
timeoutSeconds: 1
name: alertmanager
ports:
- containerPort: 9093
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /-/ready
port: 9093
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/alertmanager
name: alertmanagerconf
- mountPath: /alertmanager
name: alertmanagerdata
dnsPolicy: ClusterFirst
restartPolicy: Always
securityContext:
fsGroup: 26
# supplementalGroups:
# - 65534
schedulerName: default-scheduler
serviceAccount: alertmanager
serviceAccountName: alertmanager
terminationGracePeriodSeconds: 30
volumes:
- name: alertmanagerdata
persistentVolumeClaim:
claimName: alertmanagerdata
- configMap:
defaultMode: 420
name: alertmanager-config
name: alertmanagerconf

View File

@@ -0,0 +1,102 @@
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "1"
labels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-grafana
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-grafana
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-grafana
spec:
containers:
- env:
- name: GF_PATHS_DATA
value: /data/grafana/data
- name: GF_SECURITY_ADMIN_USER__FILE
value: /conf/admin/username
- name: GF_SECURITY_ADMIN_PASSWORD__FILE
value: /conf/admin/password
- name: PROM_HOST
value: crunchy-prometheus
- name: PROM_PORT
value: "9090"
image: grafana/grafana:8.5.10
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /api/health
port: 3000
scheme: HTTP
initialDelaySeconds: 25
periodSeconds: 20
successThreshold: 1
timeoutSeconds: 1
name: grafana
ports:
- containerPort: 3000
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /api/health
port: 3000
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /data
name: grafanadata
- mountPath: /conf/admin
name: grafana-secret
- mountPath: /etc/grafana/provisioning/datasources
name: grafana-datasources
- mountPath: /etc/grafana/provisioning/dashboards
name: grafana-dashboards
dnsPolicy: ClusterFirst
restartPolicy: Always
securityContext:
fsGroup: 26
# supplementalGroups:
# - 65534
schedulerName: default-scheduler
serviceAccount: grafana
serviceAccountName: grafana
terminationGracePeriodSeconds: 30
volumes:
- name: grafanadata
persistentVolumeClaim:
claimName: grafanadata
- name: grafana-secret
secret:
defaultMode: 420
secretName: grafana-secret
- configMap:
defaultMode: 420
name: grafana-datasources
name: grafana-datasources
- configMap:
defaultMode: 420
name: grafana-dashboards
name: grafana-dashboards

View File

@@ -0,0 +1,86 @@
apiVersion: apps/v1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "1"
labels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-prometheus
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-prometheus
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
creationTimestamp: null
labels:
app.kubernetes.io/name: postgres-operator-monitoring
name: crunchy-prometheus
spec:
containers:
- image: prom/prometheus:v2.33.5
imagePullPolicy: IfNotPresent
livenessProbe:
failureThreshold: 3
httpGet:
path: /-/healthy
port: 9090
scheme: HTTP
initialDelaySeconds: 15
periodSeconds: 20
successThreshold: 1
timeoutSeconds: 1
name: prometheus
ports:
- containerPort: 9090
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /-/ready
port: 9090
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File
volumeMounts:
- mountPath: /etc/prometheus
name: prometheusconf
- mountPath: /prometheus
name: prometheusdata
- mountPath: /etc/prometheus/alert-rules.d
name: alertmanagerrules
dnsPolicy: ClusterFirst
securityContext:
fsGroup: 26
# supplementalGroups:
# - 65534
restartPolicy: Always
schedulerName: default-scheduler
serviceAccount: prometheus-sa
serviceAccountName: prometheus-sa
terminationGracePeriodSeconds: 30
volumes:
- configMap:
defaultMode: 420
name: crunchy-prometheus
name: prometheusconf
- name: prometheusdata
persistentVolumeClaim:
claimName: prometheusdata
- configMap:
defaultMode: 420
name: alertmanager-rules-config
name: alertmanagerrules

View File

@@ -0,0 +1,50 @@
apiVersion: v1
data:
crunchy_grafana_datasource.yml: |
###
#
# Copyright 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###
# config file version
apiVersion: 1
# list of datasources to insert/update depending
# what's available in the database
datasources:
# <string, required> name of the datasource. Required
- name: PROMETHEUS
# <string, required> datasource type. Required
type: prometheus
# <string, required> access mode. proxy or direct (Server or Browser in the UI). Required
access: proxy
# <int> org id. will default to orgId 1 if not specified
orgId: 1
# <string> url
url: http://$PROM_HOST:$PROM_PORT
# <string> database password, if used
password:
# <string> database user, if used
user:
# <string> database name, if used
database:
# <bool> enable/disable basic auth
basicAuth:
# <string> basic auth username
basicAuthUser:
# <string> basic auth password
basicAuthPassword:
# <bool> enable/disable with credentials headers
withCredentials:
# <bool> mark as default datasource. Max one per org
isDefault: true
version: 1
# <bool> allow users to edit datasources from the UI.
editable: false
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: grafana-datasources

View File

@@ -0,0 +1,12 @@
apiVersion: v1
data:
password: YWRtaW4=
username: YWRtaW4=
kind: Secret
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: grafana-secret
type: Opaque

View File

@@ -0,0 +1,30 @@
kind: Kustomization
apiVersion: kustomize.config.k8s.io/v1beta1
namespace: postgres-operator
resources:
- pvcs.yaml
- ./dashboards
# configuration files
- prometheus-config.yaml
- alertmanager-config.yaml
- alertmanager-rules-config.yaml
- grafana-datasources.yaml
# secrets
- grafana-secret.yaml
# RBAC
- rbac-sa.yaml
- rbac-cr.yaml
- rbac-crb.yaml
# Deployments
- deploy-alertmanager.yaml
- deploy-grafana.yaml
- deploy-prometheus.yaml
# Services
- service.yaml
configMapGenerator:
- name: grafana-dashboards
behavior: merge
files:
- crunchy_grafana_dashboards.yml
generatorOptions:
disableNameSuffixHash: true

View File

@@ -0,0 +1,80 @@
apiVersion: v1
data:
prometheus.yml: |+
###
#
# Copyright 2017-2023 Crunchy Data Solutions, Inc. All Rights Reserved.
#
###
---
global:
scrape_interval: 15s
scrape_timeout: 15s
evaluation_interval: 5s
scrape_configs:
- job_name: 'crunchy-postgres-exporter'
kubernetes_sd_configs:
- role: pod
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_postgres_operator_crunchydata_com_crunchy_postgres_exporter,__meta_kubernetes_pod_label_crunchy_postgres_exporter]
action: keep
regex: true
separator: ""
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: drop
regex: 5432
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: drop
regex: 10000
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: drop
regex: 8009
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: drop
regex: 2022
- source_labels: [__meta_kubernetes_pod_container_port_number]
action: drop
regex: ^$
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_pod_name]
target_label: pod
- source_labels: [__meta_kubernetes_pod_label_postgres_operator_crunchydata_com_cluster,__meta_kubernetes_pod_label_pg_cluster]
target_label: cluster
separator: ""
replacement: '$1'
- source_labels: [__meta_kubernetes_namespace,cluster]
target_label: pg_cluster
separator: ":"
replacement: '$1$2'
- source_labels: [__meta_kubernetes_pod_ip]
target_label: ip
replacement: '$1'
- source_labels: [__meta_kubernetes_pod_label_postgres_operator_crunchydata_com_instance,__meta_kubernetes_pod_label_deployment_name]
target_label: deployment
replacement: '$1'
separator: ""
- source_labels: [__meta_kubernetes_pod_label_postgres_operator_crunchydata_com_role,__meta_kubernetes_pod_label_role]
target_label: role
replacement: '$1'
separator: ""
rule_files:
- /etc/prometheus/alert-rules.d/*.yml
alerting:
alertmanagers:
- scheme: http
static_configs:
- targets:
- "crunchy-alertmanager:9093"
kind: ConfigMap
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: crunchy-prometheus

View File

@@ -0,0 +1,41 @@
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app.kubernetes.io/name: pgo-monitoring
vendor: crunchydata
name: alertmanagerdata
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app.kubernetes.io/name: pgo-monitoring
vendor: crunchydata
name: grafanadata
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi
---
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
labels:
app.kubernetes.io/name: pgo-monitoring
vendor: crunchydata
name: prometheusdata
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 5Gi

View File

@@ -0,0 +1,16 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: prometheus-cr
rules:
- resources:
- pods
apiGroups:
- ""
verbs:
- get
- list
- watch

View File

@@ -0,0 +1,13 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
labels:
vendor: crunchydata
name: prometheus-crb
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-cr
subjects:
- kind: ServiceAccount
name: prometheus-sa

View File

@@ -0,0 +1,24 @@
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
vendor: crunchydata
name: prometheus-sa
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
vendor: crunchydata
name: alertmanager
---
apiVersion: v1
kind: ServiceAccount
metadata:
labels:
vendor: crunchydata
name: grafana

View File

@@ -0,0 +1,51 @@
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: crunchy-alertmanager
name: crunchy-alertmanager
spec:
type: ClusterIP
ports:
- name: alertmanager
port: 9093
selector:
name: crunchy-alertmanager
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: crunchy-grafana
name: crunchy-grafana
spec:
type: ClusterIP
ports:
- name: grafana
port: 3000
selector:
name: crunchy-grafana
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/name: postgres-operator-monitoring
vendor: crunchydata
name: crunchy-prometheus
name: crunchy-prometheus
spec:
type: ClusterIP
ports:
- name: prometheus
port: 9090
selector:
name: crunchy-prometheus