diff --git a/documentation/sql_exporter.yml b/documentation/sql_exporter.yml index a22bb9e7..ff56e068 100644 --- a/documentation/sql_exporter.yml +++ b/documentation/sql_exporter.yml @@ -35,7 +35,7 @@ target: name: mssql_database # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL) # the schema gets dropped or replaced to match the driver expected DSN format. - data_source_name: 'sqlserver://prom_user:prom_password@dbserver1.example.com:1433' + data_source_name: 'sqlserver://prom_user:prom_password@dbserver1.example.com:1433/master' # Collectors (referenced by name) to execute on the target. collectors: [mssql_standard] diff --git a/kubedb/mssql_standard.collector.yml b/kubedb/mssql_standard.collector.yml new file mode 100644 index 00000000..3e687438 --- /dev/null +++ b/kubedb/mssql_standard.collector.yml @@ -0,0 +1,207 @@ +# A collector defining standard metrics for Microsoft SQL Server. +# +# It is required that the SQL Server user has the following permissions: +# +# GRANT VIEW ANY DEFINITION TO +# GRANT VIEW SERVER STATE TO +# +collector_name: mssql_standard + +# Similar to global.min_interval, but applies to the queries defined by this collector only. +#min_interval: 0s + +metrics: + - metric_name: mssql_local_time_seconds + type: gauge + help: 'Local time in seconds since epoch (Unix time).' + values: [unix_time] + query: | + SELECT DATEDIFF(second, '19700101', GETUTCDATE()) AS unix_time + + - metric_name: mssql_connections + type: gauge + help: 'Number of active connections.' + key_labels: + - db + values: [count] + query: | + SELECT DB_NAME(sp.dbid) AS db, COUNT(sp.spid) AS count + FROM sys.sysprocesses sp + GROUP BY DB_NAME(sp.dbid) + + # + # Collected from sys.dm_os_performance_counters + # + - metric_name: mssql_deadlocks + type: counter + help: 'Number of lock requests that resulted in a deadlock.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Number of Deadlocks/sec' AND instance_name = '_Total' + + - metric_name: mssql_user_errors + type: counter + help: 'Number of user errors.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Errors/sec' AND instance_name = 'User Errors' + + - metric_name: mssql_kill_connection_errors + type: counter + help: 'Number of severe errors that caused SQL Server to kill the connection.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Errors/sec' AND instance_name = 'Kill Connection Errors' + + - metric_name: mssql_page_life_expectancy_seconds + type: gauge + help: 'The minimum number of seconds a page will stay in the buffer pool on this node without references.' + values: [cntr_value] + query: | + SELECT top(1) cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Page life expectancy' + + - metric_name: mssql_batch_requests + type: counter + help: 'Number of command batches received.' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Batch Requests/sec' + + - metric_name: mssql_log_growths + type: counter + help: 'Number of times the transaction log has been expanded, per database.' + key_labels: + - db + values: [cntr_value] + query: | + SELECT rtrim(instance_name) AS db, cntr_value + FROM sys.dm_os_performance_counters WITH (NOLOCK) + WHERE counter_name = 'Log Growths' AND instance_name <> '_Total' + + - metric_name: mssql_buffer_cache_hit_ratio + type: gauge + help: 'Ratio of requests that hit the buffer cache' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters + WHERE [counter_name] = 'Buffer cache hit ratio' + + - metric_name: mssql_checkpoint_pages_sec + type: gauge + help: 'Checkpoint Pages Per Second' + values: [cntr_value] + query: | + SELECT cntr_value + FROM sys.dm_os_performance_counters + WHERE [counter_name] = 'Checkpoint pages/sec' + + # + # Collected from sys.dm_io_virtual_file_stats + # + - metric_name: mssql_io_stall_seconds + type: counter + help: 'Stall time in seconds per database and I/O operation.' + key_labels: + - db + value_label: operation + values: + - read + - write + query_ref: mssql_io_stall + - metric_name: mssql_io_stall_total_seconds + type: counter + help: 'Total stall time in seconds per database.' + key_labels: + - db + values: + - io_stall + query_ref: mssql_io_stall + + # + # Collected from sys.dm_os_process_memory + # + - metric_name: mssql_resident_memory_bytes + type: gauge + help: 'SQL Server resident memory size (AKA working set).' + values: [resident_memory_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_virtual_memory_bytes + type: gauge + help: 'SQL Server committed virtual memory size.' + values: [virtual_memory_bytes] + query_ref: mssql_process_memory + + - metric_name: mssql_memory_utilization_percentage + type: gauge + help: 'The percentage of committed memory that is in the working set.' + values: [memory_utilization_percentage] + query_ref: mssql_process_memory + + - metric_name: mssql_page_fault_count + type: counter + help: 'The number of page faults that were incurred by the SQL Server process.' + values: [page_fault_count] + query_ref: mssql_process_memory + + # + # Collected from sys.dm_os_sys_memory + # + - metric_name: mssql_os_memory + type: gauge + help: 'OS physical memory, used and available.' + value_label: 'state' + values: [used, available] + query: | + SELECT + (total_physical_memory_kb - available_physical_memory_kb) * 1024 AS used, + available_physical_memory_kb * 1024 AS available + FROM sys.dm_os_sys_memory + + - metric_name: mssql_os_page_file + type: gauge + help: 'OS page file, used and available.' + value_label: 'state' + values: [used, available] + query: | + SELECT + (total_page_file_kb - available_page_file_kb) * 1024 AS used, + available_page_file_kb * 1024 AS available + FROM sys.dm_os_sys_memory + +queries: + # Populates `mssql_io_stall` and `mssql_io_stall_total` + - query_name: mssql_io_stall + query: | + SELECT + cast(DB_Name(a.database_id) as varchar) AS [db], + sum(io_stall_read_ms) / 1000.0 AS [read], + sum(io_stall_write_ms) / 1000.0 AS [write], + sum(io_stall) / 1000.0 AS io_stall + FROM + sys.dm_io_virtual_file_stats(null, null) a + INNER JOIN sys.master_files b ON a.database_id = b.database_id AND a.file_id = b.file_id + GROUP BY a.database_id + + # Populates `mssql_resident_memory_bytes`, `mssql_virtual_memory_bytes`, `mssql_memory_utilization_percentage` and + # `mssql_page_fault_count`. + - query_name: mssql_process_memory + query: | + SELECT + physical_memory_in_use_kb * 1024 AS resident_memory_bytes, + virtual_address_space_committed_kb * 1024 AS virtual_memory_bytes, + memory_utilization_percentage, + page_fault_count + FROM sys.dm_os_process_memory + diff --git a/kubedb/sql_exporter.yml b/kubedb/sql_exporter.yml new file mode 100644 index 00000000..f665f3de --- /dev/null +++ b/kubedb/sql_exporter.yml @@ -0,0 +1,77 @@ +# Global settings and defaults. +global: + scrape_timeout: 10s + scrape_timeout_offset: 500ms + min_interval: 0s + max_connections: 3 + max_idle_connections: 3 + +# The target to monitor and the collectors to execute on it. +target: + name: mssql_database + data_source_name: 'server=ms-stand-monitor-0.ms-stand-monitor-pods.demo.svc;user id=sa;password=Pa55w0rd!;database=master;encrypt=true;TrustServerCertificate=true;' + collectors: [mssql_standard] + enable_ping: true + +# Collectors and metrics definitions +collectors: + - collector_name: mssql_standard + metrics: + - metric_name: mssql_log_growths + type: counter + help: 'Total number of times the transaction log has been expanded since last restart, per database.' + key_labels: + - db + static_labels: + env: dev + region: europe + values: [counter] + query: | + SELECT rtrim(instance_name) AS db, cntr_value AS counter + FROM sys.dm_os_performance_counters + WHERE counter_name = 'Log Growths' AND instance_name <> '_Total' + + - metric_name: mssql_io_stall_seconds + type: counter + help: 'Stall time in seconds per database and I/O operation.' + key_labels: + - db + value_label: operation + values: + - io_stall_read + - io_stall_write + query_ref: io_stall + + - metric_name: mssql_io_stall_total_seconds + type: counter + help: 'Total stall time in seconds per database.' + key_labels: + - db + values: + - io_stall + query_ref: io_stall + + - metric_name: mssql_hostname + type: gauge + help: 'Database server hostname' + key_labels: + - hostname + static_value: 1 + query: | + SELECT @@SERVERNAME AS hostname + + queries: + - query_name: io_stall + query: | + SELECT + cast(DB_Name(a.database_id) as varchar) AS db, + sum(io_stall_read_ms) / 1000.0 AS io_stall_read, + sum(io_stall_write_ms) / 1000.0 AS io_stall_write, + sum(io_stall) / 1000.0 AS io_stall + FROM + sys.dm_io_virtual_file_stats(null, null) a + INNER JOIN sys.master_files b ON a.database_id = b.database_id AND a.file_id = b.file_id + GROUP BY a.database_id + +collector_files: + - "*.collector.yml"