Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
vendor
jsonnetfile.lock.json
*.zip
.worktrees
3 changes: 3 additions & 0 deletions microsoft-iis-mixin/.lint
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
exclusions:
panel-datasource-rule:
reason: "Modern mixins use signal-based architecture where datasource references are handled by the framework"
panel-units-rule:
reason: "Custom units are used for better user experience in these panels"
entries:
Expand All @@ -16,6 +18,7 @@ exclusions:
- panel: "Worker process startup failures"
- panel: "Worker process shutdown failures"
- panel: "Worker process ping failures"
- panel: "Worker output cache hit ratio"
template-datasource-rule:
reason: "Based on new convention we are using variable names prometheus_datasource and loki_datasource where as linter expects 'datasource'"
template-instance-rule:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
{
prometheusAlerts+:: {
groups+: [
new(this): {
groups: [
{
name: 'microsoft-iis',
name: this.config.uid + '-alerts',
rules: [
{
alert: 'MicrosoftIISHighNumberOfRejectedAsyncIORequests',
alert: 'IISHighRejectedAsyncIORequests',
expr: |||
increase(windows_iis_rejected_async_io_requests_total[5m]) > %(alertsWarningHighRejectedAsyncIORequests)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -17,14 +17,14 @@
summary: 'There are a high number of rejected async I/O requests for a site.',
description: |||
The number of rejected async IO requests is {{ printf "%%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.site }} which is above the threshold of %(alertsWarningHighRejectedAsyncIORequests)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'MicrosoftIISHighNumberOf5xxRequestErrors',
alert: 'IISHigh5xxRequestErrors',
expr: |||
sum without (pid, status_code)(increase(windows_iis_worker_request_errors_total{status_code=~"5.*"}[5m])) > %(alertsCriticalHigh5xxRequests)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -33,14 +33,14 @@
summary: 'There are a high number of 5xx request errors for an application.',
description: |||
The number of 5xx request errors is {{ printf "%%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of %(alertsCriticalHigh5xxRequests)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'MicrosoftIISLowSuccessRateForWebsocketConnections',
alert: 'IISLowWebsocketConnectionSuccess',
expr: |||
sum without (pid) (increase(windows_iis_worker_websocket_connection_accepted_total[5m]) / clamp_min(increase(windows_iis_worker_websocket_connection_attempts_total[5m]),1)) * 100 > %(alertsCriticalLowWebsocketConnectionSuccessRate)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -49,14 +49,14 @@
summary: 'There is a low success rate for websocket connections for an application.',
description: |||
The success rate for websocket connections is {{ printf "%%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of %(alertsCriticalLowWebsocketConnectionSuccessRate)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'MicrosoftIISThreadpoolUtilizationNearingMax',
alert: 'IISThreadpoolUtilizationHigh',
expr: |||
sum without (pid, state)(windows_iis_worker_threads / windows_iis_worker_max_threads) * 100 > %(alertsCriticalHighThreadPoolUtilization)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'critical',
Expand All @@ -65,14 +65,14 @@
summary: 'The thread pool utilization is nearing max capacity.',
description: |||
The threadpool utilization is at {{ printf "%%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of %(alertsCriticalHighThreadPoolUtilization)s.
||| % $._config,
||| % this.config,
},
},
{
alert: 'MicrosoftIISHighNumberOfWorkerProcessFailures',
alert: 'IISHighWorkerProcessFailures',
expr: |||
increase(windows_iis_total_worker_process_failures[5m]) > %(alertsWarningHighWorkerProcessFailures)s
||| % $._config,
||| % this.config,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -81,7 +81,7 @@
summary: 'There are a high number of worker process failures for an application.',
description: |||
The number of worker process failures is at {{ printf "%%.0f" $value }} over the last 5m on {{ $labels.instance }} - {{ $labels.app }} which is above the threshold of %(alertsWarningHighWorkerProcessFailures)s.
||| % $._config,
||| % this.config,
},
},
],
Expand Down
56 changes: 44 additions & 12 deletions microsoft-iis-mixin/config.libsonnet
Original file line number Diff line number Diff line change
@@ -1,17 +1,49 @@
{
_config+:: {
dashboardTags: ['microsoft-iis-mixin'],
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',
dashboardRefresh: '1m',
local this = self,

// alerts thresholds
alertsWarningHighRejectedAsyncIORequests: 20,
alertsCriticalHigh5xxRequests: 5,
alertsCriticalLowWebsocketConnectionSuccessRate: 80,
alertsCriticalHighThreadPoolUtilization: 90,
alertsWarningHighWorkerProcessFailures: 10,
// Basic filtering
filteringSelector: 'job=~"$job", instance=~"$instance"',
queriesSelector: 'job=~"$job", instance=~"$instance"',
groupLabels: ['job'],
instanceLabels: ['instance'],

enableLokiLogs: true,
// Dashboard settings
dashboardTags: ['microsoft-iis-mixin'],
uid: 'microsoft-iis',
dashboardNamePrefix: 'Microsoft IIS',
dashboardRefresh: '1m',
dashboardPeriod: 'now-1h',
dashboardTimezone: 'default',

// Logs configuration
enableLokiLogs: true,
logLabels: ['job', 'instance', 'level'],
extraLogLabels: [],
logsVolumeGroupBy: 'level',
showLogsVolume: true,

// Alert thresholds
alertsWarningHighRejectedAsyncIORequests: 20, // count
alertsCriticalHigh5xxRequests: 5, // %
alertsCriticalLowWebsocketConnectionSuccessRate: 80, // %
alertsCriticalHighThreadPoolUtilization: 90, // %
alertsWarningHighWorkerProcessFailures: 10, // count

// Metrics source
metricsSource: 'prometheus',

// Signal definitions
signals+: {
requests: (import './signals/requests.libsonnet')(this),
connections: (import './signals/connections.libsonnet')(this),
data_transfer: (import './signals/data_transfer.libsonnet')(this),
async_io: (import './signals/async_io.libsonnet')(this),
server_cache: (import './signals/server_cache.libsonnet')(this),
app_pools: (import './signals/app_pools.libsonnet')(this),
worker_processes: (import './signals/worker_processes.libsonnet')(this),
worker_requests: (import './signals/worker_requests.libsonnet')(this),
worker_cache: (import './signals/worker_cache.libsonnet')(this),
worker_threads: (import './signals/worker_threads.libsonnet')(this),
worker_websocket: (import './signals/worker_websocket.libsonnet')(this),
},
}
139 changes: 139 additions & 0 deletions microsoft-iis-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
local g = import './g.libsonnet';
local logslib = import 'logs-lib/logs/main.libsonnet';

{
local root = self,
new(this)::
local prefix = this.config.dashboardNamePrefix;
local links = this.grafana.links;
local tags = this.config.dashboardTags;
local uid = g.util.string.slugify(this.config.uid);
local vars = this.grafana.variables;
local annotations = this.grafana.annotations;
local refresh = this.config.dashboardRefresh;
local period = this.config.dashboardPeriod;
local timezone = this.config.dashboardTimezone;

{
// Microsoft IIS overview dashboard
'microsoft-iis-overview.json':
g.dashboard.new(prefix + ' overview')
+ g.dashboard.withDescription('Dashboard providing an overview of Microsoft IIS performance.')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.overviewRequests,
this.grafana.rows.overviewAsyncIO,
this.grafana.rows.overviewTraffic,
this.grafana.rows.overviewConnections,
]
+
if this.config.enableLokiLogs then
[this.grafana.rows.overviewLogs] else []
+
[
this.grafana.rows.overviewCache,
]
)
)
)
+ root.applyCommon(
vars.multiInstance + [
g.dashboard.variable.query.new('site')
+ g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus)
+ g.dashboard.variable.query.queryTypes.withLabelValues('site', 'windows_iis_requests_total{job=~"$job", instance=~"$instance"}')
+ g.dashboard.variable.query.generalOptions.withLabel('Site')
+ g.dashboard.variable.query.selectionOptions.withMulti(true)
+ g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+')
+ g.dashboard.variable.query.refresh.onLoad()
+ g.dashboard.variable.query.refresh.onTime(),
],
uid + '_overview',
tags,
links { microsoftIISOverview+:: {} },
annotations,
timezone,
refresh,
period
),

// Microsoft IIS applications dashboard
'microsoft-iis-applications.json':
g.dashboard.new(prefix + ' applications')
+ g.dashboard.withDescription('Dashboard providing detailed application performance metrics for Microsoft IIS.')
+ g.dashboard.withPanels(
g.util.panel.resolveCollapsedFlagOnRows(
g.util.grid.wrapPanels(
[
this.grafana.rows.applicationsRequests,
this.grafana.rows.applicationsWebsocket,
this.grafana.rows.applicationsWorkerProcesses,
this.grafana.rows.applicationsCache,
]
)
)
)
+ root.applyCommon(
vars.multiInstance + [
g.dashboard.variable.query.new('application')
+ g.dashboard.variable.query.withDatasourceFromVariable(vars.datasources.prometheus)
+ g.dashboard.variable.query.queryTypes.withLabelValues('app', 'windows_iis_current_application_pool_state{job=~"$job", instance=~"$instance"}')
+ g.dashboard.variable.query.generalOptions.withLabel('Application')
+ g.dashboard.variable.query.selectionOptions.withMulti(true)
+ g.dashboard.variable.query.selectionOptions.withIncludeAll(true, '.+')
+ g.dashboard.variable.query.refresh.onLoad()
+ g.dashboard.variable.query.refresh.onTime(),
],
uid + '_applications',
tags,
links { microsoftIISApplications+:: {} },
annotations,
timezone,
refresh,
period
),
}
+
if this.config.enableLokiLogs then
{
'microsoft-iis-logs.json':
logslib.new(
prefix + ' logs',
datasourceName=vars.datasources.loki.name,
datasourceRegex=vars.datasources.loki.regex,
filterSelector=this.config.filteringSelector,
labels=this.config.logLabels + this.config.extraLogLabels,
formatParser=null,
showLogsVolume=this.config.showLogsVolume,
logsVolumeGroupBy=this.config.logsVolumeGroupBy,
extraFilters=[]
)
{
dashboards+:
{
logs+:
root.applyCommon(
[],
uid + '_logs',
tags,
links { logs+:: {} },
annotations,
timezone,
refresh,
period
),
},
}.dashboards.logs,
} else {},

applyCommon(vars, uid, tags, links, annotations, timezone, refresh, period):
g.dashboard.withTags(tags)
+ g.dashboard.withUid(uid)
+ g.dashboard.withLinks(std.objectValues(links))
+ g.dashboard.withTimezone(timezone)
+ g.dashboard.withRefresh(refresh)
+ g.dashboard.time.withFrom(period)
+ g.dashboard.withVariables(vars)
+ g.dashboard.withAnnotations(std.objectValues(annotations)),
}
2 changes: 0 additions & 2 deletions microsoft-iis-mixin/dashboards/dashboards.libsonnet

This file was deleted.

Loading
Loading