Merge pull request #99888 from lcavalle/TELCODOCS-2171-observability

slovern · web-flow · commit c3a56b5eba59 · 2025-11-06T17:04:05.000Z
TELCODOCS-2171#Generalize Day2Ops Observability
diff --git a/_topic_maps/_topic_map.yml b/_topic_maps/_topic_map.yml
@@ -3624,7 +3624,7 @@ Topics:
     Dir: observability
     Topics:
     - Name: Observability in OpenShift Container Platform
-      File: telco-observability
+      File: observability
   - Name: Security
     Dir: security
     Topics:
diff --git a/edge_computing/day_2_core_cnf_clusters/observability/observability.adoc b/edge_computing/day_2_core_cnf_clusters/observability/observability.adoc
@@ -1,8 +1,8 @@
 :_mod-docs-content-type: ASSEMBLY
-[id="telco-observability"]
-= Observability in telco core CNF clusters
+[id="observability"]
+= Observability in {product-title} clusters
 include::_attributes/common-attributes.adoc[]
-:context: telco-observability
+:context: observability
 :imagesdir: images
 
 toc::[]
@@ -13,7 +13,7 @@ What follows is an outline of best practices for system engineers, architects, a
 
 Unless explicitly stated, the material in this document refers to both Edge and Core deployments.
 
-include::modules/telco-observability-monitoring-stack.adoc[leveloffset=+1]
+include::modules/observability-monitoring-stack.adoc[leveloffset=+1]
 
 [role="_additional-resources"]
 .Additional resources
@@ -22,7 +22,7 @@ include::modules/telco-observability-monitoring-stack.adoc[leveloffset=+1]
 
 * xref:../../../observability/monitoring/getting-started/core-platform-monitoring-first-steps.adoc#core-platform-monitoring-first-steps[Core platform monitoring first steps]
 
-include::modules/telco-observability-key-performance-metrics.adoc[leveloffset=+1]
+include::modules/observability-key-performance-metrics.adoc[leveloffset=+1]
 
 [role="_additional-resources"]
 .Additional resources
@@ -31,16 +31,16 @@ include::modules/telco-observability-key-performance-metrics.adoc[leveloffset=+1
 
 * xref:../../../storage/persistent_storage_local/persistent-storage-local.adoc#local-storage-install_persistent-storage-local[Persistent storage using local volumes]
 
-include::modules/telco-observability-monitoring-the-edge.adoc[leveloffset=+1]
+include::modules/observability-monitoring-the-edge.adoc[leveloffset=+1]
 
-include::modules/telco-observability-alerting.adoc[leveloffset=+1]
+include::modules/observability-alerting.adoc[leveloffset=+1]
 
 [role="_additional-resources"]
 .Additional resources
 
 * xref:../../../observability/monitoring/about-ocp-monitoring/key-concepts.adoc#about-managing-alerts_key-concepts[Managing alerts]
 
-include::modules/telco-observability-workload-monitoring.adoc[leveloffset=+1]
+include::modules/observability-workload-monitoring.adoc[leveloffset=+1]
 
 [role="_additional-resources"]
 .Additional resources
diff --git a/edge_computing/day_2_core_cnf_clusters/telco-day-2-welcome.adoc b/edge_computing/day_2_core_cnf_clusters/telco-day-2-welcome.adoc
@@ -15,7 +15,7 @@ Troubleshooting and maintaining telco core CNF clusters:: To maintain and troubl
 
 Observability in telco core CNF clusters:: {product-title} generates a large amount of data, such as performance metrics and logs from the platform and the workloads running on it.
 As an administrator, you can use tools to collect and analyze the available data.
-For more information, see xref:../day_2_core_cnf_clusters/observability/telco-observability.adoc#telco-observability[Observability in telco core CNF clusters].
+For more information, see xref:../day_2_core_cnf_clusters/observability/observability.adoc#observability[Observability in {product-title}].
 
 Security:: You can enhance security for high-bandwidth network deployments in telco environments by following key security considerations.
 For more information, see xref:../day_2_core_cnf_clusters/security/telco-security-basics.adoc#telco-security-basics[Security basics].
diff --git a/modules/observability-alerting.adoc b/modules/observability-alerting.adoc
@@ -1,30 +1,30 @@
 // Module included in the following assemblies:
 //
-// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc
+// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc
 
 :_mod-docs-content-type: PROCEDURE
-[id="telco-observability-alerting_{context}"]
+[id="observability-alerting_{context}"]
 
 = Alerting
 
 {product-title} includes a large number of alert rules, which can change from release to release. 
 
-[id="viewing-default-alerts"]
+[id="viewing-default-alerts_{context}"]
 == Viewing default alerts
 
-Use the following procedure to review all of the alert rules in a cluster.
+Review all of the alert rules in a cluster.
 
 .Procedure
 
-* To review all the alert rules in a cluster, you can run the following command:
+* To review all the alert rules in a cluster, run the following command:
 [source,terminal]
 +
 ----
 $ oc get cm -n openshift-monitoring prometheus-k8s-rulefiles-0 -o yaml
 ----
 +
 Rules can include a description and provide a link to additional information and mitigation steps. 
-For example, this is the rule for `etcdHighFsyncDurations`:
+For example, see the rule for `etcdHighFsyncDurations`:
 +
 [source,terminal]
 ----
@@ -43,11 +43,12 @@ For example, this is the rule for `etcdHighFsyncDurations`:
 ----
 
 [id="alert-notifications"]
-== Alert notifications  
-You can view alerts in the {product-title} console, however an administrator should configure an external receiver to forward the alerts to. 
+== Alert notifications
+
+You can view alerts in the {product-title} console. However, an administrator must configure an external receiver to forward the alerts to. 
 {product-title} supports the following receiver types:
 
-* PagerDuty: a 3rd party incident response platform
-* Webhook: an arbitrary API endpoint that receives an alert via a POST request and can take any necessary action
-* Email: sends an email to designated address
-* Slack: sends a notification to either a slack channel or an individual user
+PagerDuty:: A third-party incident response platform.
+Webhook:: An arbitrary API endpoint that receives an alert through a `POST` request and can take any necessary action.
+Email:: Sends an email to a designated address.
+Slack:: Sends a notification to either a Slack channel or an individual user.
diff --git a/modules/observability-key-performance-metrics.adoc b/modules/observability-key-performance-metrics.adoc
@@ -1,14 +1,14 @@
 // Module included in the following assemblies:
 //
-// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc
+// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc
 
 :_mod-docs-content-type: CONCEPT
-[id="telco-observability-key-performance-metrics_{context}"]
+[id="observability-key-performance-metrics_{context}"]
 = Key performance metrics
 
-Depending on your system, there can be hundreds of available measurements.
+Depending on your system, you can have hundreds of available measurements.
 
-Here are some key metrics that you should pay attention to:
+Consider the following key metrics:
 
 * `etcd` response times
 * API response times
@@ -17,26 +17,30 @@ Here are some key metrics that you should pay attention to:
 * OVN health
 * Overall cluster operator health
 
-A good rule to follow is that if you decide that a metric is important, there should be an alert for it.
+If a metric is important, set up an alert for it. 
 
 [NOTE]
 ====
 You can check the available metrics by running the following command:
+
++
 [source,terminal]
 ----
 $ oc -n openshift-monitoring exec -c prometheus prometheus-k8s-0 -- curl -qsk http://localhost:9090/api/v1/metadata | jq '.data
 ----
 ====
 
-[id="example-queries-promql"]
+[id="example-queries-promql_{context}"]
 == Example queries in PromQL
 
-The following tables show some queries that you can explore in the metrics query browser using the {product-title} console.
+Using the {product-title} console, you can explore the following queries in the metrics query browser.
 
 [NOTE]
 ====
 The URL for the console is https://<OpenShift Console FQDN>/monitoring/query-browser.
-You can get the OpenShift Console FQDN by running the following command:
+You can get the Openshift Console FQDN by running the following command:
+
++
 [source,terminal]
 ----
 $ oc get routes -n openshift-console console -o jsonpath='{.status.ingress[0].host}'
@@ -79,7 +83,7 @@ $ oc get routes -n openshift-console console -o jsonpath='{.status.ingress[0].ho
 |`POST`
 |`histogram_quantile (0.99, sum by (le,managed_cluster) (sum_over_time(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver\|openshift-apiserver", verb="POST"}[60m])))`
 
-|`LIST`
+|`LIST` 
 |`histogram_quantile (0.99, sum by (le,managed_cluster) (sum_over_time(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver\|openshift-apiserver", verb="LIST"}[60m])))`
 
 |`PUT`
@@ -127,17 +131,15 @@ $ oc get routes -n openshift-console console -o jsonpath='{.status.ingress[0].ho
 
 |===
 
-[id="recommendations-for-storage-of-metrics"]
+[id="recommendations-for-storage-of-metrics_{context}"]
 == Recommendations for storage of metrics
 
-Out of the box, Prometheus does not back up saved metrics with persistent storage.
-If you restart the Prometheus pods, all metrics data are lost.
-You should configure the monitoring stack to use the back-end storage that is available on the platform.
-To meet the high IO demands of Prometheus you should use local storage.
-
-For Telco core clusters, you can use the Local Storage Operator for persistent storage for Prometheus.
+By default, Prometheus does not back up saved metrics with persistent storage. 
+If you restart the Prometheus pods, all metrics data are lost. 
+You must configure the monitoring stack to use the back-end storage that is available on the platform. 
+To meet the high IO demands of Prometheus, use local storage.
 
-{odf-first}, which deploys a ceph cluster for block, file, and object storage, is also a suitable candidate for a Telco core cluster.
+For smaller clusters, you can use the Local Storage Operator for persistent storage for Prometheus. {odf-first}, which deploys a ceph cluster for block, file, and object storage, is suitable for larger clusters.
 
-To keep system resource requirements low on a RAN {sno} or far edge cluster, you should not provision backend storage for the monitoring stack.
-Such clusters forward all metrics to the hub cluster where you can provision a third party monitoring platform.
+To keep system resource requirements low on a {sno} cluster, do not provision back-end storage for the monitoring stack. 
+Such clusters forward all metrics to the hub cluster where you can provision a third party monitoring platform.
diff --git a/modules/observability-monitoring-stack.adoc b/modules/observability-monitoring-stack.adoc
@@ -1,9 +1,9 @@
 // Module included in the following assemblies:
 //
-// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc
+// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc
 
 :_mod-docs-content-type: CONCEPT
-[id="telco-observability-monitoring-stack_{context}"]
+[id="observability-monitoring-stack_{context}"]
 = Understanding the monitoring stack
 
 The monitoring stack uses the following components:
@@ -17,5 +17,5 @@ image::monitoring-architecture.png[{product-title} monitoring architecture]
 
 [NOTE]
 ====
-For a {sno} cluster, you should disable Alertmanager and Thanos because the cluster sends all metrics to the hub cluster for analysis and retention.
+For {sno} clusters, disable Alertmanager and Thanos because the clusters sends all metrics to the hub cluster for analysis and retention.
 ====
diff --git a/modules/observability-monitoring-the-edge.adoc b/modules/observability-monitoring-the-edge.adoc
@@ -1,14 +1,14 @@
 // Module included in the following assemblies:
 //
-// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc
+// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc
 
 :_mod-docs-content-type: PROCEDURE
-[id="telco-observability-monitoring-the-edge_{context}"]
+[id="observability-monitoring-the-edge_{context}"]
 
-= Monitoring the edge
+= Monitoring at the far edge network
 
-{sno-caps} at the edge keeps the footprint of the platform components to a minimum. 
-The following procedure is an example of how you can configure a {sno} node with a small monitoring footprint.
+{product-title} clusters at the edge must keep the footprint of the platform components to a minimum. 
+The following procedure is an example of how to configure a {sno} or a node at the far edge network with a small monitoring footprint.
 
 .Prerequisites
 
@@ -36,14 +36,14 @@ metadata:
       retention: 24h
 ----
 
-. On the {sno}, apply the `ConfigMap` CR by running the following command:
+. Apply the `ConfigMap` CR by running the following command on the {sno} cluster:
 +
 [source,terminal]
 ----
 $ oc apply -f monitoringConfigMap.yaml
 ----
 
-. Create a `NameSpace` CR, and save it as `monitoringNamespace.yaml`, as in the following example:
+. Create a `Namespace` CR, and save it as `monitoringNamespace.yaml`, as in the following example:
 +
 [source,yaml]
 ----
@@ -53,7 +53,7 @@ metadata:
   name: open-cluster-management-observability
 ----
 
-. On the hub cluster, apply the `Namespace` CR on the hub cluster by running the following command:
+. Apply the `Namespace` CR by running the following command on the hub cluster :
 +
 [source,terminal]
 ----
@@ -75,7 +75,7 @@ spec:
   generateBucketName: acm-multi
 ----
 
-. On the hub cluster, apply the `ObjectBucketClaim` CR, by running the following command:
+. Apply the `ObjectBucketClaim` CR by running the following command on the hub cluster:
 +
 [source,terminal]
 ----
@@ -95,14 +95,14 @@ stringData:
   .dockerconfigjson: 'PULL_SECRET'
 ----
 
-. On the hub cluster, apply the `Secret` CR by running the following command:
+. Apply the `Secret` CR by running the following command in the hub cluster:
 +
 [source,terminal]
 ----
 $ oc apply -f monitoringSecret.yaml
 ----
 
-. Get the keys for the NooBaa service and the backend bucket name from the hub cluster by running the following commands:
+. Get the keys for the NooBaa service and the back-end bucket name from the hub cluster by running the following commands:
 +
 [source,terminal]
 ----
@@ -140,7 +140,7 @@ stringData:
       secret_key: ${NOOBAA_SECRET_KEY}
 ----
 
-. On the hub cluster, apply the `Secret` CR by running the following command:
+. Apply the `Secret` CR by running the following command on the hub cluster:
 +
 [source,terminal]
 ----
@@ -177,7 +177,7 @@ spec:
     storeStorageSize: 25Gi
 ----
 
-. On the hub cluster, apply the `MultiClusterObservability` CR by running the following command:
+. Apply the `MultiClusterObservability` CR by running the following command on the hub cluster:
 +
 [source,terminal]
 ----
diff --git a/modules/observability-workload-monitoring.adoc b/modules/observability-workload-monitoring.adoc
@@ -1,9 +1,9 @@
 // Module included in the following assemblies:
 //
-// * edge_computing/day_2_core_cnf_clusters/observability/telco-observability.adoc
+// * edge_computing/day_2_core_cnf_clusters/observability/observability.adoc
 
 :_mod-docs-content-type: PROCEDURE
-[id="telco-observability-workload-monitoring_{context}"]
+[id="observability-workload-monitoring_{context}"]
 = Workload monitoring
 
 By default, {product-title} does not collect metrics for application workloads. You can configure a cluster to collect workload metrics.
@@ -67,8 +67,8 @@ spec:
 $ oc apply -f monitoringServiceMonitor.yaml
 ----
 
-Prometheus scrapes the path `/metrics` by default, however you can define a custom path. 
-It is up to the vendor of the application to expose this endpoint for scraping, with metrics that they deem relevant.
+Prometheus scrapes the `/metrics` path by default. However, you can define a custom path. 
+The vendor of the application must decide whether to expose the endpoint for scraping, with metrics that they deem relevant.
 
 == Creating a workload alert