From abce1942dc8274fa66fa10350f26bad76eb0d9b8 Mon Sep 17 00:00:00 2001 From: Jerry Yang <103920848+jerryyangtg@users.noreply.github.com> Date: Mon, 22 Sep 2025 09:48:08 +0800 Subject: [PATCH 1/6] Publish the user documentation for Operator version 1.7.0; --- k8s/README.md | 6 +- k8s/docs/01-introduction/README.md | 6 +- k8s/docs/02-get-started/get_started.md | 2 +- k8s/docs/03-deploy/configure-nginx-tls.md | 832 +++++ ...ure-ssl-certificate-for-ingress-service.md | 3 +- .../03-deploy/customize-tigergraph-pod.md | 2 +- .../03-deploy/deploy-operator-with-helm.md | 4 +- k8s/docs/03-deploy/tigergraph-on-aks.md | 2 +- k8s/docs/03-deploy/tigergraph-on-eks.md | 2 +- k8s/docs/03-deploy/tigergraph-on-gke.md | 2 +- k8s/docs/03-deploy/tigergraph-on-openshift.md | 2 +- ...koff-retries-for-cluster-job-operations.md | 6 +- .../backup-restore-by-cr.md | 42 + .../backup-restore-by-kubectl-tg.md | 25 + k8s/docs/04-manage/operator-upgrade.md | 30 +- ...custom-bash-scripts-via-kubernetes-jobs.md | 522 ++++ .../tigergraph-metrics-reference.md | 1047 +++++++ ...ergraph-monitor-with-prometheus-grafana.md | 877 +++++- .../06-troubleshoot/cluster-deployment.md | 10 +- .../06-troubleshoot/cluster-management.md | 2 +- k8s/docs/06-troubleshoot/kubectl-tg-plugin.md | 2 +- .../06-troubleshoot/operator-installation.md | 8 +- k8s/docs/08-reference/api-reference.md | 577 +++- .../cluster-status-of-tigergraph.md | 4 +- .../node-repaving-for-tigergraph-on-k8s.md | 3 - k8s/docs/09-release-notes/README.md | 1 + k8s/docs/09-release-notes/operator-1.7.0.md | 87 + .../backup-restore/backup-schedule-local.yaml | 4 +- .../backup-restore/backup-to-local.yaml | 2 +- .../incremental-backup-to-s3.yaml | 61 + .../backup-restore/restore-from-local.yaml | 2 +- .../restore-with-timepoint-to-s3.yaml | 34 + .../deploy/additional-storage-of-sidecar.yaml | 68 +- ...ional-storages-of-kafka-tglogs-backup.yaml | 8 +- ...-between-multiple-tigergraph-clusters.yaml | 76 +- .../deploy/custom-volume-mount-path.yaml | 8 +- ...-multiple-topology-spread-constraints.yaml | 38 +- ...y-spread-constraint-and-node-affinity.yaml | 52 +- ...eness-with-topology-spread-constraint.yaml | 20 +- .../service-of-sidecar-ingress-type.yaml | 33 +- ...service-of-sidecar-loadbanalance-type.yaml | 25 +- .../service-of-sidecar-nodeport-type.yaml | 27 +- .../10-samples/deploy/tigergraph-cluster.yaml | 8 +- .../10-samples/manage/custom-script-job.yaml | 161 + .../monitoring/alertmanager-config-email.yaml | 32 + .../monitoring/alertmanager-config-slack.yaml | 41 + .../monitoring/alertmanager-configs.yaml | 27 + .../monitoring/prometheus-rules.yaml | 346 +++ .../monitoring/tigergraph-alert-rules.yaml | 354 +++ ...gergraph-certificate-with-certmanager.yaml | 51 + .../tigergraph-monitor-grafana-configmap.yaml | 787 ++++- ...raph-monitor-service-monitor-with-ssl.yaml | 35 + .../tigergraph-monitor-with-ssl.yaml | 412 +++ .../monitoring/tigergraph-monitor.yaml | 397 ++- .../tigergraph-operator-alert-rules.yaml | 303 ++ ...-controller-manager-grafana-configmap.yaml | 2723 +++++++++++++++++ ...or-controller-manager-metrics-monitor.yaml | 57 + 57 files changed, 9763 insertions(+), 535 deletions(-) create mode 100644 k8s/docs/03-deploy/configure-nginx-tls.md create mode 100644 k8s/docs/04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md create mode 100644 k8s/docs/05-monitor/tigergraph-metrics-reference.md create mode 100644 k8s/docs/09-release-notes/operator-1.7.0.md create mode 100644 k8s/docs/10-samples/backup-restore/incremental-backup-to-s3.yaml create mode 100644 k8s/docs/10-samples/backup-restore/restore-with-timepoint-to-s3.yaml create mode 100644 k8s/docs/10-samples/manage/custom-script-job.yaml create mode 100644 k8s/docs/10-samples/monitoring/alertmanager-config-email.yaml create mode 100644 k8s/docs/10-samples/monitoring/alertmanager-config-slack.yaml create mode 100644 k8s/docs/10-samples/monitoring/alertmanager-configs.yaml create mode 100644 k8s/docs/10-samples/monitoring/prometheus-rules.yaml create mode 100644 k8s/docs/10-samples/monitoring/tigergraph-alert-rules.yaml create mode 100644 k8s/docs/10-samples/monitoring/tigergraph-certificate-with-certmanager.yaml create mode 100644 k8s/docs/10-samples/monitoring/tigergraph-monitor-service-monitor-with-ssl.yaml create mode 100644 k8s/docs/10-samples/monitoring/tigergraph-monitor-with-ssl.yaml create mode 100644 k8s/docs/10-samples/monitoring/tigergraph-operator-alert-rules.yaml create mode 100644 k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-grafana-configmap.yaml create mode 100644 k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-metrics-monitor.yaml diff --git a/k8s/README.md b/k8s/README.md index 9aab85f66..af0c2c352 100644 --- a/k8s/README.md +++ b/k8s/README.md @@ -15,6 +15,7 @@ Understanding the intricate synergy between TigerGraph, TigerGraph Operator, and | TigerGraph Operator version | TigerGraph version | Kubernetes version | |----------|----------|----------| +| 1.7.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.3.0|1.27, 1.28, 1.29, 1.30, 1.31| | 1.6.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.2.1|1.27, 1.28, 1.29, 1.30, 1.31| | 1.5.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.2.0|1.26, 1.27, 1.28, 1.29, 1.30| | 1.4.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.1.2|1.25, 1.26, 1.27, 1.28, 1.29| @@ -68,9 +69,12 @@ Once your deployment is complete, refer to the following documents for guidance - [Configure Cross-Region Replication on Kubernetes](docs/03-deploy/configure-crr-on-k8s.md) - [Upgrade the TigerGraph Cluster Using the TigerGraph Operator](docs/04-manage/tigergraph-upgrade.md) - [Enable TigerGraph Operator monitoring with Prometheus and Grafana](docs/05-monitor/tigergraph-monitor-with-prometheus-grafana.md) +- [TigerGraph Metrics Reference](docs/05-monitor/tigergraph-metrics-reference.md) - [Customize the backoff retries for cluster job operations](docs/04-manage/backoff-retries-for-cluster-job-operations.md) - [Node Repaving for TigerGraph on Kubernetes](docs/08-reference/node-repaving-for-tigergraph-on-k8s.md) - [Configure SSL Certificate for Ingress Service](docs/03-deploy/configure-ssl-certificate-for-ingress-service.md) +- [Running Custom Bash Scripts in a TigerGraph Cluster via Kubernetes Jobs](docs/04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md) +- [Configure Nginx TLS](docs/03-deploy/configure-nginx-tls.md) In case issues arise and your cluster requires diagnosis, you have two valuable resources: @@ -86,7 +90,7 @@ For detailed information about the features, improvements, and bug fixes introdu When reporting issues, please provide the following details: -- **Setup Information**: Include details as specified in the [issue template](../06-FAQs/issue_report_template.md) +- **Setup Information**: Include details as specified in the [issue template](docs/07-FAQs/issue_report_template.md) - **Reproduction Steps**: Describe the scenario where the issue occurred, along with clear steps to reproduce it. - **Errors and Logs**: Share any relevant error messages or log outputs from the involved software. - **Additional Context**: Include any other details that might help in diagnosing the issue. diff --git a/k8s/docs/01-introduction/README.md b/k8s/docs/01-introduction/README.md index 8db0522e1..d2bf325c6 100644 --- a/k8s/docs/01-introduction/README.md +++ b/k8s/docs/01-introduction/README.md @@ -15,6 +15,7 @@ Understanding the intricate synergy between TigerGraph, TigerGraph Operator, and | TigerGraph Operator version | TigerGraph version | Kubernetes version | |----------|----------|----------| +| 1.7.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.3.0|1.27, 1.28, 1.29, 1.30, 1.31| | 1.6.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.2.1|1.27, 1.28, 1.29, 1.30, 1.31| | 1.5.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.2.0|1.26, 1.27, 1.28, 1.29, 1.30| | 1.4.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.1.2|1.25, 1.26, 1.27, 1.28, 1.29| @@ -69,9 +70,12 @@ Once your deployment is complete, refer to the following documents for guidance - [Configure Cross-Region Replication on Kubernetes](../03-deploy/configure-crr-on-k8s.md) - [Upgrade the TigerGraph Cluster Using the TigerGraph Operator](../04-manage/tigergraph-upgrade.md) - [Enable TigerGraph Operator monitoring with Prometheus and Grafana](../05-monitor/tigergraph-monitor-with-prometheus-grafana.md) +- [TigerGraph Metrics Reference](../05-monitor/tigergraph-metrics-reference.md) - [Customize the backoff retries for cluster job operations](../04-manage/backoff-retries-for-cluster-job-operations.md) - [Node Repaving for TigerGraph on Kubernetes](../08-reference/node-repaving-for-tigergraph-on-k8s.md) - [Configure SSL Certificate for Ingress Service](../03-deploy/configure-ssl-certificate-for-ingress-service.md) +- [Running Custom Bash Scripts in a TigerGraph Cluster via Kubernetes Jobs](../04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md) +- [Configure Nginx TLS](../03-deploy/configure-nginx-tls.md) In case issues arise and your cluster requires diagnosis, you have two valuable resources: @@ -87,7 +91,7 @@ For detailed information about the features, improvements, and bug fixes introdu When reporting issues, please provide the following details: -- **Setup Information**: Include details as specified in the [issue template](../06-FAQs/issue_report_template.md) +- **Setup Information**: Include details as specified in the [issue template](../07-FAQs/issue_report_template.md) - **Reproduction Steps**: Describe the scenario where the issue occurred, along with clear steps to reproduce it. - **Errors and Logs**: Share any relevant error messages or log outputs from the involved software. - **Additional Context**: Include any other details that might help in diagnosing the issue. diff --git a/k8s/docs/02-get-started/get_started.md b/k8s/docs/02-get-started/get_started.md index 526b53829..384f8b799 100644 --- a/k8s/docs/02-get-started/get_started.md +++ b/k8s/docs/02-get-started/get_started.md @@ -178,7 +178,7 @@ Before installing the kubectl-tg plugin, make sure you meet the following requir > If you are using Windows, please run the commands in a WSL environment. > Please refer to [Windows Subsystem for Linux Documentation](https://learn.microsoft.com/en-us/windows/wsl/) for more information. -Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 1.6.0: +Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9: ```bash wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg diff --git a/k8s/docs/03-deploy/configure-nginx-tls.md b/k8s/docs/03-deploy/configure-nginx-tls.md new file mode 100644 index 000000000..07f593298 --- /dev/null +++ b/k8s/docs/03-deploy/configure-nginx-tls.md @@ -0,0 +1,832 @@ +# Configure Nginx TLS + +This guide demonstrates how to configure TLS certificates for TigerGraph Nginx services to enable secure HTTPS access to TigerGraph clusters. The `NginxConfig` field allows you to configure TLS for Nginx running in TigerGraph Pods. + +- [Configure Nginx TLS](#configure-nginx-tls) + - [Generate SSL Certificate](#generate-ssl-certificate) + - [Create Kubernetes TLS Secret](#create-kubernetes-tls-secret) + - [Configure TigerGraph Cluster with Nginx TLS](#configure-tigergraph-cluster-with-nginx-tls) + - [LoadBalancer Service Type](#loadbalancer-service-type) + - [NodePort Service Type](#nodeport-service-type) + - [Ingress Service Type](#ingress-service-type) + - [Prerequisites](#prerequisites) + - [Configure Nginx TLS with Ingress Service Type](#configure-nginx-tls-with-ingress-service-type) + - [Update Nginx Config](#update-nginx-config) + - [Configure Nginx TLS by `kubectl-tg`](#configure-nginx-tls-by-kubectl-tg) + - [Configure Nginx mTLS](#configure-nginx-mtls) + - [Generate mTLS Certificates](#generate-mtls-certificates) + - [Create Kubernetes Secrets for mTLS](#create-kubernetes-secrets-for-mtls) + - [For LoadBalancer and NodePort Service Types](#for-loadbalancer-and-nodeport-service-types) + - [For Ingress Service Type](#for-ingress-service-type) + - [Configure TigerGraph Cluster with mTLS](#configure-tigergraph-cluster-with-mtls) + - [LoadBalancer Service Type with mTLS](#loadbalancer-service-type-with-mtls) + - [NodePort Service Type with mTLS](#nodeport-service-type-with-mtls) + - [Ingress Service Type with mTLS](#ingress-service-type-with-mtls) + - [Troubleshooting](#troubleshooting) + - [Self-signed certificate](#self-signed-certificate) + - [No alternative certificate subject name matches target host name](#no-alternative-certificate-subject-name-matches-target-host-name) + - [SAN validation failed - No SAN extension found in client certificate](#san-validation-failed---no-san-extension-found-in-client-certificate) + - [Client certificate verification failed](#client-certificate-verification-failed) + +## Generate SSL Certificate + +To enable TLS for Nginx, you must have a valid SSL certificate. Here we provide some commands to generate a self-signed certificate for Nginx TLS. Please replace the `your-domain.com` with your own domain. + +```bash +# Configuration variables +export NAMESPACE=tigergraph +export CLUSTER_NAME=my-tigergraph-cluster +export DOMAIN=your-domain.com +export SECRET_NAME=nginx-tls-secret +export DAYS=365 +export KEY_FILE="$HOME/nginx_tls_${CLUSTER_NAME}.key" +export CRT_FILE="$HOME/nginx_tls_${CLUSTER_NAME}.crt" + +# Generate OpenSSL configuration +CONFIG_FILE=$(mktemp) +cat > $CONFIG_FILE <<-EOF +[req] +prompt = no +x509_extensions = san_env +distinguished_name = req_distinguished_name + +[req_distinguished_name] +countryName = US +stateOrProvinceName = California +localityName = Palo Alto +organizationName = TigerGraph +organizationalUnitName = Engineering +commonName = tigergraph.dev + +[san_env] +subjectAltName = DNS:$DOMAIN +EOF + +# Generate certificate +openssl req -x509 -nodes -days $DAYS -newkey rsa:2048 \ + -keyout $KEY_FILE -out $CRT_FILE \ + -config $CONFIG_FILE \ + -subj "/CN=tigergraph.dev" + +# Verify certificate +echo -e "\n=== Verifying certificate ===" +SAN_CHECK=$(openssl x509 -in $CRT_FILE -noout -text | grep "DNS:$DOMAIN" || true) + +if [ -n "$SAN_CHECK" ]; then + echo "SAN check passed: $SAN_CHECK" +else + echo "SAN doesn't generate successfully" + exit 1 +fi + +# Clean up temporary files +rm -f $CONFIG_FILE + +echo -e "\n=== Certificate generated successfully! ===" +echo "Key file: $KEY_FILE" +echo "Certificate file: $CRT_FILE" +``` + +## Create Kubernetes TLS Secret + +After generating the SSL certificate, create a Kubernetes TLS secret for Nginx: + +```bash +# Create TLS secret for Nginx +kubectl create secret tls $SECRET_NAME \ + --cert=$CRT_FILE \ + --key=$KEY_FILE \ + --namespace $NAMESPACE +``` + +Verify the secret was created successfully: + +```bash +# List secrets in the namespace +kubectl get secrets -n $NAMESPACE + +# Describe the TLS secret +kubectl describe secret $SECRET_NAME -n $NAMESPACE +``` + +Expected output: +``` +Name: nginx-tls-secret +Namespace: tigergraph +Labels: +Annotations: + +Type: kubernetes.io/tls + +Data +==== +tls.crt: 1476 bytes +tls.key: 1704 bytes +``` + +## Configure TigerGraph Cluster with Nginx TLS + +For different service types, you need to configure the Nginx TLS differently. + +### LoadBalancer Service Type + +When using LoadBalancer service type, the `NginxConfig.SecretName` enables TLS for Nginx. The LoadBalancer service will expose the Nginx service directly, and Nginx will handle TLS termination. + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster + namespace: tigergraph +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: tigergraph-image-pull-secret + ha: 1 + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: LoadBalancer + # Configure Nginx TLS + nginxConfig: + secretName: nginx-tls-secret + privateKeyName: ssh-key-secret + replicas: 3 + resources: + limits: + cpu: "6" + memory: 12Gi + requests: + cpu: "6" + memory: 12Gi + storage: + type: persistent-claim + volumeClaimTemplate: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50G + storageClassName: standard + volumeMode: Filesystem +``` + +**Verify the Access to Nginx** + +First, let's get the address of service: + +```bash +kubectl get svc ${CLUSTER_NAME}-nginx-external-service -n ${NAMESPACE} +``` + +The output will be like: +```text +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +test-cluster-nginx-external-service LoadBalancer 34.118.232.158 34.67.253.147 14240:31078/TCP 3m26s +``` + +Then you can verify the access to Nginx by running the following command: + +```bash +export EXTERNAL_IP=34.67.253.147 +curl -k https://${EXTERNAL_IP}:14240/api/ping +``` + +> [!NOTE] +> We are using `-k` option to bypass the certificate verification. To access the service with certificate verification, you need to configure a DNS to point to the Nginx service. For example, you can create a DNS record like: +> ```text +> Type: A +> Host: your-domain.com +> Value: 34.67.253.147 +> ``` +> +> Then you can access the service with certificate verification by running the following command: +> ```bash +> curl https://your-domain.com:14240/api/ping --cacert $CRT_FILE +> ``` + + +### NodePort Service Type + +When using NodePort service type, the `NginxConfig.SecretName` enables TLS for Nginx. The NodePort service will expose the Nginx service directly, and Nginx will handle TLS termination. + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster + namespace: tigergraph +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: tigergraph-image-pull-secret + ha: 1 + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: NodePort + # Configure Nginx NodePort + nginxNodePort: 30241 + # Configure Nginx TLS + nginxConfig: + secretName: nginx-tls-secret + privateKeyName: ssh-key-secret + replicas: 3 + resources: + limits: + cpu: "6" + memory: 12Gi + requests: + cpu: "6" + memory: 12Gi + storage: + type: persistent-claim + volumeClaimTemplate: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50G + storageClassName: standard + volumeMode: Filesystem +``` + +**Verify the Access to Nginx** + +First, let's get the external IP of Nodes: +```bash +kubectl get nodes -o wide +``` + +The output will be like: +```text +NAME STATUS ROLES AGE VERSION INTERNAL-IP EXTERNAL-IP OS-IMAGE KERNEL-VERSION CONTAINER-RUNTIME +gke-tg-gke-1024-default-pool-29f86fa3-3502 Ready 71m v1.33.3-gke.1136000 10.128.0.69 34.61.161.217 Container-Optimized OS from Google 6.6.93+ containerd://2.0.4 +gke-tg-gke-1024-default-pool-29f86fa3-7lqh Ready 71m v1.33.3-gke.1136000 10.128.0.70 34.133.63.247 Container-Optimized OS from Google 6.6.93+ containerd://2.0.4 +``` + +You can use any of the external IP to verify the access to Nginx. + +```bash +export EXTERNAL_IP=34.61.161.217 +export NODE_PORT=30241 # Use the nginxNodePort you configured +curl -k https://${EXTERNAL_IP}:${NODE_PORT}/api/ping +``` + +> [!NOTE] +> We are using `-k` option to bypass the certificate verification. To access the service with certificate verification, you need to configure a DNS to point to the Nginx service. For example, you can create a DNS record like: +> ```text +> Type: A +> Host: your-domain.com +> Value: 34.61.161.217 +> ``` +> +> Then you can access the service with certificate verification by running the following command: +> ```bash +> curl https://your-domain.com:${NODE_PORT}/api/ping --cacert $CRT_FILE +> ``` + +### Ingress Service Type + +#### Prerequisites + +Before using Ingress service type, you need to install an Ingress Controller in the Kubernetes cluster and configure the DNS record to point to the Ingress Controller. Here we take Nginx Ingress Controller as an example. +Please refer to [Nginx Ingress Controller](https://kubernetes.github.io/ingress-nginx/deploy/) for how to install the Nginx Ingress Controller. + +After the Ingress Controller is installed, you can get the address of the Ingress Controller by running the following command: + +```bash +kubectl get svc ingress-nginx-controller -n ingress-nginx +``` + +The output will be like: + +```text +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +ingress-nginx-controller LoadBalancer 34.118.236.161 34.31.217.219 80:31346/TCP,443:30511/TCP 89s +``` + +The `EXTERNAL-IP` of service `ingress-nginx-controller` is the address of the Ingress Controller. you need to create a DNS record and resolve the **base domain** to the Ingress Controller. +If the address of the ingress controller is an IP address, please create a DNS record like: + +```text +Type: A +Host: your-domain.com +Value: 34.31.217.219 +``` + +If you are using EKS, the address of the Ingress Controller may be a domain name. In this case, you can create a CNAME record like: + +```text +Type: CNAME +Host: your-domain.com +Value: a1b2c3d4e5f6g7.here.amazonaws.com +``` + +#### Configure Nginx TLS with Ingress Service Type + +When using Ingress service type, you need to configure both Ingress TLS termination and Nginx TLS termination. This creates a double TLS setup where: +1. Ingress terminates TLS for external traffic +2. Nginx terminates TLS for internal traffic between Ingress and TigerGraph services + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster + namespace: tigergraph +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: tigergraph-image-pull-secret + ha: 1 + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: Ingress + ingressClassName: nginx + nginxHost: your-domain.com + # Required: Set a SecretName for Ingress TLS termination + secretName: nginx-tls-secret + # Required: Nginx TLS secret for internal traffic + nginxConfig: + secretName: nginx-tls-secret + privateKeyName: ssh-key-secret + replicas: 3 + resources: + limits: + cpu: "6" + memory: 12Gi + requests: + cpu: "6" + memory: 12Gi + storage: + type: persistent-claim + volumeClaimTemplate: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50G + storageClassName: standard + volumeMode: Filesystem +``` + +**Key points for Ingress configuration:** +- Both `listener.secretName` and `nginxConfig.secretName` are required +- `listener.secretName` is used for Ingress TLS termination (external traffic) +- `nginxConfig.secretName` is used for Nginx TLS termination (internal traffic) +- The backend protocol is automatically set to HTTPS when `nginxConfig.secretName` is configured +- You are allowed to use a different certificate for `listener.secretName` and `nginxConfig.secretName`, you need to create two separate TLS secrets. But we recommend you to use the same certificate for both. + +**Verify the Access to Nginx** + +Use the following command to verify the access to Nginx: +```bash +curl https://your-domain.com/api/ping --cacert $CRT_FILE +``` + +### Update Nginx Config + +You may want to update the Nginx Config of a cluster in the following scenarios: + +1. Enable/Disable Nginx TLS for a running cluster. + +2. Change the certificate for Nginx TLS. + +To update the Nginx Config of a cluster, you just need to update the `nginxConfig` field of the CR. If you want to renew the certificate, you just need to change the content of the secret. +After you apply the CR or update the secret, a config-update job will be created to update the configurations of Nginx. + +### Configure Nginx TLS by `kubectl-tg` + +If you are using `kubectl tg` command to deploy/manage your cluster, you can use option `--nginx-secret-name` to set/update the `NginxConfig.SecretName` of the cluster. + +```bash +# create cluster with Nginx TLS +kubectl tg create --cluster-name test-cluster -n tigergraph --nginx-secret-name nginx-tls-secret ${OTHER_OPTIONS} +# update cluster with Nginx TLS +kubectl tg update --cluster-name test-cluster -n tigergraph --nginx-secret-name nginx-tls-secret ${OTHER_OPTIONS} +``` + +## Configure Nginx mTLS + +This section demonstrates how to configure mutual TLS (mTLS) for TigerGraph Nginx services. With mTLS enabled, Nginx will verify client certificates in addition to providing server certificates, ensuring both client and server authenticate each other. + +### Generate mTLS Certificates + +To enable mTLS, you need to generate a root CA certificate and client certificates based on that root CA. Here we provide commands to generate self-signed certificates for mTLS: + +```bash +# Configuration variables +export NAMESPACE=tigergraph +export CLUSTER_NAME=my-tigergraph-cluster +export DOMAIN=your-domain.com +export CA_SECRET_NAME=nginx-mtls-ca-secret +export CLIENT_SECRET_NAME=nginx-mtls-client-secret +export DAYS=365 +export CA_KEY_FILE="$HOME/nginx_mtls_ca_${CLUSTER_NAME}.key" +export CA_CRT_FILE="$HOME/nginx_mtls_ca_${CLUSTER_NAME}.crt" +export CLIENT_KEY_FILE="$HOME/nginx_mtls_client_${CLUSTER_NAME}.key" +export CLIENT_CRT_FILE="$HOME/nginx_mtls_client_${CLUSTER_NAME}.crt" + +# Step 1: Generate Root CA certificate +echo "=== Generating Root CA certificate ===" +openssl req -x509 -nodes -days $DAYS -newkey rsa:2048 \ + -keyout $CA_KEY_FILE -out $CA_CRT_FILE \ + -subj "/C=US/ST=California/L=Palo Alto/O=TigerGraph/OU=Engineering/CN=TigerGraph-CA" + +# Step 2: Generate client certificate signing request with SAN extension +echo "=== Generating client certificate signing request ===" +# Create OpenSSL configuration for client certificate with SAN +CLIENT_CONFIG_FILE=$(mktemp) +cat > $CLIENT_CONFIG_FILE <<-EOF +[req] +prompt = no +distinguished_name = req_distinguished_name +req_extensions = v3_req + +[req_distinguished_name] +C = US +ST = California +L = Palo Alto +O = TigerGraph +OU = Engineering +CN = TigerGraph-Client + +[v3_req] +basicConstraints = CA:FALSE +keyUsage = nonRepudiation, digitalSignature, keyEncipherment +subjectAltName = @alt_names + +[alt_names] +DNS.1 = TigerGraph-Client +DNS.2 = localhost +IP.1 = 127.0.0.1 +EOF + +openssl req -new -nodes -newkey rsa:2048 \ + -keyout $CLIENT_KEY_FILE -out $CLIENT_CRT_FILE.csr \ + -config $CLIENT_CONFIG_FILE + +# Step 3: Sign client certificate with Root CA +echo "=== Signing client certificate with Root CA ===" +openssl x509 -req -in $CLIENT_CRT_FILE.csr -CA $CA_CRT_FILE -CAkey $CA_KEY_FILE \ + -CAcreateserial -out $CLIENT_CRT_FILE -days $DAYS \ + -extensions v3_req -extfile $CLIENT_CONFIG_FILE + +# Step 4: Verify certificates +echo "=== Verifying certificates ===" +echo "Root CA certificate:" +openssl x509 -in $CA_CRT_FILE -noout -subject -issuer + +echo "Client certificate:" +openssl x509 -in $CLIENT_CRT_FILE -noout -subject -issuer + +# Clean up temporary files +rm -f $CLIENT_CRT_FILE.csr $CLIENT_CONFIG_FILE + +echo -e "\n=== mTLS certificates generated successfully! ===" +echo "Root CA key file: $CA_KEY_FILE" +echo "Root CA certificate file: $CA_CRT_FILE" +echo "Client key file: $CLIENT_KEY_FILE" +echo "Client certificate file: $CLIENT_CRT_FILE" +``` + +### Create Kubernetes Secrets for mTLS + +After generating the mTLS certificates, create the appropriate Kubernetes secrets based on your service type. + +#### For LoadBalancer and NodePort Service Types + +For LoadBalancer and NodePort service types, you only need to create a secret containing the CA certificate. This certificate will be configured in Nginx to verify client certificates. + +```bash +# Create CA secret for mTLS verification +kubectl create secret generic $CA_SECRET_NAME \ + --from-file=ca.crt=$CA_CRT_FILE \ + --namespace $NAMESPACE +``` + +Verify the secret was created successfully: + +```bash +# List secrets in the namespace +kubectl get secrets -n $NAMESPACE + +# Describe the CA secret +kubectl describe secret $CA_SECRET_NAME -n $NAMESPACE +``` + +Expected output: +``` +Name: nginx-mtls-ca-secret +Namespace: tigergraph +Labels: +Annotations: + +Type: Opaque + +Data +==== +ca.crt: 1476 bytes +``` + +#### For Ingress Service Type + +For Ingress service type, you need to create a single secret containing both the CA certificate and client certificate/key. This secret will be used by both Nginx (for client certificate verification) and the Ingress controller (for presenting client certificates when connecting to Nginx). + +```bash +# Create combined mTLS secret with CA certificate and client certificate/key +kubectl create secret generic $CA_SECRET_NAME \ + --from-file=ca.crt=$CA_CRT_FILE \ + --from-file=tls.crt=$CLIENT_CRT_FILE \ + --from-file=tls.key=$CLIENT_KEY_FILE \ + --namespace $NAMESPACE +``` + +Verify the secret was created successfully: + +```bash +# List secrets in the namespace +kubectl get secrets -n $NAMESPACE + +# Describe the mTLS secret +kubectl describe secret $CA_SECRET_NAME -n $NAMESPACE +``` + +Expected output: +``` +Name: nginx-mtls-ca-secret +Namespace: tigergraph +Labels: +Annotations: + +Type: Opaque + +Data +==== +ca.crt: 1476 bytes +tls.crt: 1476 bytes +tls.key: 1704 bytes +``` + +> [!IMPORTANT] +> The secret must contain the exact keys `ca.crt`, `tls.crt`, and `tls.key`. +> The name of these keys cannot be customized. + +### Configure TigerGraph Cluster with mTLS + +To enable mTLS for your TigerGraph cluster, configure the `ClientCertSecretName` field in the `NginxConfig` section of your TigerGraph custom resource. + +#### LoadBalancer Service Type with mTLS + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster + namespace: tigergraph +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: tigergraph-image-pull-secret + ha: 1 + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: LoadBalancer + # Configure Nginx TLS and mTLS + nginxConfig: + secretName: nginx-tls-secret # Server certificate for TLS + clientCertSecretName: nginx-mtls-ca-secret # CA certificate for mTLS verification + privateKeyName: ssh-key-secret + replicas: 3 + resources: + limits: + cpu: "6" + memory: 12Gi + requests: + cpu: "6" + memory: 12Gi + storage: + type: persistent-claim + volumeClaimTemplate: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50G + storageClassName: standard + volumeMode: Filesystem +``` + +#### NodePort Service Type with mTLS + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster + namespace: tigergraph +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: tigergraph-image-pull-secret + ha: 1 + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: NodePort + nginxNodePort: 30241 + # Configure Nginx TLS and mTLS + nginxConfig: + secretName: nginx-tls-secret # Server certificate for TLS + clientCertSecretName: nginx-mtls-ca-secret # CA certificate for mTLS verification + privateKeyName: ssh-key-secret + replicas: 3 + resources: + limits: + cpu: "6" + memory: 12Gi + requests: + cpu: "6" + memory: 12Gi + storage: + type: persistent-claim + volumeClaimTemplate: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50G + storageClassName: standard + volumeMode: Filesystem +``` + +#### Ingress Service Type with mTLS + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster + namespace: tigergraph +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.3.0 + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: tigergraph-image-pull-secret + ha: 1 + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: Ingress + ingressClassName: nginx + nginxHost: your-domain.com + # Required: Set a SecretName for Ingress TLS termination + secretName: nginx-tls-secret + # Configure Nginx TLS and mTLS + nginxConfig: + secretName: nginx-tls-secret # Server certificate for TLS + clientCertSecretName: nginx-mtls-ca-secret # CA certificate for mTLS verification + privateKeyName: ssh-key-secret + replicas: 3 + resources: + limits: + cpu: "6" + memory: 12Gi + requests: + cpu: "6" + memory: 12Gi + storage: + type: persistent-claim + volumeClaimTemplate: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50G + storageClassName: standard + volumeMode: Filesystem +``` + +**Key points for mTLS configuration:** +- `nginxConfig.secretName` is used for server TLS certificate (required for HTTPS) +- `nginxConfig.clientCertSecretName` is used for client certificate verification (enables mTLS) +- For LoadBalancer and NodePort: Only CA certificate is needed in the client secret +- For Ingress: Both CA certificate and client certificate/key are needed in the same secret +- When mTLS is enabled, all client requests must present a valid client certificate signed by the CA + +**Verify mTLS Access** + +To verify mTLS is working correctly, you need to use the client certificate when making requests: + +```bash +# For LoadBalancer/NodePort services +curl --cert $CLIENT_CRT_FILE --key $CLIENT_KEY_FILE https://your-service-address:port/api/ping + +# For Ingress services +curl --cert $CLIENT_CRT_FILE --key $CLIENT_KEY_FILE https://your-domain.com/api/ping +``` + +## Troubleshooting + +When you are not able to access the Nginx service with TLS/mTLS enabled, please run `curl` command with `-v` option to get the detailed error message. Here are some common error messages and solutions: + +### Self-signed certificate + +If you are using a self-signed certificate, you may encounter the following error: + +``` +curl: (60) SSL certificate problem: self-signed certificate +``` + +**Solution**: + +1. You can use `-k` option to bypass the certificate verification. + +2. You can use `--cacert` option to specify the certificate file. + +### No alternative certificate subject name matches target host name + +If you use the address of LoadBalancer or NodePort service, you may encounter the following error: + +``` +curl: (60) SSL: no alternative certificate subject name matches target host name '34.61.161.217' +``` + +When you create the certificate, you don't know the IP address of the LoadBalancer or NodePort service, so you can't add the IP address to the certificate. That's why you encounter this error. + +**Solution**: + +1. You can use `-k` option to bypass the certificate verification. + +2. You can configure a DNS record to point to the LoadBalancer or NodePort service. Use the domain name that you configured in the certificate. + +3. Generate a certificate with the IP address of the LoadBalancer or NodePort service. And update the Nginx TLS secret with the new certificate. + +### SAN validation failed - No SAN extension found in client certificate + +If you encounter the following error when using mTLS: + +``` +Forbidden: SAN validation failed - No SAN extension found in client certificate +``` + +This error occurs because the client certificate doesn't have a Subject Alternative Name (SAN) extension, which is required for proper certificate validation. + +**Solution**: + +1. **Regenerate the client certificate with SAN extension**: Use the certificate generation script provided in the [Generate mTLS Certificates](#generate-mtls-certificates) section, which includes proper SAN configuration. + +2. **Verify the certificate has SAN extension**: You can check if your existing client certificate has SAN extension by running: + ```bash + openssl x509 -in $CLIENT_CRT_FILE -noout -text | grep -A 5 "Subject Alternative Name" + ``` + +3. **Update the secret with the new certificate**: After regenerating the certificate with SAN extension, update your Kubernetes secret: + ```bash + kubectl create secret generic $CA_SECRET_NAME \ + --from-file=ca.crt=$CA_CRT_FILE \ + --from-file=tls.crt=$CLIENT_CRT_FILE \ + --from-file=tls.key=$CLIENT_KEY_FILE \ + --namespace $NAMESPACE \ + --dry-run=client -o yaml | kubectl apply -f - + ``` + +### Client certificate verification failed + +If you encounter the following error when using mTLS: + +``` +Forbidden: Client certificate verification failed +``` + +This error occurs when the client certificate cannot be verified by the server. This can happen for several reasons. + +**Common causes and solutions**: + +1. **Client certificate is not signed by the trusted CA**: + - **Check**: Verify that your client certificate is signed by the CA certificate configured in the `clientCertSecretName` secret: + ```bash + # Check the issuer of the client certificate + openssl x509 -in $CLIENT_CRT_FILE -noout -issuer + + # Check the subject of the CA certificate + openssl x509 -in $CA_CRT_FILE -noout -subject + ``` + - **Solution**: Regenerate the client certificate using the correct CA certificate. + +2. **Wrong CA certificate in the secret**: + - **Check**: Verify the CA certificate in the Kubernetes secret matches the one used to sign the client certificate: + ```bash + # Extract and check the CA certificate from the secret + kubectl get secret $CA_SECRET_NAME -n $NAMESPACE -o jsonpath='{.data.ca\.crt}' | base64 -d | openssl x509 -noout -subject + ``` + - **Solution**: Update the secret with the correct CA certificate that was used to sign the client certificate. + +3. **Client certificate has expired**: + - **Check**: Verify the certificate validity period: + ```bash + openssl x509 -in $CLIENT_CRT_FILE -noout -dates + ``` + - **Solution**: Generate a new client certificate with a valid expiration date. diff --git a/k8s/docs/03-deploy/configure-ssl-certificate-for-ingress-service.md b/k8s/docs/03-deploy/configure-ssl-certificate-for-ingress-service.md index f574eeaf7..c53beb7da 100644 --- a/k8s/docs/03-deploy/configure-ssl-certificate-for-ingress-service.md +++ b/k8s/docs/03-deploy/configure-ssl-certificate-for-ingress-service.md @@ -16,7 +16,6 @@ - [Ingress Issues](#ingress-issues) - [How to View Ingress Controller Logs](#how-to-view-ingress-controller-logs) - [Common Ingress controller error logs and solutions](#common-ingress-controller-error-logs-and-solutions) - - [Error 1: Invalid TLS Secret (Domain Mismatch)](#error-1-invalid-tls-secret-domain-mismatch) - [See also](#see-also) ## Overview @@ -281,7 +280,7 @@ kubectl logs -n ingress-nginx deployment/ingress-nginx-controller | grep "test-c #### Common Ingress controller error logs and solutions -##### Error 1: Invalid TLS Secret (Domain Mismatch) + **Error 1: Invalid TLS Secret (Domain Mismatch)** ``` W0805 09:17:39.123456 1 controller.go:1234] Error getting SSL certificate "tigergraph/ingress-secret": local SSL certificate "tigergraph/ingress-secret" is invalid: x509: certificate is not valid for any names, but wanted to match "test-nginx.helm.tigergraph.dev" ``` diff --git a/k8s/docs/03-deploy/customize-tigergraph-pod.md b/k8s/docs/03-deploy/customize-tigergraph-pod.md index fb6ebb232..830dc44b9 100644 --- a/k8s/docs/03-deploy/customize-tigergraph-pod.md +++ b/k8s/docs/03-deploy/customize-tigergraph-pod.md @@ -177,7 +177,7 @@ kind: TigerGraph metadata: name: test-cluster spec: - image: docker.io/tigergraph/tigergraph-k8s:4.2.1 + image: docker.io/tigergraph/tigergraph-k8s:4.1.2 imagePullPolicy: IfNotPresent imagePullSecrets: - name: tigergraph-image-pull-secret diff --git a/k8s/docs/03-deploy/deploy-operator-with-helm.md b/k8s/docs/03-deploy/deploy-operator-with-helm.md index 685d44611..de60431b2 100644 --- a/k8s/docs/03-deploy/deploy-operator-with-helm.md +++ b/k8s/docs/03-deploy/deploy-operator-with-helm.md @@ -77,8 +77,8 @@ Install the Operator in a dedicated namespace. You can customize the configurati ```yaml # values.yaml example replicas: 3 -image: docker.io/tigergraph/tigergraph-k8s-operator:1.6.0 -jobImage: docker.io/tigergraph/tigergraph-k8s-init:1.6.0 +image: docker.io/tigergraph/tigergraph-k8s-operator:1.7.0 +jobImage: docker.io/tigergraph/tigergraph-k8s-init:1.7.0 pullPolicy: IfNotPresent imagePullSecret: tigergraph-image-pull-secret watchNameSpaces: "" diff --git a/k8s/docs/03-deploy/tigergraph-on-aks.md b/k8s/docs/03-deploy/tigergraph-on-aks.md index 3e509d969..497aed4e9 100644 --- a/k8s/docs/03-deploy/tigergraph-on-aks.md +++ b/k8s/docs/03-deploy/tigergraph-on-aks.md @@ -81,7 +81,7 @@ The `kubectl-tg` plugin simplifies deploying and managing the Operator and Tiger > If you are using Windows, please run the commands in a WSL environment. > Please refer to [Windows Subsystem for Linux Documentation](https://learn.microsoft.com/en-us/windows/wsl/) for more information. -Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 1.6.0: +Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 1.2.0: ```bash wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg diff --git a/k8s/docs/03-deploy/tigergraph-on-eks.md b/k8s/docs/03-deploy/tigergraph-on-eks.md index b0f4fe392..93a5d1c9a 100644 --- a/k8s/docs/03-deploy/tigergraph-on-eks.md +++ b/k8s/docs/03-deploy/tigergraph-on-eks.md @@ -97,7 +97,7 @@ The `kubectl-tg` plugin allows you to deploy and manage the Operator and TigerGr > If you are using Windows, please run the commands in a WSL environment. > Please refer to [Windows Subsystem for Linux Documentation](https://learn.microsoft.com/en-us/windows/wsl/) for more information. -Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 1.6.0: +Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9: ```bash wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg diff --git a/k8s/docs/03-deploy/tigergraph-on-gke.md b/k8s/docs/03-deploy/tigergraph-on-gke.md index 5cbbf5c0c..2cca9d4f9 100644 --- a/k8s/docs/03-deploy/tigergraph-on-gke.md +++ b/k8s/docs/03-deploy/tigergraph-on-gke.md @@ -81,7 +81,7 @@ The `kubectl-tg` plugin simplifies deploying and managing the Operator and Tiger > If you are using Windows, please run the commands in a WSL environment. > Please refer to [Windows Subsystem for Linux Documentation](https://learn.microsoft.com/en-us/windows/wsl/) for more information. -Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 1.6.0: +Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9: ```bash wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg diff --git a/k8s/docs/03-deploy/tigergraph-on-openshift.md b/k8s/docs/03-deploy/tigergraph-on-openshift.md index 3bb00743f..2b36e3b42 100644 --- a/k8s/docs/03-deploy/tigergraph-on-openshift.md +++ b/k8s/docs/03-deploy/tigergraph-on-openshift.md @@ -333,7 +333,7 @@ kubectl-tg is a plugin for deploying and managing the Operator and TigerGraph cl > If you are using Windows, please run the commands in a WSL environment. > Please refer to [Windows Subsystem for Linux Documentation](https://learn.microsoft.com/en-us/windows/wsl/) for more information. -Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 1.6.0: +Here's an example of installing the latest kubectl-tg, you can change the latest to your desired version, such as 0.0.9: ```bash wget https://dl.tigergraph.com/k8s/latest/kubectl-tg -O kubectl-tg diff --git a/k8s/docs/04-manage/backoff-retries-for-cluster-job-operations.md b/k8s/docs/04-manage/backoff-retries-for-cluster-job-operations.md index ca83b05eb..87ea4ea63 100644 --- a/k8s/docs/04-manage/backoff-retries-for-cluster-job-operations.md +++ b/k8s/docs/04-manage/backoff-retries-for-cluster-job-operations.md @@ -77,7 +77,7 @@ Add three new options in kubectl-tg plugin to support configuring those configur To customize the backoff retries during cluster initialization, run the following command: ```bash -kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 4 --ha 2 --version 4.2.1 --license ${LICENSE} \ +kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 4 --ha 2 --version 4.1.2 --license ${LICENSE} \ --storage-class standard --storage-size 10G --cpu 6000m --memory 12Gi --listener-type LoadBalancer \ --min-job-retry-duration '1m' --max-job-retry-duration '30m' --max-job-retry-times 'expand-job=6,shrink-pre-job=6,initialize-job=3' --namespace ${YOUR_NAMESPACE} ``` @@ -224,7 +224,7 @@ Status: Reason: ClusterInitializePostFalse Status: False Type: InitializePost - Image: docker.io/tigergraph/tigergraph-k8s:4.2.1 + Image: docker.io/tigergraph/tigergraph-k8s:4.2.1-ubuntu22 Pod Init Labels: tigergraph.com/cluster-name: test-cluster tigergraph.com/cluster-pod: test-cluster @@ -254,7 +254,7 @@ Once the job retry attempts reach the maximum number of retries, the TigerGraph Reason: ClusterInitializePostFalse Status: False Type: InitializePost - Image: docker.io/tigergraph/tigergraph-k8s:4.2.1 + Image: docker.io/tigergraph/tigergraph-k8s:4.2.1-ubuntu22 Job Backoff Times: 3 Pod Init Labels: tigergraph.com/cluster-name: test-cluster diff --git a/k8s/docs/04-manage/backup-and-restore/backup-restore-by-cr.md b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-cr.md index 9321b069c..e247ac2bb 100644 --- a/k8s/docs/04-manage/backup-and-restore/backup-restore-by-cr.md +++ b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-cr.md @@ -31,6 +31,7 @@ - [Use RoleARN instead of access key to access S3 Bucket in TigerGraphRestore](#use-rolearn-instead-of-access-key-to-access-s3-bucket-in-tigergraphrestore) - [Restore from backup in GCS bucket](#restore-from-backup-in-gcs-bucket) - [Restore from backup in ABS container](#restore-from-backup-in-abs-container) + - [Point-in-Time Restore (Supported from TigerGraph 4.2.0)](#point-in-time-restore-supported-from-tigergraph-420) - [Cross-cluster restore in existing cluster](#cross-cluster-restore-in-existing-cluster) - [Cluster version \>=3.9.2](#cluster-version-392) - [Clone a cluster(Create a new cluster and do cross-cluster restore)](#clone-a-clustercreate-a-new-cluster-and-do-cross-cluster-restore) @@ -139,6 +140,8 @@ spec: # Optional: Set the tag of the backup, if not set, the tag will be the name of this CR # Note: this field is Required for TigerGraph Operator < 1.1.0 tag: local + # Optional: Specify the base backup tag for incremental backup (only used when incremental is true) + base: "" # Optional: Set the path for temporary staging files stagingPath: /home/tigergraph/tigergraph/data # Optional: If 'incremental' is set to true, incremental backup will be performed @@ -758,6 +761,9 @@ spec: restoreConfig: # We can use tag to restore from backup in the same cluster tag: daily-2021-11-04T120000 + # Optional: Specify timepoint for point-in-time restore in time format (e.g., 2025-01-15T14:30:00Z), requires TigerGraph >= 4.2.0 + # Note: Only one of tag, meta, or timePoint can be specified + timePoint: "" # Optional stagingPath: /home/tigergraph/tigergraph/data/restore-staging # Optional: (TigerGraph Operator>=0.0.9 and TigerGraph>=3.9.3) should be >=0 @@ -906,6 +912,42 @@ spec: # the format is like "5s","10m","1h","1h20m5s" maxRetryDuration: 10s ``` +### Point-in-Time Restore (Supported from TigerGraph 4.2.0) + +> [!IMPORTANT] +> Point-in-time restore is supported from TigerGraph version 4.2.0 and requires backups with time coverage information. + +Point-in-time restore allows you to restore your cluster to a specific timestamp rather than to a specific backup. This is useful when you need to recover data to a precise moment in time. + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraphRestore +metadata: + name: restore-timepoint +spec: + restoreConfig: + # Specify the time point for restore in RFC3339 format, the selected time point must be valid. + timePoint: "2025-01-15T14:30:00Z" + # Optional + stagingPath: /home/tigergraph/tigergraph/data/restore-staging + decompressProcessNumber: 2 + source: + storage: s3Bucket + s3Bucket: + bucketName: operator-backup + secretKeyName: s3-secret + # Specify the name of cluster + clusterName: test-cluster + # Optional: Set the retry policy for restore CR + backoffRetryPolicy: + maxRetryTimes: 3 + minRetryDuration: 5s + maxRetryDuration: 10s +``` + +> [!NOTE] +> The timepoint must be within the coverage range of your available backups. The operator will validate the timepoint format but will not verify coverage - this is handled by the TigerGraph engine during restore. + ### Cross-cluster restore in existing cluster diff --git a/k8s/docs/04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md index 4a7246e60..5fde56a93 100644 --- a/k8s/docs/04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md +++ b/k8s/docs/04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md @@ -49,6 +49,7 @@ If you have experience with Custom Resources in Kubernetes (K8S), you can levera - [Use backup in s3 bucket with RoleARN (Supported from operator 1.2.0 and TigerGraph 4.1.0)](#use-backup-in-s3-bucket-with-rolearn-supported-from-operator-120-and-tigergraph-410) - [Use backup in GCS bucket (Supported from TigerGraph Operator 1.6.0 and TigerGraph 4.2.1)](#use-backup-in-gcs-bucket-supported-from-tigergraph-operator-160-and-tigergraph-421) - [Use backup in ABS container (Supported from TigerGraph Operator 1.6.0 and TigerGraph 4.2.1)](#use-backup-in-abs-container-supported-from-tigergraph-operator-160-and-tigergraph-421) + - [Point-in-Time Restore (Supported from TigerGraph 4.2.0)](#point-in-time-restore-supported-from-tigergraph-420) - [Cross-Cluster Restore from Backup](#cross-cluster-restore-from-backup) - [Clone Cluster from Backup](#clone-cluster-from-backup) - [Cross-Cluster Restore and Cluster Clone (Cluster Version \< 3.9.2)](#cross-cluster-restore-and-cluster-clone-cluster-version--392) @@ -89,6 +90,7 @@ Options: --compress-level : Choose from options: BestSpeed, DefaultCompression, and BestCompression. Only supported for TG clusters >=3.9.3. --incremental : Perform incremental backup. + --base : specify the base backup tag for an incremental backup and must be used together with the --incremental flag. --full : Perform a full backup (full backup is the default behavior). --destination : Specify the destination for storing backup files. Currently supports local and S3 storage. @@ -330,6 +332,14 @@ kubectl tg backup create --cluster-name test-cluster -n tigergraph --name increm --local-path /home/tigergraph/mybackup ``` +To initiate an incremental backup with specified base backup, incorporate the `--base` option into the following command: +```bash +kubectl tg backup create --cluster-name test-cluster -n tigergraph --name incremental-backup \ + --incremental --base specified_base_backup_tag \ + --tag testlocal --destination local \ + --local-path /home/tigergraph/mybackup +``` + #### Updating Backup Custom Resources If you have previously created a backup using the `kubectl tg backup create` command, you can modify the backup configuration by employing the `kubectl tg backup update` command. Once the `update` command is executed. @@ -914,6 +924,7 @@ Options: --name: specify name of restore --tag : specify the tag of backup files. you can use kubectl tg backup list to get all existing backups --metadata : specify the metadata file of backup. you should this if you want a cross-cluster restore + --time-point : specify the time point for point-in-time restore in RFC3339 format (e.g., 2025-06-01T14:24:31Z) --cluster-template : configure the cluster you want to create from exported CR --staging-path : specify where to store the temporary files --source : set the source to get backup files, support local and s3 now @@ -1073,6 +1084,20 @@ kubectl tg restore --name restore-from-abs \ Make sure to replace testabs-2025-04-22T091106 with the desired backup tag and adjust tg-backup to your ABS container name. This command will trigger the restore process, bringing your cluster back to the chosen backup's state. +#### Point-in-Time Restore (Supported from TigerGraph 4.2.0) +> [!IMPORTANT] +> Point-in-time restore is supported from TigerGraph version 4.2.0. +To perform a point-in-time restore, you can specify a specific timestamp in RFC3339 format. This allows you to restore your cluster to a specific point in time rather than to a specific backup. Execute the following command to initiate a restore from the S3 bucket. The backup will be automatically selected based on the specified time point. +```bash +kubectl tg restore --name restore-timepoint \ + --namespace tigergraph --cluster-name test-cluster \ + --time-point 2025-01-15T14:30:00Z \ + --source s3Bucket --s3-bucket tg-backup \ + --aws-secret aws-secret +``` +> [!NOTE] +> The timepoint must be within the coverage range of your available backups. The operator will validate the timepoint format but will not verify coverage - this is handled by the TigerGraph engine during restore. + ### Cross-Cluster Restore from Backup > [!NOTE] diff --git a/k8s/docs/04-manage/operator-upgrade.md b/k8s/docs/04-manage/operator-upgrade.md index bfa255593..8ff9130e3 100644 --- a/k8s/docs/04-manage/operator-upgrade.md +++ b/k8s/docs/04-manage/operator-upgrade.md @@ -3,7 +3,7 @@ This document provides step-by-step instructions for upgrading the TigerGraph Kubernetes Operator using the kubectl-tg plugin. - [How to upgrade TigerGraph Kubernetes Operator](#how-to-upgrade-tigergraph-kubernetes-operator) - - [Upgrading from TigerGraph Operator 1.0.0 and later versions to version 1.3.0](#upgrading-from-tigergraph-operator-100-and-later-versions-to-version-130) + - [Upgrading from TigerGraph Operator 1.0.0 and later versions to version 1.7.0](#upgrading-from-tigergraph-operator-100-and-later-versions-to-version-170) - [Upgrading kubectl-tg plugin](#upgrading-kubectl-tg-plugin) - [Upgrading TigerGraph Operator](#upgrading-tigergraph-operator) - [Upgrading from TigerGraph Operator versions prior to 1.0.0 to version 1.0.0 and above](#upgrading-from-tigergraph-operator-versions-prior-to-100-to-version-100-and-above) @@ -27,14 +27,14 @@ This document provides step-by-step instructions for upgrading the TigerGraph Ku - [Successfully upgraded the operator from version 0.0.9 to version 1.2.0 and earlier, but still encountered some errors when creating a TigerGraph cluster](#successfully-upgraded-the-operator-from-version-009-to-version-120-and-earlier-but-still-encountered-some-errors-when-creating-a-tigergraph-cluster) - [Failed to upgrade the operator from version 0.0.9 to version 1.3.0 and above](#failed-to-upgrade-the-operator-from-version-009-to-version-130-and-above) -## Upgrading from TigerGraph Operator 1.0.0 and later versions to version 1.3.0 +## Upgrading from TigerGraph Operator 1.0.0 and later versions to version 1.7.0 ### Upgrading kubectl-tg plugin -To upgrade the kubectl-tg plugin for TigerGraph Operator 1.3.0, execute the following command: +To upgrade the kubectl-tg plugin for TigerGraph Operator 1.7.0, execute the following command: ```bash -curl https://dl.tigergraph.com/k8s/1.3.0/kubectl-tg -o kubectl-tg +curl https://dl.tigergraph.com/k8s/latest/kubectl-tg -o kubectl-tg sudo install kubectl-tg /usr/local/bin/ ``` @@ -43,8 +43,8 @@ Ensure you have installed the correct version of kubectl-tg: ```bash kubectl tg version -Version: 1.3.0 -Default version of TigerGraph cluster: 4.1.1 +Version: 1.7.0 +Default version of TigerGraph cluster: 4.3.0 ``` > [!WARNING] @@ -52,10 +52,10 @@ Default version of TigerGraph cluster: 4.1.1 #### Upgrading TigerGraph Operator -There are no breaking changes in the TigerGraph CRDs for version 1.3.0 compared to versions 1.0.0 and above. You can upgrade the TigerGraph Operator by following these steps if you have an older version (1.0.0 or above) installed. +There are no breaking changes in the TigerGraph CRDs for version 1.7.0 compared to versions 1.0.0 and above. You can upgrade the TigerGraph Operator by following these steps if you have an older version (1.0.0 or above) installed. > [!IMPORTANT] -> There is currently no support for upgrading or deleting CRDs when upgrading or uninstalling the TigerGraph Operator due to the risk of unintentional data loss. It is necessary to upgrade TigerGraph CRDs manually for the operator version prior to 1.3.0. However, for operator version 1.3.0, we use [Helm chart’s pre-upgrade hook](https://helm.sh/docs/topics/charts_hooks/) to upgrade the CRDs automatically. You can ignore the first step if you upgrade the operator to version 1.3.0 or above. +> There is currently no support for upgrading or deleting CRDs when upgrading or uninstalling the TigerGraph Operator due to the risk of unintentional data loss. It is necessary to upgrade TigerGraph CRDs manually for the operator version prior to 1.3.0. However, starting from operator version 1.3.0, we use [Helm chart’s pre-upgrade hook](https://helm.sh/docs/topics/charts_hooks/) to upgrade the CRDs automatically. You can ignore the first step if you upgrade the operator to version 1.3.0 or above. - Upgrade the TigerGraph CRDs to the latest version(It's required for the operator version prior to 1.3.0) @@ -75,7 +75,7 @@ There are no breaking changes in the TigerGraph CRDs for version 1.3.0 compared helm list -n ${YOUR_NAMESPACE_OF_OPERATOR} NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION - tg-operator tigergraph 2 2024-06-24 10:34:23.185036124 +0000 UTC deployed tg-operator-1.2.0 1.2.0 + tg-operator tigergraph 2 2025-09-28 10:34:23.185036124 +0000 UTC deployed tg-operator-1.7.0 1.7.0 ``` ## Upgrading from TigerGraph Operator versions prior to 1.0.0 to version 1.0.0 and above @@ -142,7 +142,7 @@ tg-data-test-cluster-2 Bound pvc-73d58df7-206e-4c58-aa91-702df9761fac 10G ### Install the latest or target version of `kubectl-tg` ```bash -curl https://dl.tigergraph.com/k8s/1.3.0/kubectl-tg -o kubectl-tg +curl https://dl.tigergraph.com/k8s/latest/kubectl-tg -o kubectl-tg sudo install kubectl-tg /usr/local/bin/ ``` @@ -151,8 +151,8 @@ Ensure you have installed the correct version of kubectl-tg: ```bash kubectl tg version -Version: 1.3.0 -Default version of TigerGraph cluster: 4.1.1 +Version: 1.7.0 +Default version of TigerGraph cluster: 4.3.0 ``` ### Uninstall the old version of TigerGraph Operator and TigerGraph CRDs @@ -193,7 +193,7 @@ Ensure TigerGraph Operator has been installed successfully: helm list -n tigergraph NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION -tg-operator tigergraph 1 2024-09-10 10:34:23.185036124 +0000 UTC deployed tg-operator-1.3.0 1.3.0 +tg-operator tigergraph 1 2025-09-10 10:34:23.185036124 +0000 UTC deployed tg-operator-1.7.0 1.7.0 ``` ```bash @@ -326,10 +326,10 @@ Best practice for upgrading TigerGraph <=3.9.3 with a single PVC and TigerGraph [Backup & Restore cluster kubectl-tg plugin](../04-manage/backup-and-restore/backup-restore-by-kubectl-tg.md) -- After completing the restore process, upgrade the TigerGraph cluster to version 4.2.1 using the appropriate command. +- After completing the restore process, upgrade the TigerGraph cluster to version 4.1.0 using the appropriate command. ```bash - kubectl tg update --cluster-name $YOUR_CLUSTER_NAME --version 4.2.1 --namespace $YOUR_NAMESPACE + kubectl tg update --cluster-name $YOUR_CLUSTER_NAME --version 4.1.0 --namespace $YOUR_NAMESPACE ``` ## Troubleshooting diff --git a/k8s/docs/04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md b/k8s/docs/04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md new file mode 100644 index 000000000..57cba5725 --- /dev/null +++ b/k8s/docs/04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md @@ -0,0 +1,522 @@ +# Running Custom Bash Scripts in a TigerGraph Cluster via Kubernetes Jobs + +## Overview + +After deploying a TigerGraph cluster on Kubernetes, customers often need to perform operations that cannot be configured through the TigerGraph Custom Resource (CR), such as: + +- Creating GSQL roles and managing user permissions +- Executing custom GSQL scripts for data operations +- Running maintenance tasks and administrative commands +- Performing cluster-specific configurations + +While these tasks can technically be performed by manually logging into the TigerGraph pod, this approach fails to meet automation and operational efficiency standards. This document provides a standardized procedure for running custom bash scripts in a TigerGraph cluster using Kubernetes Jobs. + +## Prerequisites + +Before proceeding, ensure you have: + +1. A deployed TigerGraph cluster on Kubernetes +2. `kubectl` configured to access your cluster +3. Appropriate RBAC permissions to create Jobs and ConfigMaps +4. SSH key secret configured for the TigerGraph cluster +5. Knowledge of the cluster's namespace and service names + +## Architecture Overview + +The solution uses Kubernetes Jobs to execute custom scripts within the TigerGraph cluster environment. The approach involves: + +1. **Kubernetes Job**: A one-time execution container that runs your custom script +2. **ConfigMap**: Stores your custom script content +3. **Secret**: Contains SSH credentials for cluster access +4. **Volume Mounts**: Provides access to scripts and credentials + +## Step-by-Step Procedure + +### Step 1: Prepare Your Custom Script + +Create your custom bash script. For example, to create GSQL roles: + +```bash +#!/bin/bash +# create-gsql-roles.sh + +set -eo pipefail + +# Connect to the TigerGraph cluster +export PATH=/home/tigergraph/tigergraph/app/cmd:$PATH + +echo "Creating GSQL roles..." + +# Create read-only role +gsql -c "CREATE ROLE readonly_role" + +# Grant permissions to roles +gsql -c "GRANT READ ON ALL QUERIES IN GRAPH social TO readonly_role" + +echo "GSQL roles created successfully" + +# Drop a role +# gsql -c "DROP ROLE readonly_role" +``` + +### Step 2: Create a ConfigMap for Your Script + +Create a ConfigMap containing your script: + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: custom-script-cm + namespace: tigergraph +data: + custom-script.sh: | + #!/bin/bash + # Your custom script content here + set -eo pipefail + + export PATH=/home/tigergraph/tigergraph/app/cmd:$PATH + + echo "Starting custom script execution..." + + # Your script logic here + gsql -c "SHOW USERS" + + echo "Custom script completed successfully" +``` + +Apply the ConfigMap: + +```bash +kubectl apply -f custom-script-cm.yaml +``` + +### Step 3: Create the Kubernetes Job + +> [!IMPORTANT] +> **​Kubernetes Job Namespace Requirement​** +> +> The Kubernetes Job ​must be created in the same namespace as the TigerGraph cluster. +> +> ​Why?​​ The Job requires access to the Kubernetes Secret containing the TigerGraph cluster's private SSH key. +> +> Secrets are ​namespace-scoped, so Jobs in different namespaces ​cannot access these credentials​ and will fail to authenticate + +Create a Kubernetes Job that will execute your script: + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: custom-script-job + namespace: tigergraph +spec: + template: + metadata: {} + spec: + containers: + - name: script-runner + image: docker.io/tigergraph/tigergraph-k8s-init:1.6.0 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - | + set -eo pipefail + PRIVATE_KEY_FILE=/etc/private-key-volume/tigergraph_rsa + SERVICE_NAME=${CLUSTER_NAME}-internal-service + + echo "Copying script to cluster..." + scp -i $PRIVATE_KEY_FILE -o StrictHostKeyChecking=no -P ${SSH_PORT} \ + /tmp/custom-script/custom-script.sh \ + tigergraph@${CLUSTER_NAME}-0.${SERVICE_NAME}.${NAMESPACE}:/home/tigergraph/custom-script.sh > /dev/null + + echo "Making script executable..." + ssh -i $PRIVATE_KEY_FILE -o StrictHostKeyChecking=no \ + -p ${SSH_PORT} tigergraph@${CLUSTER_NAME}-0.${SERVICE_NAME}.${NAMESPACE} \ + "chmod +x /home/tigergraph/custom-script.sh" + + echo "Running script in cluster..." + ssh -i $PRIVATE_KEY_FILE -o StrictHostKeyChecking=no \ + -p ${SSH_PORT} tigergraph@${CLUSTER_NAME}-0.${SERVICE_NAME}.${NAMESPACE} < person.csv + name,gender,age,state + Tom,male,40,ca + Dan,male,34,ny + Jenny,female,25,tx + Kevin,male,28,az + Amily,female,22,ca + Nancy,female,20,ky + Jack,male,26,fl + EOF + + cat << EOF > friendship.csv + person1,person2,date + Tom,Dan,2017-06-03 + Tom,Jenny,2015-01-01 + Dan,Jenny,2016-08-03 + Jenny,Amily,2015-06-08 + Dan,Nancy,2016-01-03 + Nancy,Jack,2017-03-02 + Dan,Kevin,2015-12-30 + EOF + + cat << EOF > gsql101.gsql + BEGIN + CREATE VERTEX person ( + PRIMARY_ID name STRING, + name STRING, + age INT, + gender STRING, + state STRING + ) + END + + CREATE UNDIRECTED EDGE friendship (FROM person, TO person, connect_day DATETIME) + + CREATE GRAPH social (person, friendship) + + USE GRAPH social + BEGIN + CREATE LOADING JOB load_social FOR GRAPH social { + DEFINE FILENAME file1="/home/tigergraph/person.csv"; + DEFINE FILENAME file2="/home/tigergraph/friendship.csv"; + + LOAD file1 TO VERTEX person VALUES ($"name", $"name", $"age", $"gender", $"state") USING header="true", separator=","; + LOAD file2 TO EDGE friendship VALUES (\$0, \$1, \$2) USING header="true", separator=","; + } + END + RUN LOADING JOB load_social + + CREATE QUERY hello(VERTEX p) { + Start = {p}; + Result = SELECT tgt + FROM Start:s-(friendship:e) ->person:tgt; + PRINT Result; + } + + INSTALL QUERY hello + + RUN QUERY hello("Tom") + EOF + + gsql /home/tigergraph/gsql101.gsql + + echo "Custom GSQL operations completed successfully" +``` + +Complete YAML example: + +[custom-script-job.yaml](../10-samples/manage/custom-script-job.yaml) + +### Example 3: Cluster Maintenance Tasks + +```yaml +apiVersion: v1 +kind: ConfigMap +metadata: + name: maintenance-script-cm + namespace: tigergraph +data: + maintenance.sh: | + #!/bin/bash + set -eo pipefail + export PATH=/home/tigergraph/tigergraph/app/cmd:$PATH + + echo "Starting cluster maintenance..." + + # Check cluster status + gadmin status -v + + # Clean up old logs + find /home/tigergraph/tigergraph/log -name "*.log" -mtime +7 -delete + + # Check disk usage + df -h /home/tigergraph/tigergraph/data + + echo "Maintenance completed successfully" +``` + +## Using CronJobs for Scheduled Tasks + +For recurring tasks, use CronJobs instead of Jobs: + +```yaml +apiVersion: batch/v1 +kind: CronJob +metadata: + name: scheduled-maintenance + namespace: tigergraph +spec: + schedule: "0 2 * * *" # Run daily at 2 AM + jobTemplate: + metadata: {} + spec: + template: + spec: + containers: + - name: maintenance-runner + image: docker.io/tigergraph/tigergraph-k8s-init:1.6.0 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - | + # Your maintenance script here + set -eo pipefail + PRIVATE_KEY_FILE=/etc/private-key-volume/tigergraph_rsa + SERVICE_NAME=${CLUSTER_NAME}-internal-service + + ssh -i $PRIVATE_KEY_FILE -o StrictHostKeyChecking=no \ + -p ${SSH_PORT} tigergraph@${CLUSTER_NAME}-0.${SERVICE_NAME}.${NAMESPACE} < 0 + +# License expired or invalid +tigergraph_license_days_left <= 0 +``` + +### Performance Queries + +```promql +# Average query latency +avg(tigergraph_endpoint_latency{statistic="average_latency"}) + +# High latency queries (> 1000ms) +tigergraph_endpoint_latency{statistic="average_latency"} > 1000 + +# Request rate per second +rate(tigergraph_endpoint_completed[5m]) +``` + +## Alerting Recommendations + +This section provides comprehensive alerting recommendations based on the predefined Prometheus rules in the TigerGraph Operator. These alerts are designed to help you maintain optimal performance and reliability of your TigerGraph clusters. + +### Critical Alerts + +#### Service Health Alerts + +1. **TigerGraph Service Down**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 27 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: Service is completely down and not responding + +2. **TigerGraph Service Offline**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 24 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: Service is offline and not available + +3. **TigerGraph Service Stopping**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 21 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: Service is in the process of stopping + +4. **TigerGraph Service Paused**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 18 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: Service is paused and not processing requests + +5. **TigerGraph Service Unknown Status**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 3 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: Service status is unknown or undetermined + +#### Resource Usage Alerts + +1. **Critical CPU Usage**: + + ```promql + max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 90 + ``` + + - **Severity**: Critical + - **Duration**: 3m + - **Description**: CPU usage is critically high (>90%) + +2. **Critical Memory Usage**: + + ```promql + (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 90 + ``` + + - **Severity**: Critical + - **Duration**: 3m + - **Description**: Memory usage is critically high (>90%) + +3. **Critical Disk Usage**: + + ```promql + (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 90 + ``` + + - **Severity**: Critical + - **Duration**: 3m + - **Description**: Disk usage is critically high (>90%) + +#### License Alerts + +1. **TigerGraph License Expiring Critical**: + + ```promql + min(tigergraph_license_days_left) by (cluster_name, namespace) <= 7 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 0 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: License will expire in 7 days or less + +2. **TigerGraph License Expired**: + + ```promql + min(tigergraph_license_days_left) by (cluster_name, namespace) == 0 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: License has expired today + +3. **TigerGraph License Invalid**: + + ```promql + min(tigergraph_license_days_left) by (cluster_name, namespace) == -1 + ``` + + - **Severity**: Critical + - **Duration**: 1m + - **Description**: License is invalid or corrupted + +#### Performance Alerts + +1. **Critical Endpoint Latency**: + + ```promql + max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) > 10000 + ``` + + - **Severity**: Critical + - **Duration**: 3m + - **Description**: Average endpoint latency is critically high (>10 seconds) + +### Warning Alerts + +#### Resource Usage Alerts(Warning) + +1. **High CPU Usage**: + + ```promql + max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 80 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: CPU usage is high (>80%) + +2. **High Memory Usage**: + + ```promql + (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 80 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Memory usage is high (>80%) + +3. **Low Memory Available**: + + ```promql + max(tigergraph_memory_available{}) by (cluster_name, namespace, host_id) < 1000 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Available memory is low (<1GB) + +4. **High Disk Usage**: + + ```promql + (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 80 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Disk usage is high (>80%) + +5. **Low Disk Space**: + + ```promql + max(tigergraph_diskspace_free) by (cluster_name, namespace, path_name, host_id, path) < 1000 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Free disk space is low (<1GB) + +6. **Low Disk Inodes**: + + ```promql + max(tigergraph_disk_inode_free) by (cluster_name, namespace, host_id, path_name, path) < 100000 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Free disk inodes are low (<100,000) + +#### Service Health Alerts (Warning) + +1. **TigerGraph Service Starting**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 15 + ``` + + - **Severity**: Warning + - **Duration**: 2m + - **Description**: Service is starting up + +2. **TigerGraph Service Readonly**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 12 + ``` + + - **Severity**: Warning + - **Duration**: 1m + - **Description**: Service is in readonly mode + +3. **TigerGraph Service Warmup**: + + ```promql + max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 9 + ``` + + - **Severity**: Warning + - **Duration**: 2m + - **Description**: Service is in warmup state + +#### Performance Alerts (Warning) + +1. **High Endpoint Latency**: + + ```promql + max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) > 5000 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Average endpoint latency is high (>5 seconds) + +2. **High QPS**: + + ```promql + max(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 100 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: High queries per second detected + +3. **Endpoint Timeout**: + + ```promql + max(tigergraph_endpoint_timeout) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 0 + ``` + + - **Severity**: Warning + - **Duration**: 1m + - **Description**: Endpoint timeouts detected + +#### License Alerts (Warning) + +1. **TigerGraph License Expiring Soon**: + + ```promql + min(tigergraph_license_days_left) by (cluster_name, namespace) <= 30 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 7 + ``` + + - **Severity**: Warning + - **Duration**: 1m + - **Description**: License will expire in 30 days or less + +#### System Alerts + +1. **Low CPU Available**: + + ```promql + max(tigergraph_cpu_available) by (namespace,cluster_name, host_id) < 10 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Available CPU is low (<10%) + +2. **High Network Connections**: + + ```promql + max(tigergraph_network_connections) by (cluster_name, namespace, host_id) > 2000 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: Number of open TCP connections is high + +3. **High Disk I/O**: + + ```promql + max(tigergraph_disk_iops) by (namespace,cluster_name, path_name, host_id, path, mount_point) > 1000 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: High disk I/O operations per second + +4. **High Disk I/O Time**: + + ```promql + max(tigergraph_disk_io_time) by (namespace,cluster_name, path_name, host_id, path,mount_point) > 0.1 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: High disk I/O time (>0.1 hours) + +#### Service-Specific Alerts + +1. **TigerGraph Service High CPU Usage**: + + ```promql + max(tigergraph_cpu_usage{service_name!=""}) by (cluster_name, namespace, service_name, host_id) > 70 + ``` + + - **Severity**: Warning + - **Duration**: 5m + - **Description**: High CPU usage for specific TigerGraph service + +### Alert Configuration Best Practices + +1. **Alert Grouping**: Group related alerts by service type (CPU, Memory, Disk, etc.) for better organization. + +2. **Alert Severity Levels**: + - **Critical**: Immediate action required (service down, license expired) + - **Warning**: Attention needed but not immediately critical + +3. **Alert Duration**: Use appropriate durations to avoid false positives: + - **1m**: For critical service status changes + - **3m**: For critical resource usage + - **5m**: For warning conditions + +4. **Alert Labels**: Include relevant labels in alert descriptions: + - `cluster_name`: Identify which cluster is affected + - `namespace`: Identify the Kubernetes namespace + - `host_id`: Identify the specific host + - `service_name`: Identify the specific service (when applicable) + +5. **Alert Annotations**: Provide clear, actionable descriptions: + - Include current values in descriptions + - Specify which host/cluster is affected + - Provide context about the impact + +6. **Recording Rules**: Use the predefined recording rules for complex calculations: + - `tigergraph:cpu_usage_percentage` + - `tigergraph:memory_usage_percentage` + - `tigergraph:disk_usage_percentage` + - `tigergraph:endpoint_latency_avg` + - `tigergraph:service_online_count` + +This comprehensive alerting strategy ensures proactive monitoring and helps maintain optimal performance and reliability of your TigerGraph clusters. diff --git a/k8s/docs/05-monitor/tigergraph-monitor-with-prometheus-grafana.md b/k8s/docs/05-monitor/tigergraph-monitor-with-prometheus-grafana.md index 92a7ee892..492bbd5e7 100644 --- a/k8s/docs/05-monitor/tigergraph-monitor-with-prometheus-grafana.md +++ b/k8s/docs/05-monitor/tigergraph-monitor-with-prometheus-grafana.md @@ -1,79 +1,73 @@ -# Enable TigerGraph Operator monitoring with Prometheus and Grafana - -Since version 1.5.0, the TigerGraph Operator includes support for monitoring. This guide demonstrates how to deploy Prometheus and Grafana for observability, as well as how to enable the default TigerGraph Monitor CR to automatically set up the default Grafana dashboard for enhanced visualization. - -- [Enable TigerGraph Operator monitoring with Prometheus and Grafana](#enable-tigergraph-operator-monitoring-with-prometheus-and-grafana) - - [Install Prometheus and Grafana](#install-prometheus-and-grafana) +# TigerGraph Operator Monitoring Guide + +Since version 1.5.0, the TigerGraph Operator includes support for monitoring. This guide will help you deploy Prometheus and Grafana for observability monitoring, and configure the TigerGraph Monitor CR to automatically set up Grafana dashboards. + +Starting from version 1.7.0, TigerGraph Operator supports automatically exposing Operator metrics to Prometheus when the monitoring option is enabled during installation. Additionally, it supports customizing PrometheusRules and AlertManagerConfig through the TigerGraph Monitor CR, ServiceMonitor, PrometheusRule, and AlertManagerConfig selectors. + +## Table of Contents + +- [TigerGraph Operator Monitoring Guide](#tigergraph-operator-monitoring-guide) + - [Table of Contents](#table-of-contents) + - [Install Monitoring Components](#install-monitoring-components) + - [Install Prometheus, Grafana and AlertManager](#install-prometheus-grafana-and-alertmanager) + - [Basic Installation](#basic-installation) + - [Custom Configuration Installation](#custom-configuration-installation) + - [Verify Installation](#verify-installation) + - [Expose TigerGraph Operator Metrics to Prometheus](#expose-tigergraph-operator-metrics-to-prometheus) + - [Enable Operator Metrics During Installation](#enable-operator-metrics-during-installation) + - [Enable Operator Metrics During Upgrade](#enable-operator-metrics-during-upgrade) + - [Verify Operator Metrics Exposure](#verify-operator-metrics-exposure) + - [Grafana Dashboard and Prometheus Alert Rules for TigerGraph Operator​](#grafana-dashboard-and-prometheus-alert-rules-for-tigergraph-operator) - [Manage TigerGraph Monitor](#manage-tigergraph-monitor) + - [Key Configuration Fields](#key-configuration-fields) - [Manage TigerGraph Monitor using kubectl-tg plugin](#manage-tigergraph-monitor-using-kubectl-tg-plugin) - - [Create a TigerGraph monitor](#create-a-tigergraph-monitor) - - [Update a TigerGraph monitor](#update-a-tigergraph-monitor) - - [Delete a TigerGraph monitor](#delete-a-tigergraph-monitor) + - [Basic Monitoring Configuration](#basic-monitoring-configuration) + - [Advanced Monitoring Configuration](#advanced-monitoring-configuration) - [Manage TigerGraph Monitor using CR](#manage-tigergraph-monitor-using-cr) - - [Access Grafana dashboard](#access-grafana-dashboard) - - [Uninstall Prometheus and Grafana](#uninstall-prometheus-and-grafana) + - [Basic Configuration](#basic-configuration) + - [Complete Configuration Example](#complete-configuration-example) + - [Advanced Configuration](#advanced-configuration) + - [Service Monitor Labels](#service-monitor-labels) + - [Prometheus Rules Selector](#prometheus-rules-selector) + - [Prometheus Rules](#prometheus-rules) + - [AlertManager Config Selector](#alertmanager-config-selector) + - [AlertManager Configuration](#alertmanager-configuration) + - [TLS Configuration](#tls-configuration) + - [Access Monitoring Interface](#access-monitoring-interface) - [Troubleshooting](#troubleshooting) - [Create TigerGraph monitor CR successfully but with warning events in TigerGraph monitor CR status](#create-tigergraph-monitor-cr-successfully-but-with-warning-events-in-tigergraph-monitor-cr-status) + - [Check the serviceMonitorSelector, ruleSelector, and alertmanagerConfigSelector](#check-the-servicemonitorselector-ruleselector-and-alertmanagerconfigselector) + - [TigerGraph Metrics Reference](#tigergraph-metrics-reference) + - [Uninstall Monitoring Components](#uninstall-monitoring-components) + - [Uninstall Prometheus and Grafana](#uninstall-prometheus-and-grafana) + - [Clean Up CRDs](#clean-up-crds) + - [Clean Up Persistent Storage](#clean-up-persistent-storage) -## Install Prometheus and Grafana - -It is recommended to install Prometheus and Grafana using the [kube-prometheus-stack](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack). You can achieve this by using the sub command `kubectl tg monitoring-stack` of kubectl-tg plugin. Alternatively, you can install the kube-prometheus-stack using Helm by following the [official documentation](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#kube-prometheus-stack). +## Install Monitoring Components -> [!IMPORTANT] -> Currently, it is not possible to install the kube-prometheus-stack Helm chart on OpenShift. As a result, the kubectl-tg plugin cannot be used to install it. -> If your OpenShift cluster does not yet have the kube-prometheus-stack installed, please refer to the [OpenShift official documentation](https://docs.openshift.com/container-platform/4.8/monitoring/configuring-the-monitoring-stack.html) for detailed instructions on configuring the monitoring stack. +### Install Prometheus, Grafana and AlertManager -The following example steps will install kube-prometheus-stack using the kubectl-tg plugin: +#### Basic Installation ```bash -kubectl tg monitoring-stack --help -Manage kube-prometheus-stack deployment - -Examples: - # create a kube-prometheus-stack deployment with name and namespace - kubectl tg monitoring-stack create -r monitoring-stack -n monitoring-stack - # create a kube-prometheus-stack deployment with customized values.yaml - kubectl tg monitoring-stack create -r monitoring-stack -f values.yaml -n monitoring-stack - # update a kube-prometheus-stack deployment with values.yaml - kubectl tg monitoring-stack update -r monitoring-stack -f values.yaml -n monitoring-stack - # delete a kube-prometheus-stack deployment with name and namespace - kubectl tg monitoring-stack delete -r monitoring-stack -n monitoring-stack +# Set variables +export MONITORING_NAMESPACE="monitoring-stack" +export RELEASE_NAME="monitoring-stack" -Usage: - kubectl tg monitoring-stack [create|update|delete] [OPTIONS] +# Create namespace +kubectl create namespace $MONITORING_NAMESPACE -Options: - -n|--namespace : set namespace, if not set, use the default namespace in context - -r|--kube-prometheus-stack-release-name : - specify release name of kube-prometheus-stack deployment - -f|--kube-prometheus-stack-values : - specify values.yaml of kube-prometheus-stack deployment. If not set, use the default values.yaml +# Install using default configuration +kubectl tg monitoring-stack create \ + --kube-prometheus-stack-release-name $RELEASE_NAME \ + --namespace $MONITORING_NAMESPACE ``` -> [!NOTE] -> For TigerGraph Operator 1.5.0, the kubectl-tg plugin installs kube-prometheus-stack version 68.2.1 by default. The minimum supported version of kube-prometheus-stack is 45.25.0. -> If you prefer to install a specific version of kube-prometheus-stack between 45.25.0 and 68.2.1, use the Helm command to install it manually by following the [official documentation](https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack#kube-prometheus-stack). - -Using the following command to create a kube-prometheus-stack deployment with release name, namespace and customize configuration of kube-prometheus-stack. +#### Custom Configuration Installation ```bash -export PATH_TO_VALUES_YAML="values.yaml" -export RELEASE_NAME="monitoring-stack" -export MONITORING_NAMESPACE="monitoring-stack" - -kubectl tg monitoring-stack create -f ${PATH_TO_VALUES_YAML} -r ${RELEASE_NAME} -n ${MONITORING_NAMESPACE} -``` - -> [!NOTE] -> It is recommended to install Prometheus and Grafana in a separate namespace from the TigerGraph clusters to ensure cleaner and more organized management. - -The customized configuration of kube-prometheus-stack YAML resource example is following as: - -```YAML -# The values.yaml file for kube-prometheus-stack configures a comprehensive suite of monitoring services -# to deliver observability insights. -# It enables both Prometheus and Grafana, exposing them externally via LoadBalancers. - +# Create custom values file +cat > monitoring-values.yaml << EOF prometheus: enabled: true service: @@ -83,13 +77,20 @@ prometheus: portName: prometheus-service prometheusSpec: storageSpec: - volumeClaimTemplate: - spec: - accessModes: - - "ReadWriteOnce" - resources: - requests: - storage: 50Gi + volumeClaimTemplate: + spec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: 50Gi + serviceMonitorSelector: + ## Example which selects ServiceMonitors with label "prometheus" set to "somelabel" + matchLabels: + prometheus.io/monitor: "true" + ruleSelector: + matchLabels: + prometheus.io/rule: "true" grafana: enabled: true @@ -107,42 +108,134 @@ grafana: size: 5Gi alertmanager: - enabled: false + enabled: true + service: + type: LoadBalancer + port: 9093 + targetPort: 9093 + alertmanagerSpec: + storage: + volumeClaimTemplate: + spec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: 5Gi + alertmanagerConfigSelector: + matchLabels: + prometheus.io/alertmanager-config: "true" +EOF + +# Install with custom configuration +kubectl tg monitoring-stack create \ + --kube-prometheus-stack-release-name $RELEASE_NAME \ + --kube-prometheus-stack-values monitoring-values.yaml \ + --namespace $MONITORING_NAMESPACE ``` -> [!NOTE] -> For the external services of Prometheus and Grafana, please configure them according to your specific requirements. Additionally, ensure that you allocate an appropriate storage size to accommodate your monitoring data effectively. +### Verify Installation + +```bash +# Check if all pods are running +kubectl get pods -n $MONITORING_NAMESPACE + +# Check services +kubectl get svc -n $MONITORING_NAMESPACE + +# Verify CRDs are installed +kubectl get crd | grep monitoring.coreos.com +``` + +## Expose TigerGraph Operator Metrics to Prometheus -After successfully installing the kube-prometheus-stack, you can check its status by running the following command: +Starting from version 1.7.0, you can enable TigerGraph Operator metrics exposure to Prometheus during Operator installation. This allows you to monitor the Operator itself alongside your TigerGraph clusters. + +> [!IMPORTANT] +> When enabling monitoring during Operator installation, you must specify the ServiceMonitor selector; otherwise, Prometheus will not detect the ServiceMonitor resources created by the Operator. For guidance on checking Prometheus’ ServiceMonitor selector, see the [Troubleshooting section](#check-the-servicemonitorselector-ruleselector-and-alertmanagerconfigselector). + +### Enable Operator Metrics During Installation ```bash -kubectl --namespace ${MONITORING_NAMESPACE} get pods -l "release=${RELEASE_NAME}" +# Install Operator with monitoring enabled +kubectl tg init \ + --namespace tigergraph \ + --monitoring-enabled true \ + --monitoring-service-monitor-selector "prometheus.io/monitor=true" +``` -NAME READY STATUS RESTARTS AGE -monitoring-stack-kube-prom-operator-bcf69c8c9-tx4lw 1/1 Running 0 2m23s -monitoring-stack-kube-state-metrics-6cf9d56576-ckmf5 1/1 Running 0 2m23s -monitoring-stack-prometheus-node-exporter-xhn4p 1/1 Running 0 2m23s -monitoring-stack-prometheus-node-exporter-zc92p 1/1 Running 0 2m23s -monitoring-stack-prometheus-node-exporter-zhfg5 1/1 Running 0 2m23s +### Enable Operator Metrics During Upgrade + +```bash +# Upgrade existing Operator with monitoring enabled +kubectl tg upgrade \ + --namespace tigergraph \ + --monitoring-enabled true \ + --monitoring-service-monitor-selector "prometheus.io/monitor=true" ``` -You can also use the command `kubectl tg monitoring-stack update` or `kubectl tg monitoring-stack delete` to update or delete the kube-prometheus-stack. For detailed usage instructions, please refer to the command's help documentation. +### Verify Operator Metrics Exposure + +After enabling Operator metrics, verify that the Service Monitor is created: + +```bash +# Check if Service Monitor is created +kubectl get servicemonitor -n tigergraph + +# Check Service Monitor details +kubectl describe servicemonitor tigergraph-operator-controller-manager-metrics-monitor -n tigergraph + +# Verify metrics endpoint status in Prometheus +# query the service address of Prometheus +kubectl get svc -n $MONITORING_NAMESPACE +# Then visit http://${PROMETHEUS_LBS_ADDRESS}:9090/targets +``` + +### Grafana Dashboard and Prometheus Alert Rules for TigerGraph Operator​ + +When you enable monitoring during installation, the TigerGraph Operator automatically deploys a default Grafana dashboard named `​TigerGraph Operator Metrics Dashboard`. + +While Prometheus alert rules are not applied automatically during installation, we provide a default set of rules for reference. You can customize the following example configuration to suit your requirements: + +[TigerGraph-Operator-alert-rules](../10-samples/monitoring/tigergraph-operator-alert-rules.yaml) ## Manage TigerGraph Monitor -To support monitoring in the TigerGraph Operator, a new controller, called the TigerGraph Monitoring Controller, has been introduced. This controller manages the lifecycle of all monitoring-related CRs, including `ServiceMonitor`, `ConfigMap`, and others. +### Key Configuration Fields + +- monitoredClusters: Specify the clusters you want to monitor. If left empty, all clusters created in the current namespace will be monitored. + +- serviceMonitorLabels: Define the selector labels for the ServiceMonitor. If not specified, the TigerGraph Operator will attempt to detect them automatically. + +- tlsConfig: TLS configuration to use when scraping the target. + +- ruleSelectorLabels: Define the selector labels for the PrometheusRule. If not specified, the TigerGraph Operator will attempt to detect them automatically. + +- alertmanagerConfigLabels: Define the selector labels for the AlertmanagerConfig. If not specified, the TigerGraph Operator will attempt to detect them automatically. + +- releaseName: Deprecated as of version 1.7.0. Use serviceMonitorLabels instead. + +- prometheusRule: PrometheusRule contains specification parameters for a Rule. + +- alertmanagerConfig: AlertmanagerConfig is a specification of the desired behavior of the Alertmanager configuration. + +For detailed configuration of these sub-fields, please see the [API Reference](../08-reference/api-reference.md). ### Manage TigerGraph Monitor using kubectl-tg plugin -You can manage a TigerGraph Monitor CR by subcommand `kubectl tg monitor` of kubectl-tg plugin. +You can manage a TigerGraph Monitor CR by subcommand kubectl tg monitor of kubectl-tg plugin. ```bash -kubectl tg monitor --help +$ kubectl tg monitor --help Manage TigerGraph monitor Examples: # create a TigerGraph monitor with name and namespace kubectl tg monitor create --name tigergraph-monitor -n tigergraph + # create a TigerGraph monitor with name and namespace and service monitor labels + kubectl tg monitor create --name tigergraph-monitor -n tigergraph --service-monitor-labels prometheus.io/monitor=true,prometheus.io/scrape=true + # create a TigerGraph monitor with TLS configuration + kubectl tg monitor create --name tigergraph-monitor -n tigergraph --tls-config tls-config.yaml # update a TigerGraph monitor kubectl tg monitor update --name tigergraph-monitor -r new-release-name -n tigergraph # delete a TigerGraph monitor with name and namespace @@ -157,30 +250,188 @@ Options: --monitored-clusters: specify the clusters to be monitored, if not set, monitor all clusters. Separate multiple clusters with commas, e.g. cluster1,cluster2. Set it to null if you want to remove it. -r|--kube-prometheus-stack-release-name : - specify release name of kube-prometheus-stack deployment + specify release name of kube-prometheus-stack deployment. Deprecated, please use --service-monitor-labels instead. + --service-monitor-labels : + specify the labels of service monitor, your input should be like 'prometheus.io/monitor=true,prometheus.io/scrape=true'. + Set it to null if you want to empty the labels. + --prometheus-rule-labels : + specify the labels of prometheus rule, your input should be like 'prometheus.io/monitor=true,prometheus.io/scrape=true'. + Set it to null if you want to empty the labels. + --prometheus-rule: + give a YAML file to specify the prometheus rules. + --alertmanager-config-labels : + specify the labels of alertmanager config, your input should be like 'prometheus.io/monitor=true,prometheus.io/scrape=true'. + Set it to null if you want to empty the labels. + --alertmanager-config: + give a YAML file to specify the alertmanager configs. + --tls-config: + give a YAML file to specify the TLS configuration for ServiceMonitor endpoints. ``` -#### Create a TigerGraph monitor +#### Basic Monitoring Configuration ```bash -kubectl tg monitor create --name ${TG_MONITOR_NAME} -n ${NAMESPACE} +export TG_MONITOR_NAME=tigergraph-monitor +export namespace=tigergraph +# Create basic monitoring +kubectl tg monitor create \ + --name ${TG_MONITOR_NAME} \ + --namespace ${NAMESPACE} \ + --service-monitor-labels "prometheus.io/monitor=true" +``` + +#### Advanced Monitoring Configuration + +Prepare a YAML file that includes the definitions for your `prometheusRule`, `alertmanagerConfig` and `tlsConfig`. Those YAML files will be passed to the `--prometheus-rule`, `--alertmanager-config` and `--tls-config` options. + +Below is an illustrative example of a prometheusRule, alertmanagerConfig and tlsConfig YAML files: + +[prometheus-rules.yaml](../10-samples/monitoring/prometheus-rules.yaml) + +```yaml +prometheusRule: + groups: + - name: tigergraph.cpu.alerts + rules: + - alert: TigerGraphHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - alert: TigerGraphCriticalCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value }}%)" +``` + +alertmanager-configs.yaml + +```yaml +alertmanagerConfig: + route: + groupBy: ["job", "alertname"] + groupWait: 30s + groupInterval: 5m + repeatInterval: 1m + receiver: "slack-receiver" + routes: + - receiver: "slack-receiver" + continue: true + receivers: + - name: "slack-receiver" + slackConfigs: + - sendResolved: true + apiURL: + name: slack-webhook-url + key: webhook-url + channel: "#operator-monitoring-test" + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} +``` + +tls-config.yaml + +```yaml +tlsConfig: + ca: + secret: + name: tigergraph-metrics-server-cert + key: ca.crt + cert: + secret: + name: tigergraph-metrics-server-cert + key: tls.crt + keySecret: + name: tigergraph-metrics-server-cert + key: tls.key + insecureSkipVerify: false +``` + +> [!IMPORTANT] +> When configuring alertmanagerConfig, you must also ​manually create a Kubernetes Secret​ to enable authentication for external notification services such as ​Email​ or ​Slack. +> +> This Secret typically contains sensitive credentials (e.g., API tokens, SMTP credentials) required for Alertmanager to send notifications to these services. Without the Secret, Alertmanager won't be able to establish connections to external providers. + +Secret for Email: + +```YAML +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-auth-secret # Name of the Secret +type: Opaque +data: + password: YOUR_PASSWORD_BASE64_ENCODED +``` + +Secret for Slack: + +```YAML +apiVersion: v1 +kind: Secret +metadata: + name: slack-webhook-url +type: Opaque +stringData: + webhook-url: https://hooks.slack.com/services/YOUR_SLACK_WEBHOOK_URL +``` + +Create TigerGraph monitor CR using kubectl tg plugin: + +```bash +kubectl tg monitor create \ + --name ${TG_MONITOR_NAME} \ + --namespace ${NAMESPACE} \ + --monitored-clusters "cluster1,cluster2" \ + --service-monitor-labels "prometheus.io/monitor=true" \ + --prometheus-rule-labels "prometheus.io/rules=true" \ + --prometheus-rule prometheus-rules.yaml \ + --alertmanager-config-labels "alertmanager.io/config=true" \ + --alertmanager-config alertmanager-config.yaml \ + --tls-config tls-config.yaml ``` When the TigerGraph Monitor CR is successfully created, it automatically configures Prometheus to scrape the TigerGraph metrics endpoints specified in the `ServiceMonitor` CR. Additionally, it sets up a default Grafana dashboard based on the configuration defined in a `ConfigMap`. > [!NOTE] -> By default, the TigerGraph Monitor CR monitors all TigerGraph clusters in the current namespace. To monitor specific TigerGraph clusters, specify them by the option `--monitored-clusters` +> By default, the TigerGraph Monitor CR monitors all TigerGraph clusters in the current namespace. To monitor specific TigerGraph clusters, specify them by the option `--monitored-clusters`. > [!WARNING] -> If the TigerGraph Operator is namespace-scoped, you must manually specify the release name of the kube-prometheus-stack by the option `-r|--kube-prometheus-stack-release-name`. This ensures the monitoring controller can correctly identify it, especially when the kube-prometheus-stack is deployed in a different namespace from the TigerGraph clusters. +> If the TigerGraph Operator is namespace-scoped, you must manually specify the options `--service-monitor-labels`, `--prometheus-rule-labels` and `--alertmanager-config`. This ensures the monitoring controller can correctly identify it, especially when the kube-prometheus-stack is deployed in a different namespace from the TigerGraph clusters. -#### Update a TigerGraph monitor +Update TigerGraph monitor CR using kubectl tg plugin: ```bash -kubectl tg monitor update --name ${TG_MONITOR_NAME} --monitored-clusters ${MONITORED_CLUSTERS} -r ${RELEASE_NAME} -n ${NAMESPACE} +kubectl tg monitor update \ + --name ${TG_MONITOR_NAME} \ + --namespace ${NAMESPACE} \ + --monitored-clusters "cluster1,cluster2" \ + --service-monitor-labels "prometheus.io/monitor=true" \ + --prometheus-rule-labels "prometheus.io/rules=true" \ + --prometheus-rule prometheus-rules.yaml \ + --alertmanager-config-labels "alertmanager.io/config=true" \ + --alertmanager-config alertmanager-config.yaml \ + --tls-config tls-config.yaml ``` -#### Delete a TigerGraph monitor +Delete a TigerGraph monitor: ```bash kubectl tg monitor delete --name ${TG_MONITOR_NAME} -n ${NAMESPACE} @@ -188,105 +439,350 @@ kubectl tg monitor delete --name ${TG_MONITOR_NAME} -n ${NAMESPACE} ### Manage TigerGraph Monitor using CR -You can use the following example TigerGraph Monitor CR YAML resource to manage the TigerGraph Monitor: +#### Basic Configuration -```YAML +```yaml apiVersion: graphdb.tigergraph.com/v1alpha1 kind: TigerGraphMonitor metadata: - name: ${TG_MONITOR_NAME} - namespace: ${NAMESPACE} + name: tigergraph-monitor + namespace: tigergraph spec: monitoredClusters: - test-cluster1 - test-cluster2 - releaseName: ${RELEASE_NAME} - + serviceMonitorLabels: + prometheus.io/monitor: "true" ``` > [!NOTE] -> If you don't specify `spec.monitoredClusters`, the TigerGraph Monitor CR monitors all TigerGraph clusters in the current namespace. +> By default, the TigerGraph Monitor CR monitors all TigerGraph clusters in the current namespace. To monitor specific TigerGraph clusters, specify them by the option `--monitored-clusters`. > [!WARNING] -> If the TigerGraph Operator is namespace-scoped, you must manually specify the release name of the kube-prometheus-stack in the field `spec.releaseName`. This ensures the monitoring controller can correctly identify it, especially when the kube-prometheus-stack is deployed in a different namespace from the TigerGraph clusters. +> If the TigerGraph Operator is namespace-scoped, you must manually specify the option `--service-monitor-labels`. This ensures the monitoring controller can correctly identify it, especially when the kube-prometheus-stack is deployed in a different namespace from the TigerGraph clusters. -## Access Grafana dashboard +#### Complete Configuration Example -By default, Prometheus and Grafana provisioned by kube-prometheus-stack are exposed via a LoadBalancer. To find the external IPs for Prometheus and Grafana, run the following command: +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraphMonitor +metadata: + name: tigergraph-monitor + namespace: tigergraph +spec: + # Specify clusters to monitor (optional, monitors all if not specified) + monitoredClusters: + - test-cluster1 + - test-cluster2 + + # Service Monitor labels for Prometheus discovery + serviceMonitorLabels: + prometheus.io/monitor: "true" + prometheus.io/scrape: "true" + app.kubernetes.io/component: "monitoring" + + # Prometheus Rule labels + ruleSelectorLabels: + prometheus.io/rules: "true" + app.kubernetes.io/component: "monitoring" + + # Prometheus Rules configuration + prometheusRule: + groups: + - name: tigergraph.cpu.alerts + rules: + - alert: TigerGraphHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - alert: TigerGraphCriticalCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value }}%)" + + + # Alertmanager Configuration labels + alertmanagerConfigLabels: + alertmanager.io/config: "true" + app.kubernetes.io/component: "monitoring" + + # Alertmanager Configuration + alertmanagerConfig: + route: + groupBy: ["job", "alertname"] + groupWait: 30s + groupInterval: 5m + repeatInterval: 1m + receiver: "slack-receiver" + routes: + - receiver: "slack-receiver" + continue: true + receivers: + - name: "slack-receiver" + slackConfigs: + - sendResolved: true + apiURL: + name: slack-webhook-url + key: webhook-url + channel: "#operator-monitoring-test" + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + + # TLS Configuration for ServiceMonitor endpoints + tlsConfig: + ca: + secret: + name: tigergraph-metrics-server-cert + key: ca.crt + cert: + secret: + name: tigergraph-metrics-server-cert + key: tls.crt + keySecret: + name: tigergraph-metrics-server-cert + key: tls.key + insecureSkipVerify: false +``` -```bash -kubectl get services -n ${MONITORING_NAMESPACE} +## Advanced Configuration + +### Service Monitor Labels + +Service Monitor labels are used by Prometheus to discover and scrape metrics from TigerGraph clusters. + +```yaml +serviceMonitorLabels: + prometheus.io/monitor: "true" + prometheus.io/scrape: "true" + app.kubernetes.io/component: "monitoring" ``` -Look for the external IP and port number of Prometheus and Grafana in the output. An example output is below: +> [!WARNING] +> If the TigerGraph Operator is namespace-scoped, you must manually specify the field `serviceMonitorLabels`. This ensures the monitoring controller can correctly identify it, especially when the kube-prometheus-stack is deployed in a different namespace from the TigerGraph clusters. -```bash -kubectl get services -n ${MONITORING_NAMESPACE} +### Prometheus Rules Selector -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -monitoring-stack-grafana LoadBalancer 34.118.225.63 104.198.48.129 8081:32767/TCP 10m -monitoring-stack-kube-prom-operator ClusterIP 34.118.231.29 443/TCP 10m -monitoring-stack-kube-prom-prometheus LoadBalancer 34.118.226.87 34.28.30.58 9090:31174/TCP 10m -monitoring-stack-kube-state-metrics ClusterIP 34.118.231.76 8080/TCP 10m -monitoring-stack-prometheus-node-exporter ClusterIP 34.118.231.219 9100/TCP 10m -prometheus-operated ClusterIP None 9090/TCP 10m +Prometheus Rules Selector Labels: Define the selector labels for the PrometheusRule. If not specified, the TigerGraph Operator will attempt to detect them automatically. + +```yaml +ruleSelectorLabels: + prometheus.io/rules: "true" + app.kubernetes.io/component: "monitoring" ``` -Take the above output as an example: you can access the Grafana dashboard by visiting https://104.198.48.129:8081 and the Prometheus web UI by visiting http://34.28.30.58:9090 in a web browser. +> [!WARNING] +> If the TigerGraph Operator is namespace-scoped, you must manually specify the field `ruleSelectorLabels`. This ensures the monitoring controller can correctly identify it, especially when the kube-prometheus-stack is deployed in a different namespace from the TigerGraph clusters. + +### Prometheus Rules + +Prometheus rules define alerting and recording rules for monitoring TigerGraph clusters. + +```yaml +prometheusRule: + groups: + - name: tigergraph.rules + rules: + # CPU usage alert + - alert: TigerGraphHighCPUUsage + expr: tigergraph_cpu_usage > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "High CPU usage detected" + description: "TigerGraph cluster {{ $labels.cluster }} has high CPU usage" + + # Memory usage alert + - alert: TigerGraphHighMemoryUsage + expr: tigergraph_memory_usage > 85 + for: 5m + labels: + severity: warning + annotations: + summary: "High memory usage detected" + description: "TigerGraph cluster {{ $labels.cluster }} has high memory usage" + + # Query latency alert + - alert: TigerGraphHighQueryLatency + expr: tigergraph_query_latency_seconds > 10 + for: 2m + labels: + severity: critical + annotations: + summary: "High query latency detected" + description: "TigerGraph cluster {{ $labels.cluster }} has high query latency" +``` -The default Grafana dashboard is named `TigerGraph Dashboard` under `General` folder. +A default set of Prometheus alerting rules is provided based on key TigerGraph metrics. You can customize this configuration to meet your specific requirements. -> [!NOTE] -> You can edit the existing dashboard or create your own directly through the Grafana UI. Any changes made via the UI will be persisted by the Grafana service. +[TigerGraph-Prometheus-alert-rules-example](../10-samples/monitoring/tigergraph-alert-rules.yaml) -## Uninstall Prometheus and Grafana +### AlertManager Config Selector -To uninstall Prometheus and Grafana, run the following command: +AlertManager Config Selector Labels: Define the selector labels for the AlertManager Config. If not specified, the TigerGraph Operator will attempt to detect them automatically. -```bash -kubectl tg monitoring-stack delete -r ${RELEASE_NAME} -n ${MONITORING_NAMESPACE} +```yaml +alertmanagerConfigLabels: + alertmanager.io/config: "true" + app.kubernetes.io/component: "monitoring" ``` -This removes all the Kubernetes components associated with the chart and deletes the release. +> [!WARNING] +> If the TigerGraph Operator is namespace-scoped, you must manually specify the field `alertmanagerConfigLabels`. This ensures the monitoring controller can correctly identify it, especially when the kube-prometheus-stack is deployed in a different namespace from the TigerGraph clusters. + +### AlertManager Configuration + +AlertManager configuration defines how alerts are routed and sent. + +```yaml +alertmanagerConfig: + route: + groupBy: ["job", "alertname"] + groupWait: 30s + groupInterval: 5m + repeatInterval: 1m + receiver: "slack-receiver" + routes: + - receiver: "slack-receiver" + continue: true + receivers: + - name: "slack-receiver" + slackConfigs: + - sendResolved: true + apiURL: + name: slack-webhook-url + key: webhook-url + channel: "#operator-monitoring-test" + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} +``` -CRDs created by this chart are not removed by default and should be manually cleaned up: +> [!IMPORTANT] +> When configuring alertmanagerConfig, you must also ​manually create a Kubernetes Secret​ to enable authentication for external notification services such as ​Email​ or ​Slack. +> +> This Secret typically contains sensitive credentials (e.g., API tokens, SMTP credentials) required for Alertmanager to send notifications to these services. Without the Secret, Alertmanager won't be able to establish connections to external providers. -```bash -kubectl delete crd alertmanagerconfigs.monitoring.coreos.com -kubectl delete crd alertmanagers.monitoring.coreos.com -kubectl delete crd podmonitors.monitoring.coreos.com -kubectl delete crd probes.monitoring.coreos.com -kubectl delete crd prometheusagents.monitoring.coreos.com -kubectl delete crd prometheuses.monitoring.coreos.com -kubectl delete crd prometheusrules.monitoring.coreos.com -kubectl delete crd scrapeconfigs.monitoring.coreos.com -kubectl delete crd servicemonitors.monitoring.coreos.com -kubectl delete crd thanosrulers.monitoring.coreos.com +Secret for Email: + +```YAML +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-auth-secret # Name of the Secret +type: Opaque +data: + password: YOUR_PASSWORD_BASE64_ENCODED ``` -Additional, if you specify the persistent volume claim for the Prometheus and Grafana, you also need to delete the persistent volume manually. You can use the following command to figure out the persistent volume claim name: +Secret for Slack: -```bash -kubectl get pvc --namespace ${MONITORING_NAMESPACE} +```YAML +apiVersion: v1 +kind: Secret +metadata: + name: slack-webhook-url +type: Opaque +stringData: + webhook-url: https://hooks.slack.com/services/YOUR_SLACK_WEBHOOK_URL ``` -Example output: +You can use the following example YAML configurations to set up AlertmanagerConfig for Email or Slack alert delivery: + +- [Alertmanager-config-Email](../10-samples/monitoring/alertmanager-config-email.yaml) +- [Alertmanager-config-Slack](../10-samples/monitoring/alertmanager-config-slack.yaml) + +### TLS Configuration + +TLS configuration enables secure communication between Prometheus and TigerGraph clusters. + +```yaml +tlsConfig: + ca: + secret: + name: tigergraph-metrics-server-cert + key: ca.crt + cert: + secret: + name: tigergraph-metrics-server-cert + key: tls.crt + keySecret: + name: tigergraph-metrics-server-cert + key: tls.key + insecureSkipVerify: false +``` -```bash -NAME STATUS VOLUME CAPACITY ACCESS MODES STORAGECLASS VOLUMEATTRIBUTESCLASS AGE -monitoring-stack-grafana Bound pvc-80abe8dc-8925-4e58-80fc-104f1c31f6f9 5Gi RWO standard-rwo 3h31m -prometheus-monitoring-stack-kube-prom-prometheus-db-prometheus-monitoring-stack-kube-prom-prometheus-0 Bound pvc-c60a27e5-b524-494d-b2a5-917bd433147c 50Gi RWO standard-rwo 3h31m +You can easily manage your SSL certificate files with a Kubernetes Secret using cert-manager, the YAML configuration example is below: + +[Generate-SSL-Certificate-with-cert-manager](../10-samples/monitoring/tigergraph-certificate-with-certmanager.yaml) + +To skip SSL verification in test environments, you can apply the following configuration: + +```yaml +tlsConfig: + insecureSkipVerify: true ``` -Delete the persistent volume claim of the Prometheus and Grafana with the following command: +## Access Monitoring Interface -> [!IMPORTANT] -> Please ensure that you no longer need the data from Prometheus and Grafana before deleting the persistent volume claims. +By default, Prometheus and Grafana provisioned by kube-prometheus-stack are exposed via a LoadBalancer. To find the external IPs for Prometheus and Grafana, run the following command: ```bash -kubectl delete pvc ${PVC_NAME} --namespace ${MONITORING_NAMESPACE} +kubectl get services -n ${MONITORING_NAMESPACE} ``` +Look for the external IP and port number of Prometheus and Grafana in the output. An example output is below: + +```bash +kubectl get services -n ${MONITORING_NAMESPACE} + +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +alertmanager-operated ClusterIP None 9093/TCP,9094/TCP,9094/UDP 78m +prometheus-operated ClusterIP None 9090/TCP 78m +prometheus-stack-grafana LoadBalancer 34.118.226.123 104.197.160.82 8081:32555/TCP 78m +prometheus-stack-kube-prom-alertmanager LoadBalancer 34.118.234.203 34.136.148.38 9093:32590/TCP,8080:31369/TCP 78m +prometheus-stack-kube-prom-operator ClusterIP 34.118.228.250 443/TCP 78m +prometheus-stack-kube-prom-prometheus LoadBalancer 34.118.239.17 34.10.96.171 9090:32425/TCP,8080:30638/TCP 78m +prometheus-stack-kube-state-metrics ClusterIP 34.118.228.145 8080/TCP 78m +prometheus-stack-prometheus-node-exporter ClusterIP 34.118.227.115 9100/TCP 78m +``` + +Take the above output as an example, to access the monitoring tools, open a web browser and navigate to the following endpoints: + +- ​Grafana Dashboard: http://104.197.160.82:8081 +- Prometheus Web UI: http://34.10.96.171:9090 +- AlertManager UI: http://34.136.148.38:9093 + +The default Grafana dashboard for TigerGraph cluster and TigerGraph Operator is named `TigerGraph Dashboard` and `TigerGraph Kubernetes Controller Runtime Metrics` under `General` folder. + +> [!NOTE] +> You can edit the existing dashboard or create your own directly through the Grafana UI. Any changes made via the UI will be persisted by the Grafana service. + ## Troubleshooting ### Create TigerGraph monitor CR successfully but with warning events in TigerGraph monitor CR status @@ -416,3 +912,94 @@ Events: Normal EnsuringMonitorResources 2s TigerGraphMonitor Start ensuring monitoring resources Normal ReconcileSucceeded 2s TigerGraphMonitor Reconciliation succeeded ``` + +### Check the serviceMonitorSelector, ruleSelector, and alertmanagerConfigSelector + +You may encounter issues where the following Prometheus Custom Resources are created successfully but are not loaded into Prometheus or AlertManager: + +- **ServiceMonitor**: Defines which services to scrape for metrics +- **PrometheusRule**: Defines alerting and recording rules +- **AlertmanagerConfig**: Defines alert routing and notification settings + +To troubleshoot this issue, you can check the selectors of these resources and verify if they are configured correctly: + +```bash +# Check ServiceMonitor selector in Prometheus +kubectl get prometheus ${prometheus-instance-name} -n ${prometheus-installed-namespace} -o yaml | yq .spec.serviceMonitorSelector + +# Check PrometheusRule selector in Prometheus +kubectl get prometheus ${prometheus-instance-name} -n ${prometheus-installed-namespace} -o yaml | yq .spec.ruleSelector + +# Check AlertmanagerConfig selector in Alertmanager +kubectl get alertmanager ${alertmanager-instance-name} -n ${prometheus-installed-namespace} -o yaml | yq .spec.alertmanagerConfigSelector +``` + +**Common Issues and Solutions:** + +1. **Selector Mismatch**: Ensure the labels on your ServiceMonitor, PrometheusRule, or AlertmanagerConfig match the selectors configured in Prometheus/Alertmanager. + +2. **Namespace Issues**: Verify that the resources are in the correct namespace that Prometheus/Alertmanager is configured to watch. + +**Example Verification:** + +```bash +# Check what ServiceMonitors exist +kubectl get servicemonitor --all-namespaces + +# Check the labels on a specific ServiceMonitor +kubectl get servicemonitor my-servicemonitor -n my-namespace -o yaml | yq .metadata.labels + +# check the labels on a specific PrometheusRule + +# check the labels on a specific AlertmanagerConfig + +# Verify Prometheus is watching the correct namespace +kubectl get prometheus my-prometheus -n my-namespace -o yaml | yq .spec.serviceMonitorNamespaceSelector +``` + +## Uninstall Monitoring Components + +### Uninstall Prometheus and Grafana + +```bash +# Uninstall monitoring stack +kubectl tg monitoring-stack delete \ + --kube-prometheus-stack-release-name monitoring-stack \ + --namespace monitoring-stack +``` + +### Clean Up CRDs + +```bash +# Delete monitoring CRDs (optional, affects other monitoring setups) +kubectl get crd -o name | grep monitoring.coreos.com | xargs kubectl delete +``` + +### Clean Up Persistent Storage + +```bash +# Delete PVCs (be careful, this will delete all monitoring data) +kubectl delete pvc -n monitoring-stack --all + +# Delete namespace +kubectl delete namespace monitoring-stack +``` + +> [!WARNING] +> Uninstalling monitoring components will delete all monitoring data and configurations. Make sure to backup any important data before proceeding. + +## TigerGraph Metrics Reference + +For a comprehensive reference of all TigerGraph metrics, including detailed descriptions, labels, and usage examples, see the [TigerGraph Metrics Reference](tigergraph-metrics-reference.md) document. + +This reference covers: + +- **CPU Metrics**: CPU usage, availability, and core counts +- **Memory Metrics**: Memory usage, availability, and utilization percentages +- **Disk Metrics**: Disk usage, I/O operations, and filesystem statistics +- **Network Metrics**: Connection counts and traffic patterns +- **Service Metrics**: Service health and status indicators +- **License Metrics**: License expiration tracking +- **Query Performance Metrics**: Latency, throughput, and completion rates + +The metrics reference also includes Prometheus query examples and alerting recommendations to help you build effective monitoring dashboards and alert rules. diff --git a/k8s/docs/06-troubleshoot/cluster-deployment.md b/k8s/docs/06-troubleshoot/cluster-deployment.md index 9b7f29a41..fe9a2379c 100644 --- a/k8s/docs/06-troubleshoot/cluster-deployment.md +++ b/k8s/docs/06-troubleshoot/cluster-deployment.md @@ -162,7 +162,7 @@ kubectl describe pod test-cluster-0 -n tigergraph Containers: tg: Container ID: - Image: docker.io/tigergrah/tigergraph-k8s:4.2.11 + Image: docker.io/tigergrah/tigergraph-k8s:4.2.1 Image ID: Ports: 9000/TCP, 14240/TCP, 22/TCP Host Ports: 0/TCP, 0/TCP, 0/TCP @@ -221,11 +221,11 @@ kubectl describe pod test-cluster-0 -n tigergraph Normal Scheduled 2m38s default-scheduler Successfully assigned tigergraph/test-cluster-0 to tg-k8s-openshift-777-rdj74-worker-d-pvrm2 Normal SuccessfulAttachVolume 2m34s attachdetach-controller AttachVolume.Attach succeeded for volume "pvc-96c90faf-3019-416a-ace9-200502f67b65" Normal AddedInterface 2m30s multus Add eth0 [10.130.0.33/23] from openshift-sdn - Normal Pulling 71s (x4 over 2m29s) kubelet Pulling image "docker.io/tigergrah/tigergraph-k8s:4.2.11" - Warning Failed 71s (x4 over 2m29s) kubelet Failed to pull image "docker.io/tigergrah/tigergraph-k8s:4.2.11": rpc error: code = Unknown desc = reading manifest 3.8.5 in docker.io/tigergrah/tigergraph-k8s: manifest unknown: manifest unknown + Normal Pulling 71s (x4 over 2m29s) kubelet Pulling image "docker.io/tigergrah/tigergraph-k8s:4.2.1" + Warning Failed 71s (x4 over 2m29s) kubelet Failed to pull image "docker.io/tigergrah/tigergraph-k8s:4.2.1": rpc error: code = Unknown desc = reading manifest 3.8.5 in docker.io/tigergrah/tigergraph-k8s: manifest unknown: manifest unknown Warning Failed 71s (x4 over 2m29s) kubelet Error: ErrImagePull Warning Failed 59s (x6 over 2m29s) kubelet Error: ImagePullBackOff - Normal BackOff 44s (x7 over 2m29s) kubelet Back-off pulling image "docker.io/tigergrah/tigergraph-k8s:4.2.11" + Normal BackOff 44s (x7 over 2m29s) kubelet Back-off pulling image "docker.io/tigergrah/tigergraph-k8s:4.2.1" ``` Look for messages indicating issues with the image, such as `Error: ErrImagePull` You should correct the image version using the following command: @@ -267,7 +267,7 @@ kubectl describe pod test-cluster-0 -n tigergraph Controlled By: StatefulSet/test-cluster Containers: tg: - Image: docker.io/tigergrah/tigergraph-k8s:4.2.11 + Image: docker.io/tigergrah/tigergraph-k8s:4.2.1 Ports: 9000/TCP, 14240/TCP, 22/TCP Host Ports: 0/TCP, 0/TCP, 0/TCP Requests: diff --git a/k8s/docs/06-troubleshoot/cluster-management.md b/k8s/docs/06-troubleshoot/cluster-management.md index ee0267c47..84975f212 100644 --- a/k8s/docs/06-troubleshoot/cluster-management.md +++ b/k8s/docs/06-troubleshoot/cluster-management.md @@ -67,7 +67,7 @@ This document provides solutions for common issues that may arise during the man Image Pull Secrets: Name: tigergraph-image-pull-secret Init Job: - Image: docker.io/tigergrah/tigergraph-k8s-init:1.6.0 + Image: docker.io/tigergrah/tigergraph-k8s-init:1.7.0 Image Pull Policy: IfNotPresent Image Pull Secrets: Name: tigergraph-image-pull-secret diff --git a/k8s/docs/06-troubleshoot/kubectl-tg-plugin.md b/k8s/docs/06-troubleshoot/kubectl-tg-plugin.md index 80ee956c4..28065807c 100644 --- a/k8s/docs/06-troubleshoot/kubectl-tg-plugin.md +++ b/k8s/docs/06-troubleshoot/kubectl-tg-plugin.md @@ -13,7 +13,7 @@ The builtin commands in the MacOS terminal are not GNU commands, and they often ```bash > kubectl tg create --cluster-name test-cluster --private-key-secret ssh-key-secret \ - --version 4.1.0 --storage-class standard --storage-size 10G -n tigergraph \ + --version 4.2.1 --storage-class standard --storage-size 10G -n tigergraph \ --tigergraph-config "System.Backup.TimeoutSec=900,Controller.BasicConfig.LogConfig.LogFileMaxSizeMB=40" grep: invalid option -- P diff --git a/k8s/docs/06-troubleshoot/operator-installation.md b/k8s/docs/06-troubleshoot/operator-installation.md index b80f5aba1..4c5a6df50 100644 --- a/k8s/docs/06-troubleshoot/operator-installation.md +++ b/k8s/docs/06-troubleshoot/operator-installation.md @@ -17,17 +17,17 @@ In the following steps, we assume that the TigerGraph Operator has been installe ```bash kubectl get deployment tigergraph-operator-controller-manager -o wide -n tigergraph - NAME READY UP-TO-DATE AVAILABLE AGE CONTAINERS IMAGES SELECTOR - tigergraph-operator-controller-manager 1/1 1 1 22m manager,kube-rbac-proxy docker.io/tigergrah/tigergraph-k8s-operator:0.0.3,gcr.io/kubebuilder/kube-rbac-proxy:v0.8.0 control-plane=controller-manager + NAME READY UP-TO-DATE AVAILABLE AGE CONTAINERS IMAGES SELECTOR + tigergraph-operator-controller-manager 1/1 1 1 19d manager docker.io/tginternal/tigergraph-k8s-operator:1.7.0 control-plane=controller-manager ``` - From the output of the above command, you can figure out that the operator version is 0.0.3, docker.io/tigergrah/tigergraph-k8s-operator:0.0.3, you can also use the following helm command to get the current version of Operator: + From the output of the above command, you can figure out that the operator version is 1.7.0, docker.io/tigergrah/tigergraph-k8s-operator:1.7.3, you can also use the following helm command to get the current version of Operator: ```bash helm list -n tigergraph NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION - tg-operator tigergraph 1 2023-02-26 13:16:15.701059001 +0000 UTC deployed tg-operator-0.0.3 + tg-operator tigergraph 1 2025-09-10 13:16:15.701059001 +0000 UTC deployed tg-operator-1.7.0 1.7.0 ``` diff --git a/k8s/docs/08-reference/api-reference.md b/k8s/docs/08-reference/api-reference.md index adaacd255..7805e2254 100644 --- a/k8s/docs/08-reference/api-reference.md +++ b/k8s/docs/08-reference/api-reference.md @@ -10,6 +10,8 @@ Resource Types:
  • TigerGraphBackupSchedule
  • +TigerGraphMonitor +
  • TigerGraphRestore
  • TigerGraph

    @@ -464,6 +466,20 @@ ClusterJobConfig such as MinRetryDuration, MaxRetryDuration and MaxRetryTimes

    + + +nginxConfig
    + + +NginxConfig + + + + +(Optional) +

    NginxConfig is used for customizing the configurations of Nginx

    + + @@ -752,6 +768,182 @@ TigerGraphBackupScheduleStatus +

    TigerGraphMonitor

    +

    +

    TigerGraphMonitor is the Schema for the tigergraphmonitors API

    +

    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription
    +apiVersion
    +string
    +graphdb.tigergraph.com/v1alpha1 +
    +kind
    +string +
    TigerGraphMonitor
    +metadata
    + + +Kubernetes meta/v1.ObjectMeta + + +
    +Refer to the Kubernetes API documentation for the fields of the +metadata field. +
    +spec
    + + +TigerGraphMonitorSpec + + +
    +
    +
    + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    +monitoredClusters
    + +[]string + +
    +(Optional) +

    MonitoredClusters holds the names of TigerGraph clusters to be monitored

    +
    +releaseName
    + +string + +
    +(Optional) +

    ReleaseName represents the release name of the kube-prometheus-stack deployed in the +Kubernetes cluster. +Deprecated: please use ServiceMonitorSelector instead.

    +
    +tlsConfig
    + + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1.TLSConfig + + +
    +(Optional) +

    TLS configuration to use when scraping the target.

    +
    +serviceMonitorLabels
    + +map[string]string + +
    +(Optional) +

    ServiceMonitorLabels select ServiceMonitors to be selected for target discovery by Prometheus.

    +
    +prometheusRule
    + + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1.PrometheusRuleSpec + + +
    +(Optional) +

    PrometheusRule contains specification parameters for a Rule.

    +
    +ruleSelectorLabels
    + +map[string]string + +
    +(Optional) +

    RuleSelectorLabels select PrometheusRules to be selected for target discovery by Prometheus.

    +
    +alertmanagerConfig
    + + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1.AlertmanagerConfigSpec + + +
    +(Optional) +

    AlertmanagerConfig is a specification of the desired behavior of the +Alertmanager configuration.

    +
    +alertmanagerConfigLabels
    + +map[string]string + +
    +(Optional) +

    AlertmanagerConfigLabels select AlertmanagerConfigs to be selected for target discovery by Prometheus.

    +
    +
    +status
    + + +TigerGraphMonitorStatus + + +
    +

    TigerGraphRestore

    TigerGraphRestore is the Schema for the tigergraphrestores API

    @@ -1020,9 +1212,29 @@ defined in the TigerGraphMonitor CR. Currently, these values include: -ReleaseName
    +ServiceMonitorSelectorLabels
    -string +map[string]string + + + + + + + +RuleSelectorLabels
    + +map[string]string + + + + + + + +AlertmanagerConfigSelectorLabels
    + +map[string]string @@ -1161,7 +1373,7 @@ bool (Optional) -

    [Preview] Enable incremental backup

    +

    Enable incremental backup

    @@ -1201,6 +1413,18 @@ string Available values are BestSpeed, DefaultCompression, BestCompression

    + + +baseBackup
    + +string + + + +(Optional) +

    Set the base backup tag for incremental backup

    + +

    BackupInfo

    @@ -1276,6 +1500,28 @@ string

    The version of the cluster where the backup package is created

    + + +baseBackup
    + +string + + + +

    The base backup tag of the incremental backup

    + + + + +pointInTimeRestoreStartTime
    + +string + + + +

    The point in time restore start time of the incremental backup

    + +

    BackupRestoreObjectType

    @@ -2222,6 +2468,52 @@ string +

    NginxConfig

    +

    +(Appears on: +TigerGraphSpec, +TigerGraphStatus) +

    +

    +

    + + + + + + + + + + + + + + + + + +
    FieldDescription
    +secretName
    + +string + +
    +(Optional) +

    The secretName is the name of the secret used to configure TLS for Nginx +If not set, the SSL will be disabled.

    +
    +clientCertSecretName
    + +string + +
    +(Optional) +

    The secretName is the name of the secret used to configure client certificate for Nginx +If this field is set, mTLS will be enabled. +And Nginx will verify the client certificate of the request. +If this field is not set, the mTLS will be disabled.

    +

    RegionAware

    (Appears on: @@ -2291,7 +2583,7 @@ string

    Meta should be read from a file get by gadmin backup list –meta Meta contains the information of the backup package that you want to restore -One of Meta and tag should be specified

    +One of Meta, tag and timePoint should be specified

    @@ -2303,7 +2595,21 @@ string

    Tag is the actual tag of the backup package that you want to restore -One of Meta and tag should be specified

    +One of Meta, tag and timePoint should be specified

    + + + + +timePoint
    + +string + + + +(Optional) +

    TimePoint is the time point of the backup package that you want to restore, +it is represented in RFC3339 format. +One of Meta, tag and timePoint should be specified

    @@ -2517,7 +2823,9 @@ string tgDataSize
    + k8s.io/apimachinery/pkg/api/resource.Quantity + @@ -2528,7 +2836,9 @@ k8s.io/apimachinery/pkg/api/resource.Quantity additionalStorageSize
    + map[string]k8s.io/apimachinery/pkg/api/resource.Quantity + @@ -2986,9 +3296,8 @@ BackupInfo -

    TigerGraphMonitor

    +

    TigerGraphMonitorCondition

    -

    TigerGraphMonitor is the Schema for the tigergraphmonitors API

    @@ -3000,31 +3309,53 @@ BackupInfo + + +
    -metadata
    +ConditionType
    - -Kubernetes meta/v1.ObjectMeta + +TigerGraphMonitorConditionType
    -Refer to the Kubernetes API documentation for the fields of the -metadata field.
    -spec
    +ConditionStatus
    - -TigerGraphMonitorSpec + +Kubernetes meta/v1.ConditionStatus
    -
    -
    +
    +

    TigerGraphMonitorConditionType

    +

    +(Appears on: +TigerGraphMonitorCondition) +

    +

    +

    +

    TigerGraphMonitorSpec

    +

    +(Appears on: +TigerGraphMonitor) +

    +

    +

    TigerGraphMonitorSpec defines the desired state of TigerGraphMonitor

    +

    + + + + + + + - -
    FieldDescription
    monitoredClusters
    @@ -3047,109 +3378,87 @@ string
    (Optional)

    ReleaseName represents the release name of the kube-prometheus-stack deployed in the -Kubernetes cluster.

    -
    +Kubernetes cluster. +Deprecated: please use ServiceMonitorSelector instead.

    -status
    +tlsConfig
    - -TigerGraphMonitorStatus + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1.TLSConfig +(Optional) +

    TLS configuration to use when scraping the target.

    - - -

    TigerGraphMonitorCondition

    -

    -

    - - - - - - - - - -
    FieldDescription
    -ConditionType
    +serviceMonitorLabels
    - -TigerGraphMonitorConditionType - +map[string]string
    +(Optional) +

    ServiceMonitorLabels select ServiceMonitors to be selected for target discovery by Prometheus.

    -ConditionStatus
    +prometheusRule
    - -Kubernetes meta/v1.ConditionStatus + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1.PrometheusRuleSpec
    +(Optional) +

    PrometheusRule contains specification parameters for a Rule.

    -

    TigerGraphMonitorConditionType

    -

    -(Appears on: -TigerGraphMonitorCondition) -

    -

    -

    -

    TigerGraphMonitorSpec

    -

    -(Appears on: -TigerGraphMonitor) -

    -

    -

    TigerGraphMonitorSpec defines the desired state of TigerGraphMonitor

    -

    - - - - + + - - @@ -3218,6 +3527,88 @@ string

    Dashboard is the MD5 hash of the current Grafana dashboard configuration

    + + + + + + + + + + + + + + + + + + + + + + + +
    FieldDescription +ruleSelectorLabels
    + +map[string]string + +
    +(Optional) +

    RuleSelectorLabels select PrometheusRules to be selected for target discovery by Prometheus.

    +
    -monitoredClusters
    +alertmanagerConfig
    -[]string + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1.AlertmanagerConfigSpec +
    (Optional) -

    MonitoredClusters holds the names of TigerGraph clusters to be monitored

    +

    AlertmanagerConfig is a specification of the desired behavior of the +Alertmanager configuration.

    -releaseName
    +alertmanagerConfigLabels
    -string +map[string]string
    (Optional) -

    ReleaseName represents the release name of the kube-prometheus-stack deployed in the -Kubernetes cluster.

    +

    AlertmanagerConfigLabels select AlertmanagerConfigs to be selected for target discovery by Prometheus.

    +serviceMonitorLabels
    + +map[string]string + +
    +(Optional) +

    ServiceMonitorLabels represents the labels of selecting ServiceMonitors +to be selected for target discovery by Prometheus.

    +
    +tlsConfig
    + + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1.TLSConfig + + +
    +(Optional) +

    TLS configuration to use when scraping the target.

    +
    +prometheusRule
    + + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1.PrometheusRuleSpec + + +
    +(Optional) +

    PrometheusRule contains specification parameters for a Rule that is actively added.

    +
    +ruleSelectorLabels
    + +map[string]string + +
    +(Optional) +

    RuleSelectorLabels represents the labels of selecting PrometheusRules +to be selected for target discovery by Prometheus.

    +
    +alertmanagerConfig
    + + +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1alpha1.AlertmanagerConfigSpec + + +
    +(Optional) +

    AltermanagerConfig is a specification of the actual behavior of the +Altermanager configuration.

    +
    +alertmanagerConfigLabels
    + +map[string]string + +
    +(Optional) +

    AlertmanagerConfigLabels represents the labels of selecting AlertmanagerConfigs +to be selected for target discovery by Prometheus.

    +

    TigerGraphRestoreSpec

    @@ -3801,6 +4192,20 @@ ClusterJobConfig such as MinRetryDuration, MaxRetryDuration and MaxRetryTimes

    + + +nginxConfig
    + + +NginxConfig + + + + +(Optional) +

    NginxConfig is used for customizing the configurations of Nginx

    + +

    TigerGraphStatus

    @@ -4068,6 +4473,30 @@ ClusterJobConfig

    Current configurations of cluster operations job

    + + +nginxConfig
    + + +NginxConfig + + + + +

    Current configurations of Nginx

    + + + + +nginxSecretHash
    + +string + + + +

    MD5 of current NginxConfig secret content used by TigerGraph

    + +

    TigerGraphStorage

    diff --git a/k8s/docs/08-reference/cluster-status-of-tigergraph.md b/k8s/docs/08-reference/cluster-status-of-tigergraph.md index 53baa6f0c..2e8da06b8 100644 --- a/k8s/docs/08-reference/cluster-status-of-tigergraph.md +++ b/k8s/docs/08-reference/cluster-status-of-tigergraph.md @@ -17,8 +17,8 @@ The output will be like this: ```bash NAME REPLICAS CLUSTER-SIZE CLUSTER-HA CLUSTER-VERSION SERVICE-TYPE CONDITION-TYPE CONDITION-STATUS AGE test-cluster0 3 3 2 docker.io/tigergraph/tigergraph-k8s:4.2.1 LoadBalancer Normal True 4d1h -test-cluster1 3 3 1 docker.io/tigergraph/tigergraph-k8s:4.1.3 Ingress InitializePost False 15m -test-cluster2 4 3 1 docker.io/tigergraph/tigergraph-k8s:4.1.3 NodePort ExpandPost Unknown 12h +test-cluster1 3 3 1 docker.io/tigergraph/tigergraph-k8s:4.2.1 Ingress InitializePost False 15m +test-cluster2 4 3 1 docker.io/tigergraph/tigergraph-k8s:4.2.1 NodePort ExpandPost Unknown 12h ``` You can also get the status of a specific cluster by running the following command: diff --git a/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md b/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md index be672552d..9355502f0 100644 --- a/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md +++ b/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md @@ -32,9 +32,6 @@ TigerGraph generally maintains high availability under both conditions, provided Note that some critical TigerGraph components—such as GSE, GSQL, and Kafka—may perform a leader switch during node repaving. In rare cases, this can cause brief service disruption. Therefore, upstream applications should implement retry logic with a minimum retry interval of 5 seconds. -> [!WARNING] -> When using TigerGraph 4.2.1, you may encounter rare cases where node repaving causes downtime longer than 5 seconds. If your workload requires strict high availability, we suggest trying these features in a test environment first. We plan to address these issues in the next release. - ## Prerequisites Ensure you have the following before proceeding: diff --git a/k8s/docs/09-release-notes/README.md b/k8s/docs/09-release-notes/README.md index 23624f818..5a11d1cc2 100644 --- a/k8s/docs/09-release-notes/README.md +++ b/k8s/docs/09-release-notes/README.md @@ -7,6 +7,7 @@ Those document describes the new features, improvements, bugfixes for all of Tig Please see the detailed documentation of each TigerGraph Operator version release notes as follows: +- [TigerGraph Operator 1.7.0](./operator-1.7.0.md) - [TigerGraph Operator 1.6.0](./operator-1.6.0.md) - [TigerGraph Operator 1.5.0](./operator-1.5.0.md) - [TigerGraph Operator 1.4.0](./operator-1.4.0.md) diff --git a/k8s/docs/09-release-notes/operator-1.7.0.md b/k8s/docs/09-release-notes/operator-1.7.0.md new file mode 100644 index 000000000..21bbe23fa --- /dev/null +++ b/k8s/docs/09-release-notes/operator-1.7.0.md @@ -0,0 +1,87 @@ +# TigerGraph Operator 1.7.0 Release notes + +## Overview + +**TigerGraph Operator 1.7.0** is now available, designed to work seamlessly with **TigerGraph version 4.3.0**. + +This release introduces significant new features, enhancements, and bug fixes, including: + +- Monitoring and Alerting Service Provisioning on the TigerGraph Operator. +- Customizing Prometheus rules and AlertManager configuration through the TigerGraph monitor CR. +- Creating default alerting rules and corresponding Grafana dashboards during Operator installation. +- Support exposing TigerGraph metrics to Prometheus when the SSL of the Nginx service is enabled. +- Support point-in-time restore in Kubernetes Operator. +- Optimize the Dependencies Management between the incremental backup and the full backup. +- Add support for enabling mTLS for TigerGraph on Kubernetes. +- Support configuring Nginx SSL by the TigerGraph Kubernetes operator. +- Enhance Error Handling for Cluster Operations with DB Freeze Mode. +- Skip the license status check in the readiness probe for TigerGraph versions that support keeping all services online after the license expires. + +For further details, see the sections below. + +> [!IMPORTANT] +> TigerGraph Operator has had a breaking change since version 1.0.0. If you are still using a version older than 1.0.0, it is strongly recommended that you upgrade to version 1.7.0. Versions older than 1.0.0 have been deprecated. + +### kubectl plugin installation + +To install the kubectl plugin for TigerGraph Operator 1.7.0, execute the following command: + +```bash +curl https://dl.tigergraph.com/k8s/1.7.0/kubectl-TigerGraph -o kubectl-TigerGraph +sudo install kubectl-TigerGraph /usr/local/bin/ +``` + +### TigerGraph Operator upgrading + +#### Upgrading from TigerGraph Operator 1.0.0+ to 1.7.0 + +There are no breaking changes in the Custom Resource Definitions (CRDs) for version 1.7.0 compared to versions 1.0.0 and above. If you are running Operator 1.0.0 or later, upgrade using the following command: + +> [!NOTE] +> There is currently no support for upgrading or deleting CRDs when upgrading or uninstalling the TigerGraph Operator due to the risk of unintentional data loss. It is necessary to upgrade TigerGraph CRDs manually for the operator version prior to 1.3.0. However, starting from Operator version 1.3.0, we use [Helm chart's pre-upgrade hook](https://helm.sh/docs/topics/charts_hooks/) to upgrade the CRDs automatically. You can ignore the first step if you upgrade the operator to version 1.3.0 or above. + +> [!IMPORTANT] +> Please ensure that you have installed the `kubectl-TigerGraph` version 1.7.0 before upgrading TigerGraph Operator to version 1.7.0. + +Ensure you have installed the correct version of kubectl-TigerGraph: + +```bash +kubectl TigerGraph version + +Version: 1.7.0 +Default version of TigerGraph cluster: 4.3.0 +``` + +Upgrade TigerGraph Operator using kubectl-TigerGraph plugin: + +```bash +kubectl TigerGraph upgrade --namespace ${YOUR_NAMESPACE_OF_OPERATOR} --operator-version 1.7.0 +``` + +#### Upgrading from TigerGraph Operator Versions Prior to 1.0.0 + +This TigerGraph Operator version upgrade introduces breaking changes if you are upgrading from TigerGraph Operator versions prior to 1.0.0. You need to upgrade the TigerGraph Operator, CRD, and the TigerGraph cluster following specific steps. + +Refer to the documentation [How to upgrade TigerGraph Kubernetes Operator](../04-manage/operator-upgrade.md) for details. + +## New features + +- Support Point-in-Time Restore in TigerGraph Operator. +- Add support for enabling mutual TLS for TigerGraph on Kubernetes. +- Support configuring Nginx SSL by the Kubernetes operator. +- Customizing Prometheus rules and configuring AlertManager alerts through TigerGraph monitor CR. +- Monitoring and Alerting Service Provisioning on the TigerGraph Operator. + +## Improvements + +- Support exposing TigerGraph metrics to Prometheus when the SSL of the Nginx service is enabled. +- Make sure the Webhook of the namespaced operator only handles requests from a specific namespace. +- Enhance Error Handling for Cluster Operations with DB Freeze Mode. +- Skip the license status check in the readiness probe for TigerGraph versions that support keeping all services online after the license expires. +- Optimize the Dependencies Management between the incremental backup and the full backup. +- Added support for installing TigerGraph Operator using Helm charts with a default values.yaml. + +## Bug Fixes + +- Recover the changed backup config before running the cleanup job. +- Remove the restore staging path from the restore job while it is retrying. diff --git a/k8s/docs/10-samples/backup-restore/backup-schedule-local.yaml b/k8s/docs/10-samples/backup-restore/backup-schedule-local.yaml index d3b29ec56..ebe62a849 100644 --- a/k8s/docs/10-samples/backup-restore/backup-schedule-local.yaml +++ b/k8s/docs/10-samples/backup-restore/backup-schedule-local.yaml @@ -23,7 +23,7 @@ spec: storage: local # Use this field if type is local local: - path: /home/tigergraph/tigergraph/data/backup + path: /home/tigergraph/backup # Configure the name of backup files and the path storing temporary files backupConfig: @@ -31,7 +31,7 @@ spec: # Note: this field is Required for TigerGraph Operator < v1.1.0 tag: daily # Optional - stagingPath: /home/tigergraph/tigergraph/data + stagingPath: /home/tigergraph/backup-staging # Optional :if incremental is true, incremental backup will be performed incremental: false # Optional diff --git a/k8s/docs/10-samples/backup-restore/backup-to-local.yaml b/k8s/docs/10-samples/backup-restore/backup-to-local.yaml index ec7482bf3..e0816577e 100644 --- a/k8s/docs/10-samples/backup-restore/backup-to-local.yaml +++ b/k8s/docs/10-samples/backup-restore/backup-to-local.yaml @@ -10,7 +10,7 @@ spec: storage: local # Use this field if type is local local: - path: /home/tigergraph/tigergraph/data/backup + path: /home/tigergraph/backup # Configure the name of backup files and the path storing temporary files backupConfig: diff --git a/k8s/docs/10-samples/backup-restore/incremental-backup-to-s3.yaml b/k8s/docs/10-samples/backup-restore/incremental-backup-to-s3.yaml new file mode 100644 index 000000000..b6f1a36d9 --- /dev/null +++ b/k8s/docs/10-samples/backup-restore/incremental-backup-to-s3.yaml @@ -0,0 +1,61 @@ +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraphBackup +metadata: + name: test-incremental-backup-s3 +spec: + # Specify which cluster to backup in the SAME NAMESPACE as the backup job + clusterName: test-cluster + destination: + storage: s3Bucket + s3Bucket: + # Specify the name of the S3 bucket you want to use + bucketName: operator-backup + # Specify the Secret containing the S3 access key and secret access key + secretKeyName: s3-secret + # Optional: use roleARN instead of secretKeyName + # roleARN: arn:aws:iam::123456789012:role/role-name + + # Configure the name of backup files and the path storing temporary files + backupConfig: + # Optional: Set the tag of the backup, if not set, the tag will be the name of this CR + # Note: this field is Required for TigerGraph Operator < v1.1.0 + tag: s3 + # Optional: Set the path for temporary staging files + stagingPath: /home/tigergraph/tigergraph/data + # Optional: If 'incremental' is set to true, incremental backup will be performed + incremental: true + # Optional: Set the base backup tag for incremental backup, only valid when incremental is true + # Note: this field is supported from TigerGraph 4.2.0 and TigerGraph Operator >= v1.7.0 + base: s3-backup-2025-09-11T002136.959 + # Optional: Set the timeout value for the backup process (default is 18000 seconds) + timeout: 18000 + # Optional: Specify the number of processes to use for compression (0 uses the number of CPU cores) + compressProcessNumber: 0 + # Optional: (Requires TigerGraph Operator >= v0.0.9 and TigerGraph >= v3.9.3) + # Choose the compression level for the backup: DefaultCompression/BestSpeed/BestCompression + compressLevel: DefaultCompression # Choose from DefaultCompression/BestSpeed/BestCompression + + # Optional: Set the policy for cleaning up backup package when deleting the backup CR + # Choose from Delete/Retain + # The default behavior is to retain the backup package. + # If you want to delete the backup package when deleting the backup CR, + # you can set the cleanPolicy to Delete. + # With Delete policy, + # TigerGraph Operator will create a backup-clean-job when the backup CR is deleted, + # to make sure that the backup package is removed before deleting the backup CR. + cleanPolicy: Delete + + # Optional: Set the retry policy for backup CR + backoffRetryPolicy: + # set maxRetryTimes for backup CR + maxRetryTimes: 3 + # set the min duration between two retries, + # the format is like "5s","10m","1h","1h20m5s" + minRetryDuration: 5s + # set the max duration between two retries, + # the format is like "5s","10m","1h","1h20m5s" + maxRetryDuration: 10s + # If the value is true, the deletion of backup CR won't be blocked by failed backup-clean-job + # that means, when backup-clean-job exceeds the maxRetryTimes + # the backup CR will be deleted directly, the backup package still exists in cluster + forceDeleteAfterMaxRetries: false \ No newline at end of file diff --git a/k8s/docs/10-samples/backup-restore/restore-from-local.yaml b/k8s/docs/10-samples/backup-restore/restore-from-local.yaml index 76fa27b4e..ab4cf33e1 100644 --- a/k8s/docs/10-samples/backup-restore/restore-from-local.yaml +++ b/k8s/docs/10-samples/backup-restore/restore-from-local.yaml @@ -13,7 +13,7 @@ spec: source: storage: local local: - path: /home/tigergraph/tigergraph/data/backup + path: /home/tigergraph/backup # Specify the name of cluster clusterName: test-cluster diff --git a/k8s/docs/10-samples/backup-restore/restore-with-timepoint-to-s3.yaml b/k8s/docs/10-samples/backup-restore/restore-with-timepoint-to-s3.yaml new file mode 100644 index 000000000..673da2b9c --- /dev/null +++ b/k8s/docs/10-samples/backup-restore/restore-with-timepoint-to-s3.yaml @@ -0,0 +1,34 @@ +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraphRestore +metadata: + name: restore-with-timepoint-from-s3 +spec: + restoreConfig: + # Optional: Set the tag of the backup package that you want to restore + # Note: Only one of tag, meta, or timePoint can be specified + tag: '' + # Optional: Set the time point with time format (e.g., 2025-01-15T14:30:00Z) + # Note: requires TigerGraph >= 4.2.0 and TigerGraph Operator >= v1.7.0 + timePoint: '2025-01-15T14:30:00Z' + stagingPath: /home/tigergraph/tigergraph/data/restore-staging + # Optional: (TigerGraph Operator>=v0.0.9 and TigerGraph>=v3.9.3) should be >=0 + decompressProcessNumber: 2 + source: + storage: s3Bucket + s3Bucket: + # specify the bucket you want to use + bucketName: operator-backup + secretKeyName: s3-secret + # Specify the name of cluster + clusterName: test-cluster + + # Optional: Set the retry policy for restore CR + backoffRetryPolicy: + # set maxRetryTimes for restore CR + maxRetryTimes: 3 + # set the min duration between two retries, + # the format is like "5s","10m","1h","1h20m5s" + minRetryDuration: 5s + # set the max duration between two retries, + # the format is like "5s","10m","1h","1h20m5s" + maxRetryDuration: 10s \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/additional-storage-of-sidecar.yaml b/k8s/docs/10-samples/deploy/additional-storage-of-sidecar.yaml index d75564836..d94412a7d 100644 --- a/k8s/docs/10-samples/deploy/additional-storage-of-sidecar.yaml +++ b/k8s/docs/10-samples/deploy/additional-storage-of-sidecar.yaml @@ -5,10 +5,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -22,7 +22,7 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G @@ -35,34 +35,34 @@ spec: accessMode: ReadWriteMany volumeMode: Filesystem sidecarContainers: - - args: # sidecar will execute this - - /bin/sh - - -c - - | - while true; do - echo "$(date) INFO hello from main-container" >> /tg-sidecar/myapp.log ; - sleep 1; - done - image: alpine:3.17.2 - name: sidecar-container # name of sidecar - readinessProbe: # check if the sidecar is ready - exec: - command: - - sh - - -c - - if [[ -f /tg-sidecar/myapp.log ]];then exit 0; else exit 1;fi - initialDelaySeconds: 10 - periodSeconds: 5 - resources: - requests: # request resouces for sidecar - cpu: 500m - memory: 512Mi - limits: # limit resources - cpu: 500m - memory: 512Mi - env: # inject the environment you need - - name: CLUSTER_NAME - value: test-cluster - volumeMounts: - - mountPath: /tg-sidecar - name: tg-sidecar + - args: # sidecar will execute this + - /bin/sh + - -c + - | + while true; do + echo "$(date) INFO hello from main-container" >> /tg-sidecar/myapp.log ; + sleep 1; + done + image: alpine:3.17.2 + name: sidecar-container # name of sidecar + readinessProbe: # check if the sidecar is ready + exec: + command: + - sh + - -c + - if [[ -f /tg-sidecar/myapp.log ]];then exit 0; else exit 1;fi + initialDelaySeconds: 10 + periodSeconds: 5 + resources: + requests: # request resouces for sidecar + cpu: 500m + memory: 512Mi + limits: # limit resources + cpu: 500m + memory: 512Mi + env: # inject the environment you need + - name: CLUSTER_NAME + value: test-cluster + volumeMounts: + - mountPath: /tg-sidecar + name: tg-sidecar \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/additional-storages-of-kafka-tglogs-backup.yaml b/k8s/docs/10-samples/deploy/additional-storages-of-kafka-tglogs-backup.yaml index 340dd351d..6c0bd04c3 100644 --- a/k8s/docs/10-samples/deploy/additional-storages-of-kafka-tglogs-backup.yaml +++ b/k8s/docs/10-samples/deploy/additional-storages-of-kafka-tglogs-backup.yaml @@ -5,10 +5,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -22,7 +22,7 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G @@ -38,4 +38,4 @@ spec: storageClassName: standard mountPath: /home/tigergraph/backup accessMode: ReadWriteOnce - volumeMode: Filesystem + volumeMode: Filesystem \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/apply-topology-spread-constraint-between-multiple-tigergraph-clusters.yaml b/k8s/docs/10-samples/deploy/apply-topology-spread-constraint-between-multiple-tigergraph-clusters.yaml index a41ec0d89..003b2e2c0 100644 --- a/k8s/docs/10-samples/deploy/apply-topology-spread-constraint-between-multiple-tigergraph-clusters.yaml +++ b/k8s/docs/10-samples/deploy/apply-topology-spread-constraint-between-multiple-tigergraph-clusters.yaml @@ -5,10 +5,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -22,7 +22,7 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G @@ -33,22 +33,22 @@ spec: regionAware: enable: true topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/across-cluster-pod: across-tg-cluster - matchLabelKeys: - - pod-template-hash - - maxSkew: 1 - topologyKey: "kubernetes.io/hostname" - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/across-cluster-pod: across-tg-cluster - matchLabelKeys: - - pod-template-hash + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/across-cluster-pod: across-tg-cluster + matchLabelKeys: + - pod-template-hash + - maxSkew: 1 + topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/across-cluster-pod: across-tg-cluster + matchLabelKeys: + - pod-template-hash --- apiVersion: graphdb.tigergraph.com/v1alpha1 kind: TigerGraph @@ -57,10 +57,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -74,7 +74,7 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G @@ -85,19 +85,19 @@ spec: regionAware: enable: true topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/across-cluster-pod: across-tg-cluster - matchLabelKeys: - - pod-template-hash - - maxSkew: 1 - topologyKey: "kubernetes.io/hostname" - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/across-cluster-pod: across-tg-cluster - matchLabelKeys: - - pod-template-hash + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/across-cluster-pod: across-tg-cluster + matchLabelKeys: + - pod-template-hash + - maxSkew: 1 + topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/across-cluster-pod: across-tg-cluster + matchLabelKeys: + - pod-template-hash \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/custom-volume-mount-path.yaml b/k8s/docs/10-samples/deploy/custom-volume-mount-path.yaml index 6d724b1e9..9f4276ef0 100644 --- a/k8s/docs/10-samples/deploy/custom-volume-mount-path.yaml +++ b/k8s/docs/10-samples/deploy/custom-volume-mount-path.yaml @@ -5,10 +5,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -22,7 +22,7 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G @@ -34,4 +34,4 @@ spec: claimName: efs-claim # the pvc is created by the user customVolumeMounts: - name: custom-volume - mountPath: /custom-mount-path # the custom mount path of TigerGraph container + mountPath: /custom-mount-path # the custom mount path of TigerGraph container \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/region-awareness-with-multiple-topology-spread-constraints.yaml b/k8s/docs/10-samples/deploy/region-awareness-with-multiple-topology-spread-constraints.yaml index 2d72d3d4b..df790e3bf 100644 --- a/k8s/docs/10-samples/deploy/region-awareness-with-multiple-topology-spread-constraints.yaml +++ b/k8s/docs/10-samples/deploy/region-awareness-with-multiple-topology-spread-constraints.yaml @@ -5,10 +5,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -22,7 +22,7 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G @@ -31,19 +31,19 @@ spec: regionAware: enable: true topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/cluster-pod: test-tg-cluster - matchLabelKeys: - - pod-template-hash - - maxSkew: 1 - topologyKey: "kubernetes.io/hostname" - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/cluster-pod: test-tg-cluster - matchLabelKeys: - - pod-template-hash + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/cluster-pod: test-tg-cluster + matchLabelKeys: + - pod-template-hash + - maxSkew: 1 + topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/cluster-pod: test-tg-cluster + matchLabelKeys: + - pod-template-hash \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint-and-node-affinity.yaml b/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint-and-node-affinity.yaml index 5e41ca7ce..569af1ff7 100644 --- a/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint-and-node-affinity.yaml +++ b/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint-and-node-affinity.yaml @@ -5,10 +5,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -22,7 +22,7 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G @@ -31,31 +31,31 @@ spec: regionAware: enable: true topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/cluster-pod: test-tg-cluster - matchLabelKeys: - - pod-template-hash - - maxSkew: 1 - topologyKey: "kubernetes.io/hostname" - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/cluster-pod: test-tg-cluster - matchLabelKeys: - - pod-template-hash + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/cluster-pod: test-tg-cluster + matchLabelKeys: + - pod-template-hash + - maxSkew: 1 + topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/cluster-pod: test-tg-cluster + matchLabelKeys: + - pod-template-hash affinityConfiguration: affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - - key: topology.kubernetes.io/zone - operator: In - values: - - us-east-2a - - us-east-2b - - us-east-2c + - matchExpressions: + - key: topology.kubernetes.io/zone + operator: In + values: + - us-east-2a + - us-east-2b + - us-east-2c \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint.yaml b/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint.yaml index d5f6c3431..f53d5d2ec 100644 --- a/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint.yaml +++ b/k8s/docs/10-samples/deploy/region-awareness-with-topology-spread-constraint.yaml @@ -5,10 +5,10 @@ metadata: namespace: tigergraph spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret license: ${YOUR_LICENSE} listener: type: LoadBalancer @@ -29,11 +29,11 @@ spec: enable: true topologyKey: topology.kubernetes.io/zone topologySpreadConstraints: - - maxSkew: 1 - topologyKey: topology.kubernetes.io/zone - whenUnsatisfiable: DoNotSchedule #ScheduleAnyway - labelSelector: - matchLabels: - tigergraph.com/cluster-pod: tg-test-cluster - matchLabelKeys: - - pod-template-hash + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: DoNotSchedule #ScheduleAnyway + labelSelector: + matchLabels: + tigergraph.com/cluster-pod: tg-test-cluster + matchLabelKeys: + - pod-template-hash \ No newline at end of file diff --git a/k8s/docs/10-samples/deploy/service-of-sidecar-ingress-type.yaml b/k8s/docs/10-samples/deploy/service-of-sidecar-ingress-type.yaml index 28ccb2770..79702603e 100644 --- a/k8s/docs/10-samples/deploy/service-of-sidecar-ingress-type.yaml +++ b/k8s/docs/10-samples/deploy/service-of-sidecar-ingress-type.yaml @@ -4,7 +4,7 @@ metadata: name: ingress-type-service-of-sidecar spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - name: tigergraph-image-pull-secret @@ -33,19 +33,20 @@ spec: annotations: key2: value2 listenerPorts: - - name: httpd80 - port: 80 - ingressRule: - host: your.domain.hostname - path: / - pathType: Prefix + - name: httpd80 + port: 80 + ingressRule: + host: your.domain.hostname + path: / + pathType: Prefix sidecarContainers: - - image: httpd:2.4 - name: httpd80 - resources: - requests: - cpu: 500m - memory: 512Mi - limits: - cpu: 500m - memory: 512Mi + - image: httpd:2.4 + name: httpd80 + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 500m + memory: 512Mi + diff --git a/k8s/docs/10-samples/deploy/service-of-sidecar-loadbanalance-type.yaml b/k8s/docs/10-samples/deploy/service-of-sidecar-loadbanalance-type.yaml index c3b616bdd..8382ee3b1 100644 --- a/k8s/docs/10-samples/deploy/service-of-sidecar-loadbanalance-type.yaml +++ b/k8s/docs/10-samples/deploy/service-of-sidecar-loadbanalance-type.yaml @@ -4,7 +4,7 @@ metadata: name: loadbalancer-type-service-of-sidecar spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - name: tigergraph-image-pull-secret @@ -32,15 +32,16 @@ spec: annotations: key2: value2 listenerPorts: - - name: httpd80 - port: 80 + - name: httpd80 + port: 80 sidecarContainers: - - image: httpd:2.4 - name: httpd80 - resources: - requests: - cpu: 500m - memory: 512Mi - limits: - cpu: 500m - memory: 512Mi + - image: httpd:2.4 + name: httpd80 + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 500m + memory: 512Mi + diff --git a/k8s/docs/10-samples/deploy/service-of-sidecar-nodeport-type.yaml b/k8s/docs/10-samples/deploy/service-of-sidecar-nodeport-type.yaml index c4e061394..faa995201 100644 --- a/k8s/docs/10-samples/deploy/service-of-sidecar-nodeport-type.yaml +++ b/k8s/docs/10-samples/deploy/service-of-sidecar-nodeport-type.yaml @@ -4,7 +4,7 @@ metadata: name: nodeport-type-service-of-sidecar spec: ha: 2 - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - name: tigergraph-image-pull-secret @@ -32,16 +32,17 @@ spec: annotations: key2: value2 listenerPorts: - - name: httpd80 - port: 80 - nodePort: 30080 + - name: httpd80 + port: 80 + nodePort: 30080 sidecarContainers: - - image: httpd:2.4 - name: httpd80 - resources: - requests: - cpu: 500m - memory: 512Mi - limits: - cpu: 500m - memory: 512Mi + - image: httpd:2.4 + name: httpd80 + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 500m + memory: 512Mi + diff --git a/k8s/docs/10-samples/deploy/tigergraph-cluster.yaml b/k8s/docs/10-samples/deploy/tigergraph-cluster.yaml index 9774d49bf..f43336d7c 100644 --- a/k8s/docs/10-samples/deploy/tigergraph-cluster.yaml +++ b/k8s/docs/10-samples/deploy/tigergraph-cluster.yaml @@ -4,10 +4,10 @@ metadata: name: test-cluster namespace: tigergraph spec: - image: docker.io/tigergraph/tigergraph-k8s:4.2.0 + image: docker.io/tigergraph/tigergraph-k8s:4.1.0 imagePullPolicy: IfNotPresent imagePullSecrets: - - name: tigergraph-image-pull-secret + - name: tigergraph-image-pull-secret ha: 2 license: ${YOUR_LICENSE} listener: @@ -25,9 +25,9 @@ spec: type: persistent-claim volumeClaimTemplate: accessModes: - - ReadWriteOnce + - ReadWriteOnce resources: requests: storage: 100G storageClassName: standard - volumeMode: Filesystem + volumeMode: Filesystem \ No newline at end of file diff --git a/k8s/docs/10-samples/manage/custom-script-job.yaml b/k8s/docs/10-samples/manage/custom-script-job.yaml new file mode 100644 index 000000000..d2592114b --- /dev/null +++ b/k8s/docs/10-samples/manage/custom-script-job.yaml @@ -0,0 +1,161 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: custom-script-job + namespace: tigergraph +spec: + template: + metadata: {} + spec: + containers: + - name: script-runner + image: docker.io/tigergraph/tigergraph-k8s-init:1.6.0 + imagePullPolicy: IfNotPresent + command: + - /bin/bash + - -c + - | + set -eo pipefail + PRIVATE_KEY_FILE=/etc/private-key-volume/tigergraph_rsa + SERVICE_NAME=${CLUSTER_NAME}-internal-service + + echo "Copying script to cluster..." + scp -i $PRIVATE_KEY_FILE -o StrictHostKeyChecking=no -P ${SSH_PORT} \ + /tmp/custom-script/custom-script.sh \ + tigergraph@${CLUSTER_NAME}-0.${SERVICE_NAME}.${NAMESPACE}:/home/tigergraph/custom-script.sh > /dev/null + + echo "Making script executable..." + ssh -i $PRIVATE_KEY_FILE -o StrictHostKeyChecking=no \ + -p ${SSH_PORT} tigergraph@${CLUSTER_NAME}-0.${SERVICE_NAME}.${NAMESPACE} \ + "chmod +x /home/tigergraph/custom-script.sh" + + echo "Running script in cluster..." + ssh -i $PRIVATE_KEY_FILE -o StrictHostKeyChecking=no \ + -p ${SSH_PORT} tigergraph@${CLUSTER_NAME}-0.${SERVICE_NAME}.${NAMESPACE} < person.csv + name,gender,age,state + Tom,male,40,ca + Dan,male,34,ny + Jenny,female,25,tx + Kevin,male,28,az + Amily,female,22,ca + Nancy,female,20,ky + Jack,male,26,fl + EOF + + cat << EOF > friendship.csv + person1,person2,date + Tom,Dan,2017-06-03 + Tom,Jenny,2015-01-01 + Dan,Jenny,2016-08-03 + Jenny,Amily,2015-06-08 + Dan,Nancy,2016-01-03 + Nancy,Jack,2017-03-02 + Dan,Kevin,2015-12-30 + EOF + + cat << EOF > gsql101.gsql + BEGIN + CREATE VERTEX person ( + PRIMARY_ID name STRING, + name STRING, + age INT, + gender STRING, + state STRING + ) + END + + CREATE UNDIRECTED EDGE friendship (FROM person, TO person, connect_day DATETIME) + + CREATE GRAPH social (person, friendship) + + USE GRAPH social + BEGIN + CREATE LOADING JOB load_social FOR GRAPH social { + DEFINE FILENAME file1="/home/tigergraph/person.csv"; + DEFINE FILENAME file2="/home/tigergraph/friendship.csv"; + + LOAD file1 TO VERTEX person VALUES ($"name", $"name", $"age", $"gender", $"state") USING header="true", separator=","; + LOAD file2 TO EDGE friendship VALUES (\$0, \$1, \$2) USING header="true", separator=","; + } + END + RUN LOADING JOB load_social + + CREATE QUERY hello(VERTEX p) { + Start = {p}; + Result = SELECT tgt + FROM Start:s-(friendship:e) ->person:tgt; + PRINT Result; + } + + INSTALL QUERY hello + + RUN QUERY hello("Tom") + EOF + + gsql /home/tigergraph/gsql101.gsql + + echo "Custom GSQL operations completed successfully" \ No newline at end of file diff --git a/k8s/docs/10-samples/monitoring/alertmanager-config-email.yaml b/k8s/docs/10-samples/monitoring/alertmanager-config-email.yaml new file mode 100644 index 000000000..540dbfd0f --- /dev/null +++ b/k8s/docs/10-samples/monitoring/alertmanager-config-email.yaml @@ -0,0 +1,32 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: prometheus-alertmanager-config +spec: + route: + receiver: alert-email-pagerduty-config + groupBy: ['alertname', 'job','severity'] + groupWait: 30s + groupInterval: 5m + repeatInterval: 15m + continue: true + receivers: + - name: alert-email-pagerduty-config + emailConfigs: + - to: your-email@gmail.com + sendResolved: true + from: your-email@gmail.com + smarthost: smtp.gmail.com:465 + authUsername: your-email@gmail.com + authPassword: + name: alertmanager-auth-secret + key: password + requireTLS: false +--- +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-auth-secret # Name of the Secret +type: Opaque +data: + password: YOUR_PASSWORD_BASE64_ENCODED \ No newline at end of file diff --git a/k8s/docs/10-samples/monitoring/alertmanager-config-slack.yaml b/k8s/docs/10-samples/monitoring/alertmanager-config-slack.yaml new file mode 100644 index 000000000..3651b8481 --- /dev/null +++ b/k8s/docs/10-samples/monitoring/alertmanager-config-slack.yaml @@ -0,0 +1,41 @@ +apiVersion: monitoring.coreos.com/v1alpha1 +kind: AlertmanagerConfig +metadata: + name: slack-config + labels: + alertmanager: config +spec: + route: + groupBy: ['alertname', 'job','severity'] + groupWait: 30s + groupInterval: 5m + repeatInterval: 1m + receiver: "slack-receiver" + routes: + - receiver: "slack-receiver" + continue: true + receivers: + - name: "slack-receiver" + slackConfigs: + - sendResolved: true + apiURL: + name: slack-webhook-url + key: webhook-url + channel: "#operator-monitoring-test" + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} +--- +apiVersion: v1 +kind: Secret +metadata: + name: slack-webhook-url +type: Opaque +stringData: + webhook-url: https://hooks.slack.com/services/YOUR_SLACK_WEBHOOK_URL \ No newline at end of file diff --git a/k8s/docs/10-samples/monitoring/alertmanager-configs.yaml b/k8s/docs/10-samples/monitoring/alertmanager-configs.yaml new file mode 100644 index 000000000..9026e5746 --- /dev/null +++ b/k8s/docs/10-samples/monitoring/alertmanager-configs.yaml @@ -0,0 +1,27 @@ +alertmanagerConfig: + route: + groupBy: ["job", "alertname"] + groupWait: 30s + groupInterval: 5m + repeatInterval: 1m + receiver: "slack-receiver" + routes: + - receiver: "slack-receiver" + continue: true + receivers: + - name: "slack-receiver" + slackConfigs: + - sendResolved: true + apiURL: + name: slack-webhook-url + key: webhook-url + channel: "#operator-monitoring-test" + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} \ No newline at end of file diff --git a/k8s/docs/10-samples/monitoring/prometheus-rules.yaml b/k8s/docs/10-samples/monitoring/prometheus-rules.yaml new file mode 100644 index 000000000..3ddf688cc --- /dev/null +++ b/k8s/docs/10-samples/monitoring/prometheus-rules.yaml @@ -0,0 +1,346 @@ +prometheusRule: + groups: + - name: tigergraph.cpu.alerts + rules: + - alert: TigerGraphHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - alert: TigerGraphCriticalCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value }}%)" + + - alert: TigerGraphServiceHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name!=""}) by (cluster_name, namespace, service_name, host_id) > 70 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage for {{ $labels.service_name }} Service" + description: "CPU usage for {{ $labels.service_name }} service on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - name: tigergraph.memory.alerts + rules: + - alert: TigerGraphHighMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowMemoryAvailable + expr: max(tigergraph_memory_available{}) by (cluster_name, namespace, host_id) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Memory Available" + description: "Available memory on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - name: tigergraph.disk.alerts + rules: + - alert: TigerGraphHighDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowDiskSpace + expr: max(tigergraph_diskspace_free) by (cluster_name, namespace, path_name, host_id, path) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Space" + description: "Free disk space for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - alert: TigerGraphLowDiskInodes + expr: max(tigergraph_disk_inode_free) by (cluster_name, namespace, host_id, path_name, path) < 100000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Inodes" + description: "Free disk inodes for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }})" + + - name: tigergraph.network.alerts + rules: + - alert: TigerGraphHighNetworkConnections + expr: max(tigergraph_network_connections) by (cluster_name, namespace, host_id) > 2000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Network Connections" + description: "Number of open TCP connections on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }})" + + - name: tigergraph.service.alerts + rules: + - alert: TigerGraphServiceDown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 27 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Down" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is down" + + - alert: TigerGraphServiceOffline + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 24 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Offline" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is offline" + + - alert: TigerGraphServiceStopping + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 21 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Stopping" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is stopping" + + - alert: TigerGraphServicePaused + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 18 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Paused" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is paused" + + - alert: TigerGraphServiceStarting + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 15 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Starting" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is starting" + + - alert: TigerGraphServiceReadonly + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 12 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Readonly" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in readonly mode" + + - alert: TigerGraphServiceWarmup + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 9 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Warmup" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in warmup state" + + - alert: TigerGraphServiceUnknown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 3 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Unknown Status" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has unknown status" + + - name: tigergraph.performance.alerts + rules: + - alert: TigerGraphHighEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name)> 5000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}ms)" + + - alert: TigerGraphCriticalEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) > 10000 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} is critically high ({{ $value }}ms)" + + - alert: TigerGraphHighQPS + expr: max(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 100 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High QPS Detected" + description: "High QPS for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }})" + + - alert: TigerGraphEndpointTimeout + expr: max(tigergraph_endpoint_timeout) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 0 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Endpoint Timeout" + description: "Timeout occurred for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} timeouts)" + + - name: tigergraph.system.alerts + rules: + - alert: TigerGraphLowCPUAvailable + expr: max(tigergraph_cpu_available) by (namespace,cluster_name, host_id) < 10 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low CPU Available" + description: "Available CPU on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }}%)" + + - alert: TigerGraphHighDiskIO + expr: max(tigergraph_disk_iops) by (namespace,cluster_name, path_name, host_id, path, mount_point) > 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O" + description: "High disk I/O for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} IOPS)" + + - alert: TigerGraphHighDiskIOTime + expr: max(tigergraph_disk_io_time) by (namespace,cluster_name, path_name, host_id, path,mount_point) > 0.1 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O Time" + description: "High disk I/O time for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} hours)" + + - name: tigergraph.license.alerts + rules: + - alert: TigerGraphLicenseExpiringSoon + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 30 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 7 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Soon" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. Please renew the license." + + - alert: TigerGraphLicenseExpiringCritical + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 7 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Critical" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. URGENT: License renewal required immediately." + + - alert: TigerGraphLicenseExpired + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expired" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has expired. Service may be affected." + + - alert: TigerGraphLicenseInvalid + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == -1 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Invalid" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is invalid or corrupted. Please check license configuration." + + - name: tigergraph.recording.rules + rules: + - record: tigergraph:cpu_usage_percentage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) + + - record: tigergraph:memory_usage_percentage + expr: ((max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id)) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 + + - record: tigergraph:disk_usage_percentage + expr: (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) / (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) + max(tigergraph_diskspace_free) by(namespace,cluster_name,mount_point,path, path_name, host_id))) * 100 + + - record: tigergraph:endpoint_latency_avg + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:endpoint_latency_max + expr: max(tigergraph_endpoint_latency{statistic="max_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:qps_total + expr: sum(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) + + - record: tigergraph:service_online_count + expr: count(tigergraph_service_status == 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:service_offline_count + expr: count(tigergraph_service_status != 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:license_days_remaining + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) diff --git a/k8s/docs/10-samples/monitoring/tigergraph-alert-rules.yaml b/k8s/docs/10-samples/monitoring/tigergraph-alert-rules.yaml new file mode 100644 index 000000000..debefc240 --- /dev/null +++ b/k8s/docs/10-samples/monitoring/tigergraph-alert-rules.yaml @@ -0,0 +1,354 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: tigergraph-alert-rules + namespace: tigergraph + labels: + app: tigergraph + release: prometheus-stack +spec: + groups: + - name: tigergraph.cpu.alerts + rules: + - alert: TigerGraphHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - alert: TigerGraphCriticalCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value }}%)" + + - alert: TigerGraphServiceHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name!=""}) by (cluster_name, namespace, service_name, host_id) > 70 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage for {{ $labels.service_name }} Service" + description: "CPU usage for {{ $labels.service_name }} service on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - name: tigergraph.memory.alerts + rules: + - alert: TigerGraphHighMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowMemoryAvailable + expr: max(tigergraph_memory_available{}) by (cluster_name, namespace, host_id) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Memory Available" + description: "Available memory on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - name: tigergraph.disk.alerts + rules: + - alert: TigerGraphHighDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowDiskSpace + expr: max(tigergraph_diskspace_free) by (cluster_name, namespace, path_name, host_id, path) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Space" + description: "Free disk space for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - alert: TigerGraphLowDiskInodes + expr: max(tigergraph_disk_inode_free) by (cluster_name, namespace, host_id, path_name, path) < 100000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Inodes" + description: "Free disk inodes for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }})" + + - name: tigergraph.network.alerts + rules: + - alert: TigerGraphHighNetworkConnections + expr: max(tigergraph_network_connections) by (cluster_name, namespace, host_id) > 2000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Network Connections" + description: "Number of open TCP connections on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }})" + + - name: tigergraph.service.alerts + rules: + - alert: TigerGraphServiceDown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 27 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Down" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is down" + + - alert: TigerGraphServiceOffline + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 24 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Offline" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is offline" + + - alert: TigerGraphServiceStopping + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 21 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Stopping" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is stopping" + + - alert: TigerGraphServicePaused + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 18 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Paused" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is paused" + + - alert: TigerGraphServiceStarting + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 15 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Starting" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is starting" + + - alert: TigerGraphServiceReadonly + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 12 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Readonly" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in readonly mode" + + - alert: TigerGraphServiceWarmup + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 9 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Warmup" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in warmup state" + + - alert: TigerGraphServiceUnknown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 3 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Unknown Status" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has unknown status" + + - name: tigergraph.performance.alerts + rules: + - alert: TigerGraphHighEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name)> 5000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}ms)" + + - alert: TigerGraphCriticalEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) > 10000 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} is critically high ({{ $value }}ms)" + + - alert: TigerGraphHighQPS + expr: max(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 100 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High QPS Detected" + description: "High QPS for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }})" + + - alert: TigerGraphEndpointTimeout + expr: max(tigergraph_endpoint_timeout) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 0 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Endpoint Timeout" + description: "Timeout occurred for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} timeouts)" + + - name: tigergraph.system.alerts + rules: + - alert: TigerGraphLowCPUAvailable + expr: max(tigergraph_cpu_available) by (namespace,cluster_name, host_id) < 10 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low CPU Available" + description: "Available CPU on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }}%)" + + - alert: TigerGraphHighDiskIO + expr: max(tigergraph_disk_iops) by (namespace,cluster_name, path_name, host_id, path, mount_point) > 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O" + description: "High disk I/O for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} IOPS)" + + - alert: TigerGraphHighDiskIOTime + expr: max(tigergraph_disk_io_time) by (namespace,cluster_name, path_name, host_id, path,mount_point) > 0.1 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O Time" + description: "High disk I/O time for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} hours)" + + - name: tigergraph.license.alerts + rules: + - alert: TigerGraphLicenseExpiringSoon + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 30 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 7 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Soon" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. Please renew the license." + + - alert: TigerGraphLicenseExpiringCritical + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 7 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Critical" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. URGENT: License renewal required immediately." + + - alert: TigerGraphLicenseExpired + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expired" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has expired. Service may be affected." + + - alert: TigerGraphLicenseInvalid + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == -1 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Invalid" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is invalid or corrupted. Please check license configuration." + + - name: tigergraph.recording.rules + rules: + - record: tigergraph:cpu_usage_percentage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) + + - record: tigergraph:memory_usage_percentage + expr: ((max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id)) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 + + - record: tigergraph:disk_usage_percentage + expr: (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) / (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) + max(tigergraph_diskspace_free) by(namespace,cluster_name,mount_point,path, path_name, host_id))) * 100 + + - record: tigergraph:endpoint_latency_avg + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:endpoint_latency_max + expr: max(tigergraph_endpoint_latency{statistic="max_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:qps_total + expr: sum(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) + + - record: tigergraph:service_online_count + expr: count(tigergraph_service_status == 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:service_offline_count + expr: count(tigergraph_service_status != 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:license_days_remaining + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) diff --git a/k8s/docs/10-samples/monitoring/tigergraph-certificate-with-certmanager.yaml b/k8s/docs/10-samples/monitoring/tigergraph-certificate-with-certmanager.yaml new file mode 100644 index 000000000..d7dcfd04f --- /dev/null +++ b/k8s/docs/10-samples/monitoring/tigergraph-certificate-with-certmanager.yaml @@ -0,0 +1,51 @@ +# Certificate for TigerGraph metrics endpoint TLS +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: tigergraph-metrics-server-cert +spec: + # Secret names are often required, so we will create one + secretName: tigergraph-metrics-server-cert + duration: 2160h # 90 days + renewBefore: 360h # 15 days + subject: + organizations: + - tigergraph + commonName: tigergraph-metrics-server + # The dnsNames field specifies the DNS names for which the certificate should be valid. + # Modify the dnsNames field to match the DNS names of your TigerGraph cluster. + # For the tigergraph metrics endpoint, the dnsNames should include: + # 1. The internal service name of the TigerGraph cluster. $CLUSTER_NAME-internal-service.$NAMESPACE.svc + dnsNames: + - test-cluster-internal-service + - test-cluster-internal-service.tigergraph + - test-cluster-internal-service.tigergraph.svc + - test-cluster-internal-service.tigergraph.svc.cluster.local + - test-cluste1-internal-service + - test-cluster1-internal-service.tigergraph + - test-cluster1-internal-service.tigergraph.svc + - test-cluster1-internal-service.tigergraph.svc.cluster.local + # The use of the common name field has been deprecated since 2000 and is from + # 2014 explicitly disallowed by CABF. It should not be used unless you have a + # specific need to support a legacy client which requires it. + # commonName: example.com + # The issuerRef field specifies which cert-manager Issuer should be used to + # create the Certificate. If the Issuer is namespace-scoped, this field must + # only reference a NamespaceScoped Issuer. If the Issuer is cluster-scoped, + # this field can reference a ClusterIssuer. + issuerRef: + # The name of the Issuer resource + name: selfsigned-issuer + # We can reference ClusterIssuers by changing the kind here. + # The default value is Issuer (i.e. a locally namespaced Issuer) + kind: Issuer + # This is optional since cert-manager will default to this value however + # if you are using an external issuer, change this to that issuer group. + group: cert-manager.io +--- +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: selfsigned-issuer +spec: + selfSigned: {} diff --git a/k8s/docs/10-samples/monitoring/tigergraph-monitor-grafana-configmap.yaml b/k8s/docs/10-samples/monitoring/tigergraph-monitor-grafana-configmap.yaml index c65e62552..782306958 100644 --- a/k8s/docs/10-samples/monitoring/tigergraph-monitor-grafana-configmap.yaml +++ b/k8s/docs/10-samples/monitoring/tigergraph-monitor-grafana-configmap.yaml @@ -1,10 +1,4 @@ apiVersion: v1 -kind: ConfigMap -metadata: - labels: - grafana_dashboard: "1" - name: tigergraph-monitor-grafana-configmap - namespace: tigergraph data: tigergraph-dashboard.json: |- { @@ -214,7 +208,7 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "clamp_max(avg(count_over_time((sum by (pod) (tigergraph_service_status{cluster_name=\"$ClusterName\",namespace=\"$Namespace\"}))[600s:])/on(pod) (count_over_time(kube_pod_container_info{pod=~\"${ClusterName}-.*\",container=\"tigergraph\"}[600s:])) * 100),100)", + "expr": "clamp_max(avg(count_over_time((sum by (pod) (tigergraph_service_status{cluster_name=\"$ClusterName\",namespace=\"$Namespace\"}))[600s:])/on(pod) (count_over_time((sum by (pod) (kube_pod_container_info{pod=~\"${ClusterName}-[0-9]+\",container=\"tigergraph\"})[600s:]))) * 100),100)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -412,13 +406,83 @@ data: "title": "Endpoint Latency", "type": "timeseries" }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 30 + }, + { + "color": "green", + "value": 60 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 41, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "min by(cluster_name) (tigergraph_license_days_left{cluster_name=\"$ClusterName\",namespace=\"$Namespace\"})", + "legendFormat": "License Days Remaining (days)", + "instant": true, + "refId": "A" + } + ], + "title": "License Days Remaining", + "type": "gauge" + }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 26 + "y": 34 }, "id": 7, "panels": [], @@ -491,7 +555,7 @@ data: "h": 8, "w": 24, "x": 0, - "y": 27 + "y": 35 }, "id": 1, "options": { @@ -588,7 +652,7 @@ data: "h": 8, "w": 24, "x": 0, - "y": 35 + "y": 59 }, "id": 8, "options": { @@ -612,13 +676,112 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(tigergraph_cpu_usage{service_name=~\"$ServiceName\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\", pod=~\"$pod\"}) by (host_id, service_name)", + "expr": "max(tigergraph_cpu_usage{service_name=~\"$ServiceName\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\", pod=~\"$pod\"}) by (host_id, service_name)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "CPU Usage", + "title": "CPU Usage Percentage of Service", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "yellow", + "value": 70 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 59 + }, + "id": 35, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max(tigergraph_cpu_usage{service_name=\"\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\"}) by (host_id)", + "legendFormat": "{{host_id}}", + "range": true, + "refId": "A" + } + ], + "title": "CPU Usage Percentage of Container", "type": "timeseries" }, { @@ -704,7 +867,7 @@ data: "h": 7, "w": 24, "x": 0, - "y": 43 + "y": 59 }, "id": 32, "options": { @@ -804,7 +967,7 @@ data: "h": 8, "w": 24, "x": 0, - "y": 50 + "y": 98 }, "id": 33, "options": { @@ -828,13 +991,13 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(tigergraph_memory_usage{service_name=~\"$ServiceName\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\", pod=~\"$pod\"}) by (host_id, service_name)", + "expr": "max(tigergraph_memory_usage{service_name=~\"$ServiceName\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\", pod=~\"$pod\"}) by (host_id, service_name)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Memory Usage", + "title": "Memory Usage of Service", "type": "timeseries" }, { @@ -891,7 +1054,7 @@ data: } ] }, - "unit": "decmbytes" + "unit": "mbytes" }, "overrides": [] }, @@ -899,9 +1062,9 @@ data: "h": 8, "w": 24, "x": 0, - "y": 58 + "y": 106 }, - "id": 34, + "id": 36, "options": { "legend": { "calcs": [ @@ -923,13 +1086,13 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(tigergraph_diskspace_usage{pod=~\"$pod\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\"}) by (host_id, path)", + "expr": "max(tigergraph_memory_usage{service_name=\"\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\", pod=~\"$pod\"}) by (host_id, service_name)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Diskspace Usage", + "title": "Memory Usage of Container", "type": "timeseries" }, { @@ -949,20 +1112,20 @@ data: "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 25, - "gradientMode": "opacity", + "fillOpacity": 10, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", - "lineWidth": 2, + "lineInterpolation": "linear", + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -986,7 +1149,7 @@ data: } ] }, - "unit": "bytes" + "unit": "decmbytes" }, "overrides": [] }, @@ -994,19 +1157,21 @@ data: "h": 8, "w": 24, "x": 0, - "y": 66 + "y": 114 }, - "id": 9, + "id": 34, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "mode": "multi", - "sort": "desc" + "mode": "single", + "sort": "none" } }, "targets": [ @@ -1016,17 +1181,20 @@ data: "uid": "prometheus" }, "editorMode": "code", - "expr": "sum(rate(container_network_receive_bytes_total{pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "expr": "max(tigergraph_diskspace_usage{pod=~\"$pod\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\"}) by (host_id, mount_point, path)", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Network - Bandwidth", + "title": "Diskspace Usage", "type": "timeseries" }, { - "datasource": {}, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, "fieldConfig": { "defaults": { "color": { @@ -1039,20 +1207,20 @@ data: "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 25, - "gradientMode": "opacity", + "fillOpacity": 10, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", - "lineWidth": 2, + "lineInterpolation": "linear", + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -1076,59 +1244,45 @@ data: } ] }, - "unit": "pps" + "unit": "decmbytes" }, "overrides": [] }, "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, - "y": 74 + "y": 122 }, - "id": 28, + "id": 37, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { - "mode": "multi", + "mode": "single", "sort": "none" } }, - "pluginVersion": "8.3.3", "targets": [ { "datasource": { "type": "prometheus", - "uid": "${datasource}" + "uid": "prometheus" }, "editorMode": "code", - "exemplar": true, - "expr": "sum(rate(container_network_receive_packets_total{pod=~\"$pod\"}[$__rate_interval])) by (pod)", - "interval": "30s", - "legendFormat": "{{pod}} Received", + "expr": "max(tigergraph_diskspace_free{pod=~\"$pod\",cluster_name=\"$ClusterName\",namespace=\"$Namespace\"}) by (host_id, mount_point, path)", + "legendFormat": "__auto", "range": true, "refId": "A" - }, - { - "datasource": { - "type": "prometheus", - "uid": "${datasource}" - }, - "editorMode": "code", - "exemplar": true, - "expr": "- sum(rate(container_network_transmit_packets_total{pod=~\"$pod\"}[$__rate_interval])) by (pod)", - "interval": "30s", - "legendFormat": "{{pod}} Transmitted", - "range": true, - "refId": "B" } ], - "title": "Network - Packets Rate", + "title": "Diskspace Free", "type": "timeseries" }, { @@ -1148,20 +1302,20 @@ data: "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", - "fillOpacity": 25, - "gradientMode": "opacity", + "fillOpacity": 10, + "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, - "lineInterpolation": "smooth", - "lineWidth": 2, + "lineInterpolation": "linear", + "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, - "showPoints": "never", + "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", @@ -1179,27 +1333,492 @@ data: "color": "green", "value": null }, + { + "color": "yellow", + "value": 70 + }, { "color": "red", - "value": 80 + "value": 90 } ] }, - "unit": "pps" + "unit": "bytes" }, "overrides": [] }, "gridPos": { "h": 8, "w": 12, - "x": 12, - "y": 74 + "x": 0, + "y": 130 }, - "id": 30, + "id": 38, "options": { "legend": { - "calcs": [], - "displayMode": "list", + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max by (persistentvolumeclaim) (\n kubelet_volume_stats_capacity_bytes{cluster=\"\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$Namespace\", persistentvolumeclaim=~\"tg-data-$ClusterName-.*\"}\n)\n-\nmax by (persistentvolumeclaim) (\n kubelet_volume_stats_available_bytes{cluster=\"\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$Namespace\", persistentvolumeclaim=~\"tg-data-$ClusterName-.*\"}\n)", + "legendFormat": "{{persistentvolumeclaim}}", + "range": true, + "refId": "A" + } + ], + "title": "Data Volume Used Space", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 130 + }, + "id": 39, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "max by (persistentvolumeclaim) (\n kubelet_volume_stats_available_bytes{cluster=\"\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$Namespace\", persistentvolumeclaim=~\"tg-data-$ClusterName-.*\"}\n)", + "legendFormat": "{{persistentvolumeclaim}}", + "range": true, + "refId": "A" + } + ], + "title": "Data Volume Free Space", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "red", + "value": 90 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 138 + }, + "id": 40, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(\n max by (persistentvolumeclaim) (\n kubelet_volume_stats_capacity_bytes{cluster=\"\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$Namespace\", persistentvolumeclaim=~\"tg-data-$ClusterName-.*\"}\n )\n -\n max by (persistentvolumeclaim) (\n kubelet_volume_stats_available_bytes{cluster=\"\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$Namespace\", persistentvolumeclaim=~\"tg-data-$ClusterName-.*\"}\n )\n)\n/\nmax by (persistentvolumeclaim) (\n kubelet_volume_stats_capacity_bytes{cluster=\"\", job=\"kubelet\", metrics_path=\"/metrics\", namespace=\"$Namespace\", persistentvolumeclaim=~\"tg-data-$ClusterName-.*\"}\n)\n* 100", + "legendFormat": "{{persistentvolumeclaim}}", + "instant": true, + "refId": "A" + } + ], + "title": "Data Volume Usage Percentage", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 146 + }, + "id": 9, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(rate(container_network_receive_bytes_total{pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Network - Bandwidth", + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 154 + }, + "id": 28, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "8.3.3", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_network_receive_packets_total{pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "interval": "30s", + "legendFormat": "{{pod}} Received", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "- sum(rate(container_network_transmit_packets_total{pod=~\"$pod\"}[$__rate_interval])) by (pod)", + "interval": "30s", + "legendFormat": "{{pod}} Transmitted", + "range": true, + "refId": "B" + } + ], + "title": "Network - Packets Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "pps" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 154 + }, + "id": 30, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", "placement": "bottom", "showLegend": true }, @@ -1371,5 +1990,9 @@ data: "version": 1, "weekStart": "" } - - +kind: ConfigMap +metadata: + labels: + grafana_dashboard: "1" + name: tigergraph-monitor-grafana-configmap + namespace: tigergraph diff --git a/k8s/docs/10-samples/monitoring/tigergraph-monitor-service-monitor-with-ssl.yaml b/k8s/docs/10-samples/monitoring/tigergraph-monitor-service-monitor-with-ssl.yaml new file mode 100644 index 000000000..a49ea4708 --- /dev/null +++ b/k8s/docs/10-samples/monitoring/tigergraph-monitor-service-monitor-with-ssl.yaml @@ -0,0 +1,35 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + release: prometheus-stack + name: tigergraph-monitor-service-monitor + namespace: tigergraph +spec: + endpoints: + - path: /informant/metrics + port: nginx + scheme: https + tlsConfig: + ca: + secret: + key: ca.crt + name: tigergraph-metrics-server-cert + cert: + secret: + key: tls.crt + name: tigergraph-metrics-server-cert + insecureSkipVerify: false + keySecret: + key: tls.key + name: tigergraph-metrics-server-cert + serverName: test-cluster-internal-service.tigergraph.svc + namespaceSelector: + matchNames: + - tigergraph + selector: + matchExpressions: + - key: tigergraph.com/monitoring + operator: In + values: + - test-cluster diff --git a/k8s/docs/10-samples/monitoring/tigergraph-monitor-with-ssl.yaml b/k8s/docs/10-samples/monitoring/tigergraph-monitor-with-ssl.yaml new file mode 100644 index 000000000..cf8dce1ac --- /dev/null +++ b/k8s/docs/10-samples/monitoring/tigergraph-monitor-with-ssl.yaml @@ -0,0 +1,412 @@ +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraphMonitor +metadata: + labels: + tigergraph.com/namespace: tigergraph + name: tigergraph-monitor + namespace: tigergraph +spec: + tlsConfig: + ca: + secret: + name: tigergraph-metrics-server-cert + key: ca.crt + cert: + secret: + name: tigergraph-metrics-server-cert + key: tls.crt + keySecret: + name: tigergraph-metrics-server-cert + key: tls.key + insecureSkipVerify: false + serverName: "tigergraph-cluster-0.tigergraph-cluster-headless.default.svc" + ruleSelectorLabels: + release: prometheus-stack + serviceMonitorLabels: + release: prometheus-stack + alertmanagerConfigLabels: + # remove this if you don't config alertmanagerConfigSelector for Alertmanager + release: prometheus-stack + alertmanagerConfig: + receivers: + - name: slack-receiver + slackConfigs: + - apiURL: + key: webhook-url + name: slack-webhook-url + channel: '#operator-monitoring-test' + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + sendResolved: true + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + route: + groupBy: + - job + - alertname + groupInterval: 5m + groupWait: 30s + receiver: slack-receiver + repeatInterval: 1m + routes: + - continue: true + receiver: slack-receiver + prometheusRule: + groups: + - name: tigergraph.cpu.alerts + rules: + - alert: TigerGraphHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - alert: TigerGraphCriticalCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value }}%)" + + - alert: TigerGraphServiceHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name!=""}) by (cluster_name, namespace, service_name, host_id) > 70 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage for {{ $labels.service_name }} Service" + description: "CPU usage for {{ $labels.service_name }} service on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - name: tigergraph.memory.alerts + rules: + - alert: TigerGraphHighMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowMemoryAvailable + expr: max(tigergraph_memory_available{}) by (cluster_name, namespace, host_id) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Memory Available" + description: "Available memory on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - name: tigergraph.disk.alerts + rules: + - alert: TigerGraphHighDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowDiskSpace + expr: max(tigergraph_diskspace_free) by (cluster_name, namespace, path_name, host_id, path) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Space" + description: "Free disk space for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - alert: TigerGraphLowDiskInodes + expr: max(tigergraph_disk_inode_free) by (cluster_name, namespace, host_id, path_name, path) < 100000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Inodes" + description: "Free disk inodes for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }})" + + - name: tigergraph.network.alerts + rules: + - alert: TigerGraphHighNetworkConnections + expr: max(tigergraph_network_connections) by (cluster_name, namespace, host_id) > 2000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Network Connections" + description: "Number of open TCP connections on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }})" + + - name: tigergraph.service.alerts + rules: + - alert: TigerGraphServiceDown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 27 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Down" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is down" + + - alert: TigerGraphServiceOffline + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 24 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Offline" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is offline" + + - alert: TigerGraphServiceStopping + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 21 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Stopping" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is stopping" + + - alert: TigerGraphServicePaused + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 18 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Paused" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is paused" + + - alert: TigerGraphServiceStarting + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 15 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Starting" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is starting" + + - alert: TigerGraphServiceReadonly + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 12 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Readonly" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in readonly mode" + + - alert: TigerGraphServiceWarmup + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 9 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Warmup" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in warmup state" + + - alert: TigerGraphServiceUnknown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 3 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Unknown Status" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has unknown status" + + - name: tigergraph.performance.alerts + rules: + - alert: TigerGraphHighEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name)> 5000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}ms)" + + - alert: TigerGraphCriticalEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) > 10000 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} is critically high ({{ $value }}ms)" + + - alert: TigerGraphHighQPS + expr: max(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 100 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High QPS Detected" + description: "High QPS for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }})" + + - alert: TigerGraphEndpointTimeout + expr: max(tigergraph_endpoint_timeout) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 0 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Endpoint Timeout" + description: "Timeout occurred for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} timeouts)" + + - name: tigergraph.system.alerts + rules: + - alert: TigerGraphLowCPUAvailable + expr: max(tigergraph_cpu_available) by (namespace,cluster_name, host_id) < 10 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low CPU Available" + description: "Available CPU on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }}%)" + + - alert: TigerGraphHighDiskIO + expr: max(tigergraph_disk_iops) by (namespace,cluster_name, path_name, host_id, path, mount_point) > 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O" + description: "High disk I/O for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} IOPS)" + + - alert: TigerGraphHighDiskIOTime + expr: max(tigergraph_disk_io_time) by (namespace,cluster_name, path_name, host_id, path,mount_point) > 0.1 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O Time" + description: "High disk I/O time for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} hours)" + + - name: tigergraph.license.alerts + rules: + - alert: TigerGraphLicenseExpiringSoon + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 30 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 7 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Soon" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. Please renew the license." + + - alert: TigerGraphLicenseExpiringCritical + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 7 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Critical" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. URGENT: License renewal required immediately." + + - alert: TigerGraphLicenseExpired + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expired" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has expired. Service may be affected." + + - alert: TigerGraphLicenseInvalid + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == -1 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Invalid" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is invalid or corrupted. Please check license configuration." + + - name: tigergraph.recording.rules + rules: + - record: tigergraph:cpu_usage_percentage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) + + - record: tigergraph:memory_usage_percentage + expr: ((max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id)) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 + + - record: tigergraph:disk_usage_percentage + expr: (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) / (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) + max(tigergraph_diskspace_free) by(namespace,cluster_name,mount_point,path, path_name, host_id))) * 100 + + - record: tigergraph:endpoint_latency_avg + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:endpoint_latency_max + expr: max(tigergraph_endpoint_latency{statistic="max_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:qps_total + expr: sum(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) + + - record: tigergraph:service_online_count + expr: count(tigergraph_service_status == 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:service_offline_count + expr: count(tigergraph_service_status != 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:license_days_remaining + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) +--- +apiVersion: v1 +kind: Secret +metadata: + name: slack-webhook-url +type: Opaque +stringData: + webhook-url: https://hooks.slack.com/services/YOUR_SLACK_WEBHOOK_URL diff --git a/k8s/docs/10-samples/monitoring/tigergraph-monitor.yaml b/k8s/docs/10-samples/monitoring/tigergraph-monitor.yaml index e1c89e09c..acb420cb9 100644 --- a/k8s/docs/10-samples/monitoring/tigergraph-monitor.yaml +++ b/k8s/docs/10-samples/monitoring/tigergraph-monitor.yaml @@ -1,10 +1,399 @@ apiVersion: graphdb.tigergraph.com/v1alpha1 kind: TigerGraphMonitor metadata: + labels: + tigergraph.com/namespace: tigergraph name: tigergraph-monitor namespace: tigergraph spec: - monitoredClusters: - - test-cluster - - e2e-monitor-test-cluster - releaseName: monitoring-stack + ruleSelectorLabels: + release: prometheus-stack + serviceMonitorLabels: + release: prometheus-stack + alertmanagerConfigLabels: + # remove this if you don't config alertmanagerConfigSelector for Alertmanager + release: prometheus-stack + alertmanagerConfig: + receivers: + - name: slack-receiver + slackConfigs: + - apiURL: + key: webhook-url + name: slack-webhook-url + channel: '#operator-monitoring-test' + color: '{{ if eq .Status "firing" }}danger{{ else }}good{{ end }}' + sendResolved: true + text: |- + {{ range .Alerts }} + *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}` + *Description:* {{ .Annotations.description }} + *Details:* + {{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}` + {{ end }} + {{ end }} + route: + groupBy: + - job + - alertname + groupInterval: 5m + groupWait: 30s + receiver: slack-receiver + repeatInterval: 1m + routes: + - continue: true + receiver: slack-receiver + prometheusRule: + groups: + - name: tigergraph.cpu.alerts + rules: + - alert: TigerGraphHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - alert: TigerGraphCriticalCPUUsage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical CPU Usage Detected" + description: "CPU usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value }}%)" + + - alert: TigerGraphServiceHighCPUUsage + expr: max(tigergraph_cpu_usage{service_name!=""}) by (cluster_name, namespace, service_name, host_id) > 70 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High CPU Usage for {{ $labels.service_name }} Service" + description: "CPU usage for {{ $labels.service_name }} service on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}%)" + + - name: tigergraph.memory.alerts + rules: + - alert: TigerGraphHighMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalMemoryUsage + expr: (max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Memory Usage Detected" + description: "Memory usage on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowMemoryAvailable + expr: max(tigergraph_memory_available{}) by (cluster_name, namespace, host_id) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Memory Available" + description: "Available memory on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - name: tigergraph.disk.alerts + rules: + - alert: TigerGraphHighDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 80 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphCriticalDiskUsage + expr: (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) / (max by(host_id, mount_point, path)(tigergraph_diskspace_usage) + max by(host_id, mount_point, path) (tigergraph_diskspace_free))) * 100 > 90 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Disk Usage Detected" + description: "Disk usage for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is critically high ({{ $value | humanizePercentage }})" + + - alert: TigerGraphLowDiskSpace + expr: max(tigergraph_diskspace_free) by (cluster_name, namespace, path_name, host_id, path) < 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Space" + description: "Free disk space for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }} MB)" + + - alert: TigerGraphLowDiskInodes + expr: max(tigergraph_disk_inode_free) by (cluster_name, namespace, host_id, path_name, path) < 100000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low Disk Inodes" + description: "Free disk inodes for {{ $labels.path_name }}: {{ $labels.path }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }})" + + - name: tigergraph.network.alerts + rules: + - alert: TigerGraphHighNetworkConnections + expr: max(tigergraph_network_connections) by (cluster_name, namespace, host_id) > 2000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Network Connections" + description: "Number of open TCP connections on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }})" + + - name: tigergraph.service.alerts + rules: + - alert: TigerGraphServiceDown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 27 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Down" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is down" + + - alert: TigerGraphServiceOffline + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 24 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Offline" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is offline" + + - alert: TigerGraphServiceStopping + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 21 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Stopping" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is stopping" + + - alert: TigerGraphServicePaused + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 18 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Paused" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is paused" + + - alert: TigerGraphServiceStarting + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 15 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Starting" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is starting" + + - alert: TigerGraphServiceReadonly + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 12 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Readonly" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in readonly mode" + + - alert: TigerGraphServiceWarmup + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 9 + for: 2m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph Service Warmup" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is in warmup state" + + - alert: TigerGraphServiceUnknown + expr: max(tigergraph_service_status) by (cluster_name, namespace, service_name, host_id) == 3 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph Service Unknown Status" + description: "Service {{ $labels.service_name }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has unknown status" + + - name: tigergraph.performance.alerts + rules: + - alert: TigerGraphHighEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name)> 5000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is high ({{ $value }}ms)" + + - alert: TigerGraphCriticalEndpointLatency + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) > 10000 + for: 3m + labels: + severity: critical + service: tigergraph + annotations: + summary: "Critical Endpoint Latency" + description: "Average latency for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} is critically high ({{ $value }}ms)" + + - alert: TigerGraphHighQPS + expr: max(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 100 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High QPS Detected" + description: "High QPS for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }})" + + - alert: TigerGraphEndpointTimeout + expr: max(tigergraph_endpoint_timeout) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) > 0 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Endpoint Timeout" + description: "Timeout occurred for exported_endpoint {{ $labels.exported_endpoint }} of endpoint {{ $labels.endpoint }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} timeouts)" + + - name: tigergraph.system.alerts + rules: + - alert: TigerGraphLowCPUAvailable + expr: max(tigergraph_cpu_available) by (namespace,cluster_name, host_id) < 10 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "Low CPU Available" + description: "Available CPU on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is low ({{ $value }}%)" + + - alert: TigerGraphHighDiskIO + expr: max(tigergraph_disk_iops) by (namespace,cluster_name, path_name, host_id, path, mount_point) > 1000 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O" + description: "High disk I/O for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} IOPS)" + + - alert: TigerGraphHighDiskIOTime + expr: max(tigergraph_disk_io_time) by (namespace,cluster_name, path_name, host_id, path,mount_point) > 0.1 + for: 5m + labels: + severity: warning + service: tigergraph + annotations: + summary: "High Disk I/O Time" + description: "High disk I/O time for {{ $labels.path_name }} of mount point {{ $labels.mount_point }} on host {{ $labels.host_id }} of cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} ({{ $value }} hours)" + + - name: tigergraph.license.alerts + rules: + - alert: TigerGraphLicenseExpiringSoon + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 30 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 7 + for: 1m + labels: + severity: warning + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Soon" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. Please renew the license." + + - alert: TigerGraphLicenseExpiringCritical + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) <= 7 and min(tigergraph_license_days_left) by (cluster_name, namespace) > 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expiring Critical" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} will expire in {{ $value }} days. URGENT: License renewal required immediately." + + - alert: TigerGraphLicenseExpired + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == 0 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Expired" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} has expired. Service may be affected." + + - alert: TigerGraphLicenseInvalid + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) == -1 + for: 1m + labels: + severity: critical + service: tigergraph + annotations: + summary: "TigerGraph License Invalid" + description: "TigerGraph license for cluster {{ $labels.cluster_name }} in namespace {{ $labels.namespace }} is invalid or corrupted. Please check license configuration." + + - name: tigergraph.recording.rules + rules: + - record: tigergraph:cpu_usage_percentage + expr: max(tigergraph_cpu_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id) + + - record: tigergraph:memory_usage_percentage + expr: ((max(tigergraph_memory_usage{service_name=""}) by (cluster_name, namespace, service_name, host_id)) / max(tigergraph_memory_total{service_name=""}) by (cluster_name, namespace, service_name, host_id)) * 100 + + - record: tigergraph:disk_usage_percentage + expr: (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) / (max(tigergraph_diskspace_usage) by(namespace,cluster_name,mount_point,path, path_name, host_id) + max(tigergraph_diskspace_free) by(namespace,cluster_name,mount_point,path, path_name, host_id))) * 100 + + - record: tigergraph:endpoint_latency_avg + expr: max(tigergraph_endpoint_latency{statistic="average_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:endpoint_latency_max + expr: max(tigergraph_endpoint_latency{statistic="max_latency"}) by (namespace,cluster_name, endpoint, exported_endpoint, service_name) + + - record: tigergraph:qps_total + expr: sum(tigergraph_qps) by (namespace,cluster_name, endpoint, exported_endpoint, service_name, host_id) + + - record: tigergraph:service_online_count + expr: count(tigergraph_service_status == 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:service_offline_count + expr: count(tigergraph_service_status != 6) by (namespace,cluster_name, service_name, host_id) + + - record: tigergraph:license_days_remaining + expr: min(tigergraph_license_days_left) by (cluster_name, namespace) + +--- +apiVersion: v1 +kind: Secret +metadata: + name: slack-webhook-url +type: Opaque +stringData: + webhook-url: https://hooks.slack.com/services/YOUR_SLACK_WEBHOOK_URL \ No newline at end of file diff --git a/k8s/docs/10-samples/monitoring/tigergraph-operator-alert-rules.yaml b/k8s/docs/10-samples/monitoring/tigergraph-operator-alert-rules.yaml new file mode 100644 index 000000000..0911eb7a1 --- /dev/null +++ b/k8s/docs/10-samples/monitoring/tigergraph-operator-alert-rules.yaml @@ -0,0 +1,303 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: tigergraph-operator-alert-rules + namespace: tigergraph + labels: + app: tigergraph + release: prometheus-stack +spec: + groups: + - name: tigergraph-operator-controller + rules: + # Controller Status Alerts + - alert: TigerGraphOperatorNoLeader + expr: sum by (namespace) (leader_election_master_status{operator="tigergraph",component="operator"}) == 0 + for: 2m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator no leader elected" + description: "No TigerGraph Operator instance is elected as leader in namespace {{ $labels.namespace }} for more than 2 minutes" + + - alert: TigerGraphOperatorMultipleLeaders + expr: sum by (namespace) (leader_election_master_status{operator="tigergraph",component="operator"}) > 1 + for: 1m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator multiple leaders detected" + description: "Multiple TigerGraph Operator instances are elected as leader in namespace {{ $labels.namespace }} - this indicates a split-brain condition" + + - alert: TigerGraphOperatorLeaderElectionUnhealthy + expr: (sum by (namespace) (leader_election_master_status{operator="tigergraph",component="operator"}) == 0) and (count by (namespace) (up{operator="tigergraph",component="operator"}) > 0) + for: 3m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator leader election unhealthy" + description: "TigerGraph Operator instances are running but no leader is elected in namespace {{ $labels.namespace }} for more than 3 minutes" + + + - alert: TigerGraphOperatorMaxConcurrentReconcilesExceeded + expr: controller_runtime_max_concurrent_reconciles{operator="tigergraph",component="operator"} > 10 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator max concurrent reconciles exceeded" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has max concurrent reconciles > 10 for more than 5 minutes" + + - name: tigergraph-operator-reconciliation + rules: + # Reconciliation Performance Alerts + - alert: TigerGraphOperatorHighReconciliationErrors + expr: rate(controller_runtime_reconcile_errors_total{operator="tigergraph",component="operator"}[5m]) > 0.1 + for: 5m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high reconciliation error rate" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has reconciliation error rate > 0.1 errors/sec for more than 5 minutes" + + - alert: TigerGraphOperatorReconciliationPanics + expr: rate(controller_runtime_reconcile_panics_total{operator="tigergraph",component="operator"}[5m]) > 0 + for: 1m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator reconciliation panics detected" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} is experiencing reconciliation panics" + + - alert: TigerGraphOperatorHighTerminalErrors + expr: rate(controller_runtime_terminal_reconcile_errors_total{operator="tigergraph",component="operator"}[5m]) > 0.05 + for: 5m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high terminal reconciliation errors" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has terminal reconciliation error rate > 0.05 errors/sec for more than 5 minutes" + + - name: tigergraph-operator-workqueue + rules: + # Workqueue Metrics Alerts + - alert: TigerGraphOperatorHighWorkqueueDepth + expr: workqueue_depth{operator="tigergraph",component="operator"} > 100 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high workqueue depth" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has workqueue depth > 100 for more than 5 minutes" + + - alert: TigerGraphOperatorCriticalWorkqueueDepth + expr: workqueue_depth{operator="tigergraph",component="operator"} > 1000 + for: 2m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator critical workqueue depth" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has critical workqueue depth > 1000 for more than 2 minutes" + + - alert: TigerGraphOperatorLongRunningProcessor + expr: workqueue_longest_running_processor_seconds{operator="tigergraph",component="operator"} > 300 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator long running processor" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has processor running > 5 minutes for more than 5 minutes" + + - alert: TigerGraphOperatorStuckWorkqueue + expr: workqueue_unfinished_work_seconds{operator="tigergraph",component="operator"} > 600 + for: 5m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator stuck workqueue" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has unfinished work > 10 minutes for more than 5 minutes" + + - alert: TigerGraphOperatorHighWorkqueueRetryRate + expr: rate(workqueue_retries_total{operator="tigergraph",component="operator"}[5m]) > 1 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high workqueue retry rate" + description: "Controller {{ $labels.controller }} in namespace {{ $labels.namespace }} has retry rate > 1 retries/sec for more than 5 minutes" + + - name: tigergraph-operator-system-resources + rules: + # System Resources Alerts + - alert: TigerGraphOperatorHighGoroutines + expr: go_goroutines{operator="tigergraph",component="operator"} > 1000 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high goroutine count" + description: "TigerGraph Operator in namespace {{ $labels.namespace }} has > 1000 goroutines for more than 5 minutes" + + - alert: TigerGraphOperatorCriticalGoroutines + expr: go_goroutines{operator="tigergraph",component="operator"} > 5000 + for: 2m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator critical goroutine count" + description: "TigerGraph Operator in namespace {{ $labels.namespace }} has > 5000 goroutines for more than 2 minutes" + + - alert: TigerGraphOperatorHighMemoryUsage + expr: container_memory_working_set_bytes{container="manager", pod=~"tigergraph-operator.*"} / container_spec_memory_limit_bytes{container="manager", pod=~"tigergraph-operator.*"} > 0.8 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high memory usage" + description: "TigerGraph Operator container {{ $labels.pod }} is using > 80% of memory limit for more than 5 minutes" + + - alert: TigerGraphOperatorCriticalMemoryUsage + expr: container_memory_working_set_bytes{container="manager", pod=~"tigergraph-operator.*"} / container_spec_memory_limit_bytes{container="manager", pod=~"tigergraph-operator.*"} > 0.95 + for: 2m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator critical memory usage" + description: "TigerGraph Operator container {{ $labels.pod }} is using > 95% of memory limit for more than 2 minutes" + + - alert: TigerGraphOperatorHighCPUUsage + expr: rate(container_cpu_usage_seconds_total{container="manager", pod=~"tigergraph-operator.*"}[5m]) > 0.8 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high CPU usage" + description: "TigerGraph Operator container {{ $labels.pod }} is using > 0.8 CPU cores for more than 5 minutes" + + - alert: TigerGraphOperatorHighFileDescriptors + expr: process_open_fds{operator="tigergraph",component="operator"} / process_max_fds{operator="tigergraph",component="operator"} > 0.8 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator high file descriptor usage" + description: "TigerGraph Operator in namespace {{ $labels.namespace }} is using > 80% of available file descriptors for more than 5 minutes" + + - alert: TigerGraphOperatorCriticalFileDescriptors + expr: process_open_fds{operator="tigergraph",component="operator"} / process_max_fds{operator="tigergraph",component="operator"} > 0.95 + for: 2m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator critical file descriptor usage" + description: "TigerGraph Operator in namespace {{ $labels.namespace }} is using > 95% of available file descriptors for more than 2 minutes" + + - name: tigergraph-operator-webhook + rules: + # Webhook & HTTP Alerts + - alert: TigerGraphOperatorWebhookHighErrorRate + expr: rate(controller_runtime_webhook_requests_total{operator="tigergraph",component="operator",code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator webhook high error rate" + description: "Webhook {{ $labels.webhook }} in namespace {{ $labels.namespace }} has 5xx error rate > 0.1 errors/sec for more than 5 minutes" + + - alert: TigerGraphOperatorWebhookHighLatency + expr: histogram_quantile(0.95, rate(controller_runtime_webhook_latency_seconds_bucket{operator="tigergraph",component="operator"}[5m])) > 1 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator webhook high latency" + description: "Webhook {{ $labels.webhook }} in namespace {{ $labels.namespace }} has 95th percentile latency > 1 second for more than 5 minutes" + + - alert: TigerGraphOperatorRESTClientHighErrorRate + expr: rate(rest_client_requests_total{operator="tigergraph",component="operator",code=~"5.."}[5m]) > 0.1 + for: 5m + labels: + severity: warning + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator REST client high error rate" + description: "REST client requests in namespace {{ $labels.namespace }} have 5xx error rate > 0.1 errors/sec for more than 5 minutes" + + - name: tigergraph-operator-availability + rules: + # Availability and Health Alerts + - alert: TigerGraphOperatorDown + expr: up{operator="tigergraph",component="operator"} == 0 + for: 1m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator is down" + description: "TigerGraph Operator in namespace {{ $labels.namespace }} has been down for more than 1 minute" + + - alert: TigerGraphOperatorNoMetrics + expr: absent(controller_runtime_active_workers{operator="tigergraph",component="operator"}) + for: 5m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator no metrics available" + description: "No TigerGraph Operator metrics are available in namespace {{ $labels.namespace }} for more than 5 minutes" + + - alert: TigerGraphOperatorNamespaceDown + expr: sum by (namespace) (up{operator="tigergraph",component="operator"}) == 0 + for: 2m + labels: + severity: critical + operator: tigergraph + component: operator + annotations: + summary: "TigerGraph Operator namespace completely down" + description: "All TigerGraph Operator instances in namespace {{ $labels.namespace }} are down for more than 2 minutes" diff --git a/k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-grafana-configmap.yaml b/k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-grafana-configmap.yaml new file mode 100644 index 000000000..123f9254f --- /dev/null +++ b/k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-grafana-configmap.yaml @@ -0,0 +1,2723 @@ +apiVersion: v1 +data: + tigergraph-operator-dashboard.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "TigerGraph Operator Metrics Dashboard - Comprehensive monitoring for TigerGraph Kubernetes Operator based on actual operator metrics. This dashboard provides insights into controller performance, reconciliation operations, workqueue metrics, and system resource utilization.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 2, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 1, + "panels": [], + "title": "Controller Status", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "Inactive" + }, + "1": { + "color": "green", + "index": 1, + "text": "Active" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 2, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": false, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "controller_runtime_active_workers{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Controller Active Workers", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 3, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "controller_runtime_max_concurrent_reconciles{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Max Concurrent Reconciles", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [ + { + "options": { + "0": { + "color": "red", + "index": 0, + "text": "Not Leader" + }, + "1": { + "color": "green", + "index": 1, + "text": "Leader" + } + }, + "type": "value" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 9 + }, + "id": 32, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": false, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "leader_election_master_status{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "{{name}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Leader Election Master Status", + "type": "state-timeline" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 17 + }, + "id": 4, + "panels": [], + "title": "Reconciliation Performance", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 18 + }, + "id": 5, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "{{controller}} - {{result}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 18 + }, + "id": 6, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_errors_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 21, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_panics_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Reconciliation Panics", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 22, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_terminal_reconcile_errors_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Terminal Reconciliation Errors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 34 + }, + "id": 23, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_total{result=\"success\",operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "Success ({{controller}}) ({{namespace}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_total{result=\"error\",operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "Error ({{controller}}) ({{namespace}})", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_total{result=\"requeue\",operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "Requeue ({{controller}}) ({{namespace}})", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_reconcile_total{result=\"requeue_after\",operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "Requeue After ({{controller}}) ({{namespace}})", + "range": true, + "refId": "D" + } + ], + "title": "Reconciliation Results Breakdown", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 34 + }, + "id": 24, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "controller_runtime_webhook_requests_in_flight{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "{{webhook}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Webhook Requests In Flight", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 42 + }, + "id": 7, + "panels": [], + "title": "Workqueue Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 43 + }, + "id": 8, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "workqueue_depth{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Workqueue Depth", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 43 + }, + "id": 9, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "workqueue_longest_running_processor_seconds{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Longest Running Processor", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 43 + }, + "id": 16, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "workqueue_unfinished_work_seconds{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Unfinished Work (Stuck Threads)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 51 + }, + "id": 17, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(workqueue_adds_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Workqueue Add Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 51 + }, + "id": 18, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(workqueue_retries_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m])", + "legendFormat": "{{controller}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Workqueue Retry Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 59 + }, + "id": 19, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, rate(workqueue_queue_duration_seconds_bucket{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m]))", + "legendFormat": "95th percentile ({{controller}}) ({{namespace}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, rate(workqueue_queue_duration_seconds_bucket{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m]))", + "legendFormat": "50th percentile ({{controller}}) ({{namespace}})", + "range": true, + "refId": "B" + } + ], + "title": "Workqueue Queue Duration (P95/P50)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 59 + }, + "id": 20, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.95, rate(workqueue_work_duration_seconds_bucket{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m]))", + "legendFormat": "95th percentile ({{controller}}) ({{namespace}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "histogram_quantile(0.50, rate(workqueue_work_duration_seconds_bucket{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\",controller=~\"$controller\"}[5m]))", + "legendFormat": "50th percentile ({{controller}}) ({{namespace}})", + "range": true, + "refId": "B" + } + ], + "title": "Workqueue Processing Duration (P95/P50)", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 67 + }, + "id": 10, + "panels": [], + "title": "System Resources", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 68 + }, + "id": 11, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_goroutines{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "Goroutines ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Goroutines Count", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "CPU Cores", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 4, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 8, + "y": 68 + }, + "id": 25, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", container=\"manager\", image!=\"\", container!=\"\"}[$__rate_interval])) by (container,pod)", + "interval": "30s", + "legendFormat": "{{pod}} {{container}}", + "range": true, + "refId": "A" + } + ], + "title": "TigerGraph Operator Container CPU Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 16, + "y": 68 + }, + "id": 26, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", container=\"manager\", image!=\"\", container!=\"\"}) by (container,pod)", + "interval": "30s", + "legendFormat": "{{pod}} {{container}}", + "range": true, + "refId": "A" + } + ], + "title": "TigerGraph Operator Container Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 76 + }, + "id": 30, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "process_resident_memory_bytes{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "RSS Memory ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Resident Memory (RSS)", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Memory", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineStyle": { + "fill": "solid" + }, + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 2, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 0, + "y": 84 + }, + "id": 31, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$namespace\", pod=~\"tigergraph-operator.*\", image!=\"\", container!=\"\"}) by (container,pod)", + "interval": "30s", + "legendFormat": "{{pod}} {{container}}", + "range": true, + "refId": "A" + } + ], + "title": "All TigerGraph Operator Containers Memory Usage", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "opacity", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "smooth", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 76 + }, + "id": 12, + "options": { + "legend": { + "calcs": [ + "min", + "max", + "mean" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "desc" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_heap_alloc_bytes{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "Heap Allocated ({{namespace}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_heap_inuse_bytes{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "Heap In Use ({{namespace}})", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_memstats_heap_sys_bytes{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "Heap System ({{namespace}})", + "range": true, + "refId": "C" + } + ], + "title": "Go Heap Memory", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 84 + }, + "id": 28, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "process_open_fds{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "Open FDs ({{namespace}})", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "process_max_fds{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "Max FDs ({{namespace}})", + "range": true, + "refId": "B" + } + ], + "title": "File Descriptors", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 84 + }, + "id": 29, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "go_threads{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}", + "legendFormat": "OS Threads ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "OS Threads Count", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 59 + }, + "id": 13, + "panels": [], + "title": "Webhook & HTTP", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 60 + }, + "id": 14, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(controller_runtime_webhook_requests_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}[5m])", + "legendFormat": "{{webhook}} - {{code}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "Webhook Request Rate", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 60 + }, + "id": 15, + "options": { + "legend": { + "calcs": [ + "lastNotNull" + ], + "displayMode": "table", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "rate(rest_client_requests_total{operator=\"tigergraph\",component=\"operator\",namespace=\"$namespace\"}[5m])", + "legendFormat": "{{method}} {{code}} ({{namespace}})", + "range": true, + "refId": "A" + } + ], + "title": "REST Client Request Rate", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 38, + "style": "dark", + "tags": [ + "tigergraph", + "operator", + "kubernetes" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(controller_runtime_active_workers{operator=\"tigergraph\",component=\"operator\"}, namespace)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(controller_runtime_active_workers{operator=\"tigergraph\",component=\"operator\"}, namespace)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(controller_runtime_active_workers{operator=\"tigergraph\",component=\"operator\",namespace=~\"$namespace\"}, controller)", + "hide": 0, + "includeAll": true, + "multi": true, + "name": "controller", + "options": [], + "query": { + "query": "label_values(controller_runtime_active_workers{operator=\"tigergraph\",component=\"operator\",namespace=~\"$namespace\"}, controller)", + "refId": "PrometheusVariableQueryEditor-VariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "TigerGraph Operator Metrics Dashboard", + "uid": "tigergraph-operator-metrics-dashboard", + "version": 1, + "weekStart": "" + } +kind: ConfigMap +metadata: + annotations: + meta.helm.sh/release-name: tg-operator + meta.helm.sh/release-namespace: tigergraph + labels: + app.kubernetes.io/instance: tg-operator + app.kubernetes.io/name: tg-operator + control-plane: controller-manager + grafana_dashboard: "1" + name: tigergraph-operator-controller-manager-grafana-configmap + namespace: tigergraph diff --git a/k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-metrics-monitor.yaml b/k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-metrics-monitor.yaml new file mode 100644 index 000000000..4867acdde --- /dev/null +++ b/k8s/docs/10-samples/monitoring/tigergraph-operator-controller-manager-metrics-monitor.yaml @@ -0,0 +1,57 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + release: prometheus-stack + name: tigergraph-operator-controller-manager-metrics-monitor + namespace: tigergraph +spec: + selector: + matchLabels: + control-plane: controller-manager + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + path: /metrics + port: https + scheme: https + tlsConfig: + ca: + secret: + key: ca.crt + name: metrics-server-cert + cert: + secret: + key: tls.crt + name: metrics-server-cert + insecureSkipVerify: false + keySecret: + key: tls.key + name: metrics-server-cert + serverName: tigergraph-operator-controller-manager-metrics-service.tigergraph.svc + relabelings: + - action: replace + replacement: tigergraph + sourceLabels: + - __meta_kubernetes_service_name + targetLabel: operator + - action: replace + replacement: operator + sourceLabels: + - __meta_kubernetes_service_name + targetLabel: component + - action: replace + sourceLabels: + - __meta_kubernetes_namespace + targetLabel: namespace + - action: replace + sourceLabels: + - __meta_kubernetes_pod_name + targetLabel: pod + - action: replace + sourceLabels: + - __meta_kubernetes_service_label_app_kubernetes_io_name + targetLabel: app_name + - action: replace + sourceLabels: + - __meta_kubernetes_service_label_control_plane + targetLabel: control_plane From 00028c6a08f5a84623f537bd2d2e3392f49a12a9 Mon Sep 17 00:00:00 2001 From: Jerry Yang <103920848+jerryyangtg@users.noreply.github.com> Date: Fri, 26 Sep 2025 16:56:01 +0800 Subject: [PATCH 2/6] Update the user documentation for TigerGraph upgrades and Operator upgrades; --- k8s/docs/04-manage/operator-upgrade.md | 44 ++++++++++++++++++++++++ k8s/docs/04-manage/tigergraph-upgrade.md | 41 ++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/k8s/docs/04-manage/operator-upgrade.md b/k8s/docs/04-manage/operator-upgrade.md index 8ff9130e3..37a3a493a 100644 --- a/k8s/docs/04-manage/operator-upgrade.md +++ b/k8s/docs/04-manage/operator-upgrade.md @@ -3,6 +3,9 @@ This document provides step-by-step instructions for upgrading the TigerGraph Kubernetes Operator using the kubectl-tg plugin. - [How to upgrade TigerGraph Kubernetes Operator](#how-to-upgrade-tigergraph-kubernetes-operator) + - [Before You begin](#before-you-begin) + - [Release Number Definition](#release-number-definition) + - [Check whether high availability is enabled on the TigerGraph Operator](#check-whether-high-availability-is-enabled-on-the-tigergraph-operator) - [Upgrading from TigerGraph Operator 1.0.0 and later versions to version 1.7.0](#upgrading-from-tigergraph-operator-100-and-later-versions-to-version-170) - [Upgrading kubectl-tg plugin](#upgrading-kubectl-tg-plugin) - [Upgrading TigerGraph Operator](#upgrading-tigergraph-operator) @@ -27,6 +30,47 @@ This document provides step-by-step instructions for upgrading the TigerGraph Ku - [Successfully upgraded the operator from version 0.0.9 to version 1.2.0 and earlier, but still encountered some errors when creating a TigerGraph cluster](#successfully-upgraded-the-operator-from-version-009-to-version-120-and-earlier-but-still-encountered-some-errors-when-creating-a-tigergraph-cluster) - [Failed to upgrade the operator from version 0.0.9 to version 1.3.0 and above](#failed-to-upgrade-the-operator-from-version-009-to-version-130-and-above) +## Before You begin + +### Release Number Definition + +Similar to the TigerGraph release number, the operator's release number consists of three parts, represented as X.Y.Z: + +- **X (MAJOR version)**: Indicates incompatible CRD (Custom Resource Definition) changes. + +- **Y (MINOR version)**: Indicates the addition of functionality in a backward-compatible manner, without breaking changes. + +- **Z (PATCH version)**: Indicates backward-compatible bug fixes, with no changes to MAJOR and MINOR versions. + +Therefore, there are no breaking changes when upgrading the Operator within a MINOR or PATCH version. + +> [!NOTE] +> A backward-compatible Operator upgrade may still introduce changes to the StatefulSet used to manage TigerGraph. +> However, these changes will not take effect until you update the TigerGraph CR, at which point a rolling upgrade will be triggered. + +### Check whether high availability is enabled on the TigerGraph Operator + +In production environments, high availability should always be enabled on the TigerGraph Operator to ensure seamless upgrades. + +You can check the replicas of Operator by the following command: + +```bash +kubectl get deployment tigergraph-operator-controller-manager -o jsonpath='{.spec.replicas}' -n ${YOUR_NAMESPACE_OF_OPERATOR} +``` + +Example output: + +```bash +$ kubectl get deployment tigergraph-operator-controller-manager -o jsonpath='{.spec.replicas}' -n tigergraph +1 +``` + +If the output above is 1, scale out the Operator with the following command: + +```bash +kubectl tg upgrade --namespace ${YOUR_NAMESPACE_OF_OPERATOR} --operator-size 3 +``` + ## Upgrading from TigerGraph Operator 1.0.0 and later versions to version 1.7.0 ### Upgrading kubectl-tg plugin diff --git a/k8s/docs/04-manage/tigergraph-upgrade.md b/k8s/docs/04-manage/tigergraph-upgrade.md index 9e581f755..ecec4b61c 100644 --- a/k8s/docs/04-manage/tigergraph-upgrade.md +++ b/k8s/docs/04-manage/tigergraph-upgrade.md @@ -3,6 +3,8 @@ This guide will walk you through upgrading the TigerGraph Cluster using the TigerGraph Operator. - [Upgrade the TigerGraph Cluster Using the TigerGraph Operator](#upgrade-the-tigergraph-cluster-using-the-tigergraph-operator) + - [Before you begin](#before-you-begin) + - [Check the compatibility between TigerGraph and TigerGraph Operator](#check-the-compatibility-between-tigergraph-and-tigergraph-operator) - [Upgrade the TigerGraph Cluster](#upgrade-the-tigergraph-cluster) - [Upgrade pre-check for TigerGraph upgrading](#upgrade-pre-check-for-tigergraph-upgrading) - [Maintenance Release Upgrade support](#maintenance-release-upgrade-support) @@ -10,6 +12,45 @@ This guide will walk you through upgrading the TigerGraph Cluster using the Tige - [How to proceed with the upgrade process if the upgrade pre-check job fails due to incorrect image or downgrade error](#how-to-proceed-with-the-upgrade-process-if-the-upgrade-pre-check-job-fails-due-to-incorrect-image-or-downgrade-error) - [How to proceed with the upgrade process if the upgrade pre-check fails Due to insufficient ephemeral local storage](#how-to-proceed-with-the-upgrade-process-if-the-upgrade-pre-check-fails-due-to-insufficient-ephemeral-local-storage) +## Before you begin + +### Check the compatibility between TigerGraph and TigerGraph Operator + +> [!IMPORTANT] +> Each Operator version defines a maximum supported TG version. If the target TG version is outside the supported range of the current Operator, +> you must first upgrade the Operator to a version that supports it, or simply upgrade to the latest Operator version. + +The synergy and compatibility between TigerGraph and TigerGraph Operator: + +| TigerGraph Operator version | TigerGraph version | +|----------|----------| +| 1.7.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.3.0| +| 1.6.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.2.1| +| 1.5.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.2.0| +| 1.4.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.1.2| +| 1.3.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.1.1| +| 1.2.0 | TigerGraph >= 3.6.0 && TigerGraph <= 4.1.0| +| 1.1.1 | TigerGraph >= 3.6.0 && TigerGraph <= 3.10.2| +| 1.1.0 | TigerGraph >= 3.6.0 && TigerGraph <= 3.10.1| +| 1.0.0 | TigerGraph >= 3.6.0 && TigerGraph <= 3.10.0| + +For example, Operator version 1.5.0 supports up to TigerGraph 4.2.0. To install or upgrade a cluster to TigerGraph 4.2.1, +you must first upgrade the Operator to version 1.6.0 or above. +For detailed steps of Operator upgrading, see the [Operator upgrade guide](../04-manage/operator-upgrade.md). + +You can check the Operator version by the following command: + +```bash +helm ls -A|grep tg-operator +``` + +Example output: + +```bash +$ helm ls -A|grep tg-operator +tg-operator tigergraph 1 2025-09-26 04:53:01.952172143 +0000 UTC deployed tg-operator-1.7.0 1.7.0 +``` + ## Upgrade the TigerGraph Cluster To upgrade the TigerGraph cluster using the TigerGraph Operator, you can use the kubectl-tg plugin or update the TigerGraph Cluster CR to complete the upgrade seamlessly. From 48a92d6f521edce6f44aecc9071c8575325a522f Mon Sep 17 00:00:00 2001 From: Jerry Yang <103920848+jerryyangtg@users.noreply.github.com> Date: Sat, 11 Oct 2025 10:36:37 +0800 Subject: [PATCH 3/6] Fix the incorrect storageclass in the deployment doc for aks and eks; --- k8s/docs/03-deploy/tigergraph-on-aks.md | 2 +- k8s/docs/03-deploy/tigergraph-on-eks.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/k8s/docs/03-deploy/tigergraph-on-aks.md b/k8s/docs/03-deploy/tigergraph-on-aks.md index 497aed4e9..25506e46d 100644 --- a/k8s/docs/03-deploy/tigergraph-on-aks.md +++ b/k8s/docs/03-deploy/tigergraph-on-aks.md @@ -337,7 +337,7 @@ In general, we recommend setting the replication factor (HA) to 2 and using a cl > Then, when creating the TigerGraph cluster, use the `--license-secret` option to set the license: > ```bash > kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 4 --ha 2 --version 4.2.1 \ -> --license-secret ${YOUR_CLUSTER_NAME}-license --storage-class standard --storage-size 10G --cpu 2000m --memory 6Gi --namespace ${YOUR_NAMESPACE} +> --license-secret ${YOUR_CLUSTER_NAME}-license --storage-class default --storage-size 10G --cpu 2000m --memory 6Gi --namespace ${YOUR_NAMESPACE} > ``` > [!IMPORTANT] diff --git a/k8s/docs/03-deploy/tigergraph-on-eks.md b/k8s/docs/03-deploy/tigergraph-on-eks.md index 93a5d1c9a..b7977bfe0 100644 --- a/k8s/docs/03-deploy/tigergraph-on-eks.md +++ b/k8s/docs/03-deploy/tigergraph-on-eks.md @@ -363,7 +363,7 @@ In general, we recommend setting the replication factor (HA) to 2 and using a cl > Then, when creating the TigerGraph cluster, use the `--license-secret` option to set the license: > ```bash > kubectl tg create --cluster-name ${YOUR_CLUSTER_NAME} --private-key-secret ${YOUR_SSH_KEY_SECRET_NAME} --size 4 --ha 2 --version 4.2.1 \ -> --license-secret ${YOUR_CLUSTER_NAME}-license --storage-class standard --storage-size 10G --cpu 2000m --memory 6Gi --namespace ${YOUR_NAMESPACE} +> --license-secret ${YOUR_CLUSTER_NAME}-license --storage-class gp2 --storage-size 10G --cpu 2000m --memory 6Gi --namespace ${YOUR_NAMESPACE} > ``` > [!IMPORTANT] From 9fe0af3973d7c0dc2fad14ffd2e9a02f2078027d Mon Sep 17 00:00:00 2001 From: Jerry Yang <103920848+jerryyangtg@users.noreply.github.com> Date: Tue, 28 Oct 2025 15:53:03 +0800 Subject: [PATCH 4/6] Add detailed user documentation for scale-in and scale-out operations; --- k8s/README.md | 2 + k8s/docs/01-introduction/README.md | 2 + k8s/docs/03-deploy/tigergraph-on-aks.md | 6 + k8s/docs/03-deploy/tigergraph-on-eks.md | 6 + k8s/docs/03-deploy/tigergraph-on-gke.md | 6 + k8s/docs/03-deploy/tigergraph-on-openshift.md | 6 + k8s/docs/04-manage/scale-in-and-out.md | 327 ++++++++++++++++++ 7 files changed, 355 insertions(+) create mode 100644 k8s/docs/04-manage/scale-in-and-out.md diff --git a/k8s/README.md b/k8s/README.md index af0c2c352..afa83b218 100644 --- a/k8s/README.md +++ b/k8s/README.md @@ -75,6 +75,8 @@ Once your deployment is complete, refer to the following documents for guidance - [Configure SSL Certificate for Ingress Service](docs/03-deploy/configure-ssl-certificate-for-ingress-service.md) - [Running Custom Bash Scripts in a TigerGraph Cluster via Kubernetes Jobs](docs/04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md) - [Configure Nginx TLS](docs/03-deploy/configure-nginx-tls.md) +- [Scale Up and Scale Down](docs/04-manage/scale-up-and-down.md) +- [Scale In and Scale Out](docs/04-manage/scale-in-and-out.md) In case issues arise and your cluster requires diagnosis, you have two valuable resources: diff --git a/k8s/docs/01-introduction/README.md b/k8s/docs/01-introduction/README.md index d2bf325c6..fcee6f940 100644 --- a/k8s/docs/01-introduction/README.md +++ b/k8s/docs/01-introduction/README.md @@ -76,6 +76,8 @@ Once your deployment is complete, refer to the following documents for guidance - [Configure SSL Certificate for Ingress Service](../03-deploy/configure-ssl-certificate-for-ingress-service.md) - [Running Custom Bash Scripts in a TigerGraph Cluster via Kubernetes Jobs](../04-manage/running-custom-bash-scripts-via-kubernetes-jobs.md) - [Configure Nginx TLS](../03-deploy/configure-nginx-tls.md) +- [Scale Up and Scale Down](../04-manage/scale-up-and-down.md) +- [Scale In and Scale Out](../04-manage/scale-in-and-out.md) In case issues arise and your cluster requires diagnosis, you have two valuable resources: diff --git a/k8s/docs/03-deploy/tigergraph-on-aks.md b/k8s/docs/03-deploy/tigergraph-on-aks.md index 25506e46d..82a060f26 100644 --- a/k8s/docs/03-deploy/tigergraph-on-aks.md +++ b/k8s/docs/03-deploy/tigergraph-on-aks.md @@ -520,6 +520,8 @@ kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} - kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE} ``` +For detailed information about upgrading TigerGraph clusters, refer to [Upgrading a TigerGraph cluster](../04-manage/tigergraph-upgrade.md). + ## Scale a TigerGraph cluster > [!WARNING] @@ -541,6 +543,8 @@ From Operator version 1.0.0, you can change the HA factor of the TigerGraph clus kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --ha ${NEW_HA} --namespace ${YOUR_NAMESPACE} ``` +For detailed information about scaling TigerGraph clusters (scale in/out), refer to [Scale In and Scale Out](../04-manage/scale-in-and-out.md). + ## Update the resources(CPU and Memory) of the TigerGraph cluster Modify the CPU and memory resources of your TigerGraph cluster using the following command: @@ -551,6 +555,8 @@ kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 8 --memory 16Gi --c For CR YAML manifests, update the `spec.resources.requests` and `spec.resources.limits` fields and apply the changes. +For detailed information about updating CPU and memory resources, refer to [Scale Up and Scale Down](../04-manage/scale-up-and-down.md). + ## Update system configurations and license of the TigerGraph cluster Use the following command to update the system configurations of the TigerGraph cluster: diff --git a/k8s/docs/03-deploy/tigergraph-on-eks.md b/k8s/docs/03-deploy/tigergraph-on-eks.md index b7977bfe0..d375bcb3f 100644 --- a/k8s/docs/03-deploy/tigergraph-on-eks.md +++ b/k8s/docs/03-deploy/tigergraph-on-eks.md @@ -674,6 +674,8 @@ kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} - kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE} ``` +For detailed information about upgrading TigerGraph clusters, refer to [Upgrading a TigerGraph cluster](../04-manage/tigergraph-upgrade.md). + ## Update system configurations and license of the TigerGraph cluster Use the following command to update the system configurations of the TigerGraph cluster: @@ -715,6 +717,8 @@ From Operator version 1.0.0, you can change the HA factor of the TigerGraph clus kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --ha ${NEW_HA} --namespace ${YOUR_NAMESPACE} ``` +For detailed information about scaling TigerGraph clusters (scale in/out), refer to [Scale In and Scale Out](../04-manage/scale-in-and-out.md). + ## Update Resources (CPU and Memory) of the TigerGraph Cluster To update the CPU and memory resources of the TigerGraph cluster, use the following command: @@ -725,6 +729,8 @@ kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 8 --memory 16Gi --c Alternatively, if you want to update the cluster using a CR (Custom Resource) YAML manifest, update the spec.resources.requests and spec.resources.limits fields accordingly. +For detailed information about updating CPU and memory resources, refer to [Scale Up and Scale Down](../04-manage/scale-up-and-down.md). + ## Destroy the TigerGraph Cluster and the Kubernetes Operator ### Destroy the TigerGraph Cluster diff --git a/k8s/docs/03-deploy/tigergraph-on-gke.md b/k8s/docs/03-deploy/tigergraph-on-gke.md index 2cca9d4f9..4983f51c3 100644 --- a/k8s/docs/03-deploy/tigergraph-on-gke.md +++ b/k8s/docs/03-deploy/tigergraph-on-gke.md @@ -514,6 +514,8 @@ kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} - kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE} ``` +For detailed information about upgrading TigerGraph clusters, refer to [Upgrading a TigerGraph cluster](../04-manage/tigergraph-upgrade.md). + ## Scale a TigerGraph cluster > [!WARNING] @@ -535,6 +537,8 @@ From Operator version 1.0.0, you can change the HA factor of the TigerGraph clus kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --ha ${NEW_HA} --namespace ${YOUR_NAMESPACE} ``` +For detailed information about scaling TigerGraph clusters (scale in/out), refer to [Scale In and Scale Out](../04-manage/scale-in-and-out.md). + ## Update the resources(CPU and Memory) of the TigerGraph cluster Modify the CPU and memory resources of your TigerGraph cluster using the following command: @@ -545,6 +549,8 @@ kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 8 --memory 16Gi --c For CR YAML manifests, update the `spec.resources.requests` and `spec.resources.limits` fields and apply the changes. +For detailed information about updating CPU and memory resources, refer to [Scale Up and Scale Down](../04-manage/scale-up-and-down.md). + ## Update system configurations and license of the TigerGraph cluster Use the following command to update the system configurations of the TigerGraph cluster: diff --git a/k8s/docs/03-deploy/tigergraph-on-openshift.md b/k8s/docs/03-deploy/tigergraph-on-openshift.md index 2b36e3b42..b1f4ea15c 100644 --- a/k8s/docs/03-deploy/tigergraph-on-openshift.md +++ b/k8s/docs/03-deploy/tigergraph-on-openshift.md @@ -832,6 +832,8 @@ kubectl rollout status --watch --timeout=900s statefulset/${YOUR_CLUSTER_NAME} - kubectl wait --for=condition=complete --timeout=15m job/${YOUR_CLUSTER_NAME}-upgrade-job --namespace ${YOUR_NAMESPACE} ``` +For detailed information about upgrading TigerGraph clusters, refer to [Upgrading a TigerGraph cluster](../04-manage/tigergraph-upgrade.md). + ## Scale a TigerGraph cluster > [!WARNING] @@ -853,6 +855,8 @@ From Operator version 1.0.0, you can change the HA factor of the TigerGraph clus kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --ha ${NEW_HA} --namespace ${YOUR_NAMESPACE} ``` +For detailed information about scaling TigerGraph clusters (scale in/out), refer to [Scale In and Scale Out](../04-manage/scale-in-and-out.md). + ## Update the resources(CPU and Memory) of the TigerGraph cluster Modify the CPU and memory resources of your TigerGraph cluster using the following command: @@ -863,6 +867,8 @@ kubectl tg update --cluster-name ${YOUR_CLUSTER_NAME} --cpu 8 --memory 16Gi --c For CR YAML manifests, update the `spec.resources.requests` and `spec.resources.limits` fields and apply the changes. +For detailed information about updating CPU and memory resources, refer to [Scale Up and Scale Down](../04-manage/scale-up-and-down.md). + ## Update system configurations and license of the TigerGraph cluster Use the following command to update the system configurations of the TigerGraph cluster: diff --git a/k8s/docs/04-manage/scale-in-and-out.md b/k8s/docs/04-manage/scale-in-and-out.md new file mode 100644 index 000000000..a040be63c --- /dev/null +++ b/k8s/docs/04-manage/scale-in-and-out.md @@ -0,0 +1,327 @@ +# Scale In and Scale Out + +This guide will walk you through the process of scaling in and scaling out TigerGraph Cluster using the TigerGraph Operator. + +- [Scale In and Scale Out](#scale-in-and-scale-out) + - [Introduction](#introduction) + - [Scale Out TigerGraph Cluster (Cluster Expansion)](#scale-out-tigergraph-cluster-cluster-expansion) + - [By kubectl tg plugin (Scale Out)](#by-kubectl-tg-plugin-scale-out) + - [By modifying TigerGraph CR (Scale Out)](#by-modifying-tigergraph-cr-scale-out) + - [Scale In TigerGraph Cluster (Cluster Shrinking)](#scale-in-tigergraph-cluster-cluster-shrinking) + - [By kubectl tg plugin (Scale In)](#by-kubectl-tg-plugin-scale-in) + - [By modifying TigerGraph CR (Scale In)](#by-modifying-tigergraph-cr-scale-in) + - [Troubleshooting](#troubleshooting) + - [Scale Out (Expansion) Troubleshooting](#scale-out-expansion-troubleshooting) + - [Check Pod Status During Expansion](#check-pod-status-during-expansion) + - [Check Expansion Job Status](#check-expansion-job-status) + - [Verify Expansion Success](#verify-expansion-success) + - [Common Expansion Issues](#common-expansion-issues) + - [Scale In (Shrinking) Troubleshooting](#scale-in-shrinking-troubleshooting) + - [Check Shrinking Job Status](#check-shrinking-job-status) + - [Verify Shrinking Success](#verify-shrinking-success) + - [Common Shrinking Issues](#common-shrinking-issues) + - [General Troubleshooting](#general-troubleshooting) + - [Pods Stuck in Pending Status](#pods-stuck-in-pending-status) + - [Cluster Stuck in ExpandRoll State](#cluster-stuck-in-expandroll-state) + - [Monitoring Scale Operations](#monitoring-scale-operations) + +## Introduction + +Scaling in and scaling out a TigerGraph cluster are fundamental operations for managing cluster capacity in a production environment. + +- **Scale Out (Cluster Expansion)**: Increases the cluster size by adding more replicas (`spec.replicas`) and optionally updating the replication factor (`spec.ha`) configuration to accommodate the larger cluster. +- **Scale In (Cluster Shrinking)**: Decreases the cluster size by reducing the number of replicas (`spec.replicas`) and optionally updating the replication factor (`spec.ha`) configuration to optimize resource usage. + +You can configure cluster size and HA settings by modifying `.spec.replicas` and `.spec.ha` in the TigerGraph CR or by using the `kubectl tg` plugin. + +> [!WARNING] +> TigerGraph's exceptional performance comes with certain considerations regarding high availability during scaling operations. Currently, TigerGraph does not provide dedicated high-availability scale support, and some downtime is involved. + +> [!IMPORTANT] +> +> 1. **Perform a full backup** of your existing system before performing the expansion/shrinking. +> 2. Ensure your TigerGraph license supports the target number of replicas. +> 3. Ensure that no loading jobs, queries, or REST requests are running on the original cluster. +> 4. Obtain a few key measures for the state of your data before the expansion, such as vertex counts/edge counts or certain query results. This will be useful in verifying data integrity after the expansion or shrinking completes. +> 5. **For clusters with [Cross-Region Replication(CRR)](../03-deploy/configure-crr-on-k8s.md) enabled**: cluster scaling changes the number and distribution of topics, while CRR depends on topic replication. Therefore, **the DR cluster must be recreated** to align with the updated topic configuration. +> 6. **For clusters with [region awareness](../03-deploy/region-awareness-with-pod-topology-spread-constraints.md) enabled**: Ensure that the TigerGraph replication factor (HA) is at least 2, and that the number of topology domains specified by `topologyKey` is at least 3.. + +## Scale Out TigerGraph Cluster (Cluster Expansion) + +Scale out operations increase the cluster size by adding more TigerGraph pods to handle increased workload and data volume. + +### By kubectl tg plugin (Scale Out) + +You can use the following command to scale out the TigerGraph cluster by using the `kubectl tg` plugin: + +```bash +# Scale out to 9 replicas with HA=3 +kubectl tg update --cluster-name ${TG_CLUSTER_NAME} --size 9 --ha 3 --namespace ${NAMESPACE} +``` + +### By modifying TigerGraph CR (Scale Out) + +Assuming you have a TigerGraph cluster named `test-cluster` in the namespace with 6 replicas and HA=2, you can modify the `.spec.replicas` and `.spec.ha` in the TigerGraph CR to scale out the cluster: + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.2.1 + imagePullPolicy: IfNotPresent + ha: 3 # Increased from 2 to 3 for better fault tolerance + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: LoadBalancer + privateKeyName: ssh-key-secret + replicas: 9 # Increased from 6 to 9 for more capacity + resources: + limits: + cpu: "4" + memory: 16Gi + requests: + cpu: "4" + memory: 16Gi + storage: + type: persistent-claim + volumeClaimTemplate: + resources: + requests: + storage: 100G + storageClassName: standard +``` + +## Scale In TigerGraph Cluster (Cluster Shrinking) + +Scale in operations decrease the cluster size by removing TigerGraph pods to optimize resource usage and reduce operational costs. + +### By kubectl tg plugin (Scale In) + +You can use the following command to scale in the TigerGraph cluster by using the `kubectl tg` plugin: + +```bash +# Scale in to 4 replicas with HA=2 +kubectl tg update --cluster-name ${TG_CLUSTER_NAME} --size 4 --ha 2 --namespace ${NAMESPACE} +``` + +### By modifying TigerGraph CR (Scale In) + +Assuming you have a TigerGraph cluster named `test-cluster` in the namespace with 8 replicas and HA=3, you can modify the `.spec.replicas` and `.spec.ha` in the TigerGraph CR to scale in the cluster: + +```yaml +apiVersion: graphdb.tigergraph.com/v1alpha1 +kind: TigerGraph +metadata: + name: test-cluster +spec: + image: docker.io/tigergraph/tigergraph-k8s:4.2.1 + imagePullPolicy: IfNotPresent + ha: 2 # Decreased from 3 to 2 for cost optimization + license: xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx + listener: + type: LoadBalancer + privateKeyName: ssh-key-secret + replicas: 4 # Decreased from 9 to 4 for resource optimization + resources: + limits: + cpu: "4" + memory: 16Gi + requests: + cpu: "4" + memory: 16Gi + storage: + type: persistent-claim + volumeClaimTemplate: + resources: + requests: + storage: 100G + storageClassName: standard +``` + +## Troubleshooting + +### Scale Out (Expansion) Troubleshooting + +#### Check Pod Status During Expansion + +Ensure that the pods of the cluster have been scaled out to the expected size: + +```bash +# test-cluster is the name of the cluster +# The example below tries to scale the cluster size from 3 to 5 +kubectl get pod -l tigergraph.com/cluster-pod=${TG_CLUSTER_NAME} -n ${NAMESPACE} + +NAME READY STATUS RESTARTS AGE +test-cluster-0 1/1 Running 0 17m +test-cluster-1 1/1 Running 0 17m +test-cluster-2 1/1 Running 0 17m +test-cluster-3 0/1 ContainerCreating 0 8s +test-cluster-4 0/1 ContainerCreating 0 7s +``` + +#### Check Expansion Job Status + +Ensure the expansion job is running or has completed successfully: + +```bash +# replace ${TG_CLUSTER_NAME} with your tigergraph cluster name +kubectl get job -l job-name=${TG_CLUSTER_NAME}-expand-job -n ${NAMESPACE} + +NAME COMPLETIONS DURATION AGE +test-cluster-expand-job 0/1 4m13s 4m13s +``` + +If the expansion job fails, check the logs of the job: + +```bash +# replace ${TG_CLUSTER_NAME} with your tigergraph cluster name +kubectl get pod -l job-name=${TG_CLUSTER_NAME}-expand-job -n ${NAMESPACE} + +NAME READY STATUS RESTARTS AGE +test-cluster-expand-job-6jk42 1/1 Running 0 5m38s + +kubectl logs test-cluster-expand-job-6jk42 -n ${NAMESPACE} +``` + +#### Verify Expansion Success + +- Check the status of the cluster CR: + + ```bash + kubectl tg status --cluster-name ${TG_CLUSTER_NAME} -n ${NAMESPACE} + ``` + +- Check the cluster status by executing gadmin: + + ```bash + kubectl exec -it ${TG_CLUSTER_NAME}-0 -n ${NAMESPACE} -- /home/tigergraph/tigergraph/app/cmd/gadmin status -v + ``` + +#### Common Expansion Issues + +1. **Insufficient Resources**: If the K8s cluster's resources (CPU or Memory) are insufficient to expand the TigerGraph cluster: + - For Operator versions 0.0.3 and earlier: Recreate the cluster with the same cluster name, which will load the remaining cluster data for recovery. + - For Operator versions 0.0.4 and higher: Update the size to match the K8s cluster's available resources or reset the cluster to the previous configuration. + +2. **Expansion Job Failures**: If the expanding job fails after retrying three times: + - If you backed up the cluster before expansion, restore it with the backup package directly. + - If there is no backup package, manual recovery is complex, so always backup the cluster before expansion. + +3. **Repeated Pod Restarts**: After expansion ends, services such as GPE may take time to switch from warmup to running. This process may exceed 40 seconds (the health check limit), causing repeated pod restarts (5-12 times depending on data amount). + + **Solution**: Wait for all services to return to normal status before taking any action. + + ```bash + kubectl describe pods ${TG_CLUSTER_NAME}-1 -n ${NAMESPACE} + + Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning Unhealthy 31m kubelet Readiness probe failed: service status of GPE_2#1 is Down, exit error. + Warning Unhealthy 30m (x9 over 74m) kubelet Readiness probe failed: service status of GPE_2#1 should not be Warmup, exit error + ``` + +### Scale In (Shrinking) Troubleshooting + +#### Check Shrinking Job Status + +Ensure that the shrinking job is running or has completed successfully: + +```bash +# replace ${TG_CLUSTER_NAME} with your tigergraph cluster name +kubectl get job ${TG_CLUSTER_NAME}-shrink-pre-job -n ${NAMESPACE} + +NAME COMPLETIONS DURATION AGE +test-cluster-shrink-pre-job 0/1 21s 21s +``` + +If the shrinking job fails, check the job logs: + +```bash +kubectl get pod -l job-name=${TG_CLUSTER_NAME}-shrink-pre-job -n ${NAMESPACE} + +NAME READY STATUS RESTARTS AGE +test-cluster-shrink-pre-job-jzlhm 1/1 Running 0 2m11s + +kubectl logs test-cluster-shrink-pre-job-jzlhm -n ${NAMESPACE} +``` + +#### Verify Shrinking Success + +- Check the status of the cluster CR: + + ```bash + kubectl tg status --cluster-name ${TG_CLUSTER_NAME} -n ${NAMESPACE} + ``` + +- Check the cluster status by executing gadmin: + + ```bash + kubectl exec -it ${TG_CLUSTER_NAME}-0 -n ${NAMESPACE} -- /home/tigergraph/tigergraph/app/cmd/gadmin status -v + ``` + +#### Common Shrinking Issues + +1. **Shrinking Job Failures**: If the shrinking job fails after retrying three times: + - If you have a backup of the cluster before shrinking, restore it directly using the backup package. + - If there is no backup package, manual recovery is complex, so always backup the cluster before shrinking. + +2. **Data Loss Prevention**: When scaling in, ensure: + - Maintain adequate HA value to prevent data loss + - Allow sufficient time for pods to gracefully shutdown and transfer data + - Take a backup before significant scale in operations + +### General Troubleshooting + +#### Pods Stuck in Pending Status + +When there are not enough resources on your Nodes, pods may be stuck in `Pending` status: + +```bash +kubectl get pods -n ${NAMESPACE} | grep ${TG_CLUSTER_NAME} + +test-cluster-0 0/1 Pending 0 3m21s +test-cluster-1 0/1 Pending 0 113s +test-cluster-2 0/1 Pending 0 2m39s +``` + +Check pod details for resource constraints: + +```bash +kubectl describe pod ${TG_CLUSTER_NAME}-0 -n ${NAMESPACE} + +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning FailedScheduling 2m40s (x4 over 4m6s) default-scheduler 0/6 nodes are available: 6 Insufficient memory. preemption: 0/6 nodes are available: 6 No preemption victims found for incoming pod. +``` + +#### Cluster Stuck in ExpandRoll State + +If the cluster remains in `ExpandRoll, Unknown` status for an extended period: + +1. **Check Pod Status**: Verify all pods are running and ready +2. **Check Resource Availability**: Ensure sufficient resources are available for the target replica count +3. **Check Storage**: Verify persistent volume claims are available and properly configured +4. **Check Network**: Ensure network connectivity between pods is working +5. **Check Logs**: Review operator and TigerGraph pod logs for error messages + +#### Monitoring Scale Operations + +Monitor the following during scale operations: + +```bash +# Check cluster status +kubectl get tg ${TG_CLUSTER_NAME} -n ${NAMESPACE} + +# Check pod status +kubectl get pods -n ${NAMESPACE} | grep ${TG_CLUSTER_NAME} + +# Check events +kubectl get events -n ${NAMESPACE} --sort-by='.lastTimestamp' + +# Check resource usage +kubectl top pods -n ${NAMESPACE} | grep ${TG_CLUSTER_NAME} +``` From c00ba65e48046345237fe6e6f14061a05b18e1de Mon Sep 17 00:00:00 2001 From: Jerry Yang <103920848+jerryyangtg@users.noreply.github.com> Date: Thu, 30 Oct 2025 16:40:44 +0800 Subject: [PATCH 5/6] Fix the YAML formatting issue; --- .../node-repaving-for-tigergraph-on-k8s.md | 236 +++++++++--------- 1 file changed, 118 insertions(+), 118 deletions(-) diff --git a/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md b/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md index 9355502f0..6b6b899f0 100644 --- a/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md +++ b/k8s/docs/08-reference/node-repaving-for-tigergraph-on-k8s.md @@ -133,67 +133,67 @@ In this section, we demonstrate how to repave a node in Amazon EKS using a manag - Deploy a TigerGraph Cluster with HA and Topology Spread Constraints Here we skip the TigerGraph Operator installation process, you can refer to the document [Deploy TigerGraph on AWS EKS](../03-deploy/tigergraph-on-eks.md) for the details. - ```bash - export CLUSTER_NAME= - export LICENSE= - export NAMESPACE= - export STORAGE_CLASS= - - cat < +export LICENSE= +export NAMESPACE= +export STORAGE_CLASS= + +cat < - export LICENSE= - export NAMESPACE= - export STORAGE_CLASS= - - cat < +export LICENSE= +export NAMESPACE= +export STORAGE_CLASS= + +cat < Date: Tue, 4 Nov 2025 09:34:08 +0800 Subject: [PATCH 6/6] Update some minor issues in operator docs; --- k8s/docs/02-get-started/get_started.md | 4 ++-- k8s/docs/03-deploy/tigergraph-on-aks.md | 2 +- k8s/docs/03-deploy/tigergraph-on-eks.md | 2 +- k8s/docs/03-deploy/tigergraph-on-gke.md | 2 +- k8s/docs/03-deploy/tigergraph-on-openshift.md | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/k8s/docs/02-get-started/get_started.md b/k8s/docs/02-get-started/get_started.md index 384f8b799..ef001dc17 100644 --- a/k8s/docs/02-get-started/get_started.md +++ b/k8s/docs/02-get-started/get_started.md @@ -204,7 +204,7 @@ This step is optional and can be skipped if you have privileged permissions in y CustomResourceDefinitions (CRDs) are non-namespaced entities accessible across all namespaces. Installing CRDs requires privileged permissions from the Kubernetes cluster. If you prefer to install CRDs independently from the Operator installation, use the following commands: ```bash -kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml +kubectl apply --server-side -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml ``` ### Install TigerGraph Operator @@ -245,7 +245,7 @@ Examples: # install the operator in the specified namespace, with specified helm repo and image pull secret kubectl tg init --namespace tg-tenant1 --helm-repo https://yourhelmrepo.com --image-pull-secret yoursecret # install the operator in the specified namespace, with specified operator version, watch name namespace, cpu and memory - kubectl tg init --version OPERATOR_VERSION --operator-size 3 --operator-watch-namespace tigergraph --operator-cpu 1000m --operator-memory 1024Mi --namespace tg-tenant1 + kubectl tg init --operator-version OPERATOR_VERSION --operator-size 3 --operator-watch-namespace tigergraph --operator-cpu 1000m --operator-memory 1024Mi --namespace tg-tenant1 Usage: kubectl tg init [options] diff --git a/k8s/docs/03-deploy/tigergraph-on-aks.md b/k8s/docs/03-deploy/tigergraph-on-aks.md index 82a060f26..bed904716 100644 --- a/k8s/docs/03-deploy/tigergraph-on-aks.md +++ b/k8s/docs/03-deploy/tigergraph-on-aks.md @@ -100,7 +100,7 @@ kubectl tg help This step is optional and can be skipped if you have privileged permissions in your Kubernetes environment. The required components will be automatically installed during the Operator installation process. However, if you prefer to install CustomResourceDefinitions (CRDs) independently, you can use the following command: ```bash -kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml +kubectl apply --server-side -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml ``` ### Install TigerGraph Operator diff --git a/k8s/docs/03-deploy/tigergraph-on-eks.md b/k8s/docs/03-deploy/tigergraph-on-eks.md index d375bcb3f..d45124ac4 100644 --- a/k8s/docs/03-deploy/tigergraph-on-eks.md +++ b/k8s/docs/03-deploy/tigergraph-on-eks.md @@ -121,7 +121,7 @@ kubectl tg help This step is optional and can be skipped if you have privileged permissions in your Kubernetes environment. The necessary CustomResourceDefinitions (CRDs) are automatically installed during the Operator installation. If you prefer to install CRDs independently, use the following command: ```bash -kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml +kubectl apply --server-side -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml ``` ### Install TigerGraph Operator diff --git a/k8s/docs/03-deploy/tigergraph-on-gke.md b/k8s/docs/03-deploy/tigergraph-on-gke.md index 4983f51c3..055f0b66d 100644 --- a/k8s/docs/03-deploy/tigergraph-on-gke.md +++ b/k8s/docs/03-deploy/tigergraph-on-gke.md @@ -100,7 +100,7 @@ kubectl tg help This step is optional and can be skipped if you have privileged permissions in your Kubernetes environment. The required components will be automatically installed during the Operator installation process. However, if you prefer to install CustomResourceDefinitions (CRDs) independently, you can use the following command: ```bash -kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml +kubectl apply --server-side -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml ``` ### Install TigerGraph Operator diff --git a/k8s/docs/03-deploy/tigergraph-on-openshift.md b/k8s/docs/03-deploy/tigergraph-on-openshift.md index b1f4ea15c..bad5fedd8 100644 --- a/k8s/docs/03-deploy/tigergraph-on-openshift.md +++ b/k8s/docs/03-deploy/tigergraph-on-openshift.md @@ -359,7 +359,7 @@ This step is optional. You can skip it if you have privileged permissions in you CustomResourceDefinitions (CRDs) are non-namespaced entities accessible across all namespaces. Installing CRDs requires privileged permissions from the Kubernetes cluster. You may prefer to install CRDs independently from the Operator installation: ```bash -kubectl apply -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml +kubectl apply --server-side -f https://dl.tigergraph.com/k8s/latest/tg-operator-crd.yaml ``` ### Install TigerGraph Operator