From d48f8bfa5037509c32f636a0861e557290e9aa5a Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Thu, 4 May 2023 05:07:16 +0000 Subject: [PATCH 01/15] set up TracerProvider + tracer object --- pkg/otel/tracing.go | 106 +++++++++++++++++++++++++++++++++++ util/telemetry/oteltracer.go | 11 ++++ 2 files changed, 117 insertions(+) create mode 100644 pkg/otel/tracing.go create mode 100644 util/telemetry/oteltracer.go diff --git a/pkg/otel/tracing.go b/pkg/otel/tracing.go new file mode 100644 index 000000000..e2b32fafb --- /dev/null +++ b/pkg/otel/tracing.go @@ -0,0 +1,106 @@ +package otel + +import ( + "context" + "time" + + "github.com/go-logr/logr" + "github.com/pkg/errors" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" + "k8s.io/client-go/pkg/version" + + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace" + "go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc" + "go.opentelemetry.io/otel/propagation" + "go.opentelemetry.io/otel/sdk/resource" + "go.opentelemetry.io/otel/sdk/trace" + semconv "go.opentelemetry.io/otel/semconv/v1.4.0" +) + +func RegisterTracing(ctx context.Context, log logr.Logger) error { + + tracerProvider, err := SetUpTracing(ctx) + if err != nil { + return err + } + + // Safely shut down the tracer provider when context terminates + go func() { + <-ctx.Done() + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + if err := tracerProvider.Shutdown(ctx); err != nil { + log.Error(err, "failed to shut down tracer provider") + } + }() + + return nil +} + +func newExporter(ctx context.Context) (*otlptrace.Exporter, error) { + + conn, err := grpc.DialContext(ctx, "opentelemetry-collector:4317", + // Using non-TLS connection for dev environment + grpc.WithTransportCredentials(insecure.NewCredentials()), + grpc.WithBlock(), + ) + + if err != nil { + return nil, errors.Wrap(err, "failed to create gRPC connection to collector for opentelemetry") + } + + // Set up a trace exporter + traceExporter, err := otlptracegrpc.New(ctx, otlptracegrpc.WithGRPCConn(conn)) + + if err != nil { + return nil, errors.Wrap(err, "failed to create trace exporter for opentelemetry") + } + + return traceExporter, nil +} + +func SetUpTracing(ctx context.Context) (*trace.TracerProvider, error) { + + traceExporter, err := newExporter(ctx) + + if err != nil { + return nil, err + } + + // labels/tags/res common to all traces + // TODO: consider to add more fields + resource, err := resource.New(ctx, + resource.WithAttributes( + semconv.ServiceNameKey.String("capg"), + attribute.String("exporter", "otlpgrpc"), + attribute.String("version", version.Get().String()), + ), + ) + + if err != nil { + return nil, errors.Wrap(err, "failed to create opentelemetry resource") + } + + traceProvider := trace.NewTracerProvider( + trace.WithBatcher(traceExporter), + trace.WithResource(resource), + // TODO: dynamic sampling rate? + // sampling rate based on parent span = 60% + trace.WithSampler(trace.ParentBased(trace.TraceIDRatioBased(0.6))), + ) + + otel.SetTracerProvider(traceProvider) + + otel.SetTextMapPropagator( + propagation.NewCompositeTextMapPropagator( + propagation.TraceContext{}, + ), + ) + + return traceProvider, nil +} diff --git a/util/telemetry/oteltracer.go b/util/telemetry/oteltracer.go new file mode 100644 index 000000000..69574a3e9 --- /dev/null +++ b/util/telemetry/oteltracer.go @@ -0,0 +1,11 @@ +package telemetry + +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/trace" +) + +// default Tracer +func Tracer() trace.Tracer { + return otel.Tracer("capg") +} From f899af81bb7204fae6f325b747d21cb3f36327fd Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Thu, 4 May 2023 05:07:44 +0000 Subject: [PATCH 02/15] add enableTracing flag --- main.go | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/main.go b/main.go index e1f1bad51..52459713b 100644 --- a/main.go +++ b/main.go @@ -37,6 +37,7 @@ import ( infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" expcontrollers "sigs.k8s.io/cluster-api-provider-gcp/exp/controllers" "sigs.k8s.io/cluster-api-provider-gcp/feature" + ot "sigs.k8s.io/cluster-api-provider-gcp/pkg/otel" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" "sigs.k8s.io/cluster-api-provider-gcp/version" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" @@ -82,6 +83,7 @@ var ( leaderElectionLeaseDuration time.Duration leaderElectionRenewDeadline time.Duration leaderElectionRetryPeriod time.Duration + enableTracing bool ) // Add RBAC for the authorized diagnostics endpoint. @@ -162,6 +164,13 @@ func main() { // Setup the context that's going to be used in controllers and for the manager. ctx := ctrl.SetupSignalHandler() + if enableTracing { + if err := ot.RegisterTracing(ctx, setupLog); err != nil { + setupLog.Error(err, "unable to set up tracing") + os.Exit(1) + } + } + if err := setupReconcilers(ctx, mgr); err != nil { setupLog.Error(err, "unable to setup reconcilers") os.Exit(1) @@ -374,6 +383,12 @@ func initFlags(fs *pflag.FlagSet) { "The maximum duration a reconcile loop can run (e.g. 90m)", ) + fs.BoolVar(&enableTracing, + "enable-tracing", + false, + "Enable collecting and sending traces to opentelemetry-collector service", + ) + flags.AddManagerOptions(fs, &managerOptions) feature.MutableGates.AddFlag(fs) From 22138735134e869f1ad87ec908a4643064e84916 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Thu, 4 May 2023 05:08:13 +0000 Subject: [PATCH 03/15] instrument GCP machine + GCPCluster controllers --- controllers/gcpcluster_controller.go | 34 +++++++++++++++++++++++++++ controllers/gcpmachine_controller.go | 35 ++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/controllers/gcpcluster_controller.go b/controllers/gcpcluster_controller.go index 24be31c1a..f874e4cfb 100644 --- a/controllers/gcpcluster_controller.go +++ b/controllers/gcpcluster_controller.go @@ -23,6 +23,8 @@ import ( "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter" "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" apierrors "k8s.io/apimachinery/pkg/api/errors" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud" @@ -32,6 +34,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/networks" "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/subnets" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -62,6 +65,14 @@ type GCPClusterReconciler struct { func (r *GCPClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := log.FromContext(ctx).WithValues("controller", "GCPCluster") + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPCluster"), + ), + ) + defer span.End() + c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). For(&infrav1.GCPCluster{}). @@ -106,6 +117,17 @@ func (r *GCPClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) defer cancel() log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPCluster"), + ), + ) + defer span.End() + gcpCluster := &infrav1.GCPCluster{} err := r.Get(ctx, req.NamespacedName, gcpCluster) if err != nil { @@ -161,6 +183,12 @@ func (r *GCPClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request) func (r *GCPClusterReconciler) reconcile(ctx context.Context, clusterScope *scope.ClusterScope) (ctrl.Result, error) { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPCluster") controllerutil.AddFinalizer(clusterScope.GCPCluster, infrav1.ClusterFinalizer) @@ -228,6 +256,12 @@ func (r *GCPClusterReconciler) reconcile(ctx context.Context, clusterScope *scop func (r *GCPClusterReconciler) reconcileDelete(ctx context.Context, clusterScope *scope.ClusterScope) error { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPClusterReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Reconciling Delete GCPCluster") reconcilers := []cloud.Reconciler{ diff --git a/controllers/gcpmachine_controller.go b/controllers/gcpmachine_controller.go index f090eea04..2a2823dc4 100644 --- a/controllers/gcpmachine_controller.go +++ b/controllers/gcpmachine_controller.go @@ -22,11 +22,14 @@ import ( "time" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" apierrors "k8s.io/apimachinery/pkg/api/errors" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/scope" "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/instances" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -56,6 +59,15 @@ type GCPMachineReconciler struct { func (r *GCPMachineReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := ctrl.LoggerFrom(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPMachine"), + ), + ) + defer span.End() + c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). For(&infrav1.GCPMachine{}). @@ -135,6 +147,17 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) defer cancel() log := ctrl.LoggerFrom(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPMachine"), + ), + ) + defer span.End() + gcpMachine := &infrav1.GCPMachine{} err := r.Get(ctx, req.NamespacedName, gcpMachine) if err != nil { @@ -217,6 +240,12 @@ func (r *GCPMachineReconciler) Reconcile(ctx context.Context, req ctrl.Request) func (r *GCPMachineReconciler) reconcile(ctx context.Context, machineScope *scope.MachineScope) (ctrl.Result, error) { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPMachine") controllerutil.AddFinalizer(machineScope.GCPMachine, infrav1.MachineFinalizer) @@ -251,6 +280,12 @@ func (r *GCPMachineReconciler) reconcile(ctx context.Context, machineScope *scop func (r *GCPMachineReconciler) reconcileDelete(ctx context.Context, machineScope *scope.MachineScope) error { log := log.FromContext(ctx) + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPMachineReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Reconciling Delete GCPMachine") if err := instances.New(machineScope).Delete(ctx); err != nil { From bce50973856c39853d2cfbcf75c6b572fc71e522 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 5 May 2023 02:10:35 +0000 Subject: [PATCH 04/15] helm charts for jaeger all-in-one + otel collector --- .../jaeger/fetch-jaeger-resources.sh | 31 ++++++++ .../opentelemetry/fetch-otel-resources.sh | 34 +++++++++ hack/observability/opentelemetry/values.yaml | 70 +++++++++++++++++++ 3 files changed, 135 insertions(+) create mode 100755 hack/observability/jaeger/fetch-jaeger-resources.sh create mode 100644 hack/observability/opentelemetry/fetch-otel-resources.sh create mode 100644 hack/observability/opentelemetry/values.yaml diff --git a/hack/observability/jaeger/fetch-jaeger-resources.sh b/hack/observability/jaeger/fetch-jaeger-resources.sh new file mode 100755 index 000000000..6d053a55b --- /dev/null +++ b/hack/observability/jaeger/fetch-jaeger-resources.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +# Copyright 2021 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# "Borrow" with permission from CAPZ team: https://github.com/kubernetes-sigs/cluster-api-provider-azure/blob/main/hack/observability/jaeger/fetch-jaeger-resources.sh + +set -o errexit +set -o nounset +set -o pipefail + +CHART_RELEASE=${CHART_RELEASE:-0.1.8} +JAEGER_ROOT=$(dirname "${BASH_SOURCE[0]}") +CHART_ROOT=$JAEGER_ROOT/chart + +rm -rf "$CHART_ROOT" +# "tar" has no POSIX standard, so use only basic options and test with both BSD and GNU. +wget -qO- https://github.com/hansehe/jaeger-all-in-one/raw/master/helm/charts/jaeger-all-in-one-"$CHART_RELEASE".tgz \ + | tar xvz -C "$JAEGER_ROOT" +mv "$JAEGER_ROOT"/jaeger-all-in-one "$CHART_ROOT" diff --git a/hack/observability/opentelemetry/fetch-otel-resources.sh b/hack/observability/opentelemetry/fetch-otel-resources.sh new file mode 100644 index 000000000..bd346cdea --- /dev/null +++ b/hack/observability/opentelemetry/fetch-otel-resources.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +# Copyright 2021 The Kubernetes Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# "Borrow" with permission from CAPZ team: https://github.com/kubernetes-sigs/cluster-api-provider-azure/blob/main/hack/observability/opentelemetry/fetch-otel-resources.sh + + +set -o errexit +set -o nounset +set -o pipefail + +CHART_RELEASE=${CHART_RELEASE:-0.53.0} +OTEL_ROOT=$(dirname "${BASH_SOURCE[0]}") +CHART_ROOT=$OTEL_ROOT/chart + + +rm -rf "$CHART_ROOT" +# # "tar" has no POSIX standard, so use only basic options and test with both BSD and GNU. +wget -qO- https://github.com/open-telemetry/opentelemetry-helm-charts/releases/download/opentelemetry-collector-"$CHART_RELEASE"/opentelemetry-collector-"$CHART_RELEASE".tgz \ + | tar xvz -C "$OTEL_ROOT" --exclude "ci" --exclude "examples" +mv "$OTEL_ROOT"/opentelemetry-collector "$CHART_ROOT" +wget -q https://raw.githubusercontent.com/open-telemetry/opentelemetry-helm-charts/main/LICENSE -P "$CHART_ROOT" diff --git a/hack/observability/opentelemetry/values.yaml b/hack/observability/opentelemetry/values.yaml new file mode 100644 index 000000000..67850766f --- /dev/null +++ b/hack/observability/opentelemetry/values.yaml @@ -0,0 +1,70 @@ +mode: "deployment" + +config: + receivers: + jaeger: null # disable Jaeger receiver + #otlp: # using default configs + prometheus: null # disable Prometheus receiver + zipkin: null # disable Zipkin receiver + + # (2) Processors + processors: + memory_limiter: + limit_percentage: 50 + check_interval: 1s + spike_limit_percentage: 30 + batch: + send_batch_size: 8192 + + # (3) exporter + exporters: + jaeger: + endpoint: jaeger-all-in-one:14250 + tls: + insecure: true + #export to Cloud Trace on GCP + googlecloud: + # disable metric + logging collections + metric: null + log: null + # use default configs for trace + #trace: + #use_insecure: true + + # (4) service + service: + # A pipeline = a set of receivers, processors and exporters. + pipelines: + metrics: null + logs: null + traces: + receivers: [otlp] + processors: [memory_limiter, batch] + exporters: [jaeger, googlecloud] + +image: + repository: otel/opentelemetry-collector-contrib + pullPolicy: IfNotPresent + tag: 0.75.0 +command: + name: otelcol-contrib + +# Configuration for connecting to GCP's Cloud Trace +extraEnvs: + - name: GOOGLE_APPLICATION_CREDENTIALS + value: /home/.gcp/credentials.json + +# retrieve GOOGLE_APPLICATION_CREDENTIALS env var from Secret +extraVolumes: + - name: credentials + secret: + secretName: manager-bootstrap-credentials + +extraVolumeMounts: + - name: credentials + mountPath: /home/.gcp + +ports: + jaeger-thrift: null + jaeger-grpc: null + zipkin: null From af8aab26e7f030b1fbcefcbebe783cc064540fdc Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 5 May 2023 02:10:49 +0000 Subject: [PATCH 05/15] templates to set up dev env --- hack/observability/kustomization.yaml | 6 ++++++ .../controller-manager-dev-env-otel-patch.yml | 17 +++++++++++++++++ .../secrets-dev-env-otel-patch.yaml | 5 +++++ 3 files changed, 28 insertions(+) create mode 100644 hack/observability/kustomization.yaml create mode 100644 hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml create mode 100644 hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml diff --git a/hack/observability/kustomization.yaml b/hack/observability/kustomization.yaml new file mode 100644 index 000000000..87bd08476 --- /dev/null +++ b/hack/observability/kustomization.yaml @@ -0,0 +1,6 @@ +resources: + - ../../config/default + +patchesStrategicMerge: + - opentelemetry/secrets-dev-env-otel-patch.yaml + - opentelemetry/controller-manager-dev-env-otel-patch.yml diff --git a/hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml b/hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml new file mode 100644 index 000000000..e6d92601c --- /dev/null +++ b/hack/observability/opentelemetry/controller-manager-dev-env-otel-patch.yml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + namespace: capg-system + name: capg-controller-manager +spec: + template: + spec: + # patch: ../../config/manager/manager.yaml + containers: + - name: manager + args: + - "--leader-elect" + - "--feature-gates=GKE=${EXP_CAPG_GKE:=false}" + - "--metrics-bind-addr=localhost:8080" + - "--v=${CAPG_LOGLEVEL:=0}" + - "--enable-tracing" diff --git a/hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml b/hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml new file mode 100644 index 000000000..9cdb836dc --- /dev/null +++ b/hack/observability/opentelemetry/secrets-dev-env-otel-patch.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: Secret +metadata: + name: manager-bootstrap-credentials + namespace: capg-system From 02380f506818083dfe14b87d42968c0d6f4edc22 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 5 May 2023 02:13:20 +0000 Subject: [PATCH 06/15] instrument cloud/scope --- cloud/scope/clients.go | 21 +++++++++++++++++++++ cloud/scope/cluster.go | 7 +++++++ cloud/scope/managedcluster.go | 7 +++++++ cloud/scope/managedcontrolplane.go | 7 +++++++ cloud/scope/managedmachinepool.go | 7 +++++++ 5 files changed, 49 insertions(+) diff --git a/cloud/scope/clients.go b/cloud/scope/clients.go index 502aea8ed..e52f6a187 100644 --- a/cloud/scope/clients.go +++ b/cloud/scope/clients.go @@ -32,6 +32,7 @@ import ( "k8s.io/client-go/pkg/version" "k8s.io/client-go/util/flowcontrol" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/client" ) @@ -90,6 +91,11 @@ func defaultClientOptions(ctx context.Context, credentialsRef *infrav1.ObjectRef } func newComputeService(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client, endpoints *infrav1.ServiceEndpoints) (*compute.Service, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newComputeService", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) @@ -108,6 +114,11 @@ func newComputeService(ctx context.Context, credentialsRef *infrav1.ObjectRefere } func newClusterManagerClient(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client, endpoints *infrav1.ServiceEndpoints) (*container.ClusterManagerClient, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newClusterManagerClient", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) @@ -126,6 +137,11 @@ func newClusterManagerClient(ctx context.Context, credentialsRef *infrav1.Object } func newIamCredentialsClient(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client, endpoints *infrav1.ServiceEndpoints) (*credentials.IamCredentialsClient, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newIamCredentialsClient", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) @@ -144,6 +160,11 @@ func newIamCredentialsClient(ctx context.Context, credentialsRef *infrav1.Object } func newInstanceGroupManagerClient(ctx context.Context, credentialsRef *infrav1.ObjectReference, crClient client.Client, endpoints *infrav1.ServiceEndpoints) (*computerest.InstanceGroupManagersClient, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clients.newInstanceGroupManagerClient", + ) + defer span.End() + opts, err := defaultClientOptions(ctx, credentialsRef, crClient) if err != nil { return nil, fmt.Errorf("getting default gcp client options: %w", err) diff --git a/cloud/scope/cluster.go b/cloud/scope/cluster.go index 1549c68a7..6508ed058 100644 --- a/cloud/scope/cluster.go +++ b/cloud/scope/cluster.go @@ -27,6 +27,7 @@ import ( "k8s.io/utils/ptr" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" @@ -43,6 +44,12 @@ type ClusterScopeParams struct { // NewClusterScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewClusterScope(ctx context.Context, params ClusterScopeParams) (*ClusterScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.clusterScope.NewClusterScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } diff --git a/cloud/scope/managedcluster.go b/cloud/scope/managedcluster.go index e183f37a5..29facf06d 100644 --- a/cloud/scope/managedcluster.go +++ b/cloud/scope/managedcluster.go @@ -27,6 +27,7 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/controller-runtime/pkg/client" @@ -44,6 +45,12 @@ type ManagedClusterScopeParams struct { // NewManagedClusterScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewManagedClusterScope(ctx context.Context, params ManagedClusterScopeParams) (*ManagedClusterScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.managedClusterScope.NewManagedClusterScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } diff --git a/cloud/scope/managedcontrolplane.go b/cloud/scope/managedcontrolplane.go index 1587a6aef..a6c05a74c 100644 --- a/cloud/scope/managedcontrolplane.go +++ b/cloud/scope/managedcontrolplane.go @@ -21,6 +21,7 @@ import ( "fmt" "sigs.k8s.io/cluster-api-provider-gcp/util/location" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/cluster-api/util/conditions" @@ -54,6 +55,12 @@ type ManagedControlPlaneScopeParams struct { // NewManagedControlPlaneScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewManagedControlPlaneScope(ctx context.Context, params ManagedControlPlaneScopeParams) (*ManagedControlPlaneScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.managedControlPlaneScope.NewManagedControlPlaneScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } diff --git a/cloud/scope/managedmachinepool.go b/cloud/scope/managedmachinepool.go index e2a4577a4..ae9ea4841 100644 --- a/cloud/scope/managedmachinepool.go +++ b/cloud/scope/managedmachinepool.go @@ -25,6 +25,7 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud" "sigs.k8s.io/cluster-api-provider-gcp/util/location" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/cluster-api/util/conditions" @@ -54,6 +55,12 @@ type ManagedMachinePoolScopeParams struct { // NewManagedMachinePoolScope creates a new Scope from the supplied parameters. // This is meant to be called for each reconcile iteration. func NewManagedMachinePoolScope(ctx context.Context, params ManagedMachinePoolScopeParams) (*ManagedMachinePoolScope, error) { + + ctx, span := telemetry.Tracer().Start( + ctx, "cloud.managedMachinePoolScope.NewManagedMachinePoolScope", + ) + defer span.End() + if params.Cluster == nil { return nil, errors.New("failed to generate new scope from nil Cluster") } From 7e60c0199cf9b3b893b789fb16cc8d770272cc8d Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Sun, 7 May 2023 02:00:04 +0000 Subject: [PATCH 07/15] instrument exp/controllers --- .../gcpmanagedcluster_controller.go | 33 +++++++++++++++++++ .../gcpmanagedcontrolplane_controller.go | 33 +++++++++++++++++++ .../gcpmanagedmachinepool_controller.go | 33 +++++++++++++++++++ 3 files changed, 99 insertions(+) diff --git a/exp/controllers/gcpmanagedcluster_controller.go b/exp/controllers/gcpmanagedcluster_controller.go index 1fab6c2eb..d6b080217 100644 --- a/exp/controllers/gcpmanagedcluster_controller.go +++ b/exp/controllers/gcpmanagedcluster_controller.go @@ -24,6 +24,8 @@ import ( "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/filter" "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/klog/v2" @@ -33,6 +35,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/compute/subnets" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" @@ -68,6 +71,16 @@ func (r *GCPManagedClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re log := log.FromContext(ctx) + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPManagedCluster"), + ), + ) + defer span.End() + gcpCluster := &infrav1exp.GCPManagedCluster{} err := r.Get(ctx, req.NamespacedName, gcpCluster) if err != nil { @@ -142,6 +155,14 @@ func (r *GCPManagedClusterReconciler) Reconcile(ctx context.Context, req ctrl.Re func (r *GCPManagedClusterReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := ctrl.LoggerFrom(ctx) + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPManagedClusterReconciler"), + ), + ) + defer span.End() + c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). For(&infrav1exp.GCPManagedCluster{}). @@ -169,6 +190,12 @@ func (r *GCPManagedClusterReconciler) SetupWithManager(ctx context.Context, mgr func (r *GCPManagedClusterReconciler) reconcile(ctx context.Context, clusterScope *scope.ManagedClusterScope) error { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcluster") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPManagedCluster") controllerutil.AddFinalizer(clusterScope.GCPManagedCluster, infrav1exp.ClusterFinalizer) @@ -226,6 +253,12 @@ func (r *GCPManagedClusterReconciler) reconcile(ctx context.Context, clusterScop func (r *GCPManagedClusterReconciler) reconcileDelete(ctx context.Context, clusterScope *scope.ManagedClusterScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcluster", "action", "delete") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedClusterReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Reconciling Delete GCPManagedCluster") if clusterScope.GCPManagedControlPlane != nil { diff --git a/exp/controllers/gcpmanagedcontrolplane_controller.go b/exp/controllers/gcpmanagedcontrolplane_controller.go index 5725f99e8..6255c4535 100644 --- a/exp/controllers/gcpmanagedcontrolplane_controller.go +++ b/exp/controllers/gcpmanagedcontrolplane_controller.go @@ -21,6 +21,8 @@ import ( "fmt" "time" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "sigs.k8s.io/cluster-api/util/annotations" "github.com/pkg/errors" @@ -30,6 +32,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/services/container/clusters" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" @@ -63,6 +66,14 @@ type GCPManagedControlPlaneReconciler struct { func (r *GCPManagedControlPlaneReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := log.FromContext(ctx).WithValues("controller", "GCPManagedControlPlane") + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPManagedControlPlane"), + ), + ) + defer span.End() + gcpManagedControlPlane := &infrav1exp.GCPManagedControlPlane{} c, err := ctrl.NewControllerManagedBy(mgr). WithOptions(options). @@ -90,6 +101,16 @@ func (r *GCPManagedControlPlaneReconciler) Reconcile(ctx context.Context, req ct log := ctrl.LoggerFrom(ctx) + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPManagedControlPlane"), + ), + ) + defer span.End() + // Get the control plane instance gcpManagedControlPlane := &infrav1exp.GCPManagedControlPlane{} if err := r.Client.Get(ctx, req.NamespacedName, gcpManagedControlPlane); err != nil { @@ -154,6 +175,12 @@ func (r *GCPManagedControlPlaneReconciler) Reconcile(ctx context.Context, req ct func (r *GCPManagedControlPlaneReconciler) reconcile(ctx context.Context, managedControlPlaneScope *scope.ManagedControlPlaneScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcontrolplane") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPManagedControlPlane") controllerutil.AddFinalizer(managedControlPlaneScope.GCPManagedControlPlane, infrav1exp.ManagedControlPlaneFinalizer) @@ -192,6 +219,12 @@ func (r *GCPManagedControlPlaneReconciler) reconcile(ctx context.Context, manage func (r *GCPManagedControlPlaneReconciler) reconcileDelete(ctx context.Context, managedControlPlaneScope *scope.ManagedControlPlaneScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedcontrolplane", "action", "delete") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedControlPlaneReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Deleting GCPManagedControlPlane") reconcilers := map[string]cloud.ReconcilerWithResult{ diff --git a/exp/controllers/gcpmanagedmachinepool_controller.go b/exp/controllers/gcpmanagedmachinepool_controller.go index 96dc4c0ca..be954e364 100644 --- a/exp/controllers/gcpmanagedmachinepool_controller.go +++ b/exp/controllers/gcpmanagedmachinepool_controller.go @@ -24,6 +24,8 @@ import ( "github.com/go-logr/logr" "github.com/googleapis/gax-go/v2/apierror" "github.com/pkg/errors" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/trace" "google.golang.org/grpc/codes" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -41,6 +43,7 @@ import ( "sigs.k8s.io/cluster-api-provider-gcp/cloud/scope" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/predicates" @@ -149,6 +152,14 @@ func managedControlPlaneToManagedMachinePoolMapFunc(c client.Client, gvk schema. func (r *GCPManagedMachinePoolReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, options controller.Options) error { log := log.FromContext(ctx).WithValues("controller", "GCPManagedMachinePool") + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.SetupWithManager", + trace.WithAttributes( + attribute.String("controller", "GCPManagedMachinePool"), + ), + ) + defer span.End() + gvk, err := apiutil.GVKForObject(new(infrav1exp.GCPManagedMachinePool), mgr.GetScheme()) if err != nil { return errors.Wrapf(err, "failed to find GVK for GCPManagedMachinePool") @@ -227,6 +238,16 @@ func (r *GCPManagedMachinePoolReconciler) Reconcile(ctx context.Context, req ctr ctx, cancel := context.WithTimeout(ctx, reconciler.DefaultedLoopTimeout(r.ReconcileTimeout)) defer cancel() + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.Reconcile", + trace.WithAttributes( + attribute.String("name", req.Name), + attribute.String("namespace", req.Namespace), + attribute.String("kind", "GCPManagedMachinePool"), + ), + ) + defer span.End() + log := ctrl.LoggerFrom(ctx) // Get the managed machine pool @@ -318,6 +339,12 @@ func (r *GCPManagedMachinePoolReconciler) Reconcile(ctx context.Context, req ctr func (r *GCPManagedMachinePoolReconciler) reconcile(ctx context.Context, managedMachinePoolScope *scope.ManagedMachinePoolScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedmachinepool") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.reconcile", + ) + defer span.End() + log.Info("Reconciling GCPManagedMachinePool") controllerutil.AddFinalizer(managedMachinePoolScope.GCPManagedMachinePool, infrav1exp.ManagedMachinePoolFinalizer) @@ -360,6 +387,12 @@ func (r *GCPManagedMachinePoolReconciler) reconcile(ctx context.Context, managed func (r *GCPManagedMachinePoolReconciler) reconcileDelete(ctx context.Context, managedMachinePoolScope *scope.ManagedMachinePoolScope) (ctrl.Result, error) { log := log.FromContext(ctx).WithValues("controller", "gcpmanagedmachinepool", "action", "delete") + + ctx, span := telemetry.Tracer().Start( + ctx, "controllers.GCPManagedMachinePoolReconciler.reconcileDelete", + ) + defer span.End() + log.Info("Deleting GCPManagedMachinePool") reconcilers := map[string]cloud.ReconcilerWithResult{ From 5172186fa46990aeb7f40fa01279c6d0821dfa7c Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Sun, 7 May 2023 02:00:39 +0000 Subject: [PATCH 08/15] instrument cloud/services/compute --- cloud/services/compute/firewalls/reconcile.go | 11 +++ cloud/services/compute/instances/reconcile.go | 11 +++ .../compute/loadbalancers/reconcile.go | 71 +++++++++++++++++++ cloud/services/compute/networks/reconcile.go | 21 ++++++ cloud/services/compute/subnets/reconcile.go | 16 +++++ 5 files changed, 130 insertions(+) diff --git a/cloud/services/compute/firewalls/reconcile.go b/cloud/services/compute/firewalls/reconcile.go index 4047f6548..aeb7c662c 100644 --- a/cloud/services/compute/firewalls/reconcile.go +++ b/cloud/services/compute/firewalls/reconcile.go @@ -22,11 +22,17 @@ import ( "github.com/GoogleCloudPlatform/k8s-cloud-provider/pkg/cloud/meta" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile cluster firewall compoenents. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "firewalls.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) if s.scope.IsSharedVpc() { log.V(2).Info("Shared VPC enabled. Ignore Reconciling firewall resources") @@ -53,6 +59,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete delete cluster firewall compoenents. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "firewalls.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) if s.scope.IsSharedVpc() { log.V(2).Info("Shared VPC enabled. Ignore Deleting firewall resources") diff --git a/cloud/services/compute/instances/reconcile.go b/cloud/services/compute/instances/reconcile.go index 76277d072..af19fe153 100644 --- a/cloud/services/compute/instances/reconcile.go +++ b/cloud/services/compute/instances/reconcile.go @@ -32,11 +32,17 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile machine instance. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "instances.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling instance resources") instance, err := s.createOrGetInstance(ctx) @@ -99,6 +105,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete delete machine instance. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "instances.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Deleting instance resources") instanceSpec := s.scope.InstanceSpec(log) diff --git a/cloud/services/compute/loadbalancers/reconcile.go b/cloud/services/compute/loadbalancers/reconcile.go index 9bd356d1f..e3a9e36eb 100644 --- a/cloud/services/compute/loadbalancers/reconcile.go +++ b/cloud/services/compute/loadbalancers/reconcile.go @@ -27,6 +27,7 @@ import ( "k8s.io/utils/ptr" infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) @@ -48,6 +49,11 @@ const ( // Reconcile reconcile cluster control-plane loadbalancer components. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling loadbalancer resources") @@ -82,6 +88,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete deletes cluster control-plane loadbalancer components. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) var allErrs []error lbSpec := s.scope.LoadBalancer() @@ -253,6 +264,11 @@ func (s *Service) createInternalLoadBalancer(ctx context.Context, name string, l } func (s *Service) createOrGetInstanceGroups(ctx context.Context) ([]*compute.InstanceGroup, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetInstanceGroups", + ) + defer span.End() + log := log.FromContext(ctx) fd := s.scope.FailureDomains() zones := make([]string, 0, len(fd)) @@ -297,6 +313,11 @@ func (s *Service) createOrGetInstanceGroups(ctx context.Context) ([]*compute.Ins } func (s *Service) createOrGetHealthCheck(ctx context.Context, lbname string) (*compute.HealthCheck, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetHealthCheck", + ) + defer span.End() + log := log.FromContext(ctx) healthcheckSpec := s.scope.HealthCheckSpec(lbname) log.V(2).Info("Looking for healthcheck", "name", healthcheckSpec.Name) @@ -324,6 +345,11 @@ func (s *Service) createOrGetHealthCheck(ctx context.Context, lbname string) (*c } func (s *Service) createOrGetRegionalHealthCheck(ctx context.Context, lbname string) (*compute.HealthCheck, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetBackendService", + ) + defer span.End() + log := log.FromContext(ctx) healthcheckSpec := s.scope.HealthCheckSpec(lbname) healthcheckSpec.Region = s.scope.Region() @@ -460,6 +486,11 @@ func (s *Service) createOrGetRegionalBackendService(ctx context.Context, lbname } func (s *Service) createOrGetTargetTCPProxy(ctx context.Context, service *compute.BackendService) (*compute.TargetTcpProxy, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetTargetTCPProxy", + ) + defer span.End() + log := log.FromContext(ctx) targetSpec := s.scope.TargetTCPProxySpec() targetSpec.Service = service.SelfLink @@ -488,6 +519,11 @@ func (s *Service) createOrGetTargetTCPProxy(ctx context.Context, service *comput // createOrGetAddress is used to obtain a Global address. func (s *Service) createOrGetAddress(ctx context.Context, lbname string) (*compute.Address, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetAddress", + ) + defer span.End() + log := log.FromContext(ctx) addrSpec := s.scope.AddressSpec(lbname) log.V(2).Info("Looking for address", "name", addrSpec.Name) @@ -553,6 +589,11 @@ func (s *Service) createOrGetInternalAddress(ctx context.Context, lbname string) // createOrGetForwardingRule is used obtain a Global ForwardingRule. func (s *Service) createOrGetForwardingRule(ctx context.Context, lbname string, target *compute.TargetTcpProxy, addr *compute.Address) (*compute.ForwardingRule, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.createOrGetForwardingRule", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.ForwardingRuleSpec(lbname) spec.Target = target.SelfLink @@ -655,6 +696,11 @@ func (s *Service) createOrGetRegionalForwardingRule(ctx context.Context, lbname } func (s *Service) deleteForwardingRule(ctx context.Context, lbname string) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteForwardingRule", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.ForwardingRuleSpec(lbname) key := meta.GlobalKey(spec.Name) @@ -681,6 +727,11 @@ func (s *Service) deleteRegionalForwardingRule(ctx context.Context, lbname strin } func (s *Service) deleteAddress(ctx context.Context, lbname string) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteAddress", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.AddressSpec(lbname) key := meta.GlobalKey(spec.Name) @@ -705,6 +756,11 @@ func (s *Service) deleteInternalAddress(ctx context.Context, lbname string) erro } func (s *Service) deleteTargetTCPProxy(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteTargetTCPProxy", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.TargetTCPProxySpec() key := meta.GlobalKey(spec.Name) @@ -718,6 +774,11 @@ func (s *Service) deleteTargetTCPProxy(ctx context.Context) error { } func (s *Service) deleteBackendService(ctx context.Context, lbname string) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteBackendService", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.BackendServiceSpec(lbname) key := meta.GlobalKey(spec.Name) @@ -744,6 +805,11 @@ func (s *Service) deleteRegionalBackendService(ctx context.Context, lbname strin } func (s *Service) deleteHealthCheck(ctx context.Context, lbname string) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteHealthCheck", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.HealthCheckSpec(lbname) key := meta.GlobalKey(spec.Name) @@ -770,6 +836,11 @@ func (s *Service) deleteRegionalHealthCheck(ctx context.Context, lbname string) } func (s *Service) deleteInstanceGroups(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "loadbalancers.Services.deleteInstanceGroups", + ) + defer span.End() + log := log.FromContext(ctx) for zone := range s.scope.Network().APIServerInstanceGroups { spec := s.scope.InstanceGroupSpec(zone) diff --git a/cloud/services/compute/networks/reconcile.go b/cloud/services/compute/networks/reconcile.go index daf7ae987..0c407c31b 100644 --- a/cloud/services/compute/networks/reconcile.go +++ b/cloud/services/compute/networks/reconcile.go @@ -26,11 +26,17 @@ import ( infrav1 "sigs.k8s.io/cluster-api-provider-gcp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconcile cluster network components. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling network resources") network, err := s.createOrGetNetwork(ctx) @@ -53,6 +59,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete delete cluster network components. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) if s.scope.IsSharedVpc() { log.V(2).Info("Shared VPC enabled. Ignore Deleting network resources") @@ -100,6 +111,11 @@ func (s *Service) Delete(ctx context.Context) error { // createOrGetNetwork creates a network if not exist otherwise return existing network. func (s *Service) createOrGetNetwork(ctx context.Context) (*compute.Network, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.createOrGetNetwork", + ) + defer span.End() + log := log.FromContext(ctx) log.V(2).Info("Looking for network", "name", s.scope.NetworkName()) networkKey := meta.GlobalKey(s.scope.NetworkName()) @@ -132,6 +148,11 @@ func (s *Service) createOrGetNetwork(ctx context.Context) (*compute.Network, err // createOrGetRouter creates a cloudnat router if not exist otherwise return the existing. func (s *Service) createOrGetRouter(ctx context.Context, network *compute.Network) (*compute.Router, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "networks.Services.createOrGetRouter", + ) + defer span.End() + log := log.FromContext(ctx) spec := s.scope.NatRouterSpec() log.V(2).Info("Looking for cloudnat router", "name", spec.Name) diff --git a/cloud/services/compute/subnets/reconcile.go b/cloud/services/compute/subnets/reconcile.go index 52ca259d3..a1d8ee97e 100644 --- a/cloud/services/compute/subnets/reconcile.go +++ b/cloud/services/compute/subnets/reconcile.go @@ -25,11 +25,17 @@ import ( "google.golang.org/api/compute/v1" "sigs.k8s.io/cluster-api-provider-gcp/cloud/gcperrors" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/controller-runtime/pkg/log" ) // Reconcile reconciles cluster network components. func (s *Service) Reconcile(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "subnets.Services.Reconcile", + ) + defer span.End() + logger := log.FromContext(ctx) logger.Info("Reconciling subnetwork resources") @@ -43,6 +49,11 @@ func (s *Service) Reconcile(ctx context.Context) error { // Delete deletes cluster subnetwork components. func (s *Service) Delete(ctx context.Context) error { + ctx, span := telemetry.Tracer().Start( + ctx, "subnets.Services.Delete", + ) + defer span.End() + logger := log.FromContext(ctx) if s.scope.IsSharedVpc() { logger.V(2).Info("Shared VPC enabled. Skip deleting subnet resources") @@ -81,6 +92,11 @@ func (s *Service) Delete(ctx context.Context) error { // createOrGetSubnets creates the subnetworks if they don't exist otherwise return the existing ones. func (s *Service) createOrGetSubnets(ctx context.Context) ([]*compute.Subnetwork, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "subnets.Services.createOrGetSubnets", + ) + defer span.End() + logger := log.FromContext(ctx) subnets := []*compute.Subnetwork{} for _, subnetSpec := range s.scope.SubnetSpecs() { From dbf66f76e07861593bf01054f2053b1339415e8b Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Sun, 7 May 2023 02:00:54 +0000 Subject: [PATCH 09/15] instrument cloud/services/container --- .../services/container/clusters/kubeconfig.go | 31 ++++++++++++++ .../services/container/clusters/reconcile.go | 31 ++++++++++++++ .../services/container/nodepools/reconcile.go | 41 +++++++++++++++++++ 3 files changed, 103 insertions(+) diff --git a/cloud/services/container/clusters/kubeconfig.go b/cloud/services/container/clusters/kubeconfig.go index 978ead85c..a3212f41a 100644 --- a/cloud/services/container/clusters/kubeconfig.go +++ b/cloud/services/container/clusters/kubeconfig.go @@ -32,6 +32,7 @@ import ( "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/tools/clientcmd/api" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "sigs.k8s.io/cluster-api/util/kubeconfig" "sigs.k8s.io/cluster-api/util/secret" ) @@ -42,6 +43,11 @@ const ( ) func (s *Service) reconcileKubeconfig(ctx context.Context, cluster *containerpb.Cluster, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.reconcileKubeconfig", + ) + defer span.End() + log.Info("Reconciling kubeconfig") clusterRef := types.NamespacedName{ Name: s.scope.Cluster.Name, @@ -72,6 +78,11 @@ func (s *Service) reconcileKubeconfig(ctx context.Context, cluster *containerpb. } func (s *Service) reconcileAdditionalKubeconfigs(ctx context.Context, cluster *containerpb.Cluster, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.reconcileAdditionalKubeconfigs", + ) + defer span.End() + log.Info("Reconciling additional kubeconfig") clusterRef := types.NamespacedName{ Name: s.scope.Cluster.Name + "-user", @@ -99,6 +110,11 @@ func (s *Service) reconcileAdditionalKubeconfigs(ctx context.Context, cluster *c } func (s *Service) createUserKubeconfigSecret(ctx context.Context, cluster *containerpb.Cluster, clusterRef *types.NamespacedName) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.createUserKubeconfigSecret", + ) + defer span.End() + controllerOwnerRef := *metav1.NewControllerRef(s.scope.GCPManagedControlPlane, infrav1exp.GroupVersion.WithKind("GCPManagedControlPlane")) contextName := s.getKubeConfigContextName(false) @@ -134,6 +150,11 @@ func (s *Service) createUserKubeconfigSecret(ctx context.Context, cluster *conta } func (s *Service) createCAPIKubeconfigSecret(ctx context.Context, cluster *containerpb.Cluster, clusterRef *types.NamespacedName, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.createCAPIKubeconfigSecret", + ) + defer span.End() + controllerOwnerRef := *metav1.NewControllerRef(s.scope.GCPManagedControlPlane, infrav1exp.GroupVersion.WithKind("GCPManagedControlPlane")) contextName := s.getKubeConfigContextName(false) @@ -171,6 +192,11 @@ func (s *Service) createCAPIKubeconfigSecret(ctx context.Context, cluster *conta } func (s *Service) updateCAPIKubeconfigSecret(ctx context.Context, configSecret *corev1.Secret) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.updateCAPIKubeconfigSecret", + ) + defer span.End() + data, ok := configSecret.Data[secret.KubeconfigDataName] if !ok { return errors.Errorf("missing key %q in secret data", secret.KubeconfigDataName) @@ -238,6 +264,11 @@ func (s *Service) createBaseKubeConfig(contextName string, cluster *containerpb. } func (s *Service) generateToken(ctx context.Context) (string, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.generateToken", + ) + defer span.End() + req := &credentialspb.GenerateAccessTokenRequest{ Name: "projects/-/serviceAccounts/" + s.scope.GetCredential().ClientEmail, Scope: []string{ diff --git a/cloud/services/container/clusters/reconcile.go b/cloud/services/container/clusters/reconcile.go index 2c94b35c6..084d71cb8 100644 --- a/cloud/services/container/clusters/reconcile.go +++ b/cloud/services/container/clusters/reconcile.go @@ -33,6 +33,7 @@ import ( "google.golang.org/grpc/codes" infrav1exp "sigs.k8s.io/cluster-api-provider-gcp/exp/api/v1beta1" "sigs.k8s.io/cluster-api-provider-gcp/util/reconciler" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" "sigs.k8s.io/cluster-api/util/conditions" ctrl "sigs.k8s.io/controller-runtime" @@ -41,6 +42,11 @@ import ( // Reconcile reconcile GKE cluster. func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx).WithValues("service", "container.clusters") log.Info("Reconciling cluster resources") @@ -182,6 +188,11 @@ func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { // Delete delete GKE cluster. func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx).WithValues("service", "container.clusters") log.Info("Deleting cluster resources") @@ -226,6 +237,11 @@ func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { } func (s *Service) describeCluster(ctx context.Context, log *logr.Logger) (*containerpb.Cluster, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.describeCluster", + ) + defer span.End() + getClusterRequest := &containerpb.GetClusterRequest{ Name: s.scope.ClusterFullName(), } @@ -245,6 +261,11 @@ func (s *Service) describeCluster(ctx context.Context, log *logr.Logger) (*conta } func (s *Service) createCluster(ctx context.Context, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.createCluster", + ) + defer span.End() + nodePools, machinePools, _ := s.scope.GetAllNodePools(ctx) log.V(2).Info("Running pre-flight checks on machine pools before cluster creation") @@ -351,6 +372,11 @@ func (s *Service) getSubnetNameInClusterRegion() string { } func (s *Service) updateCluster(ctx context.Context, updateClusterRequest *containerpb.UpdateClusterRequest, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.updateCluster", + ) + defer span.End() + _, err := s.scope.ManagedControlPlaneClient().UpdateCluster(ctx, updateClusterRequest) if err != nil { log.Error(err, "Error updating GKE cluster", "name", s.scope.ClusterName()) @@ -361,6 +387,11 @@ func (s *Service) updateCluster(ctx context.Context, updateClusterRequest *conta } func (s *Service) deleteCluster(ctx context.Context, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "clusters.Services.deleteCluster", + ) + defer span.End() + deleteClusterRequest := &containerpb.DeleteClusterRequest{ Name: s.scope.ClusterFullName(), } diff --git a/cloud/services/container/nodepools/reconcile.go b/cloud/services/container/nodepools/reconcile.go index a643ac0ec..87d8d5e2b 100644 --- a/cloud/services/container/nodepools/reconcile.go +++ b/cloud/services/container/nodepools/reconcile.go @@ -22,6 +22,7 @@ import ( "strings" "sigs.k8s.io/cluster-api-provider-gcp/util/resourceurl" + "sigs.k8s.io/cluster-api-provider-gcp/util/telemetry" "google.golang.org/api/iterator" "google.golang.org/grpc/codes" @@ -57,6 +58,11 @@ func (s *Service) setReadyStatusFromConditions() { // Reconcile reconcile GKE node pool. func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.Reconcile", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Reconciling node pool resources") @@ -195,6 +201,11 @@ func (s *Service) Reconcile(ctx context.Context) (ctrl.Result, error) { // Delete delete GKE node pool. func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.Delete", + ) + defer span.End() + log := log.FromContext(ctx) log.Info("Deleting node pool resources") @@ -239,6 +250,11 @@ func (s *Service) Delete(ctx context.Context) (ctrl.Result, error) { } func (s *Service) describeNodePool(ctx context.Context, log *logr.Logger) (*containerpb.NodePool, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.describeNodePool", + ) + defer span.End() + getNodePoolRequest := &containerpb.GetNodePoolRequest{ Name: s.scope.NodePoolFullName(), } @@ -258,6 +274,11 @@ func (s *Service) describeNodePool(ctx context.Context, log *logr.Logger) (*cont } func (s *Service) getInstances(ctx context.Context, nodePool *containerpb.NodePool) ([]*computepb.ManagedInstance, error) { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.getInstances", + ) + defer span.End() + instances := []*computepb.ManagedInstance{} for _, url := range nodePool.GetInstanceGroupUrls() { @@ -287,6 +308,11 @@ func (s *Service) getInstances(ctx context.Context, nodePool *containerpb.NodePo } func (s *Service) createNodePool(ctx context.Context, log *logr.Logger) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.createNodePool", + ) + defer span.End() + log.V(2).Info("Running pre-flight checks on machine pool before creation") if err := shared.ManagedMachinePoolPreflightCheck(s.scope.GCPManagedMachinePool, s.scope.MachinePool, s.scope.Region()); err != nil { return fmt.Errorf("preflight checks on machine pool before creating: %w", err) @@ -307,6 +333,11 @@ func (s *Service) createNodePool(ctx context.Context, log *logr.Logger) error { } func (s *Service) updateNodePoolConfig(ctx context.Context, updateNodePoolRequest *containerpb.UpdateNodePoolRequest) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.updateNodePoolConfig", + ) + defer span.End() + _, err := s.scope.ManagedMachinePoolClient().UpdateNodePool(ctx, updateNodePoolRequest) if err != nil { return err @@ -316,6 +347,11 @@ func (s *Service) updateNodePoolConfig(ctx context.Context, updateNodePoolReques } func (s *Service) updateNodePoolAutoscaling(ctx context.Context, setNodePoolAutoscalingRequest *containerpb.SetNodePoolAutoscalingRequest) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.updateNodePoolAutoScaling", + ) + defer span.End() + _, err := s.scope.ManagedMachinePoolClient().SetNodePoolAutoscaling(ctx, setNodePoolAutoscalingRequest) if err != nil { return err @@ -325,6 +361,11 @@ func (s *Service) updateNodePoolAutoscaling(ctx context.Context, setNodePoolAuto } func (s *Service) updateNodePoolSize(ctx context.Context, setNodePoolSizeRequest *containerpb.SetNodePoolSizeRequest) error { + ctx, span := telemetry.Tracer().Start( + ctx, "nodepools.Services.updateNodePoolSize", + ) + defer span.End() + _, err := s.scope.ManagedMachinePoolClient().SetNodePoolSize(ctx, setNodePoolSizeRequest) if err != nil { return err From 27dd64915a1e258a8b41813586a65753de499447 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 12 May 2023 15:02:33 +0000 Subject: [PATCH 10/15] add sampling rate --- main.go | 9 ++++++++- pkg/otel/tracing.go | 11 +++++------ 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/main.go b/main.go index 52459713b..adee7e94b 100644 --- a/main.go +++ b/main.go @@ -84,6 +84,7 @@ var ( leaderElectionRenewDeadline time.Duration leaderElectionRetryPeriod time.Duration enableTracing bool + samplingRate float64 ) // Add RBAC for the authorized diagnostics endpoint. @@ -165,7 +166,7 @@ func main() { ctx := ctrl.SetupSignalHandler() if enableTracing { - if err := ot.RegisterTracing(ctx, setupLog); err != nil { + if err := ot.RegisterTracing(ctx, samplingRate, setupLog); err != nil { setupLog.Error(err, "unable to set up tracing") os.Exit(1) } @@ -389,6 +390,12 @@ func initFlags(fs *pflag.FlagSet) { "Enable collecting and sending traces to opentelemetry-collector service", ) + fs.Float64Var(&samplingRate, + "trace-sampling-rate", + 0.6, + "The fraction of all the traces will be sample", + ) + flags.AddManagerOptions(fs, &managerOptions) feature.MutableGates.AddFlag(fs) diff --git a/pkg/otel/tracing.go b/pkg/otel/tracing.go index e2b32fafb..367f16603 100644 --- a/pkg/otel/tracing.go +++ b/pkg/otel/tracing.go @@ -20,9 +20,9 @@ import ( semconv "go.opentelemetry.io/otel/semconv/v1.4.0" ) -func RegisterTracing(ctx context.Context, log logr.Logger) error { +func RegisterTracing(ctx context.Context, samplingRate float64, log logr.Logger) error { - tracerProvider, err := SetUpTracing(ctx) + tracerProvider, err := SetUpTracing(ctx, samplingRate) if err != nil { return err } @@ -64,7 +64,7 @@ func newExporter(ctx context.Context) (*otlptrace.Exporter, error) { return traceExporter, nil } -func SetUpTracing(ctx context.Context) (*trace.TracerProvider, error) { +func SetUpTracing(ctx context.Context, samplingRate float64) (*trace.TracerProvider, error) { traceExporter, err := newExporter(ctx) @@ -89,9 +89,8 @@ func SetUpTracing(ctx context.Context) (*trace.TracerProvider, error) { traceProvider := trace.NewTracerProvider( trace.WithBatcher(traceExporter), trace.WithResource(resource), - // TODO: dynamic sampling rate? - // sampling rate based on parent span = 60% - trace.WithSampler(trace.ParentBased(trace.TraceIDRatioBased(0.6))), + // 0 < samplingRate <= 1 (< 0 -> be treated as 0; >= 1 -> always sample) + trace.WithSampler(trace.ParentBased(trace.TraceIDRatioBased(samplingRate))), ) otel.SetTracerProvider(traceProvider) From c086f7a631c2908ee6f777bbff7291bea16f0513 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 12 May 2023 15:02:50 +0000 Subject: [PATCH 11/15] Tilt file --- Tiltfile | 225 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 179 insertions(+), 46 deletions(-) diff --git a/Tiltfile b/Tiltfile index a595a6ab8..acdabf651 100644 --- a/Tiltfile +++ b/Tiltfile @@ -5,10 +5,12 @@ tools_bin = "./hack/tools/bin" kubectl_cmd = "./hack/tools/bin/kubectl" kind_cmd = "./hack/tools/bin/kind" -#Add tools to path +# Add tools to path os.putenv("PATH", os.getenv("PATH") + ":" + tools_bin) -update_settings(k8s_upsert_timeout_secs = 60) # on first tilt up, often can take longer than 30 seconds +update_settings( + k8s_upsert_timeout_secs=60 +) # on first tilt up, often can take longer than 30 seconds # set defaults settings = { @@ -26,10 +28,12 @@ settings = { keys = ["GCP_B64ENCODED_CREDENTIALS"] # global settings -settings.update(read_json( - "tilt-settings.json", - default = {}, -)) +settings.update( + read_json( + "tilt-settings.json", + default={}, + ) +) if settings.get("trigger_mode") == "manual": trigger_mode(TRIGGER_MODE_MANUAL) @@ -40,36 +44,61 @@ if "allowed_contexts" in settings: if "default_registry" in settings: default_registry(settings.get("default_registry")) + # deploy CAPI def deploy_capi(): version = settings.get("capi_version") - capi_uri = "https://github.com/kubernetes-sigs/cluster-api/releases/download/{}/cluster-api-components.yaml".format(version) - cmd = "curl -sSL {} | {} | {} apply -f -".format(capi_uri, envsubst_cmd, kubectl_cmd) - local(cmd, quiet = True) + capi_uri = "https://github.com/kubernetes-sigs/cluster-api/releases/download/{}/cluster-api-components.yaml".format( + version + ) + cmd = "curl -sSL {} | {} | {} apply -f -".format( + capi_uri, envsubst_cmd, kubectl_cmd + ) + local(cmd, quiet=True) if settings.get("extra_args"): extra_args = settings.get("extra_args") if extra_args.get("core"): core_extra_args = extra_args.get("core") if core_extra_args: for namespace in ["capi-system"]: - patch_args_with_extra_args(namespace, "capi-controller-manager", core_extra_args) + patch_args_with_extra_args( + namespace, "capi-controller-manager", core_extra_args + ) if extra_args.get("kubeadm-bootstrap"): kb_extra_args = extra_args.get("kubeadm-bootstrap") if kb_extra_args: - patch_args_with_extra_args("capi-kubeadm-bootstrap-system", "capi-kubeadm-bootstrap-controller-manager", kb_extra_args) + patch_args_with_extra_args( + "capi-kubeadm-bootstrap-system", + "capi-kubeadm-bootstrap-controller-manager", + kb_extra_args, + ) + def patch_args_with_extra_args(namespace, name, extra_args): - args_str = str(local("{} get deployments {} -n {} -o jsonpath={{.spec.template.spec.containers[0].args}}".format(kubectl_cmd, name, namespace))) + args_str = str( + local( + "{} get deployments {} -n {} -o jsonpath={{.spec.template.spec.containers[0].args}}".format( + kubectl_cmd, name, namespace + ) + ) + ) args_to_add = [arg for arg in extra_args if arg not in args_str] if args_to_add: args = args_str[1:-1].split() args.extend(args_to_add) - patch = [{ - "op": "replace", - "path": "/spec/template/spec/containers/0/args", - "value": args, - }] - local("{} patch deployment {} -n {} --type json -p='{}'".format(kubectl_cmd, name, namespace, str(encode_json(patch)).replace("\n", ""))) + patch = [ + { + "op": "replace", + "path": "/spec/template/spec/containers/0/args", + "value": args, + } + ] + local( + "{} patch deployment {} -n {} --type json -p='{}'".format( + kubectl_cmd, name, namespace, str(encode_json(patch)).replace("\n", "") + ) + ) + # Users may define their own Tilt customizations in tilt.d. This directory is excluded from git and these files will # not be checked in to version control. @@ -78,23 +107,37 @@ def include_user_tilt_files(): for f in user_tiltfiles: include(f) -def append_arg_for_container_in_deployment(yaml_stream, name, namespace, contains_image_name, args): + +def append_arg_for_container_in_deployment( + yaml_stream, name, namespace, contains_image_name, args +): for item in yaml_stream: - if item["kind"] == "Deployment" and item.get("metadata").get("name") == name and item.get("metadata").get("namespace") == namespace: + if ( + item["kind"] == "Deployment" + and item.get("metadata").get("name") == name + and item.get("metadata").get("namespace") == namespace + ): containers = item.get("spec").get("template").get("spec").get("containers") for container in containers: if contains_image_name in container.get("image"): container.get("args").extend(args) + def fixup_yaml_empty_arrays(yaml_str): yaml_str = yaml_str.replace("conditions: null", "conditions: []") return yaml_str.replace("storedVersions: null", "storedVersions: []") + def validate_auth(): substitutions = settings.get("kustomize_substitutions", {}) missing = [k for k in keys if k not in substitutions] if missing: - fail("missing kustomize_substitutions keys {} in tilt-settings.json".format(missing)) + fail( + "missing kustomize_substitutions keys {} in tilt-settings.json".format( + missing + ) + ) + tilt_helper_dockerfile_header = """ # Tilt image @@ -118,35 +161,58 @@ COPY --from=tilt-helper /go/bin/dlv . COPY manager . """ + # Build CAPG and add feature gates def capg(): # Apply the kustomized yaml for this provider substitutions = settings.get("kustomize_substitutions", {}) os.environ.update(substitutions) - # yaml = str(kustomizesub("./hack/observability")) # build an observable kind deployment by default - yaml = str(kustomizesub("./config/default")) + yaml = str( + kustomizesub("./hack/observability") + ) # build an observable kind deployment by default + # TODO: consider to remove + # yaml = str(kustomizesub("./config/default")) # add extra_args if they are defined if settings.get("extra_args"): gcp_extra_args = settings.get("extra_args").get("gcp") if gcp_extra_args: yaml_dict = decode_yaml_stream(yaml) - append_arg_for_container_in_deployment(yaml_dict, "capg-controller-manager", "capg-system", "cluster-api-gcp-controller", gcp_extra_args) + append_arg_for_container_in_deployment( + yaml_dict, + "capg-controller-manager", + "capg-system", + "cluster-api-gcp-controller", + gcp_extra_args, + ) yaml = str(encode_yaml_stream(yaml_dict)) yaml = fixup_yaml_empty_arrays(yaml) # Set up a local_resource build of the provider's manager binary. local_resource( "manager", - cmd = 'mkdir -p .tiltbuild;CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags \'-extldflags "-static"\' -o .tiltbuild/manager', - deps = ["api", "cloud", "config", "controllers", "exp", "feature", "pkg", "go.mod", "go.sum", "main.go"], + cmd="mkdir -p .tiltbuild;CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -ldflags '-extldflags \"-static\"' -o .tiltbuild/manager", + deps=[ + "api", + "cloud", + "config", + "controllers", + "exp", + "feature", + "pkg", + "go.mod", + "go.sum", + "main.go", + ], ) - dockerfile_contents = "\n".join([ - tilt_helper_dockerfile_header, - tilt_dockerfile_header, - ]) + dockerfile_contents = "\n".join( + [ + tilt_helper_dockerfile_header, + tilt_dockerfile_header, + ] + ) entrypoint = ["sh", "/start.sh", "/manager"] extra_args = settings.get("extra_args") @@ -156,45 +222,110 @@ def capg(): # Set up an image build for the provider. The live update configuration syncs the output from the local_resource # build into the container. docker_build( - ref = "gcr.io/k8s-staging-cluster-api-gcp/cluster-api-gcp-controller", - context = "./.tiltbuild/", - dockerfile_contents = dockerfile_contents, - target = "tilt", - entrypoint = entrypoint, - only = "manager", - live_update = [ + ref="gcr.io/k8s-staging-cluster-api-gcp/cluster-api-gcp-controller", + context="./.tiltbuild/", + dockerfile_contents=dockerfile_contents, + target="tilt", + entrypoint=entrypoint, + only="manager", + live_update=[ sync(".tiltbuild/manager", "/manager"), run("sh /restart.sh"), ], - ignore = ["templates"], + ignore=["templates"], ) k8s_yaml(blob(yaml)) + +def observability(): + # Install the OpenTelemetry helm chart + gcp_project_id = os.getenv("GCP_PROJECT_ID", "") + + k8s_yaml( + helm( + "./hack/observability/opentelemetry/chart", + name="opentelemetry-collector", + namespace="capg-system", + values=["./hack/observability/opentelemetry/values.yaml"], + # refer https://github.com/helm/helm/issues/1987 + set=[ + "extraEnvs[0].name=GCP_PROJECT_ID", + "extraEnvs[0].value=" + gcp_project_id, + ], + ) + ) + + k8s_yaml( + helm( + "./hack/observability/jaeger/chart", + name="jaeger-all-in-one", + namespace="capg-system", + set=[ + # TODO: consider to remove + # "crd.install=false", + # "rbac.create=false", + "resources.limits.cpu=200m", + "resources.limits.memory=256Mi", + ], + ) + ) + + k8s_resource( + workload="jaeger-all-in-one", + new_name="traces: jaeger-all-in-one", + port_forwards=[ + port_forward(16686, name="View traces", link_path="/search?service=capg") + ], + labels=["observability"], + ) + + k8s_resource(workload="opentelemetry-collector", labels=["observability"]) + + def base64_encode(to_encode): - encode_blob = local("echo '{}' | tr -d '\n' | base64 - | tr -d '\n'".format(to_encode), quiet = True) + encode_blob = local( + "echo '{}' | tr -d '\n' | base64 - | tr -d '\n'".format(to_encode), quiet=True + ) return str(encode_blob) + def base64_encode_file(path_to_encode): - encode_blob = local("cat {} | tr -d '\n' | base64 - | tr -d '\n'".format(path_to_encode), quiet = True) + encode_blob = local( + "cat {} | tr -d '\n' | base64 - | tr -d '\n'".format(path_to_encode), quiet=True + ) return str(encode_blob) + def read_file_from_path(path_to_read): - str_blob = local("cat {} | tr -d '\n'".format(path_to_read), quiet = True) + str_blob = local("cat {} | tr -d '\n'".format(path_to_read), quiet=True) return str(str_blob) + def base64_decode(to_decode): - decode_blob = local("echo '{}' | base64 --decode -".format(to_decode), quiet = True) + decode_blob = local("echo '{}' | base64 --decode -".format(to_decode), quiet=True) return str(decode_blob) + def kustomizesub(folder): - yaml = local("hack/kustomize-sub.sh {}".format(folder), quiet = True) + yaml = local("hack/kustomize-sub.sh {}".format(folder), quiet=True) return yaml + def waitforsystem(): - local(kubectl_cmd + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-bootstrap-system") - local(kubectl_cmd + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-control-plane-system") - local(kubectl_cmd + " wait --for=condition=ready --timeout=300s pod --all -n capi-system") + local( + kubectl_cmd + + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-bootstrap-system" + ) + local( + kubectl_cmd + + " wait --for=condition=ready --timeout=300s pod --all -n capi-kubeadm-control-plane-system" + ) + local( + kubectl_cmd + + " wait --for=condition=ready --timeout=300s pod --all -n capi-system" + ) + ############################## # Actual work happens here @@ -213,4 +344,6 @@ deploy_capi() capg() +observability() + waitforsystem() From 0fd7288d56a3e29624db0914bf7805d334a66a58 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Mon, 22 May 2023 06:16:37 +0000 Subject: [PATCH 12/15] debug blocking tracing connection --- pkg/otel/tracing.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pkg/otel/tracing.go b/pkg/otel/tracing.go index 367f16603..85336dab4 100644 --- a/pkg/otel/tracing.go +++ b/pkg/otel/tracing.go @@ -8,7 +8,6 @@ import ( "github.com/pkg/errors" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" - "k8s.io/client-go/pkg/version" "go.opentelemetry.io/otel" "go.opentelemetry.io/otel/attribute" @@ -18,6 +17,7 @@ import ( "go.opentelemetry.io/otel/sdk/resource" "go.opentelemetry.io/otel/sdk/trace" semconv "go.opentelemetry.io/otel/semconv/v1.4.0" + ver "sigs.k8s.io/cluster-api-provider-gcp/version" ) func RegisterTracing(ctx context.Context, samplingRate float64, log logr.Logger) error { @@ -44,10 +44,13 @@ func RegisterTracing(ctx context.Context, samplingRate float64, log logr.Logger) func newExporter(ctx context.Context) (*otlptrace.Exporter, error) { + ctx, cancel := context.WithTimeout(ctx, time.Second) + defer cancel() + conn, err := grpc.DialContext(ctx, "opentelemetry-collector:4317", // Using non-TLS connection for dev environment grpc.WithTransportCredentials(insecure.NewCredentials()), - grpc.WithBlock(), + grpc.WithBlock(), // blocking code ) if err != nil { @@ -73,12 +76,11 @@ func SetUpTracing(ctx context.Context, samplingRate float64) (*trace.TracerProvi } // labels/tags/res common to all traces - // TODO: consider to add more fields resource, err := resource.New(ctx, resource.WithAttributes( semconv.ServiceNameKey.String("capg"), attribute.String("exporter", "otlpgrpc"), - attribute.String("version", version.Get().String()), + attribute.String("version", ver.Get().String()), ), ) From 8a6971d267258320062420f38b1b2248d1bab8f7 Mon Sep 17 00:00:00 2001 From: phong-nguyen Date: Fri, 26 May 2023 18:14:12 +0000 Subject: [PATCH 13/15] helm charts for jaeger + otel collector --- hack/observability/jaeger/chart/.helmignore | 22 + hack/observability/jaeger/chart/Chart.yaml | 16 + hack/observability/jaeger/chart/LICENSE | 21 + hack/observability/jaeger/chart/README.md | 26 + .../jaeger/chart/templates/NOTES.txt | 23 + .../jaeger/chart/templates/_helpers.tpl | 63 ++ .../jaeger/chart/templates/ingress.yaml | 64 ++ .../jaeger/chart/templates/jaeger-volume.yaml | 18 + .../chart/templates/service-headless.yaml | 56 ++ .../jaeger/chart/templates/service.yaml | 56 ++ .../chart/templates/serviceaccount.yaml | 11 + .../jaeger/chart/templates/statefulset.yaml | 110 +++ .../templates/tests/test-connection.yaml | 17 + hack/observability/jaeger/chart/values.yaml | 100 +++ .../opentelemetry/chart/.helmignore | 23 + .../opentelemetry/chart/CONTRIBUTING.md | 8 + .../opentelemetry/chart/Chart.yaml | 14 + .../observability/opentelemetry/chart/LICENSE | 201 +++++ .../opentelemetry/chart/README.md | 217 +++++ .../opentelemetry/chart/UPGRADING.md | 289 ++++++ .../opentelemetry/chart/templates/NOTES.txt | 38 + .../opentelemetry/chart/templates/_config.tpl | 329 +++++++ .../chart/templates/_helpers.tpl | 131 +++ .../opentelemetry/chart/templates/_pod.tpl | 194 +++++ .../chart/templates/clusterrole.yaml | 50 ++ .../chart/templates/clusterrolebinding.yaml | 22 + .../chart/templates/configmap-agent.yaml | 11 + .../templates/configmap-statefulset.yaml | 11 + .../chart/templates/configmap.yaml | 11 + .../chart/templates/daemonset.yaml | 44 + .../chart/templates/deployment.yaml | 45 + .../opentelemetry/chart/templates/hpa.yaml | 32 + .../chart/templates/ingress.yaml | 54 ++ .../chart/templates/networkpolicy.yaml | 38 + .../opentelemetry/chart/templates/pdb.yaml | 18 + .../chart/templates/podmonitor.yaml | 18 + .../chart/templates/prometheusrule.yaml | 87 ++ .../chart/templates/service.yaml | 33 + .../chart/templates/serviceaccount.yaml | 14 + .../chart/templates/servicemonitor.yaml | 18 + .../chart/templates/statefulset.yaml | 44 + .../opentelemetry/chart/values.schema.json | 824 ++++++++++++++++++ .../opentelemetry/chart/values.yaml | 486 +++++++++++ .../controller-manager-dev-env-otel-patch.yml | 1 + .../secrets-dev-env-otel-patch.yaml | 5 +- hack/observability/opentelemetry/values.yaml | 9 +- 46 files changed, 3916 insertions(+), 6 deletions(-) create mode 100644 hack/observability/jaeger/chart/.helmignore create mode 100644 hack/observability/jaeger/chart/Chart.yaml create mode 100644 hack/observability/jaeger/chart/LICENSE create mode 100644 hack/observability/jaeger/chart/README.md create mode 100644 hack/observability/jaeger/chart/templates/NOTES.txt create mode 100644 hack/observability/jaeger/chart/templates/_helpers.tpl create mode 100644 hack/observability/jaeger/chart/templates/ingress.yaml create mode 100644 hack/observability/jaeger/chart/templates/jaeger-volume.yaml create mode 100644 hack/observability/jaeger/chart/templates/service-headless.yaml create mode 100644 hack/observability/jaeger/chart/templates/service.yaml create mode 100644 hack/observability/jaeger/chart/templates/serviceaccount.yaml create mode 100644 hack/observability/jaeger/chart/templates/statefulset.yaml create mode 100644 hack/observability/jaeger/chart/templates/tests/test-connection.yaml create mode 100644 hack/observability/jaeger/chart/values.yaml create mode 100644 hack/observability/opentelemetry/chart/.helmignore create mode 100644 hack/observability/opentelemetry/chart/CONTRIBUTING.md create mode 100644 hack/observability/opentelemetry/chart/Chart.yaml create mode 100644 hack/observability/opentelemetry/chart/LICENSE create mode 100644 hack/observability/opentelemetry/chart/README.md create mode 100644 hack/observability/opentelemetry/chart/UPGRADING.md create mode 100644 hack/observability/opentelemetry/chart/templates/NOTES.txt create mode 100644 hack/observability/opentelemetry/chart/templates/_config.tpl create mode 100644 hack/observability/opentelemetry/chart/templates/_helpers.tpl create mode 100644 hack/observability/opentelemetry/chart/templates/_pod.tpl create mode 100644 hack/observability/opentelemetry/chart/templates/clusterrole.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/clusterrolebinding.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/configmap-agent.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/configmap-statefulset.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/configmap.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/daemonset.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/deployment.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/hpa.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/ingress.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/networkpolicy.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/pdb.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/podmonitor.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/prometheusrule.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/service.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/serviceaccount.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/servicemonitor.yaml create mode 100644 hack/observability/opentelemetry/chart/templates/statefulset.yaml create mode 100644 hack/observability/opentelemetry/chart/values.schema.json create mode 100644 hack/observability/opentelemetry/chart/values.yaml diff --git a/hack/observability/jaeger/chart/.helmignore b/hack/observability/jaeger/chart/.helmignore new file mode 100644 index 000000000..50af03172 --- /dev/null +++ b/hack/observability/jaeger/chart/.helmignore @@ -0,0 +1,22 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/hack/observability/jaeger/chart/Chart.yaml b/hack/observability/jaeger/chart/Chart.yaml new file mode 100644 index 000000000..a72810422 --- /dev/null +++ b/hack/observability/jaeger/chart/Chart.yaml @@ -0,0 +1,16 @@ +apiVersion: v2 +appVersion: 1.41.0 +description: Jaeger all-in-one helm chart for Kubernetes +home: https://github.com/hansehe/jaeger-all-in-one +icon: https://raw.githubusercontent.com/hansehe/jaeger-all-in-one/master/helm/jaeger.png +keywords: +- jaeger +- jaeger-all-in-one +maintainers: +- email: hans.erik.heggem@gmail.com + name: hansehe +name: jaeger-all-in-one +sources: +- https://github.com/hansehe/jaeger-all-in-one +type: application +version: 0.1.8 diff --git a/hack/observability/jaeger/chart/LICENSE b/hack/observability/jaeger/chart/LICENSE new file mode 100644 index 000000000..2a7cec499 --- /dev/null +++ b/hack/observability/jaeger/chart/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Hans Erik Heggem + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/hack/observability/jaeger/chart/README.md b/hack/observability/jaeger/chart/README.md new file mode 100644 index 000000000..878731bb3 --- /dev/null +++ b/hack/observability/jaeger/chart/README.md @@ -0,0 +1,26 @@ +# Jaeger Tracing - All In One + +## Introduction + +The Jaeger tracing all-in-one service enables jaeger for development purposes, check out: +- https://www.jaegertracing.io/docs/1.18/getting-started/ + +## Installing the Chart + +To install the chart with the release name `jaeger-all-in-one` run: + +```bash +$ helm repo add jaeger-all-in-one https://raw.githubusercontent.com/hansehe/jaeger-all-in-one/master/helm/charts +$ helm install jaeger-all-in-one jaeger-all-in-one/jaeger-all-in-one +``` + +Alternatively, a YAML file that specifies the values for the parameters can be provided while installing the chart. For example, + +```bash +$ helm install jaeger-all-in-one -f values.yaml jaeger-all-in-one/jaeger-all-in-one +``` + +## Configuration + +Find all possible configuration values here: +- https://github.com/hansehe/jaeger-all-in-one/blob/master/helm/jaeger-all-in-one/values.yaml diff --git a/hack/observability/jaeger/chart/templates/NOTES.txt b/hack/observability/jaeger/chart/templates/NOTES.txt new file mode 100644 index 000000000..f05fa1802 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/NOTES.txt @@ -0,0 +1,23 @@ +{{- if .Values.enabled }} +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ . }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "jaeger-all-in-one.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "jaeger-all-in-one.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "jaeger-all-in-one.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "jaeger-all-in-one.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + echo "Visit http://127.0.0.1:{{ .Values.service.port }} to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME {{ .Values.service.port }}:{{ .Values.service.port }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/_helpers.tpl b/hack/observability/jaeger/chart/templates/_helpers.tpl new file mode 100644 index 000000000..c8f340b9b --- /dev/null +++ b/hack/observability/jaeger/chart/templates/_helpers.tpl @@ -0,0 +1,63 @@ +{{/* vim: set filetype=mustache: */}} +{{/* +Expand the name of the chart. +*/}} +{{- define "jaeger-all-in-one.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "jaeger-all-in-one.fullname" -}} +{{- if .Values.fullnameOverride -}} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- $name := default .Chart.Name .Values.nameOverride -}} +{{- if contains $name .Release.Name -}} +{{- .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- else -}} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" -}} +{{- end -}} +{{- end -}} +{{- end -}} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "jaeger-all-in-one.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Common labels +*/}} +{{- define "jaeger-all-in-one.labels" -}} +helm.sh/chart: {{ include "jaeger-all-in-one.chart" . }} +{{ include "jaeger-all-in-one.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "jaeger-all-in-one.selectorLabels" -}} +app.kubernetes.io/name: {{ include "jaeger-all-in-one.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} + +{{/* +Create the name of the service account to use +*/}} +{{- define "jaeger-all-in-one.serviceAccountName" -}} +{{- if .Values.serviceAccount.create -}} + {{ default (include "jaeger-all-in-one.fullname" .) .Values.serviceAccount.name }} +{{- else -}} + {{ default "default" .Values.serviceAccount.name }} +{{- end -}} +{{- end -}} diff --git a/hack/observability/jaeger/chart/templates/ingress.yaml b/hack/observability/jaeger/chart/templates/ingress.yaml new file mode 100644 index 000000000..8998588f5 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/ingress.yaml @@ -0,0 +1,64 @@ +{{- if .Values.enabled }} +{{- if .Values.ingress.enabled -}} +{{- $fullName := include "jaeger-all-in-one.fullname" . -}} +{{- $svcPort := .Values.service.port -}} +{{- if and .Values.ingress.className (not (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion)) }} + {{- if not (hasKey .Values.ingress.annotations "kubernetes.io/ingress.class") }} + {{- $_ := set .Values.ingress.annotations "kubernetes.io/ingress.class" .Values.ingress.className}} + {{- end }} +{{- end }} +{{- if semverCompare ">=1.19-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1 +{{- else if semverCompare ">=1.14-0" .Capabilities.KubeVersion.GitVersion -}} +apiVersion: networking.k8s.io/v1beta1 +{{- else -}} +apiVersion: extensions/v1beta1 +{{- end }} +kind: Ingress +metadata: + name: {{ $fullName }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} + {{- with .Values.ingress.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{- if and .Values.ingress.className (semverCompare ">=1.18-0" .Capabilities.KubeVersion.GitVersion) }} + ingressClassName: {{ .Values.ingress.className }} + {{- end }} + {{- if .Values.ingress.tls }} + tls: + {{- range .Values.ingress.tls }} + - hosts: + {{- range .hosts }} + - {{ . | quote }} + {{- end }} + secretName: {{ .secretName }} + {{- end }} + {{- end }} + rules: + {{- range .Values.ingress.hosts }} + - host: {{ .host | quote }} + http: + paths: + {{- range .paths }} + - path: {{ .path }} + {{- if and .pathType (semverCompare ">=1.18-0" $.Capabilities.KubeVersion.GitVersion) }} + pathType: {{ .pathType }} + {{- end }} + backend: + {{- if semverCompare ">=1.19-0" $.Capabilities.KubeVersion.GitVersion }} + service: + name: {{ $fullName }} + port: + number: {{ $svcPort }} + {{- else }} + serviceName: {{ $fullName }} + servicePort: {{ $svcPort }} + {{- end }} + {{- end }} + {{- end }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/jaeger-volume.yaml b/hack/observability/jaeger/chart/templates/jaeger-volume.yaml new file mode 100644 index 000000000..20eedea60 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/jaeger-volume.yaml @@ -0,0 +1,18 @@ +{{- if .Values.enabled }} +{{- if .Values.volume.enabled -}} +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }} + namespace: {{ .Release.Namespace }} +spec: + {{- if .Values.volume.className }} + storageClassName: {{ .Values.volume.className }} + {{- end }} + accessModes: + - ReadWriteOnce + resources: + requests: + storage: {{ .Values.volume.size }} +{{- end }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/service-headless.yaml b/hack/observability/jaeger/chart/templates/service-headless.yaml new file mode 100644 index 000000000..5c10d6fbf --- /dev/null +++ b/hack/observability/jaeger/chart/templates/service-headless.yaml @@ -0,0 +1,56 @@ +{{- if .Values.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }}-headless + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} + {{- with .Values.service.headless.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + clusterIP: None + ports: + - port: 6831 + targetPort: udp-com-thr + protocol: UDP + name: udp-com-thr + - port: 6832 + targetPort: udp-bin-thr + protocol: UDP + name: udp-bin-thr + - port: 5775 + targetPort: udp-bin-thr-o + protocol: UDP + name: udp-bin-thr-o + - port: 5778 + targetPort: http-configs + protocol: TCP + name: http-configs + - port: {{ .Values.service.port }} + targetPort: http-ui + protocol: TCP + name: http-ui + - port: 14250 + targetPort: grpc-proto + protocol: TCP + name: grpc-proto + - port: 14268 + targetPort: http-bin-thr + protocol: TCP + name: http-bin-thr + - port: 14269 + targetPort: http-admin + protocol: TCP + name: http-admin + {{- if .Values.enableHttpZipkinCollector }} + - port: 9411 + targetPort: http-zipkin + protocol: TCP + name: http-zipkin + {{- end }} + selector: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 4 }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/service.yaml b/hack/observability/jaeger/chart/templates/service.yaml new file mode 100644 index 000000000..687a20283 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/service.yaml @@ -0,0 +1,56 @@ +{{- if .Values.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} + {{- with .Values.service.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + type: {{ .Values.service.type }} + ports: + - port: 6831 + targetPort: udp-com-thr + protocol: UDP + name: udp-com-thr + - port: 6832 + targetPort: udp-bin-thr + protocol: UDP + name: udp-bin-thr + - port: 5775 + targetPort: udp-bin-thr-o + protocol: UDP + name: udp-bin-thr-o + - port: 5778 + targetPort: http-configs + protocol: TCP + name: http-configs + - port: {{ .Values.service.port }} + targetPort: http-ui + protocol: TCP + name: http-ui + - port: 14250 + targetPort: grpc-proto + protocol: TCP + name: grpc-proto + - port: 14268 + targetPort: http-bin-thr + protocol: TCP + name: http-bin-thr + - port: 14269 + targetPort: http-admin + protocol: TCP + name: http-admin + {{- if .Values.enableHttpZipkinCollector }} + - port: 9411 + targetPort: http-zipkin + protocol: TCP + name: http-zipkin + {{- end }} + selector: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 4 }} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/serviceaccount.yaml b/hack/observability/jaeger/chart/templates/serviceaccount.yaml new file mode 100644 index 000000000..85ca08c18 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/serviceaccount.yaml @@ -0,0 +1,11 @@ +{{- if .Values.enabled }} +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "jaeger-all-in-one.serviceAccountName" . }} + namespace: {{ .Release.Namespace }} + labels: +{{ include "jaeger-all-in-one.labels" . | nindent 4 }} +{{- end -}} +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/templates/statefulset.yaml b/hack/observability/jaeger/chart/templates/statefulset.yaml new file mode 100644 index 000000000..2458eaad7 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/statefulset.yaml @@ -0,0 +1,110 @@ +{{- if .Values.enabled }} +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: {{ include "jaeger-all-in-one.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "jaeger-all-in-one.labels" . | nindent 4 }} +spec: + serviceName: {{ include "jaeger-all-in-one.fullname" . }}-headless + replicas: {{ .Values.replicaCount }} + selector: + matchLabels: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "jaeger-all-in-one.selectorLabels" . | nindent 8 }} + {{- with .Values.podAnnotations }} + annotations: + {{- toYaml . | nindent 8 }} + {{- end }} + spec: + terminationGracePeriodSeconds: {{ .Values.terminationGracePeriodSeconds }} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end }} + serviceAccountName: {{ include "jaeger-all-in-one.serviceAccountName" . }} + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 8 }} + {{- if .Values.volume.enabled }} + volumes: + - name: jaeger-volume + persistentVolumeClaim: + claimName: {{ include "jaeger-all-in-one.fullname" . }} + {{- end }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: "{{ .Values.image.repository }}:{{ .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + ports: + - name: udp-com-thr + containerPort: 6831 + protocol: UDP + - name: udp-bin-thr + containerPort: 6832 + protocol: UDP + - name: udp-bin-thr-o + containerPort: 5775 + protocol: UDP + - name: http-configs + containerPort: 5778 + protocol: TCP + - name: http-ui + containerPort: 16686 + protocol: TCP + - name: grpc-proto + containerPort: 14250 + protocol: TCP + - name: http-bin-thr + containerPort: 14268 + protocol: TCP + - name: http-admin + containerPort: 14269 + protocol: TCP + {{- if .Values.enableHttpZipkinCollector }} + - name: http-zipkin + containerPort: 9411 + protocol: TCP + {{- end }} + {{- if .Values.volume.enabled }} + volumeMounts: + - mountPath: "/badger" + name: jaeger-volume + {{- end }} + livenessProbe: + httpGet: + path: {{ .Values.healthCheckUrl | quote }} + port: http-admin + readinessProbe: + httpGet: + path: {{ .Values.healthCheckUrl | quote }} + port: http-admin + resources: + {{- toYaml .Values.resources | nindent 12 }} + env: + {{- range $key, $value := .Values.environmentVariables }} + - name: {{ $key }} + value: {{ $value | quote }} + {{- end }} + {{- if .Values.enableHttpZipkinCollector }} + - name: COLLECTOR_ZIPKIN_HOST_PORT + value: "9411" + {{- end }} + {{- with .Values.nodeSelector }} + nodeSelector: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} +{{- end }} diff --git a/hack/observability/jaeger/chart/templates/tests/test-connection.yaml b/hack/observability/jaeger/chart/templates/tests/test-connection.yaml new file mode 100644 index 000000000..c009bdb60 --- /dev/null +++ b/hack/observability/jaeger/chart/templates/tests/test-connection.yaml @@ -0,0 +1,17 @@ +{{- if .Values.tests.enabled }} +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "jaeger-all-in-one.fullname" . }}-test-connection" + labels: +{{ include "jaeger-all-in-one.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test-success +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "jaeger-all-in-one.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never +{{- end }} \ No newline at end of file diff --git a/hack/observability/jaeger/chart/values.yaml b/hack/observability/jaeger/chart/values.yaml new file mode 100644 index 000000000..5026287d3 --- /dev/null +++ b/hack/observability/jaeger/chart/values.yaml @@ -0,0 +1,100 @@ +# Default values for jaeger-all-in-one. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +enabled: true +replicaCount: 1 + +image: + repository: jaegertracing/all-in-one + pullPolicy: IfNotPresent + +healthCheckUrl: / +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" +terminationGracePeriodSeconds: 10 +environmentVariables: + MEMORY_MAX_TRACES: 100000 + SPAN_STORAGE_TYPE: badger + BADGER_EPHEMERAL: false + BADGER_DIRECTORY_VALUE: /badger/data + BADGER_DIRECTORY_KEY: /badger/key + +enableHttpZipkinCollector: false + +serviceAccount: + # Specifies whether a service account should be created + create: true + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: + +podAnnotations: + prometheus.io/scrape: "true" + prometheus.io/path: "/metrics" + prometheus.io/port: "14269" + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +service: + type: ClusterIP + port: 16686 + annotations: + prometheus.io/probe: "true" + prometheus.io/probe-path: "/" + headless: + annotations: {} + +ingress: + enabled: false + className: "" + annotations: {} + # kubernetes.io/ingress.class: nginx + # cert-manager.io/cluster-issuer: letsencrypt + # nginx.ingress.kubernetes.io/force-ssl-redirect: "true" + # nginx.ingress.kubernetes.io/from-to-www-redirect: "true" + hosts: + - host: jaeger.local + paths: + - path: / + pathType: ImplementationSpecific + tls: [] + # - secretName: jaeger-tls + # hosts: + # - jaeger.local + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} + +volume: + enabled: true + className: "" + size: 3Gi + +tests: + enabled: true \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/.helmignore b/hack/observability/opentelemetry/chart/.helmignore new file mode 100644 index 000000000..0e8a0eb36 --- /dev/null +++ b/hack/observability/opentelemetry/chart/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/hack/observability/opentelemetry/chart/CONTRIBUTING.md b/hack/observability/opentelemetry/chart/CONTRIBUTING.md new file mode 100644 index 000000000..e2dd7d2b0 --- /dev/null +++ b/hack/observability/opentelemetry/chart/CONTRIBUTING.md @@ -0,0 +1,8 @@ +# Collector Chart Contributing Guide + +## Bumping Default Collector Version + +1. Increase the minor version of the chart by one and set the patch version to zero. +2. Update the chart's `appVersion` to match the new collector version. This version will be used as the image tag by default. +3. Review the corresponding release notes in [Collector Core](https://github.com/open-telemetry/opentelemetry-collector/releases), [Collector Contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib/releases), and [Collector Releases](https://github.com/open-telemetry/opentelemetry-collector-releases/releases). If any changes affect the helm charts, adjust the helm chart accordingly. +4. Run `make generate-examples`. \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/Chart.yaml b/hack/observability/opentelemetry/chart/Chart.yaml new file mode 100644 index 000000000..c25eb655f --- /dev/null +++ b/hack/observability/opentelemetry/chart/Chart.yaml @@ -0,0 +1,14 @@ +apiVersion: v2 +appVersion: 0.75.0 +description: OpenTelemetry Collector Helm chart for Kubernetes +home: https://opentelemetry.io/ +icon: https://opentelemetry.io/img/logos/opentelemetry-logo-nav.png +maintainers: +- name: dmitryax +- name: TylerHelmuth +name: opentelemetry-collector +sources: +- https://github.com/open-telemetry/opentelemetry-collector +- https://github.com/open-telemetry/opentelemetry-collector-contrib +type: application +version: 0.53.0 diff --git a/hack/observability/opentelemetry/chart/LICENSE b/hack/observability/opentelemetry/chart/LICENSE new file mode 100644 index 000000000..f49a4e16e --- /dev/null +++ b/hack/observability/opentelemetry/chart/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/README.md b/hack/observability/opentelemetry/chart/README.md new file mode 100644 index 000000000..a15197fde --- /dev/null +++ b/hack/observability/opentelemetry/chart/README.md @@ -0,0 +1,217 @@ +# OpenTelemetry Collector Helm Chart + +The helm chart installs [OpenTelemetry Collector](https://github.com/open-telemetry/opentelemetry-collector) +in kubernetes cluster. + +## Prerequisites + +- Kubernetes 1.23+ +- Helm 3.9+ + +## Installing the Chart + +Add OpenTelemetry Helm repository: + +```console +helm repo add open-telemetry https://open-telemetry.github.io/opentelemetry-helm-charts +``` + +To install the chart with the release name my-opentelemetry-collector, run the following command: + +```console +helm install my-opentelemetry-collector open-telemetry/opentelemetry-collector +``` + +## Upgrading + +See [UPGRADING.md](UPGRADING.md). + +## Security Considerations + +OpenTelemetry Collector recommends to bind receivers' servers to addresses that limit connections to authorized users. +For this reason, by default the chart binds all the Collector's endpoints to the pod's IP. + +More info is available in the [Security Best Practices docummentation](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/security-best-practices.md#safeguards-against-denial-of-service-attacks) + +Some care must be taken when using `hostNetwork: true`, as then OpenTelemetry Collector will listen on all the addresses in the host network namespace. + +## Configuration + +### Default configuration + +By default this chart will deploy an OpenTelemetry Collector as daemonset with three pipelines (logs, metrics and traces) +and logging exporter enabled by default. Besides daemonset (agent), it can be also installed as deployment. + +*Example*: Install collector as a deployment, and do not run it as an agent. + +```yaml +mode: deployment +``` + +By default collector has the following receivers enabled: + +- **metrics**: OTLP and prometheus. Prometheus is configured only for scraping collector's own metrics. +- **traces**: OTLP, zipkin and jaeger (thrift and grpc). +- **logs**: OTLP (to enable container logs, see [Configuration for Kubernetes container logs](#configuration-for-kubernetes-container-logs)). + +There are two ways to configure collector pipelines, which can be used together as well. + +### Basic top level configuration + +Default components can be removed with `null`. When changing a pipeline, you must explicitly list all the components that are in the pipeline, including any default components. + +*Example*: Disable metrics and logging pipelines and non-otlp receivers: + +```yaml +config: + receivers: + jaeger: null + prometheus: null + zipkin: null + service: + pipelines: + traces: + receivers: + - otlp + metrics: null + logs: null +``` + +*Example*: Add host metrics receiver: + +```yaml +mode: daemonset + +presets: + hostMetrics: + enabled: true +``` + +### Configuration for Kubernetes container logs + +The collector can be used to collect logs sent to standard output by Kubernetes containers. +This feature is disabled by default. It has the following requirements: + +- It needs agent collector to be deployed. +- It requires the [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version +of the collector image. + +To enable this feature, set the `presets.logsCollection.enabled` property to `true`. +Here is an example `values.yaml`: + +```yaml +mode: daemonset + +presets: + logsCollection: + enabled: true + includeCollectorLogs: true +``` + +The way this feature works is it adds a `filelog` receiver on the `logs` pipeline. This receiver is preconfigured +to read the files where Kubernetes container runtime writes all containers' console output to. + +#### :warning: Warning: Risk of looping the exported logs back into the receiver, causing "log explosion" + +The container logs pipeline uses the `logging` console exporter by default. +Paired with the default `filelog` receiver that receives all containers' console output, +it is easy to accidentally feed the exported logs back into the receiver. + +Also note that using the `--log-level=debug` option for the `logging` exporter causes it to output +multiple lines per single received log, which when looped, would amplify the logs exponentially. + +To prevent the looping, the default configuration of the receiver excludes logs from the collector's containers. + +If you want to include the collector's logs, make sure to replace the `logging` exporter +with an exporter that does not send logs to collector's standard output. + +Here's an example `values.yaml` file that replaces the default `logging` exporter on the `logs` pipeline +with an `otlphttp` exporter that sends the container logs to `https://example.com:55681` endpoint. +It also clears the `filelog` receiver's `exclude` property, for collector logs to be included in the pipeline. + +```yaml +mode: daemonset + +presets: + logsCollection: + enabled: true + includeCollectorLogs: true + +config: + exporters: + otlphttp: + endpoint: https://example.com:55681 + service: + pipelines: + logs: + exporters: + - otlphttp +``` + +### Configuration for Kubernetes attributes processor + +The collector can be configured to add Kubernetes metadata to logs, metrics and traces. + +This feature is disabled by default. It has the following requirements: + +- It requires [k8sattributesprocessor](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/k8sattributesprocessor) processor to be included in the collector, such as [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version of the collector image. + +To enable this feature, set the `presets.kubernetesAttributes.enabled` property to `true`. +Here is an example `values.yaml`: + +```yaml +mode: daemonset +presets: + kubernetesAttributes: + enabled: true +``` + +### Configuration for Kubernetes Cluster Metrics + +The collector can be configured to collects cluster-level metrics from the Kubernetes API server. A single instance of this receiver can be used to monitor a cluster. + +This feature is disabled by default. It has the following requirements: + +- It requires [k8sclusterreceiver](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/k8sclusterreceiver) to be included in the collector, such as [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version of the collector image. +- It requires statefulset or deployment mode with a signle replica. + +To enable this feature, set the `presets.clusterMetrics.enabled` property to `true`. + +Here is an example `values.yaml`: + +```yaml +mode: deployment +replicaCount: 1 +presets: + clusterMetrics: + enabled: true +``` + +### Configuration for retrieving Kubelet metrics + +The collector can be configured to collect Kubelet metrics. + +This feature is disabled by default. It has the following requirements: + +- It requires [kubeletstats](https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/receiver/kubeletstatsreceiver) receiver to be included in the collector, such as [contrib](https://github.com/open-telemetry/opentelemetry-collector-contrib) version of the collector image. + +To enable this feature, set the `presets.kubeletMetrics.enabled` property to `true`. +Here is an example `values.yaml`: + +```yaml +mode: daemonset +presets: + kubeletMetrics: + enabled: true +``` + +### CRDs + +At this time, Prometheus CRDs are supported but other CRDs are not. + +### Other configuration options + +The [values.yaml](./values.yaml) file contains information about all other configuration +options for this chart. + +For more examples see [Examples](examples). diff --git a/hack/observability/opentelemetry/chart/UPGRADING.md b/hack/observability/opentelemetry/chart/UPGRADING.md new file mode 100644 index 000000000..f182e811f --- /dev/null +++ b/hack/observability/opentelemetry/chart/UPGRADING.md @@ -0,0 +1,289 @@ +# Upgrade guidelines + +## 0.46.0 to 0.47.0 + +[Update Collector Endpoints to use Pod IP Instead of 0.0.0.0](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/603) + +The [Collector's security guidelines were updated](https://github.com/open-telemetry/opentelemetry-collector/pull/6959) to include containerized environments when discussing safeguards against denial of service attacks. +To be in compliance with the Collector's security best practices the chart has been updated to use the Collector's pod IP in place of `0.0.0.0`. + +The chart will continue to allow complete configuration of the Collector via the `config` field in the values.yaml. If pod IP does not suite your needs you can use `config` to set something different. + +See [Security Best Practices docummentation](https://github.com/open-telemetry/opentelemetry-collector/blob/main/docs/security-best-practices.md#safeguards-against-denial-of-service-attacks) for more details. + +The new default of binding to the pod IP, rather than `0.0.0.0`, will cause `kubectl port-forward` to fail. If port-forwarding is desired, the following `value.yaml` snippet will allow the Collector bind to `127.0.0.1` inside the pod, in addition to the pod's IP: + +```yaml +config: + receivers: + jaeger/local: + protocols: + grpc: + endpoint: 127.0.0.1:14250 + thrift_compact: + endpoint: 127.0.0.1:6831 + thrift_http: + endpoint: 127.0.0.1:14268 + otlp/local: + protocols: + grpc: + endpoint: 127.0.0.1:4317 + http: + endpoint: 127.0.0.1:4318 + zipkin/local: + endpoint: 127.0.0.1:9411 + service: + pipelines: + traces: + receivers: + - otlp + - otlp/local + - jaeger + - jaeger/local + - zipkin + - zipkin/local +``` + +## 0.40.7 to 0.41.0 + +[Require Kubernetes version 1.23 or later](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/541) + +If you enable use of a _HorizontalPodAutoscaler_ for the collector when running in the "deployment" mode by way of `.Values.autoscaling.enabled`, the manifest now uses the "autoscaling/v2" API group version, which [is available only as recently as Kubernetes version 1.23](https://kubernetes.io/blog/2021/12/07/kubernetes-1-23-release-announcement/#horizontalpodautoscaler-v2-graduates-to-ga). As [all previous versions of this API group are deprecated and removed as of Kubernetes version 1.26](https://kubernetes.io/docs/reference/using-api/deprecation-guide/#horizontalpodautoscaler-v126), we don't offer support for Kubernetes versions older than 1.23. + +## 0.34.0 to 0.34.0 + +[config supports templating](TBD) + +The chart now supports templating in `.Values.config`. If you are currently using any `{{ }}` syntax in `.Values.yaml` it will now be rendered. To escape existing instances of `{{ }}`, use ``` {{` `}} ```. For example, `{{ REDACTED_EMAIL }}` becomes ``` {{` {{ REDACTED_EMAIL }} `}} ```. + +## 0.28.0 to 0.29.0 + +[Reduce requested resources](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/273) + +Resource `limits` have been reduced. Upgrades/installs of chart 0.29.0 will now use fewer resources. In order to set the resources back to what they were, you will need to override the `resources` section in the `values.yaml`. + +*Example*: + +```yaml +resources: + limits: + cpu: 1 + memory: 2Gi +``` + +## 0.23.1 to 0.24.0 + +[Remove containerLogs in favor of presets.logsCollection]() + +The ability to enable logs collection from the collector has been moved from `containerLogs.enabled` to `presets.logsCollection.enabled`. If you are currently using `containerLogs.enabled`, you should instead use the preset: + +```yaml +presets: + logsCollection: + enabled: true +``` + +If you are using `containerLogs.enabled` and also enabling collection of the collector logs you can use `includeCollectorLogs` + +```yaml +presets: + logsCollection: + enabled: true + includeCollectorLogs: true +``` + +You no longer need to update `config.service.pipelines.logs` to include the filelog receiver yourself as the preset will automatically update the logs pipeline to include the filelog receiver. + +The filelog's preset configuration can modified by `config.receivers`, but preset configuration cannot be removed. If you need to remove any filelog receiver configuration generated by the preset you should not use the preset. Instead, configure the filelog receiver manually in `config.receivers` and set any other necessary fields in the values.yaml to modify k8s as needed. + +See the [daemonset-collector-logs example](https://github.com/open-telemetry/opentelemetry-helm-charts/tree/main/charts/opentelemetry-collector/examples/daemonset-collector-logs) to see an example of the preset in action. + +## 0.18.0 to 0.19.0 + +[Remove agentCollector and standaloneCollector settings](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/216) + +The `agentCollector` and `standaloneCollector` config sections have been removed. Upgrades/installs of chart 0.19.0 will fail if `agentCollector` or `standaloneCollector` are in the values.yaml. See the [Migrate to mode](#migrate-to-mode) steps for instructions on how to replace `agentCollector` and `standaloneCollector` with `mode`. + +## 0.13.0 to 0.14.0 + +[Remove two-deployment mode](https://github.com/open-telemetry/opentelemetry-helm-charts/pull/159) + +The ability to install both the agent and standalone collectors simultaneous with the chart has been removed. Installs/upgrades where both `.Values.agentCollector.enabled` and `.Values.standloneCollector.enables` are true will fail. `agentCollector` and `standloneCollector` have also be deprecated, but backward compatibility has been maintained. + +### To run both a deployment and daemonset + +Install a deployment version of the collector. This is done by setting `.Values.mode` to `deployment` + +```yaml +mode: deployment +``` + +Next, install an daemonset version of the collector that is configured to send traffic to the previously installed deployment. This is done by setting `.Values.mode` to `daemonset` and updating `.Values.config` so that data is exported to the deployment. + +```yaml +mode: daemonset + +config: + exporters: + otlp: + endpoint: example-opentelemetry-collector:4317 + tls: + insecure: true + service: + pipelines: + logs: + exporters: + - otlp + - logging + metrics: + exporters: + - otlp + - logging + traces: + exporters: + - otlp + - logging +``` + +See the [daemonset-and-deployment](examples/daemonset-and-deployment) example to see the rendered config. + +### Migrate to `mode`: + +The `agentCollector` and `standaloneCollector` sections in values.yaml have been deprecated. Instead there is a new field, `mode`, that determines if the collector is being installed as a daemonset or deployment. + +```yaml +# Valid values are "daemonset" and "deployment". +# If set, agentCollector and standaloneCollector are ignored. +mode: +``` + +The following fields have also been added to the root-level to replace the depracated `agentCollector` and `standaloneCollector` settings. + +```yaml +containerLogs: + enabled: false + +resources: + limits: + cpu: 1 + memory: 2Gi + +podAnnotations: {} + +podLabels: {} + +# Host networking requested for this pod. Use the host's network namespace. +hostNetwork: false + +# only used with deployment mode +replicaCount: 1 + +annotations: {} +``` + +When using `mode`, these settings should be used instead of their counterparts in `agentCollector` and `standaloneCollector`. + +Set `mode` to `daemonset` if `agentCollector` was being used. Move all `agentCollector` settings to the corresponding root-level setting. If `agentCollector.configOverride` was being used, merge the settings with `.Values.config`. + +Example agentCollector values.yaml: + +```yaml +agentCollector: + resources: + limits: + cpu: 3 + memory: 6Gi + configOverride: + receivers: + hostmetrics: + scrapers: + cpu: + disk: + filesystem: + service: + pipelines: + metrics: + receivers: [otlp, prometheus, hostmetrics] +``` + +Example mode values.yaml: + +```yaml +mode: daemonset + +resources: + limits: + cpu: 3 + memory: 6Gi + +config: + receivers: + hostmetrics: + scrapers: + cpu: + disk: + filesystem: + service: + pipelines: + metrics: + receivers: [otlp, prometheus, hostmetrics] +``` + +Set `mode` to `deployment` if `standaloneCollector` was being used. Move all `standaloneCollector` settings to the corresponding root-level setting. If `standaloneCollector.configOverride` was being used, merge the settings with `.Values.config`. + +Example standaloneCollector values.yaml: + +```yaml +standaloneCollector: + enabled: true + replicaCount: 2 + configOverride: + receivers: + podman_stats: + endpoint: unix://run/podman/podman.sock + timeout: 10s + collection_interval: 10s + service: + pipelines: + metrics: + receivers: [otlp, prometheus, podman_stats] +``` + +Example mode values.yaml: + +```yaml +mode: deployment + +replicaCount: 2 + +config: + receivers: + receivers: + podman_stats: + endpoint: unix://run/podman/podman.sock + timeout: 10s + collection_interval: 10s + service: + pipelines: + metrics: + receivers: [otlp, prometheus, podman_stats] +``` + +Default configuration in `.Values.config` can now be removed with `null`. When changing a pipeline, you must explicitly list all the components that are in the pipeline, including any default components. + +*Example*: Disable metrics and logging pipelines and non-otlp receivers: + +```yaml +config: + receivers: + jaeger: null + prometheus: null + zipkin: null + service: + pipelines: + traces: + receivers: + - otlp + metrics: null + logs: null +``` diff --git a/hack/observability/opentelemetry/chart/templates/NOTES.txt b/hack/observability/opentelemetry/chart/templates/NOTES.txt new file mode 100644 index 000000000..1bebd0e97 --- /dev/null +++ b/hack/observability/opentelemetry/chart/templates/NOTES.txt @@ -0,0 +1,38 @@ +{{- if not (eq (toString .Values.extraConfigMapMounts) "") }} +[WARNING] "extraConfigMapMounts" parameter is deprecated, please use "extraVolumes" or "extraVolumesMounts" instead. +{{ end }} + +{{- if not (eq (toString .Values.extraHostPathMounts) "") }} +[WARNING] "extraHostPathMounts" parameter is deprecated, please use "extraVolumes" or "extraVolumesMounts" instead. +{{ end }} + +{{- if not (eq (toString .Values.secretMounts) "") }} +[WARNING] "secretMounts" parameter is deprecated, please use "extraVolumes" or "extraVolumeMounts" instead. +{{ end }} + +{{- if and (not (eq .Values.mode "daemonset")) (not (eq .Values.mode "deployment")) (not (eq .Values.mode "statefulset")) }} +{{ fail "[ERROR] 'mode' must be set. See https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/UPGRADING.md for instructions." }} +{{ end }} + +{{- if not .Values.configMap.create }} +[WARNING] "configMap" wil not be created and "config" will not take effect. +{{ end }} + +{{- if not (eq (toString .Values.containerLogs) "") }} +[WARNING] 'containerLogs' is deprecated. Use 'presets.logsCollection' instead. See https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/UPGRADING.md#0231-to-0240 for instructions on how to migrate. +{{ end }} + +[INFO] as of chart version 0.47.0 the default collector configuration has been updated to use pod IP instead of 0.0.0.0 for its endpoints. See https://github.com/open-telemetry/opentelemetry-helm-charts/blob/main/charts/opentelemetry-collector/UPGRADING.md#0460-to-0470 for details. + +{{- if .Values.presets.clusterMetrics.enabled }} +{{- if eq .Values.mode "daemonset"}} +{{- fail "Cluster Metrics preset is not suitable for daemonset mode. Please use statefulset or deployment mode with replicaCount: 1"}} +{{ end }} +{{- if gt (int .Values.replicaCount) 1 }} +{{- fail "Cluster Metrics preset is not suitable for replicaCount greater than one. Please change replica count to one." }} +{{ end }} +{{ end }} + +{{- if .Values.presets.kubernetesEvents.enabled }} +[WARNING] The 'k8sobjects' is a ALPHA receiver and may be changed anytime. +{{ end }} \ No newline at end of file diff --git a/hack/observability/opentelemetry/chart/templates/_config.tpl b/hack/observability/opentelemetry/chart/templates/_config.tpl new file mode 100644 index 000000000..ac805a84f --- /dev/null +++ b/hack/observability/opentelemetry/chart/templates/_config.tpl @@ -0,0 +1,329 @@ +{{/* +Default memory limiter configuration for OpenTelemetry Collector based on k8s resource limits. +*/}} +{{- define "opentelemetry-collector.memoryLimiter" -}} +# check_interval is the time between measurements of memory usage. +check_interval: 5s + +# By default limit_mib is set to 80% of ".Values.resources.limits.memory" +limit_percentage: 80 + +# By default spike_limit_mib is set to 25% of ".Values.resources.limits.memory" +spike_limit_percentage: 25 +{{- end }} + +{{/* +Merge user supplied config into memory limiter config. +*/}} +{{- define "opentelemetry-collector.baseConfig" -}} +{{- $processorsConfig := get .Values.config "processors" }} +{{- if not $processorsConfig.memory_limiter }} +{{- $_ := set $processorsConfig "memory_limiter" (include "opentelemetry-collector.memoryLimiter" . | fromYaml) }} +{{- end }} +{{- $memoryBallastConfig := get .Values.config.extensions "memory_ballast" }} +{{- if or (not $memoryBallastConfig) (not $memoryBallastConfig.size_in_percentage) }} +{{- $_ := set $memoryBallastConfig "size_in_percentage" 40 }} +{{- end }} +{{- .Values.config | toYaml }} +{{- end }} + +{{/* +Build config file for daemonset OpenTelemetry Collector +*/}} +{{- define "opentelemetry-collector.daemonsetConfig" -}} +{{- $values := deepCopy .Values }} +{{- $data := dict "Values" $values | mustMergeOverwrite (deepCopy .) }} +{{- $config := include "opentelemetry-collector.baseConfig" $data | fromYaml }} +{{- if eq (include "opentelemetry-collector.logsCollectionEnabled" .) "true" }} +{{- $config = (include "opentelemetry-collector.applyLogsCollectionConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.hostMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyHostMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubeletMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubeletMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubernetesAttributes.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubernetesAttributesConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.clusterMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyClusterMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- tpl (toYaml $config) . }} +{{- end }} + +{{/* +Build config file for deployment OpenTelemetry Collector +*/}} +{{- define "opentelemetry-collector.deploymentConfig" -}} +{{- $values := deepCopy .Values }} +{{- $data := dict "Values" $values | mustMergeOverwrite (deepCopy .) }} +{{- $config := include "opentelemetry-collector.baseConfig" $data | fromYaml }} +{{- if eq (include "opentelemetry-collector.logsCollectionEnabled" .) "true" }} +{{- $config = (include "opentelemetry-collector.applyLogsCollectionConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.hostMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyHostMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubeletMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubeletMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubernetesAttributes.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubernetesAttributesConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.kubernetesEvents.enabled }} +{{- $config = (include "opentelemetry-collector.applyKubernetesEventsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- if .Values.presets.clusterMetrics.enabled }} +{{- $config = (include "opentelemetry-collector.applyClusterMetricsConfig" (dict "Values" $data "config" $config) | fromYaml) }} +{{- end }} +{{- tpl (toYaml $config) . }} +{{- end }} + +{{- define "opentelemetry-collector.applyHostMetricsConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.hostMetricsConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.metrics "receivers" (append $config.service.pipelines.metrics.receivers "hostmetrics" | uniq) }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.hostMetricsConfig" -}} +receivers: + hostmetrics: + root_path: /hostfs + collection_interval: 10s + scrapers: + cpu: + load: + memory: + disk: + filesystem: + exclude_mount_points: + mount_points: + - /dev/* + - /proc/* + - /sys/* + - /run/k3s/containerd/* + - /var/lib/docker/* + - /var/lib/kubelet/* + - /snap/* + match_type: regexp + exclude_fs_types: + fs_types: + - autofs + - binfmt_misc + - bpf + - cgroup2 + - configfs + - debugfs + - devpts + - devtmpfs + - fusectl + - hugetlbfs + - iso9660 + - mqueue + - nsfs + - overlay + - proc + - procfs + - pstore + - rpc_pipefs + - securityfs + - selinuxfs + - squashfs + - sysfs + - tracefs + match_type: strict + network: +{{- end }} + +{{- define "opentelemetry-collector.applyClusterMetricsConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.clusterMetricsConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.metrics "receivers" (append $config.service.pipelines.metrics.receivers "k8s_cluster" | uniq) }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.clusterMetricsConfig" -}} +receivers: + k8s_cluster: + collection_interval: 10s +{{- end }} + +{{- define "opentelemetry-collector.applyKubeletMetricsConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.kubeletMetricsConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.metrics "receivers" (append $config.service.pipelines.metrics.receivers "kubeletstats" | uniq) }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.kubeletMetricsConfig" -}} +receivers: + kubeletstats: + collection_interval: 20s + auth_type: "serviceAccount" + endpoint: "${K8S_NODE_NAME}:10250" +{{- end }} + +{{- define "opentelemetry-collector.applyLogsCollectionConfig" -}} +{{- $config := mustMergeOverwrite (include "opentelemetry-collector.logsCollectionConfig" .Values | fromYaml) .config }} +{{- $_ := set $config.service.pipelines.logs "receivers" (append $config.service.pipelines.logs.receivers "filelog" | uniq) }} +{{- if .Values.Values.presets.logsCollection.storeCheckpoints}} +{{- $_ := set $config.service "extensions" (append $config.service.extensions "file_storage" | uniq) }} +{{- end }} +{{- $config | toYaml }} +{{- end }} + +{{- define "opentelemetry-collector.logsCollectionConfig" -}} +{{- if .Values.presets.logsCollection.storeCheckpoints }} +extensions: + file_storage: + directory: /var/lib/otelcol +{{- end }} +receivers: + filelog: + include: [ /var/log/pods/*/*/*.log ] + {{- if .Values.presets.logsCollection.includeCollectorLogs }} + exclude: [] + {{- else }} + # Exclude collector container's logs. The file format is /var/log/pods/__//.log + exclude: [ /var/log/pods/{{ .Release.Namespace }}_{{ include "opentelemetry-collector.fullname" . }}*_*/{{ include "opentelemetry-collector.lowercase_chartname" . }}/*.log ] + {{- end }} + start_at: beginning + {{- if .Values.presets.logsCollection.storeCheckpoints}} + storage: file_storage + {{- end }} + include_file_path: true + include_file_name: false + operators: + # Find out which format is used by kubernetes + - type: router + id: get-format + routes: + - output: parser-docker + expr: 'body matches "^\\{"' + - output: parser-crio + expr: 'body matches "^[^ Z]+ "' + - output: parser-containerd + expr: 'body matches "^[^ Z]+Z"' + # Parse CRI-O format + - type: regex_parser + id: parser-crio + regex: '^(?P