From f38c967789b0a7a261727a76440122a9b88064dc Mon Sep 17 00:00:00 2001 From: Alessandro De Blasis Date: Sun, 5 Oct 2025 06:44:14 +0200 Subject: [PATCH 1/2] Add test to reproduce data race in Retry function This test demonstrates the data race bug when context is cancelled during a retry operation. The race occurs between: - Line 871: Goroutine writes to shared 'response' variable - Line 907: Main thread reads 'response' when ctx.Done() To reproduce: go test -race -run TestRetry_RaceCondition ./common/ Expected result (BEFORE fix): WARNING: DATA RACE Found 1 data race(s) exit status 66 FAIL Issue discovered in getsops/sops integration tests. Related: https://github.com/getsops/sops Signed-off-by: Alessandro De Blasis --- common/retry_race_test.go | 70 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 common/retry_race_test.go diff --git a/common/retry_race_test.go b/common/retry_race_test.go new file mode 100644 index 0000000000..047bd8f4a1 --- /dev/null +++ b/common/retry_race_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2016, 2018, 2025, Oracle and/or its affiliates. All rights reserved. +// This software is dual-licensed to you under the Universal Permissive License (UPL) 1.0 as shown at https://oss.oracle.com/licenses/upl or Apache License 2.0 as shown at http://www.apache.org/licenses/LICENSE-2.0. You may choose either license. + +package common + +import ( + "context" + "net/http" + "net/url" + "testing" + "time" + + "github.com/stretchr/testify/assert" +) + +// TestRetry_RaceCondition demonstrates the data race bug in v65.101.1 +// +// To reproduce the race condition, run: +// +// go test -race -run TestRetry_RaceCondition ./common/ +// +// Expected output with BUGGY code: +// +// WARNING: DATA RACE +// Write at 0x... by goroutine X: +// common/retry.go:871 +// Previous read at 0x... by goroutine Y: +// common/retry.go:907 +// +// The race occurs because: +// 1. Line 807: var response OCIResponse (shared variable) +// 2. Line 871: Goroutine writes to 'response' +// 3. Line 907: Main thread reads 'response' when context is cancelled +// +// This creates a data race where both threads access the same memory +// location concurrently without synchronization. +func TestRetry_RaceCondition(t *testing.T) { + shouldRetryOperation := func(res OCIOperationResponse) bool { return true } + getNextDuration := func(res OCIOperationResponse) time.Duration { return 100 * time.Millisecond } + + pol := NewRetryPolicy(uint(5), shouldRetryOperation, getNextDuration) + req := retryableOCIRequest{retryPolicy: &pol} + + testURL, _ := url.Parse("http://example.com/test") + fakeOperation := func(ctx context.Context, request OCIRequest, _ *OCIReadSeekCloser, _ map[string]string) (OCIResponse, error) { + // Simulate slow operation to increase chance of race + time.Sleep(50 * time.Millisecond) + + httpResponse := http.Response{ + Header: http.Header{}, + StatusCode: 500, + Request: &http.Request{ + Method: "GET", + URL: testURL, + }, + } + return genericOCIResponse{RawResponse: &httpResponse}, nil + } + + // Cancel context immediately to trigger the race condition + // This causes the main thread to read 'response' at line 907 + // while the goroutine is writing to it at line 871 + ctx, cancelFn := context.WithCancel(context.Background()) + cancelFn() + + _, err := Retry(ctx, req, fakeOperation, pol) + + // Test may pass, but race detector will catch the bug + assert.Equal(t, context.Canceled, err) +} \ No newline at end of file From 777083598250e7df23386d66e72836cd932b4d8c Mon Sep 17 00:00:00 2001 From: Alessandro De Blasis Date: Sun, 5 Oct 2025 06:55:45 +0200 Subject: [PATCH 2/2] Fix data race in Retry function This commit fixes the data race by ensuring the retry goroutine checks for context cancellation and exits promptly, eliminating the race between the goroutine writing to 'response' and the main thread reading it. Changes: 1. Check ctx.Done() before each retry attempt 2. Make sleep interruptible by checking ctx.Done() during wait 3. Remove racing select statement - now always wait for channel result Benefits: - No data race: All 'response' access synchronized through channel - No goroutine leak: Goroutine exits promptly on cancellation - Preserves response: Returns last OCIResponse even when cancelled - Deterministic behavior: No timing races Before this fix: go test -race -run TestRetry_RaceCondition ./common/ WARNING: DATA RACE Found 1 data race(s) exit status 66 FAIL After this fix: go test -race -run TestRetry_RaceCondition ./common/ PASS ok (no race detected) Fixes: Data race between retry.go:871 (write) and retry.go:907 (read) Related: https://github.com/getsops/sops Signed-off-by: Alessandro De Blasis --- common/retry.go | 32 ++++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/common/retry.go b/common/retry.go index 1552fa052e..2e2577813e 100644 --- a/common/retry.go +++ b/common/retry.go @@ -862,6 +862,16 @@ func Retry(ctx context.Context, request OCIRetryableRequest, operation OCIOperat // use a one-based counter because it's easier to think about operation retry in terms of attempt numbering for currentOperationAttempt := uint(1); shouldContinueIssuingRequests(currentOperationAttempt, policyToUse.MaximumNumberAttempts); currentOperationAttempt++ { + // Check if context was cancelled before starting this attempt + select { + case <-ctx.Done(): + // Context cancelled: return whatever response we have so far + retrierChannel <- retrierResult{response, ctx.Err()} + return + default: + // Continue with the attempt + } + Debugln(fmt.Sprintf("operation attempt #%v", currentOperationAttempt)) // rewind body once needed if isSeekable { @@ -894,18 +904,24 @@ func Retry(ctx context.Context, request OCIRetryableRequest, operation OCIOperat return } Debugln(fmt.Sprintf("waiting %v before retrying operation", duration)) - // sleep before retrying the operation - <-time.After(duration) + // sleep before retrying the operation, but respect context cancellation + select { + case <-ctx.Done(): + // Context cancelled during sleep: return current response + retrierChannel <- retrierResult{response, ctx.Err()} + return + case <-time.After(duration): + // Sleep completed, continue to next retry attempt + } } retryEndTime := time.Now() Debugln(fmt.Sprintf("Total Latency for this API call is: %v ms", retryEndTime.Sub(retryStartTime).Milliseconds())) retrierChannel <- retrierResult{response, err} }() - select { - case <-ctx.Done(): - return response, ctx.Err() - case result := <-retrierChannel: - return result.response, result.err - } + // Wait for the goroutine to complete and send result through channel. + // The goroutine now checks ctx.Done() internally and will exit promptly + // when context is cancelled, so we don't need to race here. + result := <-retrierChannel + return result.response, result.err }