cockroachdb
diff --git a/‎pkg/sql/bulksst/BUILD.bazel‎
Lines changed: 4 additions & 0 deletions b/‎pkg/sql/bulksst/BUILD.bazel‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎pkg/sql/bulksst/combine_file_info.go‎
Lines changed: 157 additions & 0 deletions b/‎pkg/sql/bulksst/combine_file_info.go‎
Lines changed: 157 additions & 0 deletions
@@ -22,6 +22,7 @@ go_proto_library(
 go_library(
     name = "bulksst",
     srcs = [
+        "combine_file_info.go",
         "sst_file_allocator.go",
         "sst_writer.go",
     ],
@@ -35,11 +36,13 @@ go_library(
         "//pkg/roachpb",
         "//pkg/settings",
         "//pkg/settings/cluster",
+        "//pkg/sql/execinfrapb",
         "//pkg/storage",
         "//pkg/util/hlc",
         "//pkg/util/log",
         "//pkg/util/randutil",
         "//pkg/util/timeutil",
+        "@com_github_cockroachdb_errors//:errors",
         "@com_github_cockroachdb_pebble//objstorage",
         "@com_github_cockroachdb_pebble//objstorage/objstorageprovider",
     ],
@@ -48,6 +51,7 @@ go_library(
 go_test(
     name = "bulksst_test",
     srcs = [
+        "combine_file_info_test.go",
         "main_test.go",
         "sst_file_allocator_test.go",
         "sst_writer_test.go",
 
@@ -0,0 +1,157 @@
+// Copyright 2025 The Cockroach Authors.
+//
+// Use of this software is governed by the CockroachDB Software License
+// included in the /LICENSE file.
+
+package bulksst
+
+import (
+	"bytes"
+	"slices"
+
+	"github.com/cockroachdb/cockroach/pkg/roachpb"
+	"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
+	"github.com/cockroachdb/errors"
+)
+
+// CombineFileInfo combines SST file metadata and determines merge task spans based on key samples.
+func CombineFileInfo(
+	files []SSTFiles, schemaSpans []roachpb.Span,
+) ([]execinfrapb.BulkMergeSpec_SST, []roachpb.Span, error) {
+	// Validate that schema spans are properly ordered. This is a critical
+	// precondition for the algorithm to work correctly.
+	if err := validateSchemaSpansOrdered(schemaSpans); err != nil {
+		return nil, nil, err
+	}
+
+	result := make([]execinfrapb.BulkMergeSpec_SST, 0)
+	samples := make([]roachpb.Key, 0)
+	for _, file := range files {
+		for _, sst := range file.SST {
+			result = append(result, execinfrapb.BulkMergeSpec_SST{
+				StartKey: string(sst.StartKey),
+				EndKey:   string(sst.EndKey),
+				URI:      sst.URI,
+			})
+		}
+		for _, sample := range file.RowSamples {
+			samples = append(samples, roachpb.Key(sample))
+		}
+	}
+	// Sort samples to ensure merge spans are non-overlapping and contiguous.
+	// Samples are collected from multiple workers and arrive in arbitrary order.
+	// getMergeSpans uses these samples as split points to create merge task spans.
+	// If samples are unsorted (e.g., ["k", "d", "a"]), getMergeSpans would create
+	// overlapping spans that cause the same keys to be processed multiple times,
+	// resulting in duplicate data in the output SSTs.
+	slices.SortFunc(samples, func(i, j roachpb.Key) int {
+		return bytes.Compare(i, j)
+	})
+
+	mergeSpans, err := getMergeSpans(schemaSpans, samples)
+	if err != nil {
+		return nil, nil, err
+	}
+
+	return result, mergeSpans, nil
+}
+
+// getMergeSpans determines which spans should be used as merge tasks. The
+// output spans must fully cover the input spans. The samples are used to
+// determine where schema spans should be split.
+//
+// Precondition: schemaSpans must be sorted by start key and non-overlapping.
+// This precondition is validated in validateSchemaSpansOrdered.
+func getMergeSpans(schemaSpans []roachpb.Span, sortedSample []roachpb.Key) ([]roachpb.Span, error) {
+	result := make([]roachpb.Span, 0, len(schemaSpans)+len(sortedSample))
+
+	for _, span := range schemaSpans {
+		samples, consumed := getCoveredSamples(span, sortedSample)
+
+		// Validate: if we consumed more samples than we returned, it means some
+		// samples were outside this span (either before it or in a gap). Since
+		// schema spans are processed in order, any sample before this span indicates
+		// a sample not covered by the schema spans.
+		if consumed > len(samples) {
+			// Find the first skipped sample for a clear error message
+			for i := 0; i < consumed; i++ {
+				if i >= len(samples) || !sortedSample[i].Equal(samples[i]) {
+					return nil, errors.AssertionFailedf(
+						"sample %q is before schema span [%q, %q); this indicates samples were collected for keys outside schema spans",
+						sortedSample[i], span.Key, span.EndKey)
+				}
+			}
+		}
+
+		sortedSample = sortedSample[consumed:]
+
+		startKey := span.Key
+		for _, sample := range samples {
+			// Skip samples that would create invalid (zero-length) spans.
+			// This handles duplicates and samples at span boundaries.
+			if bytes.Compare(sample, startKey) <= 0 {
+				continue
+			}
+			result = append(result, roachpb.Span{
+				Key:    startKey,
+				EndKey: sample,
+			})
+			startKey = sample
+		}
+		result = append(result, roachpb.Span{
+			Key:    startKey,
+			EndKey: span.EndKey,
+		})
+	}
+
+	// Validate that all samples were contained within schema spans. Any remaining
+	// samples indicate they were collected after the last span.
+	if len(sortedSample) > 0 {
+		return nil, errors.AssertionFailedf(
+			"samples outside schema spans: %d samples remain after processing all spans, first uncovered sample: %q",
+			len(sortedSample), sortedSample[0])
+	}
+
+	return result, nil
+}
+
+// getCoveredSamples returns the samples within the given span and the total
+// number of samples consumed (including any that were before the span start).
+func getCoveredSamples(schemaSpan roachpb.Span, sortedSamples []roachpb.Key) ([]roachpb.Key, int) {
+	// Count how many samples are before the span start.
+	// Since sortedSamples are sorted and schema spans are processed in order,
+	// samples before this span's start either:
+	// 1. Should have been covered by a previous span, or
+	// 2. Fall in a gap between spans.
+	startIdx := 0
+	for startIdx < len(sortedSamples) && bytes.Compare(sortedSamples[startIdx], schemaSpan.Key) < 0 {
+		startIdx++
+	}
+
+	// Find samples within this span: [schemaSpan.Key, schemaSpan.EndKey)
+	endIdx := startIdx
+	for endIdx < len(sortedSamples) && bytes.Compare(sortedSamples[endIdx], schemaSpan.EndKey) < 0 {
+		endIdx++
+	}
+
+	return sortedSamples[startIdx:endIdx], endIdx
+}
+
+// validateSchemaSpansOrdered checks that schema spans are sorted by start key
+// and non-overlapping. This is a precondition for getMergeSpans to work correctly.
+func validateSchemaSpansOrdered(schemaSpans []roachpb.Span) error {
+	for i := 1; i < len(schemaSpans); i++ {
+		if bytes.Compare(schemaSpans[i-1].Key, schemaSpans[i].Key) >= 0 {
+			return errors.AssertionFailedf(
+				"schema spans not ordered: span %d [%q, %q) >= span %d [%q, %q)",
+				i-1, schemaSpans[i-1].Key, schemaSpans[i-1].EndKey,
+				i, schemaSpans[i].Key, schemaSpans[i].EndKey)
+		}
+		if bytes.Compare(schemaSpans[i-1].EndKey, schemaSpans[i].Key) > 0 {
+			return errors.AssertionFailedf(
+				"schema spans overlapping: span %d ends at %q but span %d starts at %q",
+				i-1, schemaSpans[i-1].EndKey, i, schemaSpans[i].Key)
+		}
+	}
+	return nil
+}