From ff6b2be9f293336fa991a994a87043ba898c4252 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Fri, 7 Nov 2025 15:15:07 -0500 Subject: [PATCH 01/46] wip --- pkg/server/server_sql.go | 4 ++ pkg/sql/conn_executor.go | 1 + pkg/sql/planner.go | 15 ++++++ pkg/sql/queuefeed/manager.go | 68 +++++++++++++++++++++++++++ pkg/sql/queuefeed/reader.go | 81 ++++++++++++++++++++++++++++++++ pkg/sql/sem/builtins/builtins.go | 58 +++++++++++++++++++++++ pkg/sql/sem/eval/context.go | 2 + pkg/sql/sem/eval/deps.go | 4 ++ 8 files changed, 233 insertions(+) create mode 100644 pkg/sql/queuefeed/manager.go create mode 100644 pkg/sql/queuefeed/reader.go diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 11cb4cc2cdbd..0224a9285556 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -89,6 +89,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/optionalnodeliveness" "github.com/cockroachdb/cockroach/pkg/sql/pgwire" "github.com/cockroachdb/cockroach/pkg/sql/querycache" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/rangeprober" "github.com/cockroachdb/cockroach/pkg/sql/regions" "github.com/cockroachdb/cockroach/pkg/sql/rolemembershipcache" @@ -216,6 +217,8 @@ type SQLServer struct { // serviceMode is the service mode this server was started with. serviceMode mtinfopb.TenantServiceMode + + queueManager *queuefeed.Manager } // sqlServerOptionalKVArgs are the arguments supplied to newSQLServer which are @@ -1458,6 +1461,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { internalDBMemMonitor: internalDBMonitor, upgradeManager: upgradeMgr, serviceMode: cfg.serviceMode, + queueManager: queuefeed.NewManager(cfg.internalDB), }, nil } diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 5d041d516b31..0176e0f05f9b 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -3984,6 +3984,7 @@ func (ex *connExecutor) initPlanner(ctx context.Context, p *planner) { p.schemaResolver.authAccessor = p p.reducedAuditConfig = &auditlogging.ReducedAuditConfig{} p.datumAlloc = &tree.DatumAlloc{} + p.queueManager = ex.server.queueManager } // maybeAdjustMaxTimestampBound checks diff --git a/pkg/sql/planner.go b/pkg/sql/planner.go index 2bd5d90bcb58..7d4e787cac75 100644 --- a/pkg/sql/planner.go +++ b/pkg/sql/planner.go @@ -42,6 +42,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/prep" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/querycache" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/regions" "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" @@ -174,6 +175,12 @@ func (evalCtx *extendedEvalContext) QueueJob(record *jobs.Record) jobspb.JobID { type planner struct { schemaResolver + // must clean up on connexecutor.close() + // shouldnt be a single one in reality + queueReader *queuefeed.Reader + + queueManager *queuefeed.Manager + txn *kv.Txn // internalSQLTxn corresponds to the object returned from InternalSQLTxn. @@ -324,6 +331,14 @@ type planner struct { skipUnsafeInternalsCheck bool } +func (p *planner) QueueManager() *queuefeed.Manager { + return p.queueManager +} + +func (p *planner) QueueReader() *queuefeed.Reader { + return p.queueReader +} + // hasFlowForPausablePortal returns true if the planner is for re-executing a // portal. We reuse the flow stored in p.pausablePortal.pauseInfo. func (p *planner) hasFlowForPausablePortal() bool { diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go new file mode 100644 index 000000000000..06f1fe16caad --- /dev/null +++ b/pkg/sql/queuefeed/manager.go @@ -0,0 +1,68 @@ +// queuefeed is a somthing +package queuefeed + +import ( + "context" + "fmt" + + "github.com/cockroachdb/cockroach/pkg/sql/isql" +) + +// watch queue partition table +// and create it too?? +type Manager struct { + executor isql.DB +} + +func NewManager(executor isql.DB) *Manager { + // setup rangefeed on partitions table (/poll) + // handle handoff from one server to another + return &Manager{executor: executor} +} + +const createQueuePartitionTableSQL = ` +CREATE TABLE IF NOT EXISTS queue_partition_%s ( + partition_id INT8 PRIMARY KEY, + -- is the sql server assigned dead + sql_liveness_session UUID NOT NULL, + -- pgwire session + user_session UUID NOT NULL, + sql_liveness_session_successor UUID, + user_session_successor UUID, + partition_spec []byte, + updated_at TIMESTAMPZ, +)` + +const createQueueCursorTableSQL = ` +CREATE TABLE IF NOT EXISTS queue_cursor_%s ( + partition_id INT8 PRIMARY KEY, + updated_at TIMESTAMPZ, + cursor []byte, +)` + +// should take a txn +func (m *Manager) CreateQueueTables(ctx context.Context, queueName string) error { + return m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + _, err := txn.Exec(ctx, "create_qp", txn.KV(), fmt.Sprintf(createQueuePartitionTableSQL, queueName)) + if err != nil { + return err + } + _, err = txn.Exec(ctx, "create_qc", txn.KV(), fmt.Sprintf(createQueueCursorTableSQL, queueName)) + if err != nil { + return err + } + return nil + }) +} + +func (m *Manager) GetOrInitReader(ctx context.Context, name string) (*Reader, error) { + err := m.CreateQueueTables(ctx, name) + if err != nil { + return nil, err + } + return NewReader(ctx, m.executor, m, name), nil +} + +func (m *Manager) reassessAssignments(ctx context.Context, name string) {} + +type PartitionAssignment struct{} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go new file mode 100644 index 000000000000..d216783bad61 --- /dev/null +++ b/pkg/sql/queuefeed/reader.go @@ -0,0 +1,81 @@ +package queuefeed + +import ( + "context" + "errors" + + "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" +) + +type readerState int + +const ( + readerStateIdle readerState = iota + readerStateHasUncommittedBatch +) + +// has rangefeed on data. reads from it. handles handoff +// state machine around handing out batches and handing stuff off +type Reader struct { + executor isql.DB + mgr *Manager + name string + + state readerState + + buf []tree.Datums + inflightBuffer []tree.Datums +} + +func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) *Reader { + buf := []tree.Datums{ + {tree.NewDString("1"), tree.NewDString("2"), tree.NewDString("3")}, + } + + r := &Reader{ + executor: executor, + mgr: mgr, + name: name, + buf: buf, + inflightBuffer: make([]tree.Datums, 0), + } + go r.run(ctx) + return r +} + +// setup rangefeed on data +// handle only watching my partitions +// after each batch, ask mgr if i need to change assignments +// buffer rows in the background before being asked for them +// checkpoint frontier if our frontier has advanced and we confirmed receipt + +func (r *Reader) run(ctx context.Context) { + + for { + select { + case <-ctx.Done(): + return + } + } +} + +func (r *Reader) GetRows(ctx context.Context) ([]tree.Datums, error) { + if r.state != readerStateIdle { + return nil, errors.New("reader not idle") + } + r.inflightBuffer = append(r.inflightBuffer, r.buf...) + clear(r.buf) + r.state = readerStateHasUncommittedBatch + return r.inflightBuffer, nil + + // and then trigger the goro to check if m wants us to change assignments + // if it does, handle that stuff before doing a new batch +} + +func (r *Reader) ConfirmReceipt(ctx context.Context) error { + r.state = readerStateIdle + clear(r.inflightBuffer) + + return nil +} diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 6cfd5c22ece8..3c9775998bb3 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4642,6 +4642,64 @@ value if you rely on the HLC for accuracy.`, } }()...), + "crdb_internal.create_queue_feed": makeBuiltin(defProps(), tree.Overload{ + Types: tree.ParamTypes{ + {Name: "queue_name", Typ: types.String}, + }, + ReturnType: tree.FixedReturnType(types.Void), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + qn := args[0].(*tree.DString) + return nil, evalCtx.Planner.QueueManager().CreateQueueTables(ctx, string(*qn)) + }, + }), + + "crdb_internal.select_from_queue_feed": makeBuiltin(defProps(), tree.Overload{ + Types: tree.ParamTypes{ + {Name: "queue_name", Typ: types.String}, + {Name: "limit", Typ: types.Int}, + }, + ReturnType: tree.ArrayOfFirstNonNullReturnType(), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + var err error + // ignore queue_name for now; we only support one queue + // ditto limit lol + qr := evalCtx.Planner.QueueReader() + // if not initialized, initialize it + if qr == nil { + qn := args[0].(*tree.DString) + qr, err = evalCtx.Planner.QueueManager().GetOrInitReader(ctx, string(*qn)) + if err != nil { + return nil, err + } + } + // attach commit hook to txn to confirm receipt + txn := evalCtx.Txn + // or something... todo on rollback/abort + txn.AddCommitTrigger(func(ctx context.Context) { + qr.ConfirmReceipt(ctx) + }) + + ret := tree.NewDArray(types.Json) + + rows, err := qr.GetRows(ctx) + if err != nil { + return nil, err + } + for _, row := range rows { + obj := json.NewObjectBuilder(len(row)) + for i, d := range row { + j, err := tree.AsJSON(d, evalCtx.SessionData().DataConversionConfig, evalCtx.GetLocation()) + if err != nil { + return nil, err + } + obj.Add(fmt.Sprintf("f%d", i+1), j) + } + ret.Append(tree.NewDJSON(obj.Build())) + } + return ret, nil + }, + }), + "crdb_internal.json_to_pb": makeBuiltin( jsonProps(), tree.Overload{ diff --git a/pkg/sql/sem/eval/context.go b/pkg/sql/sem/eval/context.go index 38109af21cb5..7b8fbc86495e 100644 --- a/pkg/sql/sem/eval/context.go +++ b/pkg/sql/sem/eval/context.go @@ -320,6 +320,8 @@ type Context struct { // ExecutedStatementCounters contains metrics for successfully executed // statements defined within the body of a UDF/SP. ExecutedRoutineStatementCounters RoutineStatementCounters + + QueueSessionMgr any } // RoutineStatementCounters encapsulates metrics for tracking the execution diff --git a/pkg/sql/sem/eval/deps.go b/pkg/sql/sem/eval/deps.go index 34f6de9bd4ba..228cbe302128 100644 --- a/pkg/sql/sem/eval/deps.go +++ b/pkg/sql/sem/eval/deps.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/hintpb" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" @@ -477,6 +478,9 @@ type Planner interface { // the system.statement_hints table. It returns the hint ID of the newly // created hint. InsertStatementHint(ctx context.Context, statementFingerprint string, hint hintpb.StatementHintUnion) (int64, error) + + QueueReader() *queuefeed.Reader + QueueManager() *queuefeed.Manager } // InternalRows is an iterator interface that's exposed by the internal From c2ba28369938ef984729109bc68396cefa89744b Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Fri, 7 Nov 2025 16:43:58 -0500 Subject: [PATCH 02/46] fix up and move stuff around --- pkg/BUILD.bazel | 1 + pkg/server/BUILD.bazel | 1 + pkg/sql/BUILD.bazel | 1 + pkg/sql/conn_executor.go | 1 - pkg/sql/exec_util.go | 7 +++++++ pkg/sql/planner.go | 15 --------------- pkg/sql/queuefeed/BUILD.bazel | 15 +++++++++++++++ pkg/sql/queuefeed/manager.go | 1 + pkg/sql/queuefeed/reader.go | 1 + pkg/sql/sem/builtins/BUILD.bazel | 1 + pkg/sql/sem/builtins/builtins.go | 19 ++++++++++--------- pkg/sql/sem/eval/deps.go | 4 ---- 12 files changed, 38 insertions(+), 29 deletions(-) create mode 100644 pkg/sql/queuefeed/BUILD.bazel diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index d25a243f2863..356f53e56013 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -2228,6 +2228,7 @@ GO_TARGETS = [ "//pkg/sql/protoreflect:protoreflect_test", "//pkg/sql/querycache:querycache", "//pkg/sql/querycache:querycache_test", + "//pkg/sql/queuefeed:queuefeed", "//pkg/sql/randgen:randgen", "//pkg/sql/randgen:randgen_test", "//pkg/sql/rangeprober:range_prober", diff --git a/pkg/server/BUILD.bazel b/pkg/server/BUILD.bazel index 3d489bd7eac0..b7ff3731dc43 100644 --- a/pkg/server/BUILD.bazel +++ b/pkg/server/BUILD.bazel @@ -259,6 +259,7 @@ go_library( "//pkg/sql/physicalplan", "//pkg/sql/privilege", "//pkg/sql/querycache", + "//pkg/sql/queuefeed", "//pkg/sql/rangeprober", "//pkg/sql/regions", "//pkg/sql/rolemembershipcache", diff --git a/pkg/sql/BUILD.bazel b/pkg/sql/BUILD.bazel index 345ac4338c76..2e1063aede4b 100644 --- a/pkg/sql/BUILD.bazel +++ b/pkg/sql/BUILD.bazel @@ -478,6 +478,7 @@ go_library( "//pkg/sql/privilege", "//pkg/sql/protoreflect", "//pkg/sql/querycache", + "//pkg/sql/queuefeed", "//pkg/sql/regionliveness", "//pkg/sql/regions", "//pkg/sql/rolemembershipcache", diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 0176e0f05f9b..5d041d516b31 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -3984,7 +3984,6 @@ func (ex *connExecutor) initPlanner(ctx context.Context, p *planner) { p.schemaResolver.authAccessor = p p.reducedAuditConfig = &auditlogging.ReducedAuditConfig{} p.datumAlloc = &tree.DatumAlloc{} - p.queueManager = ex.server.queueManager } // maybeAdjustMaxTimestampBound checks diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 06e6a3cf216f..d42a998b1545 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -81,6 +81,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/physicalplan" plpgsqlparser "github.com/cockroachdb/cockroach/pkg/sql/plpgsql/parser" "github.com/cockroachdb/cockroach/pkg/sql/querycache" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/rolemembershipcache" "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/rowinfra" @@ -1851,6 +1852,12 @@ type ExecutorConfig struct { // LicenseEnforcer is used to enforce the license profiles. LicenseEnforcer *license.Enforcer + + QueueManager *queuefeed.Manager +} + +func (cfg *ExecutorConfig) GetQueueManager() *queuefeed.Manager { + return cfg.QueueManager } // UpdateVersionSystemSettingHook provides a callback that allows us diff --git a/pkg/sql/planner.go b/pkg/sql/planner.go index 7d4e787cac75..2bd5d90bcb58 100644 --- a/pkg/sql/planner.go +++ b/pkg/sql/planner.go @@ -42,7 +42,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/prep" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/querycache" - "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/regions" "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" @@ -175,12 +174,6 @@ func (evalCtx *extendedEvalContext) QueueJob(record *jobs.Record) jobspb.JobID { type planner struct { schemaResolver - // must clean up on connexecutor.close() - // shouldnt be a single one in reality - queueReader *queuefeed.Reader - - queueManager *queuefeed.Manager - txn *kv.Txn // internalSQLTxn corresponds to the object returned from InternalSQLTxn. @@ -331,14 +324,6 @@ type planner struct { skipUnsafeInternalsCheck bool } -func (p *planner) QueueManager() *queuefeed.Manager { - return p.queueManager -} - -func (p *planner) QueueReader() *queuefeed.Reader { - return p.queueReader -} - // hasFlowForPausablePortal returns true if the planner is for re-executing a // portal. We reuse the flow stored in p.pausablePortal.pauseInfo. func (p *planner) hasFlowForPausablePortal() bool { diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel new file mode 100644 index 000000000000..73c7a9226cd4 --- /dev/null +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -0,0 +1,15 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "queuefeed", + srcs = [ + "manager.go", + "reader.go", + ], + importpath = "github.com/cockroachdb/cockroach/pkg/sql/queuefeed", + visibility = ["//visibility:public"], + deps = [ + "//pkg/sql/isql", + "//pkg/sql/sem/tree", + ], +) diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 06f1fe16caad..d5bdd5e6f90a 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -14,6 +14,7 @@ type Manager struct { executor isql.DB } + func NewManager(executor isql.DB) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index d216783bad61..1c0b847ad0d7 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -49,6 +49,7 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) // after each batch, ask mgr if i need to change assignments // buffer rows in the background before being asked for them // checkpoint frontier if our frontier has advanced and we confirmed receipt +// gonna need some way to clean stuff up on conn_executor.close() func (r *Reader) run(ctx context.Context) { diff --git a/pkg/sql/sem/builtins/BUILD.bazel b/pkg/sql/sem/builtins/BUILD.bazel index 06c3e60d572e..3eafd3e78622 100644 --- a/pkg/sql/sem/builtins/BUILD.bazel +++ b/pkg/sql/sem/builtins/BUILD.bazel @@ -78,6 +78,7 @@ go_library( "//pkg/sql/pgwire/pgnotice", "//pkg/sql/privilege", "//pkg/sql/protoreflect", + "//pkg/sql/queuefeed", "//pkg/sql/rowenc", "//pkg/sql/rowenc/keyside", "//pkg/sql/rowenc/valueside", diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 3c9775998bb3..cf3783522f43 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -55,6 +55,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/protoreflect" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside" "github.com/cockroachdb/cockroach/pkg/sql/sem/asof" @@ -4649,7 +4650,7 @@ value if you rely on the HLC for accuracy.`, ReturnType: tree.FixedReturnType(types.Void), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { qn := args[0].(*tree.DString) - return nil, evalCtx.Planner.QueueManager().CreateQueueTables(ctx, string(*qn)) + return nil, getQueueManager(evalCtx).CreateQueueTables(ctx, string(*qn)) }, }), @@ -4663,14 +4664,10 @@ value if you rely on the HLC for accuracy.`, var err error // ignore queue_name for now; we only support one queue // ditto limit lol - qr := evalCtx.Planner.QueueReader() - // if not initialized, initialize it - if qr == nil { - qn := args[0].(*tree.DString) - qr, err = evalCtx.Planner.QueueManager().GetOrInitReader(ctx, string(*qn)) - if err != nil { - return nil, err - } + qn := args[0].(*tree.DString) + qr, err := getQueueManager(evalCtx).GetOrInitReader(ctx, string(*qn)) + if err != nil { + return nil, err } // attach commit hook to txn to confirm receipt txn := evalCtx.Txn @@ -12904,3 +12901,7 @@ func exprSliceToStrSlice(exprs []tree.Expr) []string { } var nilRegionsError = errors.AssertionFailedf("evalCtx.Regions is nil") + +func getQueueManager(evalCtx *eval.Context) *queuefeed.Manager { + return evalCtx.Planner.ExecutorConfig().(interface{ GetQueueManager() *queuefeed.Manager }).GetQueueManager() +} diff --git a/pkg/sql/sem/eval/deps.go b/pkg/sql/sem/eval/deps.go index 228cbe302128..34f6de9bd4ba 100644 --- a/pkg/sql/sem/eval/deps.go +++ b/pkg/sql/sem/eval/deps.go @@ -18,7 +18,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/hintpb" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" - "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" @@ -478,9 +477,6 @@ type Planner interface { // the system.statement_hints table. It returns the hint ID of the newly // created hint. InsertStatementHint(ctx context.Context, statementFingerprint string, hint hintpb.StatementHintUnion) (int64, error) - - QueueReader() *queuefeed.Reader - QueueManager() *queuefeed.Manager } // InternalRows is an iterator interface that's exposed by the internal From 25c08d7a58472fe1875dee56412ae4bae7361be3 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Fri, 7 Nov 2025 16:49:47 -0500 Subject: [PATCH 03/46] add oid entry --- pkg/sql/sem/builtins/fixed_oids.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 793e0381f80b..c0d5ad4ba1b9 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2863,6 +2863,8 @@ var builtinOidsArray = []string{ 2908: `crdb_internal.inject_hint(statement_fingerprint: string, donor_sql: string) -> int`, 2909: `crdb_internal.clear_statement_hints_cache() -> void`, 2910: `crdb_internal.await_statement_hints_cache() -> void`, + 2911: `crdb_internal.create_queue_feed(queue_name: string) -> void`, + 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> anyelement`, } var builtinOidsBySignature map[string]oid.Oid From 98dbecf64eed9b82a7c45b124b84da835a50f930 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Fri, 7 Nov 2025 16:52:30 -0500 Subject: [PATCH 04/46] make qm --- pkg/server/server_sql.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 0224a9285556..8b81a1154e84 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -1065,6 +1065,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { TenantReadOnly: cfg.SQLConfig.TenantReadOnly, CidrLookup: cfg.BaseConfig.CidrLookup, LicenseEnforcer: cfg.SQLConfig.LicenseEnforcer, + QueueManager: queuefeed.NewManager(cfg.internalDB), } if codec.ForSystemTenant() { From b112ae690c1092a216306562c55e85e8a46f9a75 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Sat, 8 Nov 2025 17:13:15 -0500 Subject: [PATCH 05/46] working --- pkg/sql/queuefeed/manager.go | 13 ++++++------- pkg/sql/sem/builtins/builtins.go | 10 ++++++++-- pkg/sql/sem/builtins/fixed_oids.go | 2 +- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index d5bdd5e6f90a..68d19c548349 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -14,7 +14,6 @@ type Manager struct { executor isql.DB } - func NewManager(executor isql.DB) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another @@ -22,7 +21,7 @@ func NewManager(executor isql.DB) *Manager { } const createQueuePartitionTableSQL = ` -CREATE TABLE IF NOT EXISTS queue_partition_%s ( +CREATE TABLE IF NOT EXISTS defaultdb.queue_partition_%s ( partition_id INT8 PRIMARY KEY, -- is the sql server assigned dead sql_liveness_session UUID NOT NULL, @@ -30,15 +29,15 @@ CREATE TABLE IF NOT EXISTS queue_partition_%s ( user_session UUID NOT NULL, sql_liveness_session_successor UUID, user_session_successor UUID, - partition_spec []byte, - updated_at TIMESTAMPZ, + partition_spec bytea, + updated_at TIMESTAMPTZ )` const createQueueCursorTableSQL = ` -CREATE TABLE IF NOT EXISTS queue_cursor_%s ( +CREATE TABLE IF NOT EXISTS defaultdb.queue_cursor_%s ( partition_id INT8 PRIMARY KEY, - updated_at TIMESTAMPZ, - cursor []byte, + updated_at TIMESTAMPTZ, + cursor bytea )` // should take a txn diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index cf3783522f43..da5597874156 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4647,10 +4647,15 @@ value if you rely on the HLC for accuracy.`, Types: tree.ParamTypes{ {Name: "queue_name", Typ: types.String}, }, + Volatility: volatility.Volatile, ReturnType: tree.FixedReturnType(types.Void), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { qn := args[0].(*tree.DString) - return nil, getQueueManager(evalCtx).CreateQueueTables(ctx, string(*qn)) + err := getQueueManager(evalCtx).CreateQueueTables(ctx, string(*qn)) + if err != nil { + return nil, err + } + return tree.DVoidDatum, nil }, }), @@ -4659,7 +4664,8 @@ value if you rely on the HLC for accuracy.`, {Name: "queue_name", Typ: types.String}, {Name: "limit", Typ: types.Int}, }, - ReturnType: tree.ArrayOfFirstNonNullReturnType(), + Volatility: volatility.Volatile, + ReturnType: tree.FixedReturnType(types.MakeArray(types.Json)), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { var err error // ignore queue_name for now; we only support one queue diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index c0d5ad4ba1b9..ef0419c05cf5 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2864,7 +2864,7 @@ var builtinOidsArray = []string{ 2909: `crdb_internal.clear_statement_hints_cache() -> void`, 2910: `crdb_internal.await_statement_hints_cache() -> void`, 2911: `crdb_internal.create_queue_feed(queue_name: string) -> void`, - 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> anyelement`, + 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb[]`, } var builtinOidsBySignature map[string]oid.Oid From c4ccf01621fe3ec0326d3e1ffa49902aed368d32 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Sat, 8 Nov 2025 17:47:52 -0500 Subject: [PATCH 06/46] rangefeed --- pkg/sql/queuefeed/reader.go | 129 +++++++++++++++++++++++++------ pkg/sql/sem/builtins/builtins.go | 2 +- 2 files changed, 107 insertions(+), 24 deletions(-) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 1c0b847ad0d7..329c0217eb50 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -2,10 +2,16 @@ package queuefeed import ( "context" - "errors" + "fmt" + "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" + "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "github.com/cockroachdb/cockroach/pkg/util/timeutil" + "github.com/cockroachdb/errors" ) type readerState int @@ -19,13 +25,18 @@ const ( // state machine around handing out batches and handing stuff off type Reader struct { executor isql.DB + rff *rangefeed.Factory mgr *Manager name string - state readerState + mu struct { + syncutil.Mutex + state readerState + buf []tree.Datums + inflightBuffer []tree.Datums + } - buf []tree.Datums - inflightBuffer []tree.Datums + cancel context.CancelFunc } func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) *Reader { @@ -34,24 +45,79 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) } r := &Reader{ - executor: executor, - mgr: mgr, - name: name, - buf: buf, - inflightBuffer: make([]tree.Datums, 0), + executor: executor, + mgr: mgr, + name: name, } + r.mu.state = readerStateIdle + r.mu.buf = buf + r.mu.inflightBuffer = make([]tree.Datums, 0) + + ctx, cancel := context.WithCancel(ctx) + r.cancel = cancel + + r.setupRangefeed(ctx) go r.run(ctx) return r } -// setup rangefeed on data -// handle only watching my partitions -// after each batch, ask mgr if i need to change assignments -// buffer rows in the background before being asked for them -// checkpoint frontier if our frontier has advanced and we confirmed receipt -// gonna need some way to clean stuff up on conn_executor.close() +func (r *Reader) setupRangefeed(ctx context.Context) { + incomingResolveds := make(chan hlc.Timestamp) + setErr := func(err error) { r.cancel() } + + onValue := func(ctx context.Context, value *kvpb.RangeFeedValue) { + r.mu.Lock() + defer r.mu.Unlock() + + if len(r.mu.buf) > 100 { + // TODO: wait for rows to be read before adding more + } + // TODO: decode value.Value + r.mu.buf = append(r.mu.buf, tree.Datums{tree.DVoidDatum}) + } + // setup rangefeed on data + opts := []rangefeed.Option{ + rangefeed.WithPProfLabel("queuefeed.reader", fmt.Sprintf("name=%s", r.name)), + // rangefeed.WithMemoryMonitor(w.mon), + rangefeed.WithOnCheckpoint(func(ctx context.Context, checkpoint *kvpb.RangeFeedCheckpoint) { + // This can happen when done catching up; ignore it. + if checkpoint.ResolvedTS.IsEmpty() { + return + } + select { + case incomingResolveds <- checkpoint.ResolvedTS: + case <-ctx.Done(): + default: // TODO: handle resolveds (dont actually default here) + } + }), + rangefeed.WithOnInternalError(func(ctx context.Context, err error) { setErr(err) }), + rangefeed.WithConsumerID(42), + rangefeed.WithInvoker(func(fn func() error) error { return fn() }), + rangefeed.WithFiltering(false), + } + + // TODO: resume from cursor + initialTS := hlc.Timestamp{WallTime: timeutil.Now().UnixNano()} + rf := r.rff.New( + fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., + ) + defer rf.Close() + +} + +// - [x] setup rangefeed on data +// - [ ] handle only watching my partitions +// - [ ] after each batch, ask mgr if i need to change assignments +// - [ ] buffer rows in the background before being asked for them +// - [ ] checkpoint frontier if our frontier has advanced and we confirmed receipt +// - [ ] gonna need some way to clean stuff up on conn_executor.close() func (r *Reader) run(ctx context.Context) { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + // state machine + // - for { select { @@ -61,22 +127,39 @@ func (r *Reader) run(ctx context.Context) { } } -func (r *Reader) GetRows(ctx context.Context) ([]tree.Datums, error) { - if r.state != readerStateIdle { +func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) { + r.mu.Lock() + defer r.mu.Unlock() + + if r.mu.state != readerStateIdle { return nil, errors.New("reader not idle") } - r.inflightBuffer = append(r.inflightBuffer, r.buf...) - clear(r.buf) - r.state = readerStateHasUncommittedBatch - return r.inflightBuffer, nil + if len(r.mu.inflightBuffer) > 0 { + return nil, errors.AssertionFailedf("getrows called with nonempty inflight buffer") + } + + if limit > len(r.mu.buf) { + limit = len(r.mu.buf) + } + + r.mu.inflightBuffer = append(r.mu.inflightBuffer, r.mu.buf[0:limit]...) + r.mu.buf = r.mu.buf[limit:] + + r.mu.inflightBuffer = append(r.mu.inflightBuffer, r.mu.buf...) + clear(r.mu.buf) + r.mu.state = readerStateHasUncommittedBatch + return r.mu.inflightBuffer, nil // and then trigger the goro to check if m wants us to change assignments // if it does, handle that stuff before doing a new batch } func (r *Reader) ConfirmReceipt(ctx context.Context) error { - r.state = readerStateIdle - clear(r.inflightBuffer) + r.mu.Lock() + defer r.mu.Unlock() + + r.mu.state = readerStateIdle + clear(r.mu.inflightBuffer) return nil } diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index da5597874156..e317074adccc 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4684,7 +4684,7 @@ value if you rely on the HLC for accuracy.`, ret := tree.NewDArray(types.Json) - rows, err := qr.GetRows(ctx) + rows, err := qr.GetRows(ctx, int(tree.MustBeDInt(args[1]))) if err != nil { return nil, err } From 9ecedaac1641ac5e63099f924111351f58df8ea5 Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Mon, 10 Nov 2025 11:34:34 -0500 Subject: [PATCH 07/46] make sure it doesn't panic --- pkg/sql/queuefeed/BUILD.bazel | 6 ++++++ pkg/sql/queuefeed/reader.go | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 73c7a9226cd4..2c4c2a552578 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -9,7 +9,13 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/sql/queuefeed", visibility = ["//visibility:public"], deps = [ + "//pkg/kv/kvclient/rangefeed", + "//pkg/kv/kvpb", "//pkg/sql/isql", "//pkg/sql/sem/tree", + "//pkg/util/hlc", + "//pkg/util/syncutil", + "//pkg/util/timeutil", + "@com_github_cockroachdb_errors//:errors", ], ) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 329c0217eb50..0605a341a4ae 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -56,7 +56,8 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) ctx, cancel := context.WithCancel(ctx) r.cancel = cancel - r.setupRangefeed(ctx) + // TODO(queuefeed): Re-enable once queue data table and spans are implemented + // r.setupRangefeed(ctx) go r.run(ctx) return r } From 03444e070c58d301c6c00ac1122df8f6c7e7515e Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Mon, 10 Nov 2025 13:04:03 -0500 Subject: [PATCH 08/46] queuefeed: have crdb_internal.select_from_queue_feed return rows Instead of returning a single array as json for the events stored in our queue, now return a row for each row so that we get root@127.0.0.1:26257/demoapp/movr> SELECT crdb_internal.select_from_queue_feed('foo', 3); crdb_internal.select_from_queue_feed ---------------------------------------- {"f1": "1", "f2": "2", "f3": "3"} {"f1": "4", "f2": "5", "f3": "6"} {"f1": "7", "f2": "8", "f3": "9"} instead of root@127.0.0.1:26257/demoapp/movr> SELECT crdb_internal.select_from_queue_feed('foo', 3); crdb_internal.select_from_queue_feed -------------------------------------------------------------------------------------------------------------------------------------------------------------- {"{\"f1\": \"1\", \"f2\": \"2\", \"f3\": \"3\"}","{\"f1\": \"11\", \"f2\": \"22\", \"f3\": \"33\"}","{\"f1\": \"111\", \"f2\": \"222\", \"f3\": \"333\"}"} (1 row) --- pkg/sql/queuefeed/reader.go | 2 + pkg/sql/sem/builtins/builtins.go | 44 ----------- pkg/sql/sem/builtins/fixed_oids.go | 2 +- pkg/sql/sem/builtins/generator_builtins.go | 88 ++++++++++++++++++++++ 4 files changed, 91 insertions(+), 45 deletions(-) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 0605a341a4ae..da155c668770 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -42,6 +42,8 @@ type Reader struct { func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) *Reader { buf := []tree.Datums{ {tree.NewDString("1"), tree.NewDString("2"), tree.NewDString("3")}, + {tree.NewDString("4"), tree.NewDString("5"), tree.NewDString("6")}, + {tree.NewDString("7"), tree.NewDString("8"), tree.NewDString("9")}, } r := &Reader{ diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index e317074adccc..9be6268f81ab 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4659,50 +4659,6 @@ value if you rely on the HLC for accuracy.`, }, }), - "crdb_internal.select_from_queue_feed": makeBuiltin(defProps(), tree.Overload{ - Types: tree.ParamTypes{ - {Name: "queue_name", Typ: types.String}, - {Name: "limit", Typ: types.Int}, - }, - Volatility: volatility.Volatile, - ReturnType: tree.FixedReturnType(types.MakeArray(types.Json)), - Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { - var err error - // ignore queue_name for now; we only support one queue - // ditto limit lol - qn := args[0].(*tree.DString) - qr, err := getQueueManager(evalCtx).GetOrInitReader(ctx, string(*qn)) - if err != nil { - return nil, err - } - // attach commit hook to txn to confirm receipt - txn := evalCtx.Txn - // or something... todo on rollback/abort - txn.AddCommitTrigger(func(ctx context.Context) { - qr.ConfirmReceipt(ctx) - }) - - ret := tree.NewDArray(types.Json) - - rows, err := qr.GetRows(ctx, int(tree.MustBeDInt(args[1]))) - if err != nil { - return nil, err - } - for _, row := range rows { - obj := json.NewObjectBuilder(len(row)) - for i, d := range row { - j, err := tree.AsJSON(d, evalCtx.SessionData().DataConversionConfig, evalCtx.GetLocation()) - if err != nil { - return nil, err - } - obj.Add(fmt.Sprintf("f%d", i+1), j) - } - ret.Append(tree.NewDJSON(obj.Build())) - } - return ret, nil - }, - }), - "crdb_internal.json_to_pb": makeBuiltin( jsonProps(), tree.Overload{ diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index ef0419c05cf5..17b7aac29027 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2864,7 +2864,7 @@ var builtinOidsArray = []string{ 2909: `crdb_internal.clear_statement_hints_cache() -> void`, 2910: `crdb_internal.await_statement_hints_cache() -> void`, 2911: `crdb_internal.create_queue_feed(queue_name: string) -> void`, - 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb[]`, + 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb`, } var builtinOidsBySignature map[string]oid.Oid diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index 5c45bbc6ad7d..48642ebd64e2 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -9,6 +9,7 @@ import ( "bytes" "context" gojson "encoding/json" + "fmt" "math/rand" "sort" "strconv" @@ -141,6 +142,19 @@ var generators = map[string]builtinDefinition{ volatility.Stable, ), ), + "crdb_internal.select_from_queue_feed": makeBuiltin( + genProps(), + makeGeneratorOverload( + tree.ParamTypes{ + {Name: "queue_name", Typ: types.String}, + {Name: "limit", Typ: types.Int}, + }, + queueFeedGeneratorType, + makeQueueFeedGenerator, + "Returns rows from a queue feed", + volatility.Volatile, + ), + ), "crdb_internal.scan": makeBuiltin( tree.FunctionProperties{ Category: builtinconstants.CategoryGenerator, @@ -4350,3 +4364,77 @@ func (g *txnDiagnosticsRequestGenerator) Values() (tree.Datums, error) { // Close implements the eval.ValueGenerator interface. func (g *txnDiagnosticsRequestGenerator) Close(ctx context.Context) { } + +type queueFeedGenerator struct { + queueName string + limit int + evalCtx *eval.Context + rows []tree.Datums + rowIdx int +} + +var queueFeedGeneratorType = types.Jsonb + +func makeQueueFeedGenerator( + ctx context.Context, evalCtx *eval.Context, args tree.Datums, +) (eval.ValueGenerator, error) { + queueName := string(tree.MustBeDString(args[0])) + limit := int(tree.MustBeDInt(args[1])) + return &queueFeedGenerator{ + queueName: queueName, + limit: limit, + evalCtx: evalCtx, + rowIdx: -1, + }, nil +} + +// ResolvedType implements the eval.ValueGenerator interface. +func (g *queueFeedGenerator) ResolvedType() *types.T { + return queueFeedGeneratorType +} + +// Start implements the eval.ValueGenerator interface. +func (g *queueFeedGenerator) Start(ctx context.Context, txn *kv.Txn) error { + // Ignoring queue_name for now; we only support one queue. Same for limit. + // TODO(queuefeed): support multiple queues and limit. + qr, err := getQueueManager(g.evalCtx).GetOrInitReader(ctx, g.queueName) + if err != nil { + return err + } + + // Attach commit hook to txn to confirm receipt + // or something... todo on rollback/abort. + txn.AddCommitTrigger(func(ctx context.Context) { + qr.ConfirmReceipt(ctx) + }) + + rows, err := qr.GetRows(ctx, g.limit) + if err != nil { + return err + } + g.rows = rows + return nil +} + +// Next implements the eval.ValueGenerator interface. +func (g *queueFeedGenerator) Next(ctx context.Context) (bool, error) { + g.rowIdx++ + return g.rowIdx < len(g.rows), nil +} + +// Values implements the eval.ValueGenerator interface. +func (g *queueFeedGenerator) Values() (tree.Datums, error) { + row := g.rows[g.rowIdx] + obj := json.NewObjectBuilder(len(row)) + for i, d := range row { + j, err := tree.AsJSON(d, g.evalCtx.SessionData().DataConversionConfig, g.evalCtx.GetLocation()) + if err != nil { + return nil, err + } + obj.Add(fmt.Sprintf("f%d", i+1), j) + } + return tree.Datums{tree.NewDJSON(obj.Build())}, nil +} + +// Close implements the eval.ValueGenerator interface. +func (g *queueFeedGenerator) Close(ctx context.Context) {} From 492de9ee535da67e739b3b705d2eca29b669f96f Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Mon, 10 Nov 2025 11:26:04 -0500 Subject: [PATCH 09/46] queuefeed: add wrapper for partitions table --- pkg/BUILD.bazel | 2 + pkg/sql/queuefeed/BUILD.bazel | 32 +++- pkg/sql/queuefeed/main_test.go | 26 +++ pkg/sql/queuefeed/partitions.go | 196 +++++++++++++++++++++ pkg/sql/queuefeed/partitions_test.go | 176 ++++++++++++++++++ pkg/sql/sem/builtins/generator_builtins.go | 6 +- 6 files changed, 436 insertions(+), 2 deletions(-) create mode 100644 pkg/sql/queuefeed/main_test.go create mode 100644 pkg/sql/queuefeed/partitions.go create mode 100644 pkg/sql/queuefeed/partitions_test.go diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 356f53e56013..2b88bd4e27ac 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -586,6 +586,7 @@ ALL_TESTS = [ "//pkg/sql/privilege:privilege_test", "//pkg/sql/protoreflect:protoreflect_test", "//pkg/sql/querycache:querycache_test", + "//pkg/sql/queuefeed:queuefeed_test", "//pkg/sql/randgen:randgen_test", "//pkg/sql/regions:regions_test", "//pkg/sql/row:row_disallowed_imports_test", @@ -2229,6 +2230,7 @@ GO_TARGETS = [ "//pkg/sql/querycache:querycache", "//pkg/sql/querycache:querycache_test", "//pkg/sql/queuefeed:queuefeed", + "//pkg/sql/queuefeed:queuefeed_test", "//pkg/sql/randgen:randgen", "//pkg/sql/randgen:randgen_test", "//pkg/sql/rangeprober:range_prober", diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 2c4c2a552578..1c43ee5718e2 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -1,9 +1,10 @@ -load("@io_bazel_rules_go//go:def.bzl", "go_library") +load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "queuefeed", srcs = [ "manager.go", + "partitions.go", "reader.go", ], importpath = "github.com/cockroachdb/cockroach/pkg/sql/queuefeed", @@ -11,11 +12,40 @@ go_library( deps = [ "//pkg/kv/kvclient/rangefeed", "//pkg/kv/kvpb", + "//pkg/roachpb", "//pkg/sql/isql", "//pkg/sql/sem/tree", + "//pkg/sql/sqlliveness", "//pkg/util/hlc", "//pkg/util/syncutil", "//pkg/util/timeutil", + "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", ], ) + +go_test( + name = "queuefeed_test", + srcs = [ + "main_test.go", + "partitions_test.go", + ], + embed = [":queuefeed"], + deps = [ + "//pkg/base", + "//pkg/roachpb", + "//pkg/security/securityassets", + "//pkg/security/securitytest", + "//pkg/server", + "//pkg/sql/isql", + "//pkg/sql/sqlliveness", + "//pkg/testutils/serverutils", + "//pkg/testutils/sqlutils", + "//pkg/testutils/testcluster", + "//pkg/util/leaktest", + "//pkg/util/log", + "//pkg/util/randutil", + "//pkg/util/uuid", + "@com_github_stretchr_testify//require", + ], +) diff --git a/pkg/sql/queuefeed/main_test.go b/pkg/sql/queuefeed/main_test.go new file mode 100644 index 000000000000..a0065c2e5693 --- /dev/null +++ b/pkg/sql/queuefeed/main_test.go @@ -0,0 +1,26 @@ +// Copyright 2024 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package queuefeed_test + +import ( + "os" + "testing" + + "github.com/cockroachdb/cockroach/pkg/security/securityassets" + "github.com/cockroachdb/cockroach/pkg/security/securitytest" + "github.com/cockroachdb/cockroach/pkg/server" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/testcluster" + "github.com/cockroachdb/cockroach/pkg/util/randutil" +) + +func TestMain(m *testing.M) { + securityassets.SetLoader(securitytest.EmbeddedAssets) + randutil.SeedForTests() + serverutils.InitTestServerFactory(server.TestServerFactory) + serverutils.InitTestClusterFactory(testcluster.TestClusterFactory) + os.Exit(m.Run()) +} diff --git a/pkg/sql/queuefeed/partitions.go b/pkg/sql/queuefeed/partitions.go new file mode 100644 index 000000000000..a9bc0f0b8e99 --- /dev/null +++ b/pkg/sql/queuefeed/partitions.go @@ -0,0 +1,196 @@ +package queuefeed + +import ( + "context" + "fmt" + + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/util/uuid" +) + +type Partition struct { + // ID is the `partition_id` column in the queue partition table. + ID int64 + // Session is the `user_session` and `sql_liveness_session` assigned to this + // partition. + Session Session + // Successor is the `user_session_successor` and + // `sql_liveness_session_successor` assigned to the partition. + Successor Session + // Span is decoded from the `partition_spec` column. + Span *roachpb.Span +} + +type partitionTable struct { + db isql.DB + queueName string +} + +func (p *partitionTable) CreateSchema(ctx context.Context, txn isql.Txn) error { + _, err := txn.Exec(ctx, "create-partition-table", txn.KV(), + fmt.Sprintf(`CREATE TABLE IF NOT EXISTS defaultdb.queue_partition_%s ( + partition_id BIGSERIAL PRIMARY KEY, + sql_liveness_session UUID, + user_session UUID, + sql_liveness_session_successor UUID, + user_session_successor UUID, + partition_spec BYTES + )`, p.queueName)) + return err +} + +func (p *partitionTable) ListPartitions(ctx context.Context, txn isql.Txn) ([]Partition, error) { + rows, err := txn.QueryBuffered(ctx, "list-partitions", txn.KV(), fmt.Sprintf(` + SELECT + partition_id, + sql_liveness_session, + user_session, + sql_liveness_session_successor, + user_session_successor, + partition_spec + FROM defaultdb.queue_partition_%s`, p.queueName)) + if err != nil { + return nil, err + } + + partitions := make([]Partition, len(rows)) + for i, row := range rows { + var session, successor Session + if !(row[1] == tree.DNull || row[2] == tree.DNull) { + session = Session{ + LivenessID: sqlliveness.SessionID(tree.MustBeDUuid(row[1]).UUID.GetBytes()), + ConnectionID: tree.MustBeDUuid(row[2]).UUID, + } + } + if !(row[3] == tree.DNull || row[4] == tree.DNull) { + successor = Session{ + LivenessID: sqlliveness.SessionID(tree.MustBeDUuid(row[3]).UUID.GetBytes()), + ConnectionID: tree.MustBeDUuid(row[4]).UUID, + } + } + + span, err := decodeSpan([]byte(*row[5].(*tree.DBytes))) + if err != nil { + return nil, err + } + + partitions[i] = Partition{ + ID: int64(tree.MustBeDInt(row[0])), + Session: session, + Successor: successor, + Span: span, + } + } + + return partitions, nil +} + +func (p *partitionTable) InsertPartition( + ctx context.Context, txn isql.Txn, partition Partition, +) error { + var sessionLivenessID, sessionConnectionID interface{} + var successorLivenessID, successorConnectionID interface{} + + if !partition.Session.Empty() { + sessionLivenessID = []byte(partition.Session.LivenessID) + sessionConnectionID = partition.Session.ConnectionID + } else { + sessionLivenessID = nil + sessionConnectionID = nil + } + + if !partition.Successor.Empty() { + successorLivenessID = []byte(partition.Successor.LivenessID) + successorConnectionID = partition.Successor.ConnectionID + } else { + successorLivenessID = nil + successorConnectionID = nil + } + + spanBytes := encodeSpan(partition.Span) + + _, err := txn.Exec(ctx, "insert-partition", txn.KV(), + fmt.Sprintf(`INSERT INTO defaultdb.queue_partition_%s + (partition_id, sql_liveness_session, user_session, sql_liveness_session_successor, user_session_successor, partition_spec) + VALUES ($1, $2, $3, $4, $5, $6)`, p.queueName), + partition.ID, sessionLivenessID, sessionConnectionID, + successorLivenessID, successorConnectionID, spanBytes) + + return err +} + +func (p *partitionTable) UpdatePartition( + ctx context.Context, txn isql.Txn, partition Partition, +) error { + var sessionLivenessID, sessionConnectionID interface{} + var successorLivenessID, successorConnectionID interface{} + + if !partition.Session.Empty() { + sessionLivenessID = []byte(partition.Session.LivenessID) + sessionConnectionID = partition.Session.ConnectionID + } else { + sessionLivenessID = nil + sessionConnectionID = nil + } + + if !partition.Successor.Empty() { + successorLivenessID = []byte(partition.Successor.LivenessID) + successorConnectionID = partition.Successor.ConnectionID + } else { + successorLivenessID = nil + successorConnectionID = nil + } + + spanBytes := encodeSpan(partition.Span) + + _, err := txn.Exec(ctx, "update-partition", txn.KV(), + fmt.Sprintf(`UPDATE defaultdb.queue_partition_%s + SET sql_liveness_session = $2, + user_session = $3, + sql_liveness_session_successor = $4, + user_session_successor = $5, + partition_spec = $6 + WHERE partition_id = $1`, p.queueName), + partition.ID, sessionLivenessID, sessionConnectionID, + successorLivenessID, successorConnectionID, spanBytes) + + return err +} + +func (p *Partition) Empty() bool { + return p.ID == 0 +} + +type Session struct { + // ConnectionID is the ID of the underlying connection. + ConnectionID uuid.UUID + // LivenessID is the session ID for the server. Its used to identify sessions + // that belong to dead sql servers. + LivenessID sqlliveness.SessionID +} + +func (s *Session) Empty() bool { + return s.ConnectionID == uuid.Nil && s.LivenessID == "" +} + +func decodeSpan(data []byte) (*roachpb.Span, error) { + var span roachpb.Span + if err := span.Unmarshal(data); err != nil { + return nil, err + } + return &span, nil +} + +func encodeSpan(span *roachpb.Span) []byte { + if span == nil { + return nil + } + data, err := span.Marshal() + if err != nil { + return nil + } + return data +} diff --git a/pkg/sql/queuefeed/partitions_test.go b/pkg/sql/queuefeed/partitions_test.go new file mode 100644 index 000000000000..391d250b8738 --- /dev/null +++ b/pkg/sql/queuefeed/partitions_test.go @@ -0,0 +1,176 @@ +package queuefeed + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/uuid" + "github.com/stretchr/testify/require" +) + +func TestListPartitions(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := srv.ApplicationLayer().InternalDB().(isql.DB) + sqlRunner := sqlutils.MakeSQLRunner(sqlDB) + queueName := "test" + + pt := &partitionTable{db: db, queueName: queueName} + + // Create table + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.CreateSchema(ctx, txn) + }) + require.NoError(t, err) + + // Test empty + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + require.Empty(t, partitions) + return nil + }) + require.NoError(t, err) + + // Insert one partition + sessionID := uuid.MakeV4() + connectionID := uuid.MakeV4() + span := &roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("z")} + spanBytes, _ := span.Marshal() + + sqlRunner.Exec(t, ` + INSERT INTO defaultdb.queue_partition_`+queueName+` + (partition_id, sql_liveness_session, user_session, partition_spec) + VALUES (1, $1, $2, $3)`, sessionID, connectionID, spanBytes) + + // Test with data + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + require.Len(t, partitions, 1) + require.Equal(t, int64(1), partitions[0].ID) + require.Equal(t, connectionID, partitions[0].Session.ConnectionID) + return nil + }) + require.NoError(t, err) +} + +func TestUpdatePartition(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := srv.ApplicationLayer().InternalDB().(isql.DB) + sqlRunner := sqlutils.MakeSQLRunner(sqlDB) + queueName := "test" + + pt := &partitionTable{db: db, queueName: queueName} + + // Create table + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.CreateSchema(ctx, txn) + }) + require.NoError(t, err) + + // Insert initial partition + sqlRunner.Exec(t, ` + INSERT INTO defaultdb.queue_partition_`+queueName+` (partition_id) VALUES (1)`) + + // Update the partition + newSessionID := uuid.MakeV4() + newConnectionID := uuid.MakeV4() + span := &roachpb.Span{Key: roachpb.Key("new"), EndKey: roachpb.Key("span")} + + partition := Partition{ + ID: 1, + Session: Session{ + LivenessID: sqlliveness.SessionID(newSessionID.GetBytes()), + ConnectionID: newConnectionID, + }, + Span: span, + } + + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.UpdatePartition(ctx, txn, partition) + }) + require.NoError(t, err) + + // Verify update worked + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + require.Len(t, partitions, 1) + require.Equal(t, newConnectionID, partitions[0].Session.ConnectionID) + require.Equal(t, span.Key, partitions[0].Span.Key) + return nil + }) + require.NoError(t, err) +} + +func TestInsertPartition(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := srv.ApplicationLayer().InternalDB().(isql.DB) + queueName := "test" + + pt := &partitionTable{db: db, queueName: queueName} + + // Create table + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.CreateSchema(ctx, txn) + }) + require.NoError(t, err) + + // Insert partition + sessionID := uuid.MakeV4() + connectionID := uuid.MakeV4() + span := &roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("z")} + + partition := Partition{ + ID: 1, + Session: Session{ + LivenessID: sqlliveness.SessionID(sessionID.GetBytes()), + ConnectionID: connectionID, + }, + Span: span, + } + + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.InsertPartition(ctx, txn, partition) + }) + require.NoError(t, err) + + // Verify insertion worked + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + require.Len(t, partitions, 1) + require.Equal(t, int64(1), partitions[0].ID) + require.Equal(t, connectionID, partitions[0].Session.ConnectionID) + require.Equal(t, span.Key, partitions[0].Span.Key) + require.Equal(t, span.EndKey, partitions[0].Span.EndKey) + return nil + }) + require.NoError(t, err) +} diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index 48642ebd64e2..3617b7431945 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -45,6 +45,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/json" jsonpath "github.com/cockroachdb/cockroach/pkg/util/jsonpath/eval" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/randident" "github.com/cockroachdb/cockroach/pkg/util/randident/randidentcfg" @@ -4405,7 +4406,10 @@ func (g *queueFeedGenerator) Start(ctx context.Context, txn *kv.Txn) error { // Attach commit hook to txn to confirm receipt // or something... todo on rollback/abort. txn.AddCommitTrigger(func(ctx context.Context) { - qr.ConfirmReceipt(ctx) + // TODO(queuefeed): handle error properly. + if err := qr.ConfirmReceipt(ctx); err != nil { + log.Dev.Errorf(ctx, "error confirming receipt of queue %s: %v", g.queueName, err) + } }) rows, err := qr.GetRows(ctx, g.limit) From e3703c3a9bcd5c75af294c22c9e7e8671611f7f6 Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Mon, 10 Nov 2025 15:34:09 -0500 Subject: [PATCH 10/46] add a table to track existing queues and add test --- pkg/sql/queuefeed/BUILD.bazel | 2 ++ pkg/sql/queuefeed/manager.go | 51 +++++++++++++++++++++++++++--- pkg/sql/queuefeed/manager_test.go | 37 ++++++++++++++++++++++ pkg/sql/queuefeed/reader.go | 7 ++-- pkg/sql/sem/builtins/builtins.go | 6 ++-- pkg/sql/sem/builtins/fixed_oids.go | 2 +- 6 files changed, 95 insertions(+), 10 deletions(-) create mode 100644 pkg/sql/queuefeed/manager_test.go diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 1c43ee5718e2..de6119a4f586 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -15,6 +15,7 @@ go_library( "//pkg/roachpb", "//pkg/sql/isql", "//pkg/sql/sem/tree", + "//pkg/sql/sessiondata", "//pkg/sql/sqlliveness", "//pkg/util/hlc", "//pkg/util/syncutil", @@ -29,6 +30,7 @@ go_test( srcs = [ "main_test.go", "partitions_test.go", + "manager_test.go", ], embed = [":queuefeed"], deps = [ diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 68d19c548349..f544a979c7b9 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -6,6 +6,9 @@ import ( "fmt" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" + "github.com/cockroachdb/errors" ) // watch queue partition table @@ -40,10 +43,28 @@ CREATE TABLE IF NOT EXISTS defaultdb.queue_cursor_%s ( cursor bytea )` +const createQueueTableSQL = ` +CREATE TABLE IF NOT EXISTS defaultdb.queue_feeds ( + queue_feed_name STRING PRIMARY KEY, + table_desc_id INT8 NOT NULL +)` + +const insertQueueFeedSQL = ` +INSERT INTO defaultdb.queue_feeds (queue_feed_name, table_desc_id) VALUES ($1, $2) +` + +const fetchQueueFeedSQL = ` +SELECT table_desc_id FROM defaultdb.queue_feeds WHERE queue_feed_name = $1 +` + // should take a txn -func (m *Manager) CreateQueueTables(ctx context.Context, queueName string) error { +func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID int64) error { return m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - _, err := txn.Exec(ctx, "create_qp", txn.KV(), fmt.Sprintf(createQueuePartitionTableSQL, queueName)) + _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) + if err != nil { + return err + } + _, err = txn.Exec(ctx, "create_qp", txn.KV(), fmt.Sprintf(createQueuePartitionTableSQL, queueName)) if err != nil { return err } @@ -51,16 +72,38 @@ func (m *Manager) CreateQueueTables(ctx context.Context, queueName string) error if err != nil { return err } + // TODO(queuefeed): add validation on the table descriptor id + _, err = txn.Exec(ctx, "insert_q", txn.KV(), insertQueueFeedSQL, queueName, tableDescID) + if err != nil { + return err + } return nil }) } func (m *Manager) GetOrInitReader(ctx context.Context, name string) (*Reader, error) { - err := m.CreateQueueTables(ctx, name) + var tableDescID int64 + err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) + if err != nil { + return err + } + vals, err := txn.QueryRowEx(ctx, "fetch_q", txn.KV(), + sessiondata.NodeUserSessionDataOverride, fetchQueueFeedSQL, name) + if err != nil { + return err + } + if len(vals) == 0 { + return errors.Errorf("queue feed not found") + } + tableDescID = int64(tree.MustBeDInt(vals[0])) + return nil + }) if err != nil { return nil, err } - return NewReader(ctx, m.executor, m, name), nil + reader := NewReader(ctx, m.executor, m, name, tableDescID) + return reader, nil } func (m *Manager) reassessAssignments(ctx context.Context, name string) {} diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go new file mode 100644 index 000000000000..fcb0502668ff --- /dev/null +++ b/pkg/sql/queuefeed/manager_test.go @@ -0,0 +1,37 @@ +package queuefeed + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" +) + +func TestFeedCreation(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := srv.ApplicationLayer().InternalDB().(isql.DB) + // expect an error when trying to read from a queue that doesn't exist + qm := NewManager(db) + _, err := qm.GetOrInitReader(context.Background(), "test") + require.ErrorContains(t, err, "queue feed not found") + + // expect no error when creating a queue + require.NoError(t, qm.CreateQueue(context.Background(), "test", 104)) + + // now we can read from the queue + reader, err := qm.GetOrInitReader(context.Background(), "test") + require.NoError(t, err) + require.NotNil(t, reader) + reader.cancel() +} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index da155c668770..f996b61ad227 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -39,7 +39,7 @@ type Reader struct { cancel context.CancelFunc } -func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) *Reader { +func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string, tableDescID int64) *Reader { buf := []tree.Datums{ {tree.NewDString("1"), tree.NewDString("2"), tree.NewDString("3")}, {tree.NewDString("4"), tree.NewDString("5"), tree.NewDString("6")}, @@ -58,7 +58,8 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string) ctx, cancel := context.WithCancel(ctx) r.cancel = cancel - // TODO(queuefeed): Re-enable once queue data table and spans are implemented + // TODO(queuefeed): Re-enable once queue data table and spans are implemented. + // We will use the table descriptor id to set up a rangefeed on the table. // r.setupRangefeed(ctx) go r.run(ctx) return r @@ -119,7 +120,7 @@ func (r *Reader) run(ctx context.Context) { ctx, cancel := context.WithCancel(ctx) defer cancel() - // state machine + // state machine // - for { diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 9be6268f81ab..ca5d744f5bc5 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4646,13 +4646,15 @@ value if you rely on the HLC for accuracy.`, "crdb_internal.create_queue_feed": makeBuiltin(defProps(), tree.Overload{ Types: tree.ParamTypes{ {Name: "queue_name", Typ: types.String}, + {Name: "table_descriptor_id", Typ: types.Int}, }, Volatility: volatility.Volatile, ReturnType: tree.FixedReturnType(types.Void), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { qn := args[0].(*tree.DString) - err := getQueueManager(evalCtx).CreateQueueTables(ctx, string(*qn)) - if err != nil { + qm := getQueueManager(evalCtx) + tID := args[1].(*tree.DInt) + if err := qm.CreateQueue(ctx, string(*qn), int64(*tID)); err != nil { return nil, err } return tree.DVoidDatum, nil diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 17b7aac29027..ad1a3ac8fc62 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2863,7 +2863,7 @@ var builtinOidsArray = []string{ 2908: `crdb_internal.inject_hint(statement_fingerprint: string, donor_sql: string) -> int`, 2909: `crdb_internal.clear_statement_hints_cache() -> void`, 2910: `crdb_internal.await_statement_hints_cache() -> void`, - 2911: `crdb_internal.create_queue_feed(queue_name: string) -> void`, + 2911: `crdb_internal.create_queue_feed(queue_name: string, table_descriptor_id: int) -> void`, 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb`, } From 45e9f7fc89d5550c94b91861d9532545db86545d Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Mon, 10 Nov 2025 11:34:34 -0500 Subject: [PATCH 11/46] read from rangefeed --- pkg/BUILD.bazel | 1 + pkg/server/server_sql.go | 4 +- pkg/sql/BUILD.bazel | 1 + pkg/sql/exec_util.go | 3 +- pkg/sql/queuefeed/manager.go | 22 +- pkg/sql/queuefeed/queuebase/BUILD.bazel | 9 + pkg/sql/queuefeed/queuebase/queuebase.go | 17 ++ pkg/sql/queuefeed/reader.go | 315 ++++++++++++++++++++--- pkg/sql/sem/builtins/BUILD.bazel | 2 +- pkg/sql/sem/builtins/builtins.go | 52 +++- 10 files changed, 378 insertions(+), 48 deletions(-) create mode 100644 pkg/sql/queuefeed/queuebase/BUILD.bazel create mode 100644 pkg/sql/queuefeed/queuebase/queuebase.go diff --git a/pkg/BUILD.bazel b/pkg/BUILD.bazel index 2b88bd4e27ac..946325097036 100644 --- a/pkg/BUILD.bazel +++ b/pkg/BUILD.bazel @@ -2229,6 +2229,7 @@ GO_TARGETS = [ "//pkg/sql/protoreflect:protoreflect_test", "//pkg/sql/querycache:querycache", "//pkg/sql/querycache:querycache_test", + "//pkg/sql/queuefeed/queuebase:queuebase", "//pkg/sql/queuefeed:queuefeed", "//pkg/sql/queuefeed:queuefeed_test", "//pkg/sql/randgen:randgen", diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 8b81a1154e84..3b835783e67c 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -1065,7 +1065,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { TenantReadOnly: cfg.SQLConfig.TenantReadOnly, CidrLookup: cfg.BaseConfig.CidrLookup, LicenseEnforcer: cfg.SQLConfig.LicenseEnforcer, - QueueManager: queuefeed.NewManager(cfg.internalDB), + QueueManager: queuefeed.NewManager(cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), } if codec.ForSystemTenant() { @@ -1462,7 +1462,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { internalDBMemMonitor: internalDBMonitor, upgradeManager: upgradeMgr, serviceMode: cfg.serviceMode, - queueManager: queuefeed.NewManager(cfg.internalDB), + queueManager: queuefeed.NewManager(cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), }, nil } diff --git a/pkg/sql/BUILD.bazel b/pkg/sql/BUILD.bazel index 2e1063aede4b..018afe3a1e7c 100644 --- a/pkg/sql/BUILD.bazel +++ b/pkg/sql/BUILD.bazel @@ -479,6 +479,7 @@ go_library( "//pkg/sql/protoreflect", "//pkg/sql/querycache", "//pkg/sql/queuefeed", + "//pkg/sql/queuefeed/queuebase", "//pkg/sql/regionliveness", "//pkg/sql/regions", "//pkg/sql/rolemembershipcache", diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index d42a998b1545..dc587401fc38 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -82,6 +82,7 @@ import ( plpgsqlparser "github.com/cockroachdb/cockroach/pkg/sql/plpgsql/parser" "github.com/cockroachdb/cockroach/pkg/sql/querycache" "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/rolemembershipcache" "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/rowinfra" @@ -1856,7 +1857,7 @@ type ExecutorConfig struct { QueueManager *queuefeed.Manager } -func (cfg *ExecutorConfig) GetQueueManager() *queuefeed.Manager { +func (cfg *ExecutorConfig) GetQueueManager() queuebase.Manager { return cfg.QueueManager } diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index f544a979c7b9..863ca64b3803 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -5,7 +5,11 @@ import ( "context" "fmt" + "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" "github.com/cockroachdb/errors" @@ -15,12 +19,15 @@ import ( // and create it too?? type Manager struct { executor isql.DB + rff *rangefeed.Factory + codec keys.SQLCodec + leaseMgr *lease.Manager } -func NewManager(executor isql.DB) *Manager { +func NewManager(executor isql.DB, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another - return &Manager{executor: executor} + return &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} } const createQueuePartitionTableSQL = ` @@ -81,7 +88,8 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID }) } -func (m *Manager) GetOrInitReader(ctx context.Context, name string) (*Reader, error) { +func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.Reader, error) { + // TODO: get if exists already var tableDescID int64 err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) @@ -102,10 +110,14 @@ func (m *Manager) GetOrInitReader(ctx context.Context, name string) (*Reader, er if err != nil { return nil, err } - reader := NewReader(ctx, m.executor, m, name, tableDescID) + reader := NewReader(ctx, m.executor, m, m.rff, m.codec, m.leaseMgr, name, tableDescID) return reader, nil } -func (m *Manager) reassessAssignments(ctx context.Context, name string) {} +func (m *Manager) reassessAssignments(ctx context.Context, name string) (bool, error) { + return false, nil +} + +var _ queuebase.Manager = &Manager{} type PartitionAssignment struct{} diff --git a/pkg/sql/queuefeed/queuebase/BUILD.bazel b/pkg/sql/queuefeed/queuebase/BUILD.bazel new file mode 100644 index 000000000000..c36da12750bf --- /dev/null +++ b/pkg/sql/queuefeed/queuebase/BUILD.bazel @@ -0,0 +1,9 @@ +load("@io_bazel_rules_go//go:def.bzl", "go_library") + +go_library( + name = "queuebase", + srcs = ["queuebase.go"], + importpath = "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase", + visibility = ["//visibility:public"], + deps = ["//pkg/sql/sem/tree"], +) diff --git a/pkg/sql/queuefeed/queuebase/queuebase.go b/pkg/sql/queuefeed/queuebase/queuebase.go new file mode 100644 index 000000000000..c4336e37c2d2 --- /dev/null +++ b/pkg/sql/queuefeed/queuebase/queuebase.go @@ -0,0 +1,17 @@ +package queuebase + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" +) + +type Manager interface { + GetOrInitReader(ctx context.Context, name string) (Reader, error) + CreateQueue(ctx context.Context, name string, tableID int64) error +} + +type Reader interface { + GetRows(ctx context.Context, limit int) ([]tree.Datums, error) + ConfirmReceipt(ctx context.Context) +} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index f996b61ad227..00556aeb7db0 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -3,22 +3,38 @@ package queuefeed import ( "context" "fmt" + "slices" + "sync" + "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" + "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/fetchpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" + "github.com/cockroachdb/cockroach/pkg/sql/row" + "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" ) +const maxBufSize = 1000 + type readerState int const ( - readerStateIdle readerState = iota + readerStateBatching readerState = iota readerStateHasUncommittedBatch + readerStateCheckingForReassignment ) // has rangefeed on data. reads from it. handles handoff @@ -29,34 +45,47 @@ type Reader struct { mgr *Manager name string + // stuff for decoding data. this is ripped from rowfetcher_cache.go in changefeeds + codec keys.SQLCodec + leaseMgr *lease.Manager + mu struct { syncutil.Mutex state readerState buf []tree.Datums inflightBuffer []tree.Datums + poppedWakeup *sync.Cond + pushedWakeup *sync.Cond } - cancel context.CancelFunc -} + triggerCheckForReassignment chan struct{} -func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string, tableDescID int64) *Reader { - buf := []tree.Datums{ - {tree.NewDString("1"), tree.NewDString("2"), tree.NewDString("3")}, - {tree.NewDString("4"), tree.NewDString("5"), tree.NewDString("6")}, - {tree.NewDString("7"), tree.NewDString("8"), tree.NewDString("9")}, - } + cancel context.CancelCauseFunc +} +func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager, name string, tableDescID int64) *Reader { r := &Reader{ - executor: executor, - mgr: mgr, - name: name, + executor: executor, + mgr: mgr, + codec: codec, + leaseMgr: leaseMgr, + name: name, + rff: rff, + triggerCheckForReassignment: make(chan struct{}), } - r.mu.state = readerStateIdle - r.mu.buf = buf - r.mu.inflightBuffer = make([]tree.Datums, 0) + r.mu.state = readerStateBatching + r.mu.buf = make([]tree.Datums, 0, maxBufSize) + r.mu.poppedWakeup = sync.NewCond(&r.mu.Mutex) + r.mu.pushedWakeup = sync.NewCond(&r.mu.Mutex) - ctx, cancel := context.WithCancel(ctx) - r.cancel = cancel + ctx = context.TODO() // the context passed in here is canceled while the commit hook is running. so we need a longer-lived one for the persistent stuff. + // TODO: pass in from manager init + ctx, cancel := context.WithCancelCause(ctx) + r.cancel = func(cause error) { + cancel(cause) + fmt.Printf("canceling with cause: %s\n", cause) + r.mu.poppedWakeup.Broadcast() + } // TODO(queuefeed): Re-enable once queue data table and spans are implemented. // We will use the table descriptor id to set up a rangefeed on the table. @@ -66,18 +95,31 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, name string, } func (r *Reader) setupRangefeed(ctx context.Context) { + defer func() { + fmt.Println("setupRangefeed done") + }() + incomingResolveds := make(chan hlc.Timestamp) - setErr := func(err error) { r.cancel() } + setErr := func(err error) { r.cancel(err) } onValue := func(ctx context.Context, value *kvpb.RangeFeedValue) { + fmt.Printf("onValue: %+v\n", value) r.mu.Lock() defer r.mu.Unlock() - if len(r.mu.buf) > 100 { - // TODO: wait for rows to be read before adding more + // wait for rows to be read before adding more, if necessary + for ctx.Err() == nil && len(r.mu.buf) > maxBufSize { + r.mu.poppedWakeup.Wait() + } + + datums, err := r.decodeRangefeedValue(ctx, value) + if err != nil { + setErr(err) + return } - // TODO: decode value.Value - r.mu.buf = append(r.mu.buf, tree.Datums{tree.DVoidDatum}) + r.mu.buf = append(r.mu.buf, datums) + r.mu.pushedWakeup.Broadcast() + fmt.Printf("onValue done with buf len: %d\n", len(r.mu.buf)) } // setup rangefeed on data opts := []rangefeed.Option{ @@ -105,7 +147,25 @@ func (r *Reader) setupRangefeed(ctx context.Context) { rf := r.rff.New( fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., ) - defer rf.Close() + + // get desc for table + desc, err := r.leaseMgr.AcquireByName(ctx, lease.TimestampToReadTimestamp(initialTS), 100, 101, "t") + if err != nil { + setErr(err) + return + } + defer desc.Release(ctx) + + // TODO: why are we given a zero codec? + r.codec = keys.MakeSQLCodec(roachpb.SystemTenantID) + + spans := []roachpb.Span{desc.Underlying().(catalog.TableDescriptor).TableSpan(r.codec)} + if err := rf.Start(ctx, spans); err != nil { + setErr(err) + return + } + _ = rf + // TODO: rf.Close() on close } @@ -117,53 +177,236 @@ func (r *Reader) setupRangefeed(ctx context.Context) { // - [ ] gonna need some way to clean stuff up on conn_executor.close() func (r *Reader) run(ctx context.Context) { - ctx, cancel := context.WithCancel(ctx) - defer cancel() - - // state machine - // - + defer func() { + fmt.Println("run done") + }() for { select { case <-ctx.Done(): + fmt.Printf("run: ctx done: %s\n", ctx.Err(), context.Cause(ctx)) return + case <-r.triggerCheckForReassignment: + fmt.Printf("triggerCheckForReassignment\n") + if err := r.checkForReassignment(ctx); err != nil { + r.cancel(err) + return + } } } } func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) { - r.mu.Lock() + fmt.Printf("GetRows start\n") + + r.mu.Lock() // cant get this lock? defer r.mu.Unlock() - if r.mu.state != readerStateIdle { + if r.mu.state != readerStateBatching { return nil, errors.New("reader not idle") } if len(r.mu.inflightBuffer) > 0 { return nil, errors.AssertionFailedf("getrows called with nonempty inflight buffer") } + if len(r.mu.buf) == 0 { + fmt.Printf("GetRows called with empty buf. waiting for pushedWakeup\n") + for ctx.Err() == nil && len(r.mu.buf) == 0 { + r.mu.pushedWakeup.Wait() + } + } + if limit > len(r.mu.buf) { limit = len(r.mu.buf) } + fmt.Printf("GetRows called with limit: %d, buf len: %d\n", limit, len(r.mu.buf)) + r.mu.inflightBuffer = append(r.mu.inflightBuffer, r.mu.buf[0:limit]...) r.mu.buf = r.mu.buf[limit:] - r.mu.inflightBuffer = append(r.mu.inflightBuffer, r.mu.buf...) - clear(r.mu.buf) r.mu.state = readerStateHasUncommittedBatch - return r.mu.inflightBuffer, nil + + r.mu.poppedWakeup.Broadcast() + + fmt.Printf("GetRows done with inflightBuffer len: %d, buf len: %d\n", len(r.mu.inflightBuffer), len(r.mu.buf)) + + return slices.Clone(r.mu.inflightBuffer), nil // and then trigger the goro to check if m wants us to change assignments // if it does, handle that stuff before doing a new batch } -func (r *Reader) ConfirmReceipt(ctx context.Context) error { +func (r *Reader) ConfirmReceipt(ctx context.Context) { + func() { + r.mu.Lock() + defer r.mu.Unlock() + + fmt.Printf("confirming receipt with inflightBuffer len: %d\n", len(r.mu.inflightBuffer)) + + clear(r.mu.inflightBuffer) + r.mu.state = readerStateCheckingForReassignment + }() + + select { + case <-ctx.Done(): + return + case r.triggerCheckForReassignment <- struct{}{}: + } +} + +func (r *Reader) checkForReassignment(ctx context.Context) error { + defer func() { + fmt.Println("checkForReassignment done") + }() + r.mu.Lock() defer r.mu.Unlock() - r.mu.state = readerStateIdle - clear(r.mu.inflightBuffer) + if r.mu.state != readerStateCheckingForReassignment { + return errors.AssertionFailedf("reader not in checking for reassignment state") + } + change, err := r.mgr.reassessAssignments(ctx, r.name) + if err != nil { + return errors.Wrap(err, "reassessing assignments") + } + if change { + fmt.Println("TODO: reassignment detected. lets do something about it") + } + r.mu.state = readerStateBatching return nil } + +// TODO: this is all highly sus +func (r *Reader) decodeRangefeedValue(ctx context.Context, rfv *kvpb.RangeFeedValue) (tree.Datums, error) { + key, value := rfv.Key, rfv.Value + key, err := r.codec.StripTenantPrefix(key) + if err != nil { + return nil, errors.Wrapf(err, "stripping tenant prefix: %s", keys.PrettyPrint(nil, key)) + } + + _, tableID, _, err := rowenc.DecodePartialTableIDIndexID(key) + if err != nil { + return nil, errors.Wrapf(err, "decoding partial table id index id: %s", keys.PrettyPrint(nil, key)) + } + tableDesc, err := r.fetchTableDesc(ctx, tableID, value.Timestamp) + if err != nil { + return nil, errors.Wrapf(err, "fetching table descriptor: %s", keys.PrettyPrint(nil, key)) + } + familyDesc, err := catalog.MustFindFamilyByID(tableDesc, 0) + if err != nil { + return nil, errors.Wrapf(err, "fetching family descriptor: %s", keys.PrettyPrint(nil, key)) + } + cols, err := getRelevantColumnsForFamily(tableDesc, familyDesc) + if err != nil { + return nil, errors.Wrapf(err, "getting relevant columns for family: %s", keys.PrettyPrint(nil, key)) + } + + var spec fetchpb.IndexFetchSpec + if err := rowenc.InitIndexFetchSpec(&spec, r.codec, tableDesc, tableDesc.GetPrimaryIndex(), cols); err != nil { + return nil, errors.Wrapf(err, "initializing index fetch spec: %s", keys.PrettyPrint(nil, key)) + } + rf := row.Fetcher{} + if err := rf.Init(ctx, row.FetcherInitArgs{ + Spec: &spec, + WillUseKVProvider: true, + TraceKV: true, + TraceKVEvery: &util.EveryN{N: 1}, + }); err != nil { + return nil, errors.Wrapf(err, "initializing row fetcher: %s", keys.PrettyPrint(nil, key)) + } + kvProvider := row.KVProvider{KVs: []roachpb.KeyValue{{Key: key, Value: value}}} + if err := rf.ConsumeKVProvider(ctx, &kvProvider); err != nil { + return nil, errors.Wrapf(err, "consuming kv provider: %s", keys.PrettyPrint(nil, key)) + } + encDatums, _, err := rf.NextRow(ctx) + if err != nil { + return nil, errors.Wrapf(err, "fetching next row: %s", keys.PrettyPrint(nil, key)) + } + _ = encDatums + + datums := make(tree.Datums, len(cols)) + for i, colID := range cols { + col, err := catalog.MustFindColumnByID(tableDesc, colID) + if err != nil { + return nil, errors.Wrapf(err, "finding column by id: %s", colID) + } + ed := encDatums[i] + if err := ed.EnsureDecoded(col.ColumnDesc().Type, &tree.DatumAlloc{}); err != nil { + return nil, errors.Wrapf(err, "error decoding column %q as type %s", col.ColumnDesc().Name, col.ColumnDesc().Type.String()) + } + datums[i] = ed.Datum + } + return datums, nil +} + +func (r *Reader) fetchTableDesc( + ctx context.Context, tableID descpb.ID, ts hlc.Timestamp, +) (catalog.TableDescriptor, error) { + // Retrieve the target TableDescriptor from the lease manager. No caching + // is attempted because the lease manager does its own caching. + desc, err := r.leaseMgr.Acquire(ctx, lease.TimestampToReadTimestamp(ts), tableID) + if err != nil { + // Manager can return all kinds of errors during chaos, but based on + // its usage, none of them should ever be terminal. + return nil, changefeedbase.MarkRetryableError(err) + } + tableDesc := desc.Underlying().(catalog.TableDescriptor) + // Immediately release the lease, since we only need it for the exact + // timestamp requested. + desc.Release(ctx) + if tableDesc.MaybeRequiresTypeHydration() { + return nil, errors.AssertionFailedf("type hydration not supported yet") + } + return tableDesc, nil +} + +var _ queuebase.Reader = &Reader{} + +func getRelevantColumnsForFamily( + tableDesc catalog.TableDescriptor, familyDesc *descpb.ColumnFamilyDescriptor, +) ([]descpb.ColumnID, error) { + cols := tableDesc.GetPrimaryIndex().CollectKeyColumnIDs() + for _, colID := range familyDesc.ColumnIDs { + cols.Add(colID) + } + + // Maintain the ordering of tableDesc.PublicColumns(), which is + // matches the order of columns in the SQL table. + idx := 0 + result := make([]descpb.ColumnID, cols.Len()) + visibleColumns := tableDesc.PublicColumns() + if tableDesc.GetDeclarativeSchemaChangerState() != nil { + hasMergedIndex := catalog.HasDeclarativeMergedPrimaryIndex(tableDesc) + visibleColumns = make([]catalog.Column, 0, cols.Len()) + for _, col := range tableDesc.AllColumns() { + if col.Adding() { + continue + } + if tableDesc.GetDeclarativeSchemaChangerState() == nil && !col.Public() { + continue + } + if col.Dropped() && (!col.WriteAndDeleteOnly() || hasMergedIndex) { + continue + } + visibleColumns = append(visibleColumns, col) + } + // Recover the order of the original columns. + slices.SortStableFunc(visibleColumns, func(a, b catalog.Column) int { + return int(a.GetPGAttributeNum()) - int(b.GetPGAttributeNum()) + }) + } + for _, col := range visibleColumns { + colID := col.GetID() + if cols.Contains(colID) { + result[idx] = colID + idx++ + } + } + + // Some columns in familyDesc.ColumnIDs may not be public, so + // result may contain fewer columns than cols. + result = result[:idx] + return result, nil +} diff --git a/pkg/sql/sem/builtins/BUILD.bazel b/pkg/sql/sem/builtins/BUILD.bazel index 3eafd3e78622..98598ebe3ba2 100644 --- a/pkg/sql/sem/builtins/BUILD.bazel +++ b/pkg/sql/sem/builtins/BUILD.bazel @@ -78,7 +78,7 @@ go_library( "//pkg/sql/pgwire/pgnotice", "//pkg/sql/privilege", "//pkg/sql/protoreflect", - "//pkg/sql/queuefeed", + "//pkg/sql/queuefeed/queuebase", "//pkg/sql/rowenc", "//pkg/sql/rowenc/keyside", "//pkg/sql/rowenc/valueside", diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index ca5d744f5bc5..982bb2c639f0 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -55,7 +55,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/protoreflect" - "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/rowenc" "github.com/cockroachdb/cockroach/pkg/sql/rowenc/keyside" "github.com/cockroachdb/cockroach/pkg/sql/sem/asof" @@ -4661,6 +4661,52 @@ value if you rely on the HLC for accuracy.`, }, }), + "crdb_internal.select_from_queue_feed": makeBuiltin(defProps(), tree.Overload{ + Types: tree.ParamTypes{ + {Name: "queue_name", Typ: types.String}, + {Name: "limit", Typ: types.Int}, + }, + Volatility: volatility.Volatile, + ReturnType: tree.FixedReturnType(types.MakeArray(types.Json)), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + var err error + // ignore queue_name for now; we only support one queue + // ditto limit lol + qn := args[0].(*tree.DString) + qr, err := getQueueManager(evalCtx).GetOrInitReader(ctx, string(*qn)) + if err != nil { + return nil, err + } + // attach commit hook to txn to confirm receipt + txn := evalCtx.Txn + + ret := tree.NewDArray(types.Json) + + rowResult, err := qr.GetRows(ctx, int(tree.MustBeDInt(args[1]))) + if err != nil { + return nil, err + } + // or something... todo on rollback/abort + txn.AddCommitTrigger(func(ctx context.Context) { + qr.ConfirmReceipt(ctx) + }) + + for _, row := range rowResult { + obj := json.NewObjectBuilder(len(row)) + for i, d := range row { + fmt.Printf("d: %#+v\n", d) + j, err := tree.AsJSON(d, evalCtx.SessionData().DataConversionConfig, evalCtx.GetLocation()) + if err != nil { + return nil, err + } + obj.Add(fmt.Sprintf("f%d", i+1), j) + } + ret.Append(tree.NewDJSON(obj.Build())) + } + return ret, nil + }, + }), + "crdb_internal.json_to_pb": makeBuiltin( jsonProps(), tree.Overload{ @@ -12866,6 +12912,6 @@ func exprSliceToStrSlice(exprs []tree.Expr) []string { var nilRegionsError = errors.AssertionFailedf("evalCtx.Regions is nil") -func getQueueManager(evalCtx *eval.Context) *queuefeed.Manager { - return evalCtx.Planner.ExecutorConfig().(interface{ GetQueueManager() *queuefeed.Manager }).GetQueueManager() +func getQueueManager(evalCtx *eval.Context) queuebase.Manager { + return evalCtx.Planner.ExecutorConfig().(interface{ GetQueueManager() queuebase.Manager }).GetQueueManager() } From 1cba809e38850d0e952cea19aaac9743de95e766 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 03:01:31 +0000 Subject: [PATCH 12/46] fix build --- pkg/sql/queuefeed/BUILD.bazel | 12 +++++++++++- pkg/sql/sem/builtins/builtins.go | 2 +- pkg/sql/sem/builtins/fixed_oids.go | 2 +- pkg/sql/sem/builtins/generator_builtins.go | 5 +---- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index de6119a4f586..b597dba856f6 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -10,13 +10,23 @@ go_library( importpath = "github.com/cockroachdb/cockroach/pkg/sql/queuefeed", visibility = ["//visibility:public"], deps = [ + "//pkg/ccl/changefeedccl/changefeedbase", + "//pkg/keys", "//pkg/kv/kvclient/rangefeed", "//pkg/kv/kvpb", "//pkg/roachpb", + "//pkg/sql/catalog", + "//pkg/sql/catalog/descpb", + "//pkg/sql/catalog/fetchpb", + "//pkg/sql/catalog/lease", "//pkg/sql/isql", + "//pkg/sql/queuefeed/queuebase", + "//pkg/sql/row", + "//pkg/sql/rowenc", "//pkg/sql/sem/tree", "//pkg/sql/sessiondata", "//pkg/sql/sqlliveness", + "//pkg/util", "//pkg/util/hlc", "//pkg/util/syncutil", "//pkg/util/timeutil", @@ -29,8 +39,8 @@ go_test( name = "queuefeed_test", srcs = [ "main_test.go", - "partitions_test.go", "manager_test.go", + "partitions_test.go", ], embed = [":queuefeed"], deps = [ diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 982bb2c639f0..2ed6a50806d5 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4661,7 +4661,7 @@ value if you rely on the HLC for accuracy.`, }, }), - "crdb_internal.select_from_queue_feed": makeBuiltin(defProps(), tree.Overload{ + "crdb_internal.select_array_from_queue_feed": makeBuiltin(defProps(), tree.Overload{ Types: tree.ParamTypes{ {Name: "queue_name", Typ: types.String}, {Name: "limit", Typ: types.Int}, diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index ad1a3ac8fc62..a5c3071f8078 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2864,7 +2864,7 @@ var builtinOidsArray = []string{ 2909: `crdb_internal.clear_statement_hints_cache() -> void`, 2910: `crdb_internal.await_statement_hints_cache() -> void`, 2911: `crdb_internal.create_queue_feed(queue_name: string, table_descriptor_id: int) -> void`, - 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb`, + 2912: `crdb_internal.select_array_from_queue_feed(queue_name: string, limit: int) -> jsonb`, } var builtinOidsBySignature map[string]oid.Oid diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index 3617b7431945..aa7cc3ec6f15 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -45,7 +45,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/envutil" "github.com/cockroachdb/cockroach/pkg/util/json" jsonpath "github.com/cockroachdb/cockroach/pkg/util/jsonpath/eval" - "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/mon" "github.com/cockroachdb/cockroach/pkg/util/randident" "github.com/cockroachdb/cockroach/pkg/util/randident/randidentcfg" @@ -4407,9 +4406,7 @@ func (g *queueFeedGenerator) Start(ctx context.Context, txn *kv.Txn) error { // or something... todo on rollback/abort. txn.AddCommitTrigger(func(ctx context.Context) { // TODO(queuefeed): handle error properly. - if err := qr.ConfirmReceipt(ctx); err != nil { - log.Dev.Errorf(ctx, "error confirming receipt of queue %s: %v", g.queueName, err) - } + qr.ConfirmReceipt(ctx) }) rows, err := qr.GetRows(ctx, g.limit) From 0824bcb7b3b500be0c7933bb309e520da25887e1 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 03:18:19 +0000 Subject: [PATCH 13/46] restore functionality --- pkg/sql/queuefeed/manager.go | 1 + pkg/sql/queuefeed/reader.go | 13 +++++++++++-- pkg/sql/sem/builtins/builtins.go | 3 +-- pkg/sql/sem/builtins/fixed_oids.go | 3 ++- 4 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 863ca64b3803..a9cfd8056ad0 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -110,6 +110,7 @@ func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.R if err != nil { return nil, err } + fmt.Printf("get or init reader for queue %s with table desc id: %d\n", name, tableDescID) reader := NewReader(ctx, m.executor, m, m.rff, m.codec, m.leaseMgr, name, tableDescID) return reader, nil } diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 00556aeb7db0..8ba906602a69 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -44,6 +44,7 @@ type Reader struct { rff *rangefeed.Factory mgr *Manager name string + tableID descpb.ID // stuff for decoding data. this is ripped from rowfetcher_cache.go in changefeeds codec keys.SQLCodec @@ -71,6 +72,7 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefe leaseMgr: leaseMgr, name: name, rff: rff, + tableID: descpb.ID(tableDescID), triggerCheckForReassignment: make(chan struct{}), } r.mu.state = readerStateBatching @@ -89,7 +91,7 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefe // TODO(queuefeed): Re-enable once queue data table and spans are implemented. // We will use the table descriptor id to set up a rangefeed on the table. - // r.setupRangefeed(ctx) + r.setupRangefeed(ctx) go r.run(ctx) return r } @@ -159,7 +161,14 @@ func (r *Reader) setupRangefeed(ctx context.Context) { // TODO: why are we given a zero codec? r.codec = keys.MakeSQLCodec(roachpb.SystemTenantID) - spans := []roachpb.Span{desc.Underlying().(catalog.TableDescriptor).TableSpan(r.codec)} + tk := roachpb.Span{ + Key: r.codec.TablePrefix(uint32(r.tableID)), + } + tk.EndKey = tk.Key.PrefixEnd() + spans := []roachpb.Span{tk} + + fmt.Printf("starting rangefeed with spans: %+v\n", spans) + if err := rf.Start(ctx, spans); err != nil { setErr(err) return diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 2ed6a50806d5..b40e45b3c138 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4678,7 +4678,6 @@ value if you rely on the HLC for accuracy.`, return nil, err } // attach commit hook to txn to confirm receipt - txn := evalCtx.Txn ret := tree.NewDArray(types.Json) @@ -4687,7 +4686,7 @@ value if you rely on the HLC for accuracy.`, return nil, err } // or something... todo on rollback/abort - txn.AddCommitTrigger(func(ctx context.Context) { + evalCtx.Txn.AddCommitTrigger(func(ctx context.Context) { qr.ConfirmReceipt(ctx) }) diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index a5c3071f8078..ccaf8e3e2fe7 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2864,7 +2864,8 @@ var builtinOidsArray = []string{ 2909: `crdb_internal.clear_statement_hints_cache() -> void`, 2910: `crdb_internal.await_statement_hints_cache() -> void`, 2911: `crdb_internal.create_queue_feed(queue_name: string, table_descriptor_id: int) -> void`, - 2912: `crdb_internal.select_array_from_queue_feed(queue_name: string, limit: int) -> jsonb`, + 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb`, + 2913: `crdb_internal.select_array_from_queue_feed(queue_name: string, limit: int) -> jsonb[]`, } var builtinOidsBySignature map[string]oid.Oid From 4cb10debdc3634d536ff219f895277d43a9a5fc7 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 03:21:21 +0000 Subject: [PATCH 14/46] fix test --- pkg/sql/queuefeed/manager_test.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index fcb0502668ff..639c4c514b92 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -5,10 +5,13 @@ import ( "testing" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" ) @@ -22,7 +25,7 @@ func TestFeedCreation(t *testing.T) { db := srv.ApplicationLayer().InternalDB().(isql.DB) // expect an error when trying to read from a queue that doesn't exist - qm := NewManager(db) + qm := NewManager(db, srv.RangeFeedFactory().(*rangefeed.Factory), srv.Codec(), srv.ApplicationLayer().LeaseManager().(*lease.Manager)) _, err := qm.GetOrInitReader(context.Background(), "test") require.ErrorContains(t, err, "queue feed not found") @@ -33,5 +36,5 @@ func TestFeedCreation(t *testing.T) { reader, err := qm.GetOrInitReader(context.Background(), "test") require.NoError(t, err) require.NotNil(t, reader) - reader.cancel() + reader.(*Reader).cancel(errors.New("test shutdown")) } From cfc38aa2debbbd6b67e54e21872285a1539d08cc Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 03:24:46 +0000 Subject: [PATCH 15/46] cleanup --- pkg/sql/queuefeed/reader.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 8ba906602a69..c05e87fd0f3e 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -89,8 +89,6 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefe r.mu.poppedWakeup.Broadcast() } - // TODO(queuefeed): Re-enable once queue data table and spans are implemented. - // We will use the table descriptor id to set up a rangefeed on the table. r.setupRangefeed(ctx) go r.run(ctx) return r From d110c5d8bb8b0272c3a227ac3876d3831fa4a813 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 04:09:20 +0000 Subject: [PATCH 16/46] wip --- pkg/server/server_sql.go | 4 ++-- pkg/sql/exec_util.go | 8 +++++++ pkg/sql/queuefeed/manager.go | 2 +- pkg/sql/queuefeed/reader.go | 39 +++++++++++++++++++++++++------- pkg/sql/sem/builtins/builtins.go | 11 ++++----- 5 files changed, 47 insertions(+), 17 deletions(-) diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 3b835783e67c..013c04890978 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -1065,7 +1065,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { TenantReadOnly: cfg.SQLConfig.TenantReadOnly, CidrLookup: cfg.BaseConfig.CidrLookup, LicenseEnforcer: cfg.SQLConfig.LicenseEnforcer, - QueueManager: queuefeed.NewManager(cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), + QueueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), } if codec.ForSystemTenant() { @@ -1462,7 +1462,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { internalDBMemMonitor: internalDBMonitor, upgradeManager: upgradeMgr, serviceMode: cfg.serviceMode, - queueManager: queuefeed.NewManager(cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), + queueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), }, nil } diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index dc587401fc38..2837a4450866 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -1861,6 +1861,14 @@ func (cfg *ExecutorConfig) GetQueueManager() queuebase.Manager { return cfg.QueueManager } +func (cfg *ExecutorConfig) GetStopper() *stop.Stopper { + return cfg.Stopper +} + +func (cfg *ExecutorConfig) GetAmbientCtx() log.AmbientContext { + return cfg.AmbientCtx +} + // UpdateVersionSystemSettingHook provides a callback that allows us // update the cluster version inside the system.settings table. This hook // is aimed at mainly updating tenant pods, which will currently skip over diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index a9cfd8056ad0..4f4ac6b30c1d 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -24,7 +24,7 @@ type Manager struct { leaseMgr *lease.Manager } -func NewManager(executor isql.DB, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager) *Manager { +func NewManager(_ context.Context, executor isql.DB, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another return &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index c05e87fd0f3e..c9c168bf460f 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -61,7 +61,8 @@ type Reader struct { triggerCheckForReassignment chan struct{} - cancel context.CancelCauseFunc + cancel context.CancelCauseFunc + goroCtx context.Context } func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager, name string, tableDescID int64) *Reader { @@ -74,19 +75,20 @@ func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefe rff: rff, tableID: descpb.ID(tableDescID), triggerCheckForReassignment: make(chan struct{}), + // stored so we can use it in methods using a different context than the main goro ie GetRows and ConfirmReceipt + goroCtx: ctx, } r.mu.state = readerStateBatching r.mu.buf = make([]tree.Datums, 0, maxBufSize) r.mu.poppedWakeup = sync.NewCond(&r.mu.Mutex) r.mu.pushedWakeup = sync.NewCond(&r.mu.Mutex) - ctx = context.TODO() // the context passed in here is canceled while the commit hook is running. so we need a longer-lived one for the persistent stuff. - // TODO: pass in from manager init ctx, cancel := context.WithCancelCause(ctx) r.cancel = func(cause error) { - cancel(cause) fmt.Printf("canceling with cause: %s\n", cause) + cancel(cause) r.mu.poppedWakeup.Broadcast() + r.mu.pushedWakeup.Broadcast() } r.setupRangefeed(ctx) @@ -114,7 +116,7 @@ func (r *Reader) setupRangefeed(ctx context.Context) { datums, err := r.decodeRangefeedValue(ctx, value) if err != nil { - setErr(err) + setErr(errors.Wrapf(err, "decoding rangefeed value: %+v", value)) return } r.mu.buf = append(r.mu.buf, datums) @@ -183,6 +185,8 @@ func (r *Reader) setupRangefeed(ctx context.Context) { // - [ ] checkpoint frontier if our frontier has advanced and we confirmed receipt // - [ ] gonna need some way to clean stuff up on conn_executor.close() +// TODO: run still shuts down with context canceled after getting rows. why? + func (r *Reader) run(ctx context.Context) { defer func() { fmt.Println("run done") @@ -191,7 +195,7 @@ func (r *Reader) run(ctx context.Context) { for { select { case <-ctx.Done(): - fmt.Printf("run: ctx done: %s\n", ctx.Err(), context.Cause(ctx)) + fmt.Printf("run: ctx done: %s; cause: %s\n", ctx.Err(), context.Cause(ctx)) return case <-r.triggerCheckForReassignment: fmt.Printf("triggerCheckForReassignment\n") @@ -206,7 +210,7 @@ func (r *Reader) run(ctx context.Context) { func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) { fmt.Printf("GetRows start\n") - r.mu.Lock() // cant get this lock? + r.mu.Lock() defer r.mu.Unlock() if r.mu.state != readerStateBatching { @@ -218,9 +222,26 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) if len(r.mu.buf) == 0 { fmt.Printf("GetRows called with empty buf. waiting for pushedWakeup\n") - for ctx.Err() == nil && len(r.mu.buf) == 0 { + // shut down the reader if this ctx (which is distinct from the goro ctx) is canceled + done := make(chan struct{}) + defer close(done) + go func() { + select { + case <-ctx.Done(): + r.cancel(errors.Wrapf(context.Cause(ctx), "GetRows canceled")) + case <-done: + return + } + }() + for ctx.Err() == nil && r.goroCtx.Err() == nil && len(r.mu.buf) == 0 { r.mu.pushedWakeup.Wait() } + if ctx.Err() != nil { + return nil, errors.Wrapf(context.Cause(ctx), "GetRows canceled") + } + if r.goroCtx.Err() != nil { + return nil, errors.Wrapf(context.Cause(r.goroCtx), "reader shutting down") + } } if limit > len(r.mu.buf) { @@ -258,6 +279,8 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { select { case <-ctx.Done(): return + case <-r.goroCtx.Done(): + return case r.triggerCheckForReassignment <- struct{}{}: } } diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index b40e45b3c138..2df6bd5d15b4 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4669,15 +4669,13 @@ value if you rely on the HLC for accuracy.`, Volatility: volatility.Volatile, ReturnType: tree.FixedReturnType(types.MakeArray(types.Json)), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { - var err error - // ignore queue_name for now; we only support one queue - // ditto limit lol qn := args[0].(*tree.DString) - qr, err := getQueueManager(evalCtx).GetOrInitReader(ctx, string(*qn)) + ambientCtx := evalCtx.Planner.ExecutorConfig().(interface{ GetAmbientCtx() log.AmbientContext }).GetAmbientCtx() + bgCtx := ambientCtx.AnnotateCtx(context.Background()) + qr, err := getQueueManager(evalCtx).GetOrInitReader(bgCtx, string(*qn)) if err != nil { - return nil, err + return nil, errors.Wrapf(err, "get or init reader for queue %s", string(*qn)) } - // attach commit hook to txn to confirm receipt ret := tree.NewDArray(types.Json) @@ -4685,6 +4683,7 @@ value if you rely on the HLC for accuracy.`, if err != nil { return nil, err } + // attach commit hook to txn to confirm receipt // or something... todo on rollback/abort evalCtx.Txn.AddCommitTrigger(func(ctx context.Context) { qr.ConfirmReceipt(ctx) From 2740ef1b492d2747a25286a790c75928f712fcf5 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 04:21:06 +0000 Subject: [PATCH 17/46] use session context --- pkg/sql/conn_executor.go | 3 +++ pkg/sql/queuefeed/manager_test.go | 2 +- pkg/sql/sem/builtins/builtins.go | 4 +--- pkg/sql/sem/eval/context.go | 7 +++++++ 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 5d041d516b31..0a6a5b399f97 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -3815,6 +3815,7 @@ func bufferedWritesIsAllowedForIsolationLevel( func (ex *connExecutor) initEvalCtx(ctx context.Context, evalCtx *extendedEvalContext, p *planner) { *evalCtx = extendedEvalContext{ Context: eval.Context{ + SessionCtx: ex.ctxHolder.ctx(), Planner: p, StreamManagerFactory: p, PrivilegedAccessor: p, @@ -3910,6 +3911,8 @@ func (ex *connExecutor) GetPCRReaderTimestamp() hlc.Timestamp { // Safe for concurrent use. func (ex *connExecutor) resetEvalCtx(evalCtx *extendedEvalContext, txn *kv.Txn, stmtTS time.Time) { newTxn := txn == nil || evalCtx.Txn != txn + // Keep the session context up to date (accounts for session tracing hijack). + evalCtx.SessionCtx = ex.ctxHolder.ctx() evalCtx.TxnState = ex.getTransactionState() evalCtx.TxnReadOnly = ex.state.readOnly.Load() evalCtx.TxnImplicit = ex.implicitTxn() diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 639c4c514b92..1b492e66e594 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -25,7 +25,7 @@ func TestFeedCreation(t *testing.T) { db := srv.ApplicationLayer().InternalDB().(isql.DB) // expect an error when trying to read from a queue that doesn't exist - qm := NewManager(db, srv.RangeFeedFactory().(*rangefeed.Factory), srv.Codec(), srv.ApplicationLayer().LeaseManager().(*lease.Manager)) + qm := NewManager(ctx, db, srv.RangeFeedFactory().(*rangefeed.Factory), srv.Codec(), srv.ApplicationLayer().LeaseManager().(*lease.Manager)) _, err := qm.GetOrInitReader(context.Background(), "test") require.ErrorContains(t, err, "queue feed not found") diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 2df6bd5d15b4..c0d70bef6184 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4670,9 +4670,7 @@ value if you rely on the HLC for accuracy.`, ReturnType: tree.FixedReturnType(types.MakeArray(types.Json)), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { qn := args[0].(*tree.DString) - ambientCtx := evalCtx.Planner.ExecutorConfig().(interface{ GetAmbientCtx() log.AmbientContext }).GetAmbientCtx() - bgCtx := ambientCtx.AnnotateCtx(context.Background()) - qr, err := getQueueManager(evalCtx).GetOrInitReader(bgCtx, string(*qn)) + qr, err := getQueueManager(evalCtx).GetOrInitReader(evalCtx.SessionCtx, string(*qn)) if err != nil { return nil, errors.Wrapf(err, "get or init reader for queue %s", string(*qn)) } diff --git a/pkg/sql/sem/eval/context.go b/pkg/sql/sem/eval/context.go index 7b8fbc86495e..bcd33dc58ade 100644 --- a/pkg/sql/sem/eval/context.go +++ b/pkg/sql/sem/eval/context.go @@ -67,6 +67,13 @@ var ErrNilTxnInClusterContext = errors.New("nil txn in cluster context") // more fields from the sql package. Through that extendedEvalContext, this // struct now generally used by planNodes. type Context struct { + // SessionCtx is the session-lifetime context for the current SQL connection. + // It reflects the session's context (or the session tracing context if + // tracing is enabled) and is not cancelled at statement end. Prefer the + // statement/transaction-scoped ctx passed to functions when work must obey + // statement/txn cancellation; use SessionCtx only for work that must outlive + // the statement/txn but remain tied to the SQL session. + SessionCtx context.Context // SessionDataStack stores the session variables accessible by the correct // context. Each element on the stack represents the beginning of a new // transaction or nested transaction (savepoints). From 7b63782c84c702cea936c88ae7cf4c41c0dea779 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 04:49:07 +0000 Subject: [PATCH 18/46] fix --- pkg/sql/queuefeed/manager.go | 1 + pkg/sql/queuefeed/reader.go | 6 +++--- pkg/sql/sem/builtins/generator_builtins.go | 4 +--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 4f4ac6b30c1d..232bac21acd2 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -91,6 +91,7 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.Reader, error) { // TODO: get if exists already var tableDescID int64 + // TODO: this ctx on the other hand should be stmt scoped err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) if err != nil { diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index c9c168bf460f..5fd68c21dd59 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -228,7 +228,7 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) go func() { select { case <-ctx.Done(): - r.cancel(errors.Wrapf(context.Cause(ctx), "GetRows canceled")) + r.cancel(errors.Wrapf(ctx.Err(), "GetRows canceled")) case <-done: return } @@ -237,10 +237,10 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) r.mu.pushedWakeup.Wait() } if ctx.Err() != nil { - return nil, errors.Wrapf(context.Cause(ctx), "GetRows canceled") + return nil, errors.Wrapf(ctx.Err(), "GetRows canceled") } if r.goroCtx.Err() != nil { - return nil, errors.Wrapf(context.Cause(r.goroCtx), "reader shutting down") + return nil, errors.Wrapf(r.goroCtx.Err(), "reader shutting down") } } diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index aa7cc3ec6f15..42da4d7bea4c 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -4395,9 +4395,7 @@ func (g *queueFeedGenerator) ResolvedType() *types.T { // Start implements the eval.ValueGenerator interface. func (g *queueFeedGenerator) Start(ctx context.Context, txn *kv.Txn) error { - // Ignoring queue_name for now; we only support one queue. Same for limit. - // TODO(queuefeed): support multiple queues and limit. - qr, err := getQueueManager(g.evalCtx).GetOrInitReader(ctx, g.queueName) + qr, err := getQueueManager(g.evalCtx).GetOrInitReader(g.evalCtx.SessionCtx, g.queueName) if err != nil { return err } From 4309a88ef012e23acb3b2bd09581dc17299db486 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 05:09:46 +0000 Subject: [PATCH 19/46] add test --- pkg/sql/queuefeed/manager_test.go | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 1b492e66e594..1419010eccc7 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -3,12 +3,14 @@ package queuefeed import ( "context" "testing" + "time" "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/errors" @@ -38,3 +40,26 @@ func TestFeedCreation(t *testing.T) { require.NotNil(t, reader) reader.(*Reader).cancel(errors.New("test shutdown")) } + +func TestQueuefeedCtxCancel(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := sqlutils.MakeSQLRunner(srv.ApplicationLayer().SQLConn(t)) + + db.Exec(t, `CREATE TABLE t (a string)`) + // get table id + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) + db.Exec(t, `SELECT crdb_internal.create_queue_feed('hi', $1)`, tableID) + + ctx, cancel := context.WithTimeout(ctx, 1*time.Second) + defer cancel() + _, err := db.DB.QueryContext(ctx, `SELECT crdb_internal.select_from_queue_feed('hi', 1)`) + require.Error(t, err) + +} From 798fe0a65b90e24dde3f4c7b1d96782053fa9cb9 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 05:16:53 +0000 Subject: [PATCH 20/46] clean --- pkg/sql/queuefeed/reader.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 5fd68c21dd59..9556cfd08ff3 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -150,20 +150,10 @@ func (r *Reader) setupRangefeed(ctx context.Context) { fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., ) - // get desc for table - desc, err := r.leaseMgr.AcquireByName(ctx, lease.TimestampToReadTimestamp(initialTS), 100, 101, "t") - if err != nil { - setErr(err) - return - } - defer desc.Release(ctx) - // TODO: why are we given a zero codec? r.codec = keys.MakeSQLCodec(roachpb.SystemTenantID) - tk := roachpb.Span{ - Key: r.codec.TablePrefix(uint32(r.tableID)), - } + tk := roachpb.Span{Key: r.codec.TablePrefix(uint32(r.tableID))} tk.EndKey = tk.Key.PrefixEnd() spans := []roachpb.Span{tk} @@ -185,8 +175,6 @@ func (r *Reader) setupRangefeed(ctx context.Context) { // - [ ] checkpoint frontier if our frontier has advanced and we confirmed receipt // - [ ] gonna need some way to clean stuff up on conn_executor.close() -// TODO: run still shuts down with context canceled after getting rows. why? - func (r *Reader) run(ctx context.Context) { defer func() { fmt.Println("run done") From 35a189a506d7989ab7b4fd867406287184a45d8b Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 18:35:03 +0000 Subject: [PATCH 21/46] cleanup --- pkg/sql/exec_util.go | 8 -------- pkg/sql/queuefeed/reader.go | 13 +++---------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index 2837a4450866..dc587401fc38 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -1861,14 +1861,6 @@ func (cfg *ExecutorConfig) GetQueueManager() queuebase.Manager { return cfg.QueueManager } -func (cfg *ExecutorConfig) GetStopper() *stop.Stopper { - return cfg.Stopper -} - -func (cfg *ExecutorConfig) GetAmbientCtx() log.AmbientContext { - return cfg.AmbientCtx -} - // UpdateVersionSystemSettingHook provides a callback that allows us // update the cluster version inside the system.settings table. This hook // is aimed at mainly updating tenant pods, which will currently skip over diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 9556cfd08ff3..8fec63b4d9b6 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -211,16 +211,9 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) if len(r.mu.buf) == 0 { fmt.Printf("GetRows called with empty buf. waiting for pushedWakeup\n") // shut down the reader if this ctx (which is distinct from the goro ctx) is canceled - done := make(chan struct{}) - defer close(done) - go func() { - select { - case <-ctx.Done(): - r.cancel(errors.Wrapf(ctx.Err(), "GetRows canceled")) - case <-done: - return - } - }() + defer context.AfterFunc(ctx, func() { + r.cancel(errors.Wrapf(ctx.Err(), "GetRows canceled")) + })() for ctx.Err() == nil && r.goroCtx.Err() == nil && len(r.mu.buf) == 0 { r.mu.pushedWakeup.Wait() } From a3870cd7c940bf4eca558e1fda6ad27b92aecccf Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Tue, 11 Nov 2025 05:29:55 +0000 Subject: [PATCH 22/46] store readers --- pkg/sql/queuefeed/manager.go | 38 ++++++++++++++++++++++++++++++++++-- pkg/sql/queuefeed/reader.go | 28 +++++++++++++++++++++++--- 2 files changed, 61 insertions(+), 5 deletions(-) diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 232bac21acd2..ba1643487899 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -12,6 +12,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/errors" ) @@ -22,12 +23,21 @@ type Manager struct { rff *rangefeed.Factory codec keys.SQLCodec leaseMgr *lease.Manager + + mu struct { + syncutil.Mutex + // name -> reader + // TODO: this should actually be a map of (session id, name) -> reader, or smth + readers map[string]*Reader + } } func NewManager(_ context.Context, executor isql.DB, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another - return &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} + m := &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} + m.mu.readers = make(map[string]*Reader) + return m } const createQueuePartitionTableSQL = ` @@ -89,7 +99,23 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID } func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.Reader, error) { - // TODO: get if exists already + m.mu.Lock() + defer m.mu.Unlock() + reader, ok := m.mu.readers[name] + if ok && reader.IsAlive() { + fmt.Printf("get or init reader for queue %s found in cache\n", name) + return reader, nil + } + fmt.Printf("get or init reader for queue %s not found in cache\n", name) + reader, err := m.getOrInitReaderUncached(ctx, name) + if err != nil { + return nil, err + } + m.mu.readers[name] = reader + return reader, nil +} + +func (m *Manager) getOrInitReaderUncached(ctx context.Context, name string) (*Reader, error) { var tableDescID int64 // TODO: this ctx on the other hand should be stmt scoped err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { @@ -120,6 +146,14 @@ func (m *Manager) reassessAssignments(ctx context.Context, name string) (bool, e return false, nil } +func (m *Manager) forgetReader(name string) { + func() { + m.mu.Lock() + defer m.mu.Unlock() + delete(m.mu.readers, name) + }() +} + var _ queuebase.Manager = &Manager{} type PartitionAssignment struct{} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 8fec63b4d9b6..91d288a8878b 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -5,6 +5,7 @@ import ( "fmt" "slices" "sync" + "sync/atomic" "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" "github.com/cockroachdb/cockroach/pkg/keys" @@ -35,6 +36,7 @@ const ( readerStateBatching readerState = iota readerStateHasUncommittedBatch readerStateCheckingForReassignment + readerStateDead ) // has rangefeed on data. reads from it. handles handoff @@ -61,8 +63,9 @@ type Reader struct { triggerCheckForReassignment chan struct{} - cancel context.CancelCauseFunc - goroCtx context.Context + cancel context.CancelCauseFunc + goroCtx context.Context + isShutdown atomic.Bool } func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager, name string, tableDescID int64) *Reader { @@ -178,6 +181,8 @@ func (r *Reader) setupRangefeed(ctx context.Context) { func (r *Reader) run(ctx context.Context) { defer func() { fmt.Println("run done") + r.isShutdown.Store(true) + r.mgr.forgetReader(r.name) }() for { @@ -198,6 +203,10 @@ func (r *Reader) run(ctx context.Context) { func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) { fmt.Printf("GetRows start\n") + if r.isShutdown.Load() { + return nil, errors.New("reader is shutting down") + } + r.mu.Lock() defer r.mu.Unlock() @@ -247,13 +256,17 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) } func (r *Reader) ConfirmReceipt(ctx context.Context) { + if r.isShutdown.Load() { + return + } + func() { r.mu.Lock() defer r.mu.Unlock() fmt.Printf("confirming receipt with inflightBuffer len: %d\n", len(r.mu.inflightBuffer)) - clear(r.mu.inflightBuffer) + r.mu.inflightBuffer = r.mu.inflightBuffer[:0] r.mu.state = readerStateCheckingForReassignment }() @@ -266,6 +279,15 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { } } +func (r *Reader) IsAlive() bool { + return !r.isShutdown.Load() +} + +func (r *Reader) Close() error { + r.cancel(errors.New("reader closing")) + return nil +} + func (r *Reader) checkForReassignment(ctx context.Context) error { defer func() { fmt.Println("checkForReassignment done") From 187991cbc4531bd12418d4b8d5837b590c4546ce Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Mon, 10 Nov 2025 11:26:04 -0500 Subject: [PATCH 23/46] queuefeed: implement simple assignment algorithm This implements a simple assignment algorithm where there is one partition that is assigned on a first come first serve basis. --- pkg/server/server_sql.go | 2 +- pkg/sql/queuefeed/BUILD.bazel | 6 ++ pkg/sql/queuefeed/assignments.go | 121 ++++++++++++++++++++++++++ pkg/sql/queuefeed/assignments_test.go | 60 +++++++++++++ pkg/sql/queuefeed/manager.go | 64 ++++++++++---- pkg/sql/queuefeed/manager_test.go | 26 ++++-- pkg/sql/queuefeed/partitions.go | 9 +- pkg/sql/queuefeed/partitions_test.go | 6 +- pkg/sql/queuefeed/reader.go | 17 +++- pkg/sql/sem/builtins/builtins.go | 4 +- 10 files changed, 276 insertions(+), 39 deletions(-) create mode 100644 pkg/sql/queuefeed/assignments.go create mode 100644 pkg/sql/queuefeed/assignments_test.go diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 013c04890978..a40470b4d9fa 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -1065,7 +1065,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { TenantReadOnly: cfg.SQLConfig.TenantReadOnly, CidrLookup: cfg.BaseConfig.CidrLookup, LicenseEnforcer: cfg.SQLConfig.LicenseEnforcer, - QueueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), + QueueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, codec, leaseMgr), } if codec.ForSystemTenant() { diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index b597dba856f6..adc872f25b73 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -3,6 +3,7 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library", "go_test") go_library( name = "queuefeed", srcs = [ + "assignments.go", "manager.go", "partitions.go", "reader.go", @@ -38,6 +39,7 @@ go_library( go_test( name = "queuefeed_test", srcs = [ + "assignments_test.go", "main_test.go", "manager_test.go", "partitions_test.go", @@ -45,10 +47,13 @@ go_test( embed = [":queuefeed"], deps = [ "//pkg/base", + "//pkg/kv/kvclient/rangefeed", "//pkg/roachpb", "//pkg/security/securityassets", "//pkg/security/securitytest", "//pkg/server", + "//pkg/sql", + "//pkg/sql/catalog/lease", "//pkg/sql/isql", "//pkg/sql/sqlliveness", "//pkg/testutils/serverutils", @@ -58,6 +63,7 @@ go_test( "//pkg/util/log", "//pkg/util/randutil", "//pkg/util/uuid", + "@com_github_cockroachdb_errors//:errors", "@com_github_stretchr_testify//require", ], ) diff --git a/pkg/sql/queuefeed/assignments.go b/pkg/sql/queuefeed/assignments.go new file mode 100644 index 000000000000..7e7524e2a009 --- /dev/null +++ b/pkg/sql/queuefeed/assignments.go @@ -0,0 +1,121 @@ +package queuefeed + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/errors" +) + +type Assignment struct { + // Version is unique per process level and can be used to efficiently detect + // assignment changes. + Version int64 + Session Session + // Partitions is the list of partitions assigned to the session. It is sorted + // by ID. + Partitions []Partition +} + +type PartitionAssignments struct { + db isql.DB + partitionTable *partitionTable +} + +func NewPartitionAssignments(db isql.DB, queueName string) *PartitionAssignments { + return &PartitionAssignments{ + db: db, + partitionTable: &partitionTable{queueName: queueName}, + } +} + +// RefreshAssignment refreshes the assignment for the given session. It returnrns +// nil if the assignment has not changed. +// +// If the session is caught up (i.e. it has proceessed up to a recent timestamp +// for all assigned partitions), then it may be assigned new partitions. +// +// If a partition has a successor session, then calling RefreshAssignment will +// return an assignment that does not include that partition. +func (p *PartitionAssignments) RefreshAssignment( + session Session, caughtUp bool, +) (updatedAssignment *Assignment, err error) { + // This is a stub implementation that assumes there is a single partition. + return nil, nil +} + +// RegisterSession registers a new session. The session may be assigned zero +// partitions if there are no unassigned partitions. If it is assigned no +// partitions, the caller can periodically call RefreshAssignment claim +// partitions if they become available. +func (p *PartitionAssignments) RegisterSession( + ctx context.Context, session Session, +) (*Assignment, error) { + // TODO(jeffswenson): this is a stub implementation that simply assigns all + // unclaimed partitions to the current session. + + var result *Assignment + err := p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + result = &Assignment{Session: session} + + partitions, err := p.partitionTable.ListPartitions(ctx, txn) + if err != nil { + return err + } + for _, partition := range partitions { + if !partition.Session.Empty() { + continue + } + partition.Session = session + if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { + return errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) + } + result.Partitions = append(result.Partitions, partition) + } + + return nil + }) + if err != nil { + return nil, errors.Wrap(err, "registering session") + } + return result, nil +} + +func (p *PartitionAssignments) UnregisterSession(ctx context.Context, session Session) error { + return p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := p.partitionTable.ListPartitions(ctx, txn) + if err != nil { + return err + } + for _, partition := range partitions { + if partition.Session == session { + partition.Session = partition.Successor + if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { + return errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) + } + } + if partition.Successor == session { + partition.Successor = Session{} + if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { + return errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) + } + } + } + return nil + }) +} + +func (p *PartitionAssignments) constructAssignment(session Session) (*Assignment, error) { + // Build an assignment for the given session from the partition cache. + return nil, errors.New("not implemented") +} + +func (p *PartitionAssignments) tryClaim(session Session, partition *Partition) (Partition, error) { + // Try to claim an unassigned partition for the given session. + return Partition{}, nil +} + +func (p *PartitionAssignments) tryRelease(session Session, toRelease []Partition) error { + // Release the given partitions from the session. + return nil +} diff --git a/pkg/sql/queuefeed/assignments_test.go b/pkg/sql/queuefeed/assignments_test.go new file mode 100644 index 000000000000..b3a60d581328 --- /dev/null +++ b/pkg/sql/queuefeed/assignments_test.go @@ -0,0 +1,60 @@ +package queuefeed_test + +import ( + "context" + "testing" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/sql" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/uuid" + "github.com/stretchr/testify/require" +) + +func TestPartitionAssignments(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer s.Stopper().Stop(ctx) + + tdb := sqlutils.MakeSQLRunner(sqlDB) + tdb.Exec(t, "CREATE TABLE test_table (id INT PRIMARY KEY, data TEXT)") + + var tableDescID int64 + tdb.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 'test_table'").Scan(&tableDescID) + + // Create queue using QueueManager + manager := queuefeed.NewTestManager(t, s.ApplicationLayer()) + queueName := "test_queue" + err := manager.CreateQueue(ctx, queueName, tableDescID) + require.NoError(t, err) + + pa := queuefeed.NewPartitionAssignments(s.ExecutorConfig().(sql.ExecutorConfig).InternalDB, queueName) + + session := queuefeed.Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("1"), + } + + assignment, err := pa.RegisterSession(ctx, session) + require.NoError(t, err) + require.Len(t, assignment.Partitions, 1) + require.Equal(t, session, assignment.Partitions[0].Session, "partition: %+v", assignment.Partitions[0]) + + tdb.CheckQueryResults(t, + "SELECT sql_liveness_session, user_session FROM defaultdb.queue_partition_"+queueName, + [][]string{{"1", session.ConnectionID.String()}}) + + require.NoError(t, pa.UnregisterSession(ctx, session)) + + tdb.CheckQueryResults(t, + "SELECT sql_liveness_session, user_session FROM defaultdb.queue_partition_"+queueName, + [][]string{{"NULL", "NULL"}}) +} diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 232bac21acd2..8d5ba22e5b51 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -7,6 +7,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" @@ -24,25 +27,18 @@ type Manager struct { leaseMgr *lease.Manager } -func NewManager(_ context.Context, executor isql.DB, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager) *Manager { +func NewManager( + _ context.Context, + executor isql.DB, + rff *rangefeed.Factory, + codec keys.SQLCodec, + leaseMgr *lease.Manager, +) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another return &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} } -const createQueuePartitionTableSQL = ` -CREATE TABLE IF NOT EXISTS defaultdb.queue_partition_%s ( - partition_id INT8 PRIMARY KEY, - -- is the sql server assigned dead - sql_liveness_session UUID NOT NULL, - -- pgwire session - user_session UUID NOT NULL, - sql_liveness_session_successor UUID, - user_session_successor UUID, - partition_spec bytea, - updated_at TIMESTAMPTZ -)` - const createQueueCursorTableSQL = ` CREATE TABLE IF NOT EXISTS defaultdb.queue_cursor_%s ( partition_id INT8 PRIMARY KEY, @@ -66,25 +62,56 @@ SELECT table_desc_id FROM defaultdb.queue_feeds WHERE queue_feed_name = $1 // should take a txn func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID int64) error { - return m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) if err != nil { return err } - _, err = txn.Exec(ctx, "create_qp", txn.KV(), fmt.Sprintf(createQueuePartitionTableSQL, queueName)) + + pt := &partitionTable{queueName: queueName} + err = pt.CreateSchema(ctx, txn) if err != nil { return err } + _, err = txn.Exec(ctx, "create_qc", txn.KV(), fmt.Sprintf(createQueueCursorTableSQL, queueName)) if err != nil { return err } - // TODO(queuefeed): add validation on the table descriptor id + return nil + }) + if err != nil { + return errors.Wrapf(err, "creating queue tables for %s", queueName) + } + + return m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + // TODO(queuefeed): figure out how we want to integrate with schema changes. + descriptor, err := m.leaseMgr.Acquire(ctx, lease.TimestampToReadTimestamp(txn.KV().ReadTimestamp()), descpb.ID(tableDescID)) + if err != nil { + return err + } + tableDesc := descriptor.Underlying().(catalog.TableDescriptor) + defer descriptor.Release(ctx) + _, err = txn.Exec(ctx, "insert_q", txn.KV(), insertQueueFeedSQL, queueName, tableDescID) if err != nil { return err } - return nil + + pt := &partitionTable{queueName: queueName} + + // Create a single initial partition that covers the table's primary key. + primaryIndexPrefix := m.codec.IndexPrefix(uint32(tableDesc.GetID()), uint32(tableDesc.GetPrimaryIndexID())) + primaryKeySpan := roachpb.Span{ + Key: primaryIndexPrefix, + EndKey: primaryIndexPrefix.PrefixEnd(), + } + partition := Partition{ + ID: 1, + Span: &primaryKeySpan, + } + + return pt.InsertPartition(ctx, txn, partition) }) } @@ -97,6 +124,7 @@ func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.R if err != nil { return err } + vals, err := txn.QueryRowEx(ctx, "fetch_q", txn.KV(), sessiondata.NodeUserSessionDataOverride, fetchQueueFeedSQL, name) if err != nil { diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 1419010eccc7..d4640d20504d 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -17,22 +17,34 @@ import ( "github.com/stretchr/testify/require" ) +func NewTestManager(t *testing.T, a serverutils.ApplicationLayerInterface) *Manager { + ctx := context.Background() + db := a.InternalDB().(isql.DB) + m := NewManager(ctx, db, a.RangeFeedFactory().(*rangefeed.Factory), a.Codec(), a.LeaseManager().(*lease.Manager)) + require.NotNil(t, m.codec) + return m +} + func TestFeedCreation(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) ctx := context.Background() - srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) defer srv.Stopper().Stop(ctx) - db := srv.ApplicationLayer().InternalDB().(isql.DB) // expect an error when trying to read from a queue that doesn't exist - qm := NewManager(ctx, db, srv.RangeFeedFactory().(*rangefeed.Factory), srv.Codec(), srv.ApplicationLayer().LeaseManager().(*lease.Manager)) + qm := NewTestManager(t, srv.ApplicationLayer()) _, err := qm.GetOrInitReader(context.Background(), "test") require.ErrorContains(t, err, "queue feed not found") // expect no error when creating a queue - require.NoError(t, qm.CreateQueue(context.Background(), "test", 104)) + db := sqlutils.MakeSQLRunner(conn) + db.Exec(t, `CREATE TABLE t (a string)`) + // get table id + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) + require.NoError(t, qm.CreateQueue(context.Background(), "test", tableID)) // now we can read from the queue reader, err := qm.GetOrInitReader(context.Background(), "test") @@ -46,11 +58,10 @@ func TestQueuefeedCtxCancel(t *testing.T) { defer log.Scope(t).Close(t) ctx := context.Background() - srv, _, _ := serverutils.StartServer(t, base.TestServerArgs{}) + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) defer srv.Stopper().Stop(ctx) - db := sqlutils.MakeSQLRunner(srv.ApplicationLayer().SQLConn(t)) - + db := sqlutils.MakeSQLRunner(conn) db.Exec(t, `CREATE TABLE t (a string)`) // get table id var tableID int64 @@ -61,5 +72,4 @@ func TestQueuefeedCtxCancel(t *testing.T) { defer cancel() _, err := db.DB.QueryContext(ctx, `SELECT crdb_internal.select_from_queue_feed('hi', 1)`) require.Error(t, err) - } diff --git a/pkg/sql/queuefeed/partitions.go b/pkg/sql/queuefeed/partitions.go index a9bc0f0b8e99..d4a2e8af5bac 100644 --- a/pkg/sql/queuefeed/partitions.go +++ b/pkg/sql/queuefeed/partitions.go @@ -25,7 +25,6 @@ type Partition struct { } type partitionTable struct { - db isql.DB queueName string } @@ -33,9 +32,9 @@ func (p *partitionTable) CreateSchema(ctx context.Context, txn isql.Txn) error { _, err := txn.Exec(ctx, "create-partition-table", txn.KV(), fmt.Sprintf(`CREATE TABLE IF NOT EXISTS defaultdb.queue_partition_%s ( partition_id BIGSERIAL PRIMARY KEY, - sql_liveness_session UUID, + sql_liveness_session BYTES, user_session UUID, - sql_liveness_session_successor UUID, + sql_liveness_session_successor BYTES, user_session_successor UUID, partition_spec BYTES )`, p.queueName)) @@ -61,13 +60,13 @@ func (p *partitionTable) ListPartitions(ctx context.Context, txn isql.Txn) ([]Pa var session, successor Session if !(row[1] == tree.DNull || row[2] == tree.DNull) { session = Session{ - LivenessID: sqlliveness.SessionID(tree.MustBeDUuid(row[1]).UUID.GetBytes()), + LivenessID: sqlliveness.SessionID(tree.MustBeDBytes(row[1])), ConnectionID: tree.MustBeDUuid(row[2]).UUID, } } if !(row[3] == tree.DNull || row[4] == tree.DNull) { successor = Session{ - LivenessID: sqlliveness.SessionID(tree.MustBeDUuid(row[3]).UUID.GetBytes()), + LivenessID: sqlliveness.SessionID(tree.MustBeDBytes(row[3])), ConnectionID: tree.MustBeDUuid(row[4]).UUID, } } diff --git a/pkg/sql/queuefeed/partitions_test.go b/pkg/sql/queuefeed/partitions_test.go index 391d250b8738..5f13296b146b 100644 --- a/pkg/sql/queuefeed/partitions_test.go +++ b/pkg/sql/queuefeed/partitions_test.go @@ -28,7 +28,7 @@ func TestListPartitions(t *testing.T) { sqlRunner := sqlutils.MakeSQLRunner(sqlDB) queueName := "test" - pt := &partitionTable{db: db, queueName: queueName} + pt := &partitionTable{queueName: queueName} // Create table err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { @@ -80,7 +80,7 @@ func TestUpdatePartition(t *testing.T) { sqlRunner := sqlutils.MakeSQLRunner(sqlDB) queueName := "test" - pt := &partitionTable{db: db, queueName: queueName} + pt := &partitionTable{queueName: queueName} // Create table err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { @@ -134,7 +134,7 @@ func TestInsertPartition(t *testing.T) { db := srv.ApplicationLayer().InternalDB().(isql.DB) queueName := "test" - pt := &partitionTable{db: db, queueName: queueName} + pt := &partitionTable{queueName: queueName} // Create table err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 8fec63b4d9b6..8b4af5ffa6eb 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -65,7 +65,16 @@ type Reader struct { goroCtx context.Context } -func NewReader(ctx context.Context, executor isql.DB, mgr *Manager, rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager, name string, tableDescID int64) *Reader { +func NewReader( + ctx context.Context, + executor isql.DB, + mgr *Manager, + rff *rangefeed.Factory, + codec keys.SQLCodec, + leaseMgr *lease.Manager, + name string, + tableDescID int64, +) *Reader { r := &Reader{ executor: executor, mgr: mgr, @@ -290,7 +299,9 @@ func (r *Reader) checkForReassignment(ctx context.Context) error { } // TODO: this is all highly sus -func (r *Reader) decodeRangefeedValue(ctx context.Context, rfv *kvpb.RangeFeedValue) (tree.Datums, error) { +func (r *Reader) decodeRangefeedValue( + ctx context.Context, rfv *kvpb.RangeFeedValue, +) (tree.Datums, error) { key, value := rfv.Key, rfv.Value key, err := r.codec.StripTenantPrefix(key) if err != nil { @@ -341,7 +352,7 @@ func (r *Reader) decodeRangefeedValue(ctx context.Context, rfv *kvpb.RangeFeedVa for i, colID := range cols { col, err := catalog.MustFindColumnByID(tableDesc, colID) if err != nil { - return nil, errors.Wrapf(err, "finding column by id: %s", colID) + return nil, errors.Wrapf(err, "finding column by id: %d", colID) } ed := encDatums[i] if err := ed.EnsureDecoded(col.ColumnDesc().Type, &tree.DatumAlloc{}); err != nil { diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index c0d70bef6184..3aa6e7f66b89 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4697,7 +4697,9 @@ value if you rely on the HLC for accuracy.`, } obj.Add(fmt.Sprintf("f%d", i+1), j) } - ret.Append(tree.NewDJSON(obj.Build())) + if err := ret.Append(tree.NewDJSON(obj.Build())); err != nil { + return nil, err + } } return ret, nil }, From 6fd7c9780e0785efb0e40b90d55d29ee81e56377 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Wed, 12 Nov 2025 17:06:09 +0000 Subject: [PATCH 24/46] rollback support --- pkg/kv/txn.go | 30 +++++++++++++++++++--- pkg/sql/queuefeed/queuebase/queuebase.go | 1 + pkg/sql/queuefeed/reader.go | 19 ++++++++++++++ pkg/sql/sem/builtins/generator_builtins.go | 9 ++++--- 4 files changed, 53 insertions(+), 6 deletions(-) diff --git a/pkg/kv/txn.go b/pkg/kv/txn.go index c1f6bd824293..c3978bfc8797 100644 --- a/pkg/kv/txn.go +++ b/pkg/kv/txn.go @@ -90,6 +90,9 @@ type Txn struct { // commitTriggers are run upon successful commit. commitTriggers []func(ctx context.Context) + // rollbackTriggers are run upon rollback/abort. + rollbackTriggers []func(ctx context.Context) + // mu holds fields that need to be synchronized for concurrent request execution. mu struct { syncutil.Mutex @@ -1093,6 +1096,16 @@ func (txn *Txn) AddCommitTrigger(trigger func(ctx context.Context)) { txn.commitTriggers = append(txn.commitTriggers, trigger) } +// AddRollbackTrigger adds a closure to be executed on rollback/abort +// of the transaction. +func (txn *Txn) AddRollbackTrigger(trigger func(ctx context.Context)) { + if txn.typ != RootTxn { + panic(errors.AssertionFailedf("AddRollbackTrigger() called on leaf txn")) + } + + txn.rollbackTriggers = append(txn.rollbackTriggers, trigger) +} + // endTxnReqAlloc is used to batch the heap allocations of an EndTxn request. type endTxnReqAlloc struct { req kvpb.EndTxnRequest @@ -1243,6 +1256,9 @@ func (txn *Txn) PrepareForRetry(ctx context.Context) error { // Reset commit triggers. These must be reconfigured by the client during the // next retry. txn.commitTriggers = nil + // Reset rollback triggers. These must be reconfigured by the client during the + // next retry. + txn.rollbackTriggers = nil txn.mu.Lock() defer txn.mu.Unlock() @@ -1383,9 +1399,17 @@ func (txn *Txn) Send( if pErr == nil { // Invoking the commit triggers here ensures they run even in the case when a // commit request is issued manually (not via Commit). - if et, ok := ba.GetArg(kvpb.EndTxn); ok && et.(*kvpb.EndTxnRequest).Commit { - for _, t := range txn.commitTriggers { - t(ctx) + if et, ok := ba.GetArg(kvpb.EndTxn); ok { + if et.(*kvpb.EndTxnRequest).Commit { + for _, t := range txn.commitTriggers { + t(ctx) + } + } else { + // Invoking the rollback triggers here ensures they run even in the case when a + // rollback request is issued manually (not via Rollback). + for _, t := range txn.rollbackTriggers { + t(ctx) + } } } return br, nil diff --git a/pkg/sql/queuefeed/queuebase/queuebase.go b/pkg/sql/queuefeed/queuebase/queuebase.go index c4336e37c2d2..827f7bbd3e61 100644 --- a/pkg/sql/queuefeed/queuebase/queuebase.go +++ b/pkg/sql/queuefeed/queuebase/queuebase.go @@ -14,4 +14,5 @@ type Manager interface { type Reader interface { GetRows(ctx context.Context, limit int) ([]tree.Datums, error) ConfirmReceipt(ctx context.Context) + RollbackBatch(ctx context.Context) } diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 7d76e770cc8a..9580635e9558 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -288,6 +288,25 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { } } +func (r *Reader) RollbackBatch(ctx context.Context) { + if r.isShutdown.Load() { + return + } + + r.mu.Lock() + defer r.mu.Unlock() + + fmt.Printf("rolling back batch with inflightBuffer len: %d\n", len(r.mu.inflightBuffer)) + + newBuf := make([]tree.Datums, 0, len(r.mu.inflightBuffer)+len(r.mu.buf)) + newBuf = append(newBuf, r.mu.inflightBuffer...) + newBuf = append(newBuf, r.mu.buf...) + r.mu.buf = newBuf + r.mu.inflightBuffer = r.mu.inflightBuffer[:0] + + r.mu.state = readerStateBatching +} + func (r *Reader) IsAlive() bool { return !r.isShutdown.Load() } diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index 42da4d7bea4c..f1ea20223d92 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -4400,12 +4400,15 @@ func (g *queueFeedGenerator) Start(ctx context.Context, txn *kv.Txn) error { return err } - // Attach commit hook to txn to confirm receipt - // or something... todo on rollback/abort. + // Attach commit hook to txn to confirm receipt on successful commit. txn.AddCommitTrigger(func(ctx context.Context) { - // TODO(queuefeed): handle error properly. qr.ConfirmReceipt(ctx) }) + // On rollback, we don't confirm receipt since the transaction didn't commit + // and the rows shouldn't be considered consumed. + txn.AddRollbackTrigger(func(ctx context.Context) { + qr.RollbackBatch(ctx) + }) rows, err := qr.GetRows(ctx, g.limit) if err != nil { From 26c65e74aaa2b641213d57a827e64863ca8f5fb4 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Wed, 12 Nov 2025 21:10:06 +0000 Subject: [PATCH 25/46] fix context bug --- pkg/sql/queuefeed/reader.go | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 9580635e9558..9fbaba9cd6bd 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -87,8 +87,6 @@ func NewReader( rff: rff, tableID: descpb.ID(tableDescID), triggerCheckForReassignment: make(chan struct{}), - // stored so we can use it in methods using a different context than the main goro ie GetRows and ConfirmReceipt - goroCtx: ctx, } r.mu.state = readerStateBatching r.mu.buf = make([]tree.Datums, 0, maxBufSize) @@ -102,6 +100,7 @@ func NewReader( r.mu.poppedWakeup.Broadcast() r.mu.pushedWakeup.Broadcast() } + r.goroCtx = ctx r.setupRangefeed(ctx) go r.run(ctx) @@ -175,15 +174,16 @@ func (r *Reader) setupRangefeed(ctx context.Context) { setErr(err) return } - _ = rf - // TODO: rf.Close() on close - + _ = context.AfterFunc(r.goroCtx, func() { + fmt.Println("closing rangefeed") + rf.Close() + })() } // - [x] setup rangefeed on data // - [ ] handle only watching my partitions // - [ ] after each batch, ask mgr if i need to change assignments -// - [ ] buffer rows in the background before being asked for them +// - [X] buffer rows in the background before being asked for them // - [ ] checkpoint frontier if our frontier has advanced and we confirmed receipt // - [ ] gonna need some way to clean stuff up on conn_executor.close() @@ -259,9 +259,6 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) fmt.Printf("GetRows done with inflightBuffer len: %d, buf len: %d\n", len(r.mu.inflightBuffer), len(r.mu.buf)) return slices.Clone(r.mu.inflightBuffer), nil - - // and then trigger the goro to check if m wants us to change assignments - // if it does, handle that stuff before doing a new batch } func (r *Reader) ConfirmReceipt(ctx context.Context) { @@ -339,7 +336,6 @@ func (r *Reader) checkForReassignment(ctx context.Context) error { return nil } -// TODO: this is all highly sus func (r *Reader) decodeRangefeedValue( ctx context.Context, rfv *kvpb.RangeFeedValue, ) (tree.Datums, error) { From b91617cf1460cbb3704f50850ac7e2bbefa9a846 Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Wed, 12 Nov 2025 15:40:56 -0500 Subject: [PATCH 26/46] queuefeed: add a smoke test --- pkg/sql/queuefeed/BUILD.bazel | 2 + pkg/sql/queuefeed/reader.go | 3 -- pkg/sql/queuefeed/smoke_test.go | 73 +++++++++++++++++++++++++++++++++ 3 files changed, 75 insertions(+), 3 deletions(-) create mode 100644 pkg/sql/queuefeed/smoke_test.go diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index adc872f25b73..2f65d87b04c6 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -43,6 +43,7 @@ go_test( "main_test.go", "manager_test.go", "partitions_test.go", + "smoke_test.go", ], embed = [":queuefeed"], deps = [ @@ -65,5 +66,6 @@ go_test( "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", "@com_github_stretchr_testify//require", + "@org_golang_x_sync//errgroup", ], ) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 9580635e9558..1e34e748b448 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -162,9 +162,6 @@ func (r *Reader) setupRangefeed(ctx context.Context) { fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., ) - // TODO: why are we given a zero codec? - r.codec = keys.MakeSQLCodec(roachpb.SystemTenantID) - tk := roachpb.Span{Key: r.codec.TablePrefix(uint32(r.tableID))} tk.EndKey = tk.Key.PrefixEnd() spans := []roachpb.Span{tk} diff --git a/pkg/sql/queuefeed/smoke_test.go b/pkg/sql/queuefeed/smoke_test.go new file mode 100644 index 000000000000..1ed10c50e45e --- /dev/null +++ b/pkg/sql/queuefeed/smoke_test.go @@ -0,0 +1,73 @@ +package queuefeed + +import ( + "context" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" +) + +func TestQueuefeedSmoketest(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := sqlutils.MakeSQLRunner(sqlDB) + db.Exec(t, `CREATE TABLE t (k string primary key)`) + _, err := srv.SystemLayer().SQLConn(t).Exec(`SET CLUSTER SETTING kv.rangefeed.enabled = true`) + require.NoError(t, err) + + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) + db.Exec(t, `SELECT crdb_internal.create_queue_feed('test_queue', $1)`, tableID) + + // TODO improve this test once creating the queue sets an accurate cursor. We + // should be able to read an expected set of rows. + ctx, cancel := context.WithCancel(ctx) + group, ctx := errgroup.WithContext(ctx) + group.Go(func() error { + for i := 0; ctx.Err() == nil; i++ { + t.Log("inserting row", i) + db.Exec(t, `INSERT INTO t VALUES ($1::STRING)`, i) + time.Sleep(100 * time.Millisecond) + } + return nil + }) + + conn, err := srv.SQLConn(t).Conn(context.Background()) + require.NoError(t, err) + defer func() { _ = conn.Close() }() + + // Try to read from the queue until we observe some data. The queue doesn't + // currently track the frontier, so we need to keep inserting data because + // there is a race between inserting and reading from the queue. + found := 0 + for found < 1 { + t.Log("reading from queue feed") + + cursor, err := conn.QueryContext(ctx, "SELECT * FROM crdb_internal.select_from_queue_feed('test_queue', 1)") + require.NoError(t, err) + + for cursor.Next() { + var k string + require.NoError(t, cursor.Scan(&k)) + found++ + } + + require.NoError(t, cursor.Err()) + require.NoError(t, cursor.Close()) + } + + cancel() + require.NoError(t, group.Wait()) +} From a8af8561e1699b1468674d7f2fb3d303d12e3cc9 Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Wed, 12 Nov 2025 14:31:03 -0500 Subject: [PATCH 27/46] queuefeed: create partition cache This change includes the partition cache. The idea is this will be refreshed based on the rangefeed and local transactions that update the table. The core assignment algorithms were implemented against the cache which allows us to quickly test a large number of random scenarios. The production assigner will basically mimic the simulation assigner, but it will use txns to update rows in the db before applying the updates to the cache. --- pkg/sql/queuefeed/BUILD.bazel | 2 + pkg/sql/queuefeed/partition_cache.go | 275 ++++++++++++++++++++++ pkg/sql/queuefeed/partition_cache_test.go | 211 +++++++++++++++++ pkg/sql/queuefeed/partitions.go | 2 +- 4 files changed, 489 insertions(+), 1 deletion(-) create mode 100644 pkg/sql/queuefeed/partition_cache.go create mode 100644 pkg/sql/queuefeed/partition_cache_test.go diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 2f65d87b04c6..e80085d77e43 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -5,6 +5,7 @@ go_library( srcs = [ "assignments.go", "manager.go", + "partition_cache.go", "partitions.go", "reader.go", ], @@ -42,6 +43,7 @@ go_test( "assignments_test.go", "main_test.go", "manager_test.go", + "partition_cache_test.go", "partitions_test.go", "smoke_test.go", ], diff --git a/pkg/sql/queuefeed/partition_cache.go b/pkg/sql/queuefeed/partition_cache.go new file mode 100644 index 000000000000..f19be3fca26f --- /dev/null +++ b/pkg/sql/queuefeed/partition_cache.go @@ -0,0 +1,275 @@ +package queuefeed + +import ( + "fmt" + "math/rand" + "sort" + "strings" +) + +type partitionCache struct { + // partitions is a stale cache of the current state of the partition's + // table. Any assignment decisions and updates should be made using + // transactions. + partitions map[int64]Partition + + // assignmentIndex is a map of sessions to assigned partitions. + assignmentIndex map[Session]map[int64]struct{} + + sessions map[Session]struct{} +} + +func (p *partitionCache) DebugString() string { + var result strings.Builder + + result.WriteString("PartitionCache Debug:\n") + result.WriteString("===================\n\n") + + // Print partitions + result.WriteString("Partitions:\n") + if len(p.partitions) == 0 { + result.WriteString(" (none)\n") + } else { + for id, partition := range p.partitions { + result.WriteString(fmt.Sprintf(" ID: %d", id)) + if !partition.Session.Empty() { + result.WriteString(fmt.Sprintf(" | Session: %s", partition.Session.ConnectionID.String()[:8])) + } else { + result.WriteString(" | Session: (unassigned)") + } + result.WriteString("\n") + } + } + + // Print assignment index + result.WriteString("\nAssignment Index (session -> partitions):\n") + if len(p.assignmentIndex) == 0 { + result.WriteString(" (none)\n") + } else { + for session, partitions := range p.assignmentIndex { + result.WriteString(fmt.Sprintf(" %s: [", session.ConnectionID.String()[:8])) + partitionIDs := make([]int64, 0, len(partitions)) + for id := range partitions { + partitionIDs = append(partitionIDs, id) + } + sort.Slice(partitionIDs, func(i, j int) bool { + return partitionIDs[i] < partitionIDs[j] + }) + for i, id := range partitionIDs { + if i > 0 { + result.WriteString(", ") + } + result.WriteString(fmt.Sprintf("%d", id)) + } + result.WriteString("]\n") + } + } + + return result.String() +} + +func (p *partitionCache) Init(partitions []Partition) { + p.partitions = make(map[int64]Partition) + p.assignmentIndex = make(map[Session]map[int64]struct{}) + + for _, partition := range partitions { + p.addPartition(partition) + } +} + +func (p *partitionCache) Update(partitions map[int64]Partition) { + // TODO(queuefeed): When we introduce rangefeeds we probably need to add mvcc + // version to Partition to make sure updates from sql statements are kept + // coherent with updates from the rangefeed. + for id, newPartition := range partitions { + oldPartition := p.partitions[id] + switch { + case newPartition.Empty(): + p.removePartition(id) + case oldPartition.Empty(): + p.addPartition(newPartition) + default: + p.updatePartition(oldPartition, newPartition) + } + } +} + +func (p *partitionCache) removePartition(partitionID int64) { + partition, exists := p.partitions[partitionID] + if !exists { + return + } + + delete(p.partitions, partitionID) + + // Remove from session index + if !partition.Session.Empty() { + if sessions, ok := p.assignmentIndex[partition.Session]; ok { + delete(sessions, partitionID) + if len(sessions) == 0 { + delete(p.assignmentIndex, partition.Session) + } + } + } +} + +func (p *partitionCache) addPartition(partition Partition) { + // Add to main partition map + p.partitions[partition.ID] = partition + + // Add to session index and partition index for assigned session + if !partition.Session.Empty() { + if _, ok := p.assignmentIndex[partition.Session]; !ok { + p.assignmentIndex[partition.Session] = make(map[int64]struct{}) + } + p.assignmentIndex[partition.Session][partition.ID] = struct{}{} + } + +} + +func (p *partitionCache) updatePartition(oldPartition, newPartition Partition) { + // Update main partition map + p.partitions[newPartition.ID] = newPartition + + // Remove old session assignments + if !oldPartition.Session.Empty() { + if sessions, ok := p.assignmentIndex[oldPartition.Session]; ok { + delete(sessions, oldPartition.ID) + if len(sessions) == 0 { + delete(p.assignmentIndex, oldPartition.Session) + } + } + } + + // Add new session assignments + if !newPartition.Session.Empty() { + if _, ok := p.assignmentIndex[newPartition.Session]; !ok { + p.assignmentIndex[newPartition.Session] = make(map[int64]struct{}) + } + p.assignmentIndex[newPartition.Session][newPartition.ID] = struct{}{} + } +} + +func (p *partitionCache) isStale(assignment *Assignment) bool { + cachedAssignment := p.assignmentIndex[assignment.Session] + if len(assignment.Partitions) != len(cachedAssignment) { + return true + } + for _, partition := range assignment.Partitions { + if _, ok := cachedAssignment[partition.ID]; !ok { + return true + } + } + return false +} + +func (p *partitionCache) constructAssignment(session Session) *Assignment { + assignment := &Assignment{ + Session: session, + Partitions: make([]Partition, 0, len(p.assignmentIndex[session])), + } + for partitionID := range p.assignmentIndex[session] { + assignment.Partitions = append(assignment.Partitions, p.partitions[partitionID]) + } + sort.Slice(assignment.Partitions, func(i, j int) bool { + return assignment.Partitions[i].ID < assignment.Partitions[j].ID + }) + return assignment +} + +func (p *partitionCache) planRegister( + session Session, cache partitionCache, +) (tryClaim Partition, trySteal Partition) { + // Check to see if there is an an unassigned partition that can be claimed. + for _, partition := range cache.partitions { + if partition.Session.Empty() { + return partition, Partition{} + } + } + maxPartitions := (len(p.partitions) + len(p.assignmentIndex) - 1) / len(p.assignmentIndex) + return Partition{}, p.planTheft(1, maxPartitions) +} + +func (p *partitionCache) planAssignment( + session Session, caughtUp bool, cache partitionCache, +) (tryRelease []Partition, tryClaim Partition, trySteal Partition) { + + for partitionId := range p.assignmentIndex[session] { + partition := p.partitions[partitionId] + if !partition.Successor.Empty() { + tryRelease = append(tryRelease, partition) + } + } + if len(tryRelease) != 0 { + return tryRelease, Partition{}, Partition{} + } + + // If we aren't caught up, we should not try to claim any new partitions. + if !caughtUp { + return nil, Partition{}, Partition{} + } + + // Check to see if there is an an unassigned partition that can be claimed. + for _, partition := range p.partitions { + // TODO(jeffswenson): we should really try to claim a random partition to + // avoid contention. + if partition.Session.Empty() { + return nil, partition, Partition{} + } + } + + // maxPartitions is the maximum number of partitions we would expect to be + // assigned to this session. + maxPartitions := len(p.partitions) + if len(p.assignmentIndex) != 0 { + maxPartitions = (len(p.partitions) + len(p.assignmentIndex) - 1) / len(p.assignmentIndex) + } + assignedPartitions := len(p.assignmentIndex[session]) + if maxPartitions <= assignedPartitions { + return nil, Partition{}, Partition{} + } + + // NOTE: planTheft may return an empty partition. E.g. consider the case + // where there are two sessions and three partitions. In that case the + // maximum partition assignment is 2, but one partition will end up with + // only 1 assignment. It will consider stealing even though the partitions + // are balanced. + // + // We prioritize stealing sessions from any client that has more than the + // maximum expected number of partitions. But we are willing to steal from + // any client that has two more partitions than this client currently has. + // Stealing from someone with less than the maximum expected number of is + // needed to handle distributions like: + // a -> 3 partitions + // b -> 3 partitions + // c -> 1 partition + return nil, Partition{}, p.planTheft(assignedPartitions+1, maxPartitions) +} + +// planTheft selects a partition from a session that has more partitions +// assigned to it than the richTreshold. E.g. `richTreshold` is 1, any session +// with 2 or more partitions is a candidate for work stealing. +func (p *partitionCache) planTheft(minumExpected, maximumExpected int) Partition { + richCandidates, eligibleCandidates := []Partition{}, []Partition{} + for _, session := range p.assignmentIndex { + assignedPartitions := len(session) + if maximumExpected < assignedPartitions { + for partitionID := range session { + richCandidates = append(richCandidates, p.partitions[partitionID]) + } + } + if minumExpected < assignedPartitions { + for partitionID := range session { + eligibleCandidates = append(eligibleCandidates, p.partitions[partitionID]) + } + } + } + + if len(richCandidates) != 0 { + return richCandidates[rand.Intn(len(richCandidates))] + } + if len(eligibleCandidates) != 0 { + return eligibleCandidates[rand.Intn(len(eligibleCandidates))] + } + return Partition{} +} diff --git a/pkg/sql/queuefeed/partition_cache_test.go b/pkg/sql/queuefeed/partition_cache_test.go new file mode 100644 index 000000000000..4973e4f4d345 --- /dev/null +++ b/pkg/sql/queuefeed/partition_cache_test.go @@ -0,0 +1,211 @@ +package queuefeed + +import ( + "math/rand" + "testing" + + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/util/uuid" + "github.com/stretchr/testify/require" +) + +type assignmentSimulator struct { + t *testing.T + sessions []Session + cache partitionCache +} + +func newAssignmentSimulator(t *testing.T, partitionCount int) *assignmentSimulator { + partitions := make([]Partition, partitionCount) + for i := range partitions { + partitions[i] = Partition{ + ID: int64(i + 1), + Session: Session{}, // Unassigned + Successor: Session{}, + } + } + sim := &assignmentSimulator{t: t} + sim.cache.Init(partitions) + return sim +} + +// refreshAssignment returns true if refreshing the assignment took any action. +func (a *assignmentSimulator) refreshAssignment(session Session) bool { + tryRelease, tryClaim, trySecede := a.cache.planAssignment(session, true, a.cache) + + updates := make(map[int64]Partition) + + // This is simulating the production implementation. The prod implementation + // would use a txn to apply these changes to the DB, then update the cache + // with the latest version of the rows. + for _, partition := range tryRelease { + updates[partition.ID] = Partition{ + ID: partition.ID, + Session: partition.Successor, + Successor: Session{}, + Span: partition.Span, + } + } + if !tryClaim.Empty() { + updates[tryClaim.ID] = Partition{ + ID: tryClaim.ID, + Session: session, + Successor: Session{}, + Span: tryClaim.Span, + } + } + if !trySecede.Empty() { + updates[trySecede.ID] = Partition{ + ID: trySecede.ID, + Session: trySecede.Session, + Successor: session, + Span: trySecede.Span, + } + } + + a.cache.Update(updates) + + return len(updates) != 0 +} + +func (a *assignmentSimulator) createSession() Session { + session := Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID(uuid.MakeV4().String()), + } + + a.sessions = append(a.sessions, session) + + // This is simulating the production implementation. The prod implementation + // would use a txn to apply these changes to the DB, then update the cache + // with the latest version of the rows. + tryClaim, trySecede := a.cache.planRegister(session, a.cache) + + updates := make(map[int64]Partition) + if !tryClaim.Empty() { + tryClaim.Session = session + updates[tryClaim.ID] = tryClaim + } + if !trySecede.Empty() { + trySecede.Successor = session + updates[trySecede.ID] = trySecede + } + a.cache.Update(updates) + + return session +} + +func (a *assignmentSimulator) removeSession(session Session) { + // Remove session from sessions list + for i, s := range a.sessions { + if s == session { + a.sessions = append(a.sessions[:i], a.sessions[i+1:]...) + break + } + } + + assignment := a.cache.constructAssignment(session) + updates := make(map[int64]Partition) + + for _, partition := range assignment.Partitions { + // For each partition assigned to this session, make the successor (if any) + // the owner. + updates[partition.ID] = Partition{ + ID: partition.ID, + Session: partition.Successor, + Successor: Session{}, // Clear successor + Span: partition.Span, + } + } + // Any partitions where this session is the successor should have + // the successor cleared. + for id := range a.cache.partitions { + if a.cache.partitions[id].Successor != session { + continue + } + partition := a.cache.partitions[id] + updates[partition.ID] = Partition{ + ID: partition.ID, + Session: partition.Session, // Keep current owner + Successor: Session{}, // Clear successor + Span: partition.Span, + } + } + + a.cache.Update(updates) +} + +func (a *assignmentSimulator) runToStable() { + maxIterations := 100000 // Prevent infinite loops + + for i := 0; i < maxIterations; i++ { + actionTaken := false + + // Process each session + for _, session := range a.sessions { + if a.refreshAssignment(session) { + actionTaken = true + } + } + + // If no action was taken in this round, we're stable + if !actionTaken { + return + } + } + + a.t.Fatalf("runToStable exceeded maximum iterations (%d sessions, %d partitions): %s ", len(a.sessions), len(a.cache.partitions), a.cache.DebugString()) +} + +func TestPartitionCacheSimple(t *testing.T) { + sim := newAssignmentSimulator(t, 2) + + // Create two sessions. + session1 := sim.createSession() + sim.runToStable() + session2 := sim.createSession() + sim.runToStable() + + // Each session should have one partition. + assignment1 := sim.cache.constructAssignment(session1) + assignment2 := sim.cache.constructAssignment(session2) + require.Len(t, assignment1.Partitions, 1) + require.Len(t, assignment2.Partitions, 1) + + // After removing one session, the other session should have both partitions. + sim.removeSession(session1) + sim.runToStable() + assignment2 = sim.cache.constructAssignment(session2) + require.Len(t, assignment2.Partitions, 2) +} + +func TestPartitionCacheRandom(t *testing.T) { + partitions := rand.Intn(1000) + 1 + sessions := make([]Session, rand.Intn(100)+1) + + sim := newAssignmentSimulator(t, partitions) + + for i := range sessions { + if rand.Int()%2 == 0 { + sim.runToStable() + } + sessions[i] = sim.createSession() + } + sim.runToStable() + + t.Logf("%d partitions, %d sessions", partitions, len(sessions)) + t.Log(sim.cache.DebugString()) + + // Verify all partitions are assigned + for _, partition := range sim.cache.partitions { + require.False(t, partition.Session.Empty()) + } + + // Verify that no session has more than ceil(partitions / len(sessions)) + // partitions. + maxPerSession := (partitions + len(sessions) - 1) / len(sessions) + for _, session := range sessions { + assignment := sim.cache.constructAssignment(session) + require.LessOrEqual(t, len(assignment.Partitions), maxPerSession) + } +} diff --git a/pkg/sql/queuefeed/partitions.go b/pkg/sql/queuefeed/partitions.go index d4a2e8af5bac..c079159cf200 100644 --- a/pkg/sql/queuefeed/partitions.go +++ b/pkg/sql/queuefeed/partitions.go @@ -171,7 +171,7 @@ type Session struct { LivenessID sqlliveness.SessionID } -func (s *Session) Empty() bool { +func (s Session) Empty() bool { return s.ConnectionID == uuid.Nil && s.LivenessID == "" } From 17758ef61a80560eebd13436f97b96454149a83d Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Wed, 12 Nov 2025 17:26:51 -0500 Subject: [PATCH 28/46] queuefeed: wire up the assigner --- pkg/sql/queuefeed/assignments.go | 11 +++- pkg/sql/queuefeed/manager.go | 29 +++++++--- pkg/sql/queuefeed/partitions.go | 13 ++--- pkg/sql/queuefeed/partitions_test.go | 4 +- pkg/sql/queuefeed/reader.go | 82 ++++++++++++++++++---------- 5 files changed, 91 insertions(+), 48 deletions(-) diff --git a/pkg/sql/queuefeed/assignments.go b/pkg/sql/queuefeed/assignments.go index 7e7524e2a009..5303c7c75343 100644 --- a/pkg/sql/queuefeed/assignments.go +++ b/pkg/sql/queuefeed/assignments.go @@ -63,9 +63,11 @@ func (p *PartitionAssignments) RegisterSession( return err } for _, partition := range partitions { - if !partition.Session.Empty() { - continue - } + // TODO we really shouldn't force assign partitions, but we are not watch + // sql liveness so we can't detect dead sessions yet. + //if !partition.Session.Empty() { + // continue + //} partition.Session = session if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { return errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) @@ -82,6 +84,9 @@ func (p *PartitionAssignments) RegisterSession( } func (p *PartitionAssignments) UnregisterSession(ctx context.Context, session Session) error { + // TODO: this should probably be pushed onto some task queue that is + // independent of the pgwire session so we can retry without block connection + // cleanup. return p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { partitions, err := p.partitionTable.ListPartitions(ctx, txn) if err != nil { diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index ff0f752c3d27..e5bd5a5c7195 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -15,7 +15,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/util/syncutil" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" ) @@ -32,6 +34,8 @@ type Manager struct { // name -> reader // TODO: this should actually be a map of (session id, name) -> reader, or smth readers map[string]*Reader + + queueAssignment map[string]*PartitionAssignments } } @@ -46,6 +50,7 @@ func NewManager( // handle handoff from one server to another m := &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} m.mu.readers = make(map[string]*Reader) + m.mu.queueAssignment = make(map[string]*PartitionAssignments) return m } @@ -118,7 +123,7 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID } partition := Partition{ ID: 1, - Span: &primaryKeySpan, + Span: primaryKeySpan, } return pt.InsertPartition(ctx, txn, partition) @@ -134,7 +139,11 @@ func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.R return reader, nil } fmt.Printf("get or init reader for queue %s not found in cache\n", name) - reader, err := m.getOrInitReaderUncached(ctx, name) + reader, err := m.newReaderLocked(ctx, name, Session{ + // TODO(queuefeed): get a real session here. + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("1"), + }) if err != nil { return nil, err } @@ -142,8 +151,17 @@ func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.R return reader, nil } -func (m *Manager) getOrInitReaderUncached(ctx context.Context, name string) (*Reader, error) { +func (m *Manager) newReaderLocked( + ctx context.Context, name string, session Session, +) (*Reader, error) { var tableDescID int64 + + assigner, ok := m.mu.queueAssignment[name] + if !ok { + assigner = NewPartitionAssignments(m.executor, name) + m.mu.queueAssignment[name] = assigner + } + // TODO: this ctx on the other hand should be stmt scoped err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) @@ -166,8 +184,7 @@ func (m *Manager) getOrInitReaderUncached(ctx context.Context, name string) (*Re return nil, err } fmt.Printf("get or init reader for queue %s with table desc id: %d\n", name, tableDescID) - reader := NewReader(ctx, m.executor, m, m.rff, m.codec, m.leaseMgr, name, tableDescID) - return reader, nil + return NewReader(ctx, m.executor, m, m.rff, m.codec, m.leaseMgr, session, assigner, name) } func (m *Manager) reassessAssignments(ctx context.Context, name string) (bool, error) { @@ -183,5 +200,3 @@ func (m *Manager) forgetReader(name string) { } var _ queuebase.Manager = &Manager{} - -type PartitionAssignment struct{} diff --git a/pkg/sql/queuefeed/partitions.go b/pkg/sql/queuefeed/partitions.go index c079159cf200..57467acc970f 100644 --- a/pkg/sql/queuefeed/partitions.go +++ b/pkg/sql/queuefeed/partitions.go @@ -21,7 +21,7 @@ type Partition struct { // `sql_liveness_session_successor` assigned to the partition. Successor Session // Span is decoded from the `partition_spec` column. - Span *roachpb.Span + Span roachpb.Span } type partitionTable struct { @@ -175,18 +175,15 @@ func (s Session) Empty() bool { return s.ConnectionID == uuid.Nil && s.LivenessID == "" } -func decodeSpan(data []byte) (*roachpb.Span, error) { +func decodeSpan(data []byte) (roachpb.Span, error) { var span roachpb.Span if err := span.Unmarshal(data); err != nil { - return nil, err + return roachpb.Span{}, err } - return &span, nil + return span, nil } -func encodeSpan(span *roachpb.Span) []byte { - if span == nil { - return nil - } +func encodeSpan(span roachpb.Span) []byte { data, err := span.Marshal() if err != nil { return nil diff --git a/pkg/sql/queuefeed/partitions_test.go b/pkg/sql/queuefeed/partitions_test.go index 5f13296b146b..65391597c03c 100644 --- a/pkg/sql/queuefeed/partitions_test.go +++ b/pkg/sql/queuefeed/partitions_test.go @@ -95,7 +95,7 @@ func TestUpdatePartition(t *testing.T) { // Update the partition newSessionID := uuid.MakeV4() newConnectionID := uuid.MakeV4() - span := &roachpb.Span{Key: roachpb.Key("new"), EndKey: roachpb.Key("span")} + span := roachpb.Span{Key: roachpb.Key("new"), EndKey: roachpb.Key("span")} partition := Partition{ ID: 1, @@ -145,7 +145,7 @@ func TestInsertPartition(t *testing.T) { // Insert partition sessionID := uuid.MakeV4() connectionID := uuid.MakeV4() - span := &roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("z")} + span := roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("z")} partition := Partition{ ID: 1, diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 60b3e6f9f147..419e0a26bf0e 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -46,7 +46,7 @@ type Reader struct { rff *rangefeed.Factory mgr *Manager name string - tableID descpb.ID + assigner *PartitionAssignments // stuff for decoding data. this is ripped from rowfetcher_cache.go in changefeeds codec keys.SQLCodec @@ -61,7 +61,10 @@ type Reader struct { pushedWakeup *sync.Cond } - triggerCheckForReassignment chan struct{} + // TODO: handle the case where an assignment can change. + session Session + assignment *Assignment + rangefeed *rangefeed.RangeFeed cancel context.CancelCauseFunc goroCtx context.Context @@ -75,18 +78,21 @@ func NewReader( rff *rangefeed.Factory, codec keys.SQLCodec, leaseMgr *lease.Manager, + session Session, + assigner *PartitionAssignments, name string, - tableDescID int64, -) *Reader { +) (*Reader, error) { r := &Reader{ - executor: executor, - mgr: mgr, - codec: codec, - leaseMgr: leaseMgr, - name: name, - rff: rff, - tableID: descpb.ID(tableDescID), - triggerCheckForReassignment: make(chan struct{}), + executor: executor, + mgr: mgr, + codec: codec, + leaseMgr: leaseMgr, + name: name, + rff: rff, + // stored so we can use it in methods using a different context than the main goro ie GetRows and ConfirmReceipt + goroCtx: ctx, + assigner: assigner, + session: session, } r.mu.state = readerStateBatching r.mu.buf = make([]tree.Datums, 0, maxBufSize) @@ -100,14 +106,21 @@ func NewReader( r.mu.poppedWakeup.Broadcast() r.mu.pushedWakeup.Broadcast() } - r.goroCtx = ctx - r.setupRangefeed(ctx) + assignment, err := assigner.RegisterSession(ctx, session) + if err != nil { + return nil, errors.Wrap(err, "registering session for reader") + } + if len(assignment.Partitions) == 0 { + return nil, errors.New("no partitions assigned to reader: todo support this case by polling for assignment") + } + + r.setupRangefeed(ctx, assignment) go r.run(ctx) - return r + return r, nil } -func (r *Reader) setupRangefeed(ctx context.Context) { +func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) { defer func() { fmt.Println("setupRangefeed done") }() @@ -161,9 +174,10 @@ func (r *Reader) setupRangefeed(ctx context.Context) { fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., ) - tk := roachpb.Span{Key: r.codec.TablePrefix(uint32(r.tableID))} - tk.EndKey = tk.Key.PrefixEnd() - spans := []roachpb.Span{tk} + // TODO: handle the case where there are no partitions in the assignment. In + // that case we should poll `RefreshAssignment` until we get one. This would + // only occur if every assignment was handed out already. + spans := []roachpb.Span{assignment.Partitions[0].Span} fmt.Printf("starting rangefeed with spans: %+v\n", spans) @@ -171,10 +185,16 @@ func (r *Reader) setupRangefeed(ctx context.Context) { setErr(err) return } - _ = context.AfterFunc(r.goroCtx, func() { + + _ = context.AfterFunc(ctx, func() { + // TODO(queuefeed): move this to Close and hook Close into + // connExecutor.Close(). fmt.Println("closing rangefeed") rf.Close() })() + + r.rangefeed = rf + r.assignment = assignment } // - [x] setup rangefeed on data @@ -196,12 +216,6 @@ func (r *Reader) run(ctx context.Context) { case <-ctx.Done(): fmt.Printf("run: ctx done: %s; cause: %s\n", ctx.Err(), context.Cause(ctx)) return - case <-r.triggerCheckForReassignment: - fmt.Printf("triggerCheckForReassignment\n") - if err := r.checkForReassignment(ctx); err != nil { - r.cancel(err) - return - } } } } @@ -256,6 +270,9 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) fmt.Printf("GetRows done with inflightBuffer len: %d, buf len: %d\n", len(r.mu.inflightBuffer), len(r.mu.buf)) return slices.Clone(r.mu.inflightBuffer), nil + + // and then trigger the goro to check if m wants us to change assignments + // if it does, handle that stuff before doing a new batch } func (r *Reader) ConfirmReceipt(ctx context.Context) { @@ -278,7 +295,15 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { return case <-r.goroCtx.Done(): return - case r.triggerCheckForReassignment <- struct{}{}: + default: + // TODO only set caughtUp to true if our frontier is near the current time. + newAssignment, err := r.assigner.RefreshAssignment(r.session /*caughtUp=*/, true) + if err != nil { + r.cancel(errors.Wrap(err, "refreshing assignment")) + } + if newAssignment != nil { + // TODO restart the rangefeed with the new partitions + } } } @@ -306,8 +331,9 @@ func (r *Reader) IsAlive() bool { } func (r *Reader) Close() error { + err := r.assigner.UnregisterSession(r.goroCtx, r.session) r.cancel(errors.New("reader closing")) - return nil + return err } func (r *Reader) checkForReassignment(ctx context.Context) error { From b5efef8729c156e6fcfcf134ef4942e44e652bb0 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 13 Nov 2025 04:17:25 +0000 Subject: [PATCH 29/46] rudimentary reassignments --- pkg/sql/queuefeed/assignments.go | 52 +++++++++++++++----- pkg/sql/queuefeed/assignments_test.go | 65 +++++++++++++++++++++++++ pkg/sql/queuefeed/partitions.go | 10 ++-- pkg/sql/queuefeed/reader.go | 70 +++++++++++++++++++-------- 4 files changed, 164 insertions(+), 33 deletions(-) diff --git a/pkg/sql/queuefeed/assignments.go b/pkg/sql/queuefeed/assignments.go index 5303c7c75343..6b1dfeae4d88 100644 --- a/pkg/sql/queuefeed/assignments.go +++ b/pkg/sql/queuefeed/assignments.go @@ -1,7 +1,9 @@ package queuefeed import ( + "cmp" "context" + "slices" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/errors" @@ -38,10 +40,36 @@ func NewPartitionAssignments(db isql.DB, queueName string) *PartitionAssignments // If a partition has a successor session, then calling RefreshAssignment will // return an assignment that does not include that partition. func (p *PartitionAssignments) RefreshAssignment( - session Session, caughtUp bool, + ctx context.Context, session Session, caughtUp bool, ) (updatedAssignment *Assignment, err error) { - // This is a stub implementation that assumes there is a single partition. - return nil, nil + // find my assignments and see if any of them have a successor session. return the ones that don't. + // TODO: this should be done in sql + // TODO: this handles partition handoff, but not hand...on... (?) + var myPartitions []Partition + anyChanged := false + err = p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := p.partitionTable.ListPartitions(ctx, txn) + if err != nil { + return err + } + for _, partition := range partitions { + if partition.Session != session { + continue + } + if !partition.Successor.Empty() { + anyChanged = true + continue + } + myPartitions = append(myPartitions, partition) + } + return nil + }) + if !anyChanged { + return nil, nil + } + + slices.SortFunc(myPartitions, func(a, b Partition) int { return cmp.Compare(a.ID, b.ID) }) + return &Assignment{Session: session, Partitions: myPartitions}, nil } // RegisterSession registers a new session. The session may be assigned zero @@ -110,14 +138,16 @@ func (p *PartitionAssignments) UnregisterSession(ctx context.Context, session Se }) } -func (p *PartitionAssignments) constructAssignment(session Session) (*Assignment, error) { - // Build an assignment for the given session from the partition cache. - return nil, errors.New("not implemented") -} - -func (p *PartitionAssignments) tryClaim(session Session, partition *Partition) (Partition, error) { - // Try to claim an unassigned partition for the given session. - return Partition{}, nil +// Try to claim a partition for the given session. +func (p *PartitionAssignments) TryClaim(ctx context.Context, session Session, partition Partition) (Partition, error) { + partition.Successor = session + err := p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return p.partitionTable.UpdatePartition(ctx, txn, partition) + }) + if err != nil { + return Partition{}, errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) + } + return partition, nil } func (p *PartitionAssignments) tryRelease(session Session, toRelease []Partition) error { diff --git a/pkg/sql/queuefeed/assignments_test.go b/pkg/sql/queuefeed/assignments_test.go index b3a60d581328..c0150c378927 100644 --- a/pkg/sql/queuefeed/assignments_test.go +++ b/pkg/sql/queuefeed/assignments_test.go @@ -6,6 +6,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/sql" + "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" @@ -58,3 +59,67 @@ func TestPartitionAssignments(t *testing.T) { "SELECT sql_liveness_session, user_session FROM defaultdb.queue_partition_"+queueName, [][]string{{"NULL", "NULL"}}) } + +func TestPartitionReassignments(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer s.Stopper().Stop(ctx) + + tdb := sqlutils.MakeSQLRunner(sqlDB) + // tdb.Exec(t, "CREATE TABLE test_table (id INT PRIMARY KEY, data TEXT)") // TODO: why does this fail with "empty encoded value"? + tdb.Exec(t, "CREATE TABLE test_table (a string)") + + var tableDescID int64 + tdb.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 'test_table'").Scan(&tableDescID) + + // Create queue using QueueManager + manager := queuefeed.NewTestManager(t, s.ApplicationLayer()) + queueName := "test_queue" + err := manager.CreateQueue(ctx, queueName, tableDescID) + require.NoError(t, err) + + pa := queuefeed.NewPartitionAssignments(s.ExecutorConfig().(sql.ExecutorConfig).InternalDB, queueName) + + reader, err := manager.GetOrInitReader(ctx, "test_queue") + require.NoError(t, err) + + // get the session the reader is using + var partition queuefeed.Partition + pt := queuefeed.TestNewPartitionsTable(queueName) + err = s.ExecutorConfig().(sql.ExecutorConfig).InternalDB.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + require.Len(t, partitions, 1) + partition = partitions[0] + return nil + }) + require.NoError(t, err) + + // some other session tries to claim the partition + someOtherSession := queuefeed.Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("2"), + } + partition, err = pa.TryClaim(ctx, someOtherSession, partition) + require.NoError(t, err) + require.Equal(t, someOtherSession, partition.Successor) + + // do a read from the queue so it checks for a reassignment + tdb.Exec(t, "INSERT INTO test_table (a) VALUES ('test'), ('test2')") + rows, err := reader.GetRows(ctx, 1) + require.NoError(t, err) + require.Len(t, rows, 1) + // confirm receipt. it should then check for a reassignment and see that we disowned it + reader.ConfirmReceipt(ctx) + + // try to read again to see that it failed + // NOTE: we want this to not fail in the future but to sleep & poll instead. + // sooo maybe this isnt the best way to test this. but i cant think of a + // better way at this exact moment. + + rows, err = reader.GetRows(ctx, 1) + require.Error(t, err) +} diff --git a/pkg/sql/queuefeed/partitions.go b/pkg/sql/queuefeed/partitions.go index 57467acc970f..91d51593ea84 100644 --- a/pkg/sql/queuefeed/partitions.go +++ b/pkg/sql/queuefeed/partitions.go @@ -43,7 +43,7 @@ func (p *partitionTable) CreateSchema(ctx context.Context, txn isql.Txn) error { func (p *partitionTable) ListPartitions(ctx context.Context, txn isql.Txn) ([]Partition, error) { rows, err := txn.QueryBuffered(ctx, "list-partitions", txn.KV(), fmt.Sprintf(` - SELECT + SELECT partition_id, sql_liveness_session, user_session, @@ -112,7 +112,7 @@ func (p *partitionTable) InsertPartition( spanBytes := encodeSpan(partition.Span) _, err := txn.Exec(ctx, "insert-partition", txn.KV(), - fmt.Sprintf(`INSERT INTO defaultdb.queue_partition_%s + fmt.Sprintf(`INSERT INTO defaultdb.queue_partition_%s (partition_id, sql_liveness_session, user_session, sql_liveness_session_successor, user_session_successor, partition_spec) VALUES ($1, $2, $3, $4, $5, $6)`, p.queueName), partition.ID, sessionLivenessID, sessionConnectionID, @@ -146,7 +146,7 @@ func (p *partitionTable) UpdatePartition( spanBytes := encodeSpan(partition.Span) _, err := txn.Exec(ctx, "update-partition", txn.KV(), - fmt.Sprintf(`UPDATE defaultdb.queue_partition_%s + fmt.Sprintf(`UPDATE defaultdb.queue_partition_%s SET sql_liveness_session = $2, user_session = $3, sql_liveness_session_successor = $4, @@ -190,3 +190,7 @@ func encodeSpan(span roachpb.Span) []byte { } return data } + +func TestNewPartitionsTable(queueName string) *partitionTable { + return &partitionTable{queueName: queueName} +} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 419e0a26bf0e..802dfebef1a9 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -111,22 +111,28 @@ func NewReader( if err != nil { return nil, errors.Wrap(err, "registering session for reader") } - if len(assignment.Partitions) == 0 { - return nil, errors.New("no partitions assigned to reader: todo support this case by polling for assignment") + if err := r.setupRangefeed(ctx, assignment); err != nil { + return nil, errors.Wrap(err, "setting up rangefeed") } - - r.setupRangefeed(ctx, assignment) go r.run(ctx) return r, nil } -func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) { +var ErrNoPartitionsAssigned = errors.New("no partitions assigned to reader: todo support this case by polling for assignment") + +func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) error { defer func() { fmt.Println("setupRangefeed done") }() + // TODO: handle the case where there are no partitions in the assignment. In + // that case we should poll `RefreshAssignment` until we get one. This would + // only occur if every assignment was handed out already. + if len(assignment.Partitions) == 0 { + return errors.Wrap(ErrNoPartitionsAssigned, "setting up rangefeed") + } + incomingResolveds := make(chan hlc.Timestamp) - setErr := func(err error) { r.cancel(err) } onValue := func(ctx context.Context, value *kvpb.RangeFeedValue) { fmt.Printf("onValue: %+v\n", value) @@ -138,9 +144,13 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) { r.mu.poppedWakeup.Wait() } + if !value.Value.IsPresent() { + // not handling diffs/deletes rn + return + } datums, err := r.decodeRangefeedValue(ctx, value) if err != nil { - setErr(errors.Wrapf(err, "decoding rangefeed value: %+v", value)) + r.cancel(errors.Wrapf(err, "decoding rangefeed value: %+v", value)) return } r.mu.buf = append(r.mu.buf, datums) @@ -162,7 +172,7 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) { default: // TODO: handle resolveds (dont actually default here) } }), - rangefeed.WithOnInternalError(func(ctx context.Context, err error) { setErr(err) }), + rangefeed.WithOnInternalError(func(ctx context.Context, err error) { r.cancel(err) }), rangefeed.WithConsumerID(42), rangefeed.WithInvoker(func(fn func() error) error { return fn() }), rangefeed.WithFiltering(false), @@ -174,16 +184,12 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) { fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., ) - // TODO: handle the case where there are no partitions in the assignment. In - // that case we should poll `RefreshAssignment` until we get one. This would - // only occur if every assignment was handed out already. spans := []roachpb.Span{assignment.Partitions[0].Span} fmt.Printf("starting rangefeed with spans: %+v\n", spans) if err := rf.Start(ctx, spans); err != nil { - setErr(err) - return + return errors.Wrap(err, "starting rangefeed") } _ = context.AfterFunc(ctx, func() { @@ -195,15 +201,17 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) { r.rangefeed = rf r.assignment = assignment + return nil } // - [x] setup rangefeed on data // - [ ] handle only watching my partitions -// - [ ] after each batch, ask mgr if i need to change assignments +// - [X] after each batch, ask mgr if i need to assignments // - [X] buffer rows in the background before being asked for them // - [ ] checkpoint frontier if our frontier has advanced and we confirmed receipt // - [ ] gonna need some way to clean stuff up on conn_executor.close() +// TODO: this loop isnt doing much anymore. if we dont need it for anything else, let's remove it func (r *Reader) run(ctx context.Context) { defer func() { fmt.Println("run done") @@ -270,9 +278,6 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) fmt.Printf("GetRows done with inflightBuffer len: %d, buf len: %d\n", len(r.mu.inflightBuffer), len(r.mu.buf)) return slices.Clone(r.mu.inflightBuffer), nil - - // and then trigger the goro to check if m wants us to change assignments - // if it does, handle that stuff before doing a new batch } func (r *Reader) ConfirmReceipt(ctx context.Context) { @@ -297,14 +302,23 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { return default: // TODO only set caughtUp to true if our frontier is near the current time. - newAssignment, err := r.assigner.RefreshAssignment(r.session /*caughtUp=*/, true) + newAssignment, err := r.assigner.RefreshAssignment(ctx, r.session /*caughtUp=*/, true) if err != nil { r.cancel(errors.Wrap(err, "refreshing assignment")) + return } if newAssignment != nil { - // TODO restart the rangefeed with the new partitions + if err := r.updateAssignment(newAssignment); err != nil { + r.cancel(errors.Wrap(err, "updating assignment")) + return + } } } + func() { + r.mu.Lock() + defer r.mu.Unlock() + r.mu.state = readerStateBatching + }() } func (r *Reader) RollbackBatch(ctx context.Context) { @@ -336,6 +350,24 @@ func (r *Reader) Close() error { return err } +func (r *Reader) updateAssignment(assignment *Assignment) error { + defer func() { + fmt.Printf("updateAssignment done with assignment: %+v\n", assignment) + }() + + r.mu.Lock() + defer r.mu.Unlock() + + r.assignment = assignment + r.rangefeed.Close() + r.mu.buf = r.mu.buf[:0] + + if err := r.setupRangefeed(r.goroCtx, assignment); err != nil { + return errors.Wrapf(err, "setting up rangefeed for new assignment: %+v", assignment) + } + return nil +} + func (r *Reader) checkForReassignment(ctx context.Context) error { defer func() { fmt.Println("checkForReassignment done") From d4da0611d1b45a9ade72f0d0d9c11aad26011abf Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 13 Nov 2025 17:15:40 +0000 Subject: [PATCH 30/46] Create a partition per range --- pkg/sql/queuefeed/BUILD.bazel | 5 ++ pkg/sql/queuefeed/manager.go | 62 ++++++++++++-- pkg/sql/queuefeed/manager_test.go | 103 +++++++++++++++++++++++ pkg/sql/queuefeed/queuebase/queuebase.go | 1 + 4 files changed, 166 insertions(+), 5 deletions(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index e80085d77e43..6a43bd7edd97 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -14,6 +14,8 @@ go_library( deps = [ "//pkg/ccl/changefeedccl/changefeedbase", "//pkg/keys", + "//pkg/kv", + "//pkg/kv/kvclient/kvcoord", "//pkg/kv/kvclient/rangefeed", "//pkg/kv/kvpb", "//pkg/roachpb", @@ -56,6 +58,8 @@ go_test( "//pkg/security/securitytest", "//pkg/server", "//pkg/sql", + "//pkg/sql/catalog", + "//pkg/sql/catalog/descpb", "//pkg/sql/catalog/lease", "//pkg/sql/isql", "//pkg/sql/sqlliveness", @@ -67,6 +71,7 @@ go_test( "//pkg/util/randutil", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", + "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@org_golang_x_sync//errgroup", ], diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index e5bd5a5c7195..8cb64df7578a 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -6,6 +6,8 @@ import ( "fmt" "github.com/cockroachdb/cockroach/pkg/keys" + "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -115,18 +117,68 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID pt := &partitionTable{queueName: queueName} - // Create a single initial partition that covers the table's primary key. + // Make a partition for each range of the table's primary span, covering the span of that range. primaryIndexPrefix := m.codec.IndexPrefix(uint32(tableDesc.GetID()), uint32(tableDesc.GetPrimaryIndexID())) primaryKeySpan := roachpb.Span{ Key: primaryIndexPrefix, EndKey: primaryIndexPrefix.PrefixEnd(), } - partition := Partition{ - ID: 1, - Span: primaryKeySpan, + + // Extract DistSender from the executor's KV DB to iterate over ranges. + txnWrapperSender, ok := m.executor.KV().NonTransactionalSender().(*kv.CrossRangeTxnWrapperSender) + if !ok { + return errors.Errorf("failed to extract a %T from %T", + (*kv.CrossRangeTxnWrapperSender)(nil), m.executor.KV().NonTransactionalSender()) + } + distSender, ok := txnWrapperSender.Wrapped().(*kvcoord.DistSender) + if !ok { + return errors.Errorf("failed to extract a %T from %T", + (*kvcoord.DistSender)(nil), txnWrapperSender.Wrapped()) + } + + // Convert the span to an RSpan for range iteration. + rSpan, err := keys.SpanAddr(primaryKeySpan) + if err != nil { + return errors.Wrapf(err, "converting primary key span to address span") + } + + // Iterate over all ranges covering the primary key span. + it := kvcoord.MakeRangeIterator(distSender) + partitionID := int64(1) + for it.Seek(ctx, rSpan.Key, kvcoord.Ascending); ; it.Next(ctx) { + if !it.Valid() { + return errors.Wrapf(it.Error(), "iterating ranges for primary key span") + } + + // Get the range descriptor and trim its span to the primary key span boundaries. + desc := it.Desc() + startKey := desc.StartKey + if startKey.Compare(rSpan.Key) < 0 { + startKey = rSpan.Key + } + endKey := desc.EndKey + if endKey.Compare(rSpan.EndKey) > 0 { + endKey = rSpan.EndKey + } + + partition := Partition{ + ID: partitionID, + Span: roachpb.Span{Key: startKey.AsRawKey(), EndKey: endKey.AsRawKey()}, + } + + if err := pt.InsertPartition(ctx, txn, partition); err != nil { + return errors.Wrapf(err, "inserting partition %d for range", partitionID) + } + + partitionID++ + + // Check if we need to continue to the next range. + if !it.NeedAnother(rSpan) { + break + } } - return pt.InsertPartition(ctx, txn, partition) + return nil }) } diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index d4640d20504d..3a32d801ad61 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -7,6 +7,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" + "github.com/cockroachdb/cockroach/pkg/roachpb" + "github.com/cockroachdb/cockroach/pkg/sql/catalog" + "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" @@ -14,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/errors" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -73,3 +77,102 @@ func TestQueuefeedCtxCancel(t *testing.T) { _, err := db.DB.QueryContext(ctx, `SELECT crdb_internal.select_from_queue_feed('hi', 1)`) require.Error(t, err) } + +func TestFeedCreationPartitions(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + // expect no error when creating a queue + db := sqlutils.MakeSQLRunner(conn) + db.Exec(t, `CREATE TABLE t (a string)`) + // split into 1k ranges + db.Exec(t, `INSERT INTO t (a) SELECT generate_series(1, 10000)`) + db.Exec(t, `ALTER TABLE t SPLIT AT (SELECT (i/10)::int FROM generate_series(1, 10000) AS g(i))`) + db.Exec(t, `ALTER TABLE t SCATTER`) + + // get table id + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) + qm := NewTestManager(t, srv.ApplicationLayer()) + require.NoError(t, qm.CreateQueue(ctx, "test", tableID)) + + // Get the table descriptor to determine the primary index span. + leaseMgr := srv.ApplicationLayer().LeaseManager().(*lease.Manager) + descriptor, err := leaseMgr.Acquire(ctx, lease.TimestampToReadTimestamp(srv.ApplicationLayer().Clock().Now()), descpb.ID(tableID)) + require.NoError(t, err) + defer descriptor.Release(ctx) + tableDesc := descriptor.Underlying().(catalog.TableDescriptor) + primaryIndexSpan := tableDesc.PrimaryIndexSpan(qm.codec) + + // Count the number of partitions. + pt := &partitionTable{queueName: "test"} + err = srv.ApplicationLayer().InternalDB().(isql.DB).Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + require.GreaterOrEqual(t, len(partitions), 1000, "expected at least 1000 partitions") // It could be a bit more than 1k. + + partitionIDs := make(map[int64]bool) + var partitionSpans []roachpb.Span + for _, partition := range partitions { + // There should be no duplicate partition IDs. + assert.NotZero(t, partition.ID) + _, ok := partitionIDs[partition.ID] + assert.False(t, ok, "duplicate partition ID: %d", partition.ID) + partitionIDs[partition.ID] = true + + // The spans should be primary index only and not overlap and cover the entire primary index span. + partitionSpan := partition.Span + assert.True(t, partitionSpan.Valid()) + assert.True(t, primaryIndexSpan.Contains(partitionSpan)) + partitionSpans = append(partitionSpans, partitionSpan) + + assert.True(t, partition.Session.Empty(), "partition %d should not be assigned to a session", partition.ID) + assert.True(t, partition.Successor.Empty(), "partition %d should not have a successor", partition.ID) + } + + // Verify spans don't overlap by checking each pair. + for i, span1 := range partitionSpans { + for j, span2 := range partitionSpans { + if i < j { + // Spans should not overlap (they can be adjacent). + assert.False(t, span1.Overlaps(span2), + "partition spans should not overlap: span1=%v, span2=%v", span1, span2) + } + } + } + + // Verify spans cover the entire primary index span + var spanGroup roachpb.SpanGroup + spanGroup.Add(partitionSpans...) + mergedSpans := spanGroup.Slice() // should be a single span covering the entire primary index span + assert.Equal(t, 1, len(mergedSpans)) + assert.True(t, mergedSpans[0].Equal(primaryIndexSpan)) + + return nil + }) + require.NoError(t, err) + + // Start a reader and verify it reads all the partitions. + reader, err := qm.GetOrInitReader(ctx, "test") + require.NoError(t, err) + require.NotNil(t, reader) + defer func() { _ = reader.Close() }() + + err = srv.ApplicationLayer().InternalDB().(isql.DB).Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + + session := reader.(*Reader).session + for _, partition := range partitions { + assert.Equal(t, session, partition.Session) + assert.True(t, partition.Successor.Empty(), "partition %d should not have a successor", partition.ID) + } + + return nil + }) + require.NoError(t, err) +} diff --git a/pkg/sql/queuefeed/queuebase/queuebase.go b/pkg/sql/queuefeed/queuebase/queuebase.go index 827f7bbd3e61..e0ab808f53b9 100644 --- a/pkg/sql/queuefeed/queuebase/queuebase.go +++ b/pkg/sql/queuefeed/queuebase/queuebase.go @@ -15,4 +15,5 @@ type Reader interface { GetRows(ctx context.Context, limit int) ([]tree.Datums, error) ConfirmReceipt(ctx context.Context) RollbackBatch(ctx context.Context) + Close() error } From 9188113a3dff96e26b1a6c47342bf949cf8192e1 Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Wed, 12 Nov 2025 15:16:34 -0500 Subject: [PATCH 31/46] queuefeed: Add checkpointing logic Epic: none --- pkg/sql/queuefeed/BUILD.bazel | 3 + pkg/sql/queuefeed/manager.go | 52 +++++++++ pkg/sql/queuefeed/reader.go | 130 ++++++++++++++++------ pkg/sql/queuefeed/reader_test.go | 178 +++++++++++++++++++++++++++++++ 4 files changed, 332 insertions(+), 31 deletions(-) create mode 100644 pkg/sql/queuefeed/reader_test.go diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 6a43bd7edd97..5a307da93949 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -47,6 +47,7 @@ go_test( "manager_test.go", "partition_cache_test.go", "partitions_test.go", + "reader_test.go", "smoke_test.go", ], embed = [":queuefeed"], @@ -62,6 +63,8 @@ go_test( "//pkg/sql/catalog/descpb", "//pkg/sql/catalog/lease", "//pkg/sql/isql", + "//pkg/sql/queuefeed/queuebase", + "//pkg/sql/sem/tree", "//pkg/sql/sqlliveness", "//pkg/testutils/serverutils", "//pkg/testutils/sqlutils", diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 8cb64df7578a..9b47e962e390 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" @@ -77,6 +78,15 @@ const fetchQueueFeedSQL = ` SELECT table_desc_id FROM defaultdb.queue_feeds WHERE queue_feed_name = $1 ` +const updateCheckpointSQL = ` +UPSERT INTO defaultdb.queue_cursor_%s (partition_id, updated_at, cursor) +VALUES ($1, now(), $2) +` + +const readCheckpointSQL = ` +SELECT cursor FROM defaultdb.queue_cursor_%s WHERE partition_id = $1 +` + // should take a txn func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID int64) error { err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { @@ -251,4 +261,46 @@ func (m *Manager) forgetReader(name string) { }() } +func (m *Manager) WriteCheckpoint( + ctx context.Context, queueName string, partitionID int64, ts hlc.Timestamp, +) error { + // Serialize the timestamp as bytes + cursorBytes, err := ts.Marshal() + if err != nil { + return errors.Wrap(err, "marshaling checkpoint timestamp") + } + + sql := fmt.Sprintf(updateCheckpointSQL, queueName) + return m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + _, err := txn.Exec(ctx, "write_checkpoint", txn.KV(), sql, partitionID, cursorBytes) + return err + }) +} + +func (m *Manager) ReadCheckpoint( + ctx context.Context, queueName string, partitionID int64, +) (hlc.Timestamp, error) { + var ts hlc.Timestamp + sql := fmt.Sprintf(readCheckpointSQL, queueName) + + err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + row, err := txn.QueryRowEx(ctx, "read_checkpoint", txn.KV(), + sessiondata.NodeUserSessionDataOverride, sql, partitionID) + if err != nil { + return err + } + if row == nil { + return nil + } + + cursorBytes := []byte(*row[0].(*tree.DBytes)) + if err := ts.Unmarshal(cursorBytes); err != nil { + return errors.Wrap(err, "unmarshaling checkpoint timestamp") + } + return nil + }) + + return ts, err +} + var _ queuebase.Manager = &Manager{} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 802dfebef1a9..e1da24279908 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -39,6 +39,15 @@ const ( readerStateDead ) +// bufferedEvent represents either a data row or a checkpoint timestamp +// in the reader's buffer. Exactly one of row or resolved will be set. +type bufferedEvent struct { + // row is set for data events. nil for checkpoint events. + row tree.Datums + // resolved is set for checkpoint events. Empty for data events. + resolved hlc.Timestamp +} + // has rangefeed on data. reads from it. handles handoff // state machine around handing out batches and handing stuff off type Reader struct { @@ -55,8 +64,8 @@ type Reader struct { mu struct { syncutil.Mutex state readerState - buf []tree.Datums - inflightBuffer []tree.Datums + buf []bufferedEvent + inflightBuffer []bufferedEvent poppedWakeup *sync.Cond pushedWakeup *sync.Cond } @@ -95,7 +104,7 @@ func NewReader( session: session, } r.mu.state = readerStateBatching - r.mu.buf = make([]tree.Datums, 0, maxBufSize) + r.mu.buf = make([]bufferedEvent, 0, maxBufSize) r.mu.poppedWakeup = sync.NewCond(&r.mu.Mutex) r.mu.pushedWakeup = sync.NewCond(&r.mu.Mutex) @@ -132,8 +141,6 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err return errors.Wrap(ErrNoPartitionsAssigned, "setting up rangefeed") } - incomingResolveds := make(chan hlc.Timestamp) - onValue := func(ctx context.Context, value *kvpb.RangeFeedValue) { fmt.Printf("onValue: %+v\n", value) r.mu.Lock() @@ -153,7 +160,7 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err r.cancel(errors.Wrapf(err, "decoding rangefeed value: %+v", value)) return } - r.mu.buf = append(r.mu.buf, datums) + r.mu.buf = append(r.mu.buf, bufferedEvent{row: datums}) r.mu.pushedWakeup.Broadcast() fmt.Printf("onValue done with buf len: %d\n", len(r.mu.buf)) } @@ -166,11 +173,20 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err if checkpoint.ResolvedTS.IsEmpty() { return } - select { - case incomingResolveds <- checkpoint.ResolvedTS: - case <-ctx.Done(): - default: // TODO: handle resolveds (dont actually default here) + + r.mu.Lock() + defer r.mu.Unlock() + + // Wait for rows to be read before adding more, if necessary. + for ctx.Err() == nil && len(r.mu.buf) > maxBufSize { + r.mu.poppedWakeup.Wait() + } + + if ctx.Err() != nil { + return } + + r.mu.buf = append(r.mu.buf, bufferedEvent{resolved: checkpoint.ResolvedTS}) }), rangefeed.WithOnInternalError(func(ctx context.Context, err error) { r.cancel(err) }), rangefeed.WithConsumerID(42), @@ -178,8 +194,18 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err rangefeed.WithFiltering(false), } - // TODO: resume from cursor - initialTS := hlc.Timestamp{WallTime: timeutil.Now().UnixNano()} + // Resume from checkpoint if available + // TODO: Support multiple partitions + partitionID := int64(1) + initialTS, err := r.mgr.ReadCheckpoint(ctx, r.name, partitionID) + if err != nil { + return errors.Wrap(err, "reading checkpoint") + } + if initialTS.IsEmpty() { + // No checkpoint found, start from now + initialTS = hlc.Timestamp{WallTime: timeutil.Now().UnixNano()} + } + rf := r.rff.New( fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., ) @@ -245,56 +271,100 @@ func (r *Reader) GetRows(ctx context.Context, limit int) ([]tree.Datums, error) return nil, errors.AssertionFailedf("getrows called with nonempty inflight buffer") } - if len(r.mu.buf) == 0 { - fmt.Printf("GetRows called with empty buf. waiting for pushedWakeup\n") + // Helper to count data events (not checkpoints) in buffer + hasDataEvents := func() bool { + for _, event := range r.mu.buf { + if event.resolved.IsEmpty() { + return true + } + } + return false + } + + // Wait until we have at least one data event (not just checkpoints) + if !hasDataEvents() { // shut down the reader if this ctx (which is distinct from the goro ctx) is canceled defer context.AfterFunc(ctx, func() { r.cancel(errors.Wrapf(ctx.Err(), "GetRows canceled")) })() - for ctx.Err() == nil && r.goroCtx.Err() == nil && len(r.mu.buf) == 0 { + for ctx.Err() == nil && r.goroCtx.Err() == nil && !hasDataEvents() { r.mu.pushedWakeup.Wait() } if ctx.Err() != nil { return nil, errors.Wrapf(ctx.Err(), "GetRows canceled") } - if r.goroCtx.Err() != nil { - return nil, errors.Wrapf(r.goroCtx.Err(), "reader shutting down") - } } - if limit > len(r.mu.buf) { - limit = len(r.mu.buf) + // Find the position of the (limit+1)th data event (not checkpoint) + // We'll take everything up to that point, which gives us up to `limit` data rows + // plus any checkpoints that came before/between them. + bufferEndIdx := len(r.mu.buf) + + // Optimization: if the entire buffer is smaller than limit, take it all + if len(r.mu.buf) > limit { + dataCount := 0 + for i, event := range r.mu.buf { + if event.resolved.IsEmpty() { + dataCount++ + if dataCount > limit { + bufferEndIdx = i + break + } + } + } } - fmt.Printf("GetRows called with limit: %d, buf len: %d\n", limit, len(r.mu.buf)) - - r.mu.inflightBuffer = append(r.mu.inflightBuffer, r.mu.buf[0:limit]...) - r.mu.buf = r.mu.buf[limit:] + r.mu.inflightBuffer = append(r.mu.inflightBuffer, r.mu.buf[0:bufferEndIdx]...) + r.mu.buf = r.mu.buf[bufferEndIdx:] r.mu.state = readerStateHasUncommittedBatch - r.mu.poppedWakeup.Broadcast() - fmt.Printf("GetRows done with inflightBuffer len: %d, buf len: %d\n", len(r.mu.inflightBuffer), len(r.mu.buf)) + // Here we filter to return only data events to the user. + result := make([]tree.Datums, 0, limit) + for _, event := range r.mu.inflightBuffer { + if event.resolved.IsEmpty() { + result = append(result, event.row) + } + } - return slices.Clone(r.mu.inflightBuffer), nil + return result, nil } +// ConfirmReceipt is called when we commit a transaction that reads from the queue. +// We will checkpoint if we have checkpoint events in our inflightBuffer. func (r *Reader) ConfirmReceipt(ctx context.Context) { if r.isShutdown.Load() { return } + var checkpointToWrite hlc.Timestamp func() { r.mu.Lock() defer r.mu.Unlock() - fmt.Printf("confirming receipt with inflightBuffer len: %d\n", len(r.mu.inflightBuffer)) + // Find the last checkpoint in inflightBuffer + for _, event := range r.mu.inflightBuffer { + if !event.resolved.IsEmpty() { + checkpointToWrite = event.resolved + } + } r.mu.inflightBuffer = r.mu.inflightBuffer[:0] r.mu.state = readerStateCheckingForReassignment }() + // Persist the checkpoint if we have one. + if !checkpointToWrite.IsEmpty() { + // TODO: Support multiple partitions - for now we only have partition 1. + partitionID := int64(1) + if err := r.mgr.WriteCheckpoint(ctx, r.name, partitionID, checkpointToWrite); err != nil { + fmt.Printf("error writing checkpoint: %s\n", err) + // TODO: decide how to handle checkpoint write errors. Since the txn + // has already committed, I don't think we can really fail at this point. + } + } + select { case <-ctx.Done(): return @@ -329,9 +399,7 @@ func (r *Reader) RollbackBatch(ctx context.Context) { r.mu.Lock() defer r.mu.Unlock() - fmt.Printf("rolling back batch with inflightBuffer len: %d\n", len(r.mu.inflightBuffer)) - - newBuf := make([]tree.Datums, 0, len(r.mu.inflightBuffer)+len(r.mu.buf)) + newBuf := make([]bufferedEvent, 0, len(r.mu.inflightBuffer)+len(r.mu.buf)) newBuf = append(newBuf, r.mu.inflightBuffer...) newBuf = append(newBuf, r.mu.buf...) r.mu.buf = newBuf diff --git a/pkg/sql/queuefeed/reader_test.go b/pkg/sql/queuefeed/reader_test.go new file mode 100644 index 000000000000..5743176a24da --- /dev/null +++ b/pkg/sql/queuefeed/reader_test.go @@ -0,0 +1,178 @@ +package queuefeed + +import ( + "context" + "testing" + "time" + + "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" + "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" + "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/util/leaktest" + "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/stretchr/testify/require" +) + +func TestReaderBasic(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := sqlutils.MakeSQLRunner(conn) + db.Exec(t, `CREATE TABLE t (a STRING, b INT)`) + + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 't'").Scan(&tableID) + + qm := NewTestManager(t, srv.ApplicationLayer()) + require.NoError(t, qm.CreateQueue(ctx, "test_queue", tableID)) + + reader, err := qm.GetOrInitReader(ctx, "test_queue") + require.NoError(t, err) + defer reader.(*Reader).Close() + + db.Exec(t, `INSERT INTO t VALUES ('row1', 10), ('row2', 20), ('row3', 30)`) + + rows := pollForRows(t, ctx, reader, 3) + + requireRow(t, rows[0], "row1", 10) + requireRow(t, rows[1], "row2", 20) + requireRow(t, rows[2], "row3", 30) + reader.ConfirmReceipt(ctx) +} + +func TestReaderRollback(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := sqlutils.MakeSQLRunner(conn) + db.Exec(t, `CREATE TABLE t (a STRING, b INT)`) + + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 't'").Scan(&tableID) + + qm := NewTestManager(t, srv.ApplicationLayer()) + require.NoError(t, qm.CreateQueue(ctx, "rollback_test", tableID)) + + reader, err := qm.GetOrInitReader(ctx, "rollback_test") + require.NoError(t, err) + defer reader.(*Reader).Close() + + db.Exec(t, `INSERT INTO t VALUES ('row1', 100), ('row2', 200)`) + + rows1 := pollForRows(t, ctx, reader, 2) + + requireRow(t, rows1[0], "row1", 100) + requireRow(t, rows1[1], "row2", 200) + + reader.RollbackBatch(ctx) + + rows2, err := reader.GetRows(ctx, 10) + require.NoError(t, err) + require.Len(t, rows2, 2, "should get same 2 rows after rollback") + + requireRow(t, rows2[0], "row1", 100) + requireRow(t, rows2[1], "row2", 200) + + reader.ConfirmReceipt(ctx) + + db.Exec(t, `INSERT INTO t VALUES ('row3', 300), ('row4', 400)`) + + // Verify we got the NEW data (row3, row4), NOT the old data (row1, row2). + rows3 := pollForRows(t, ctx, reader, 2) + + requireRow(t, rows3[0], "row3", 300) + requireRow(t, rows3[1], "row4", 400) + + reader.ConfirmReceipt(ctx) +} + +func TestCheckpointRestoration(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := sqlutils.MakeSQLRunner(conn) + db.Exec(t, `CREATE TABLE t (a STRING, b INT)`) + + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 't'").Scan(&tableID) + + qm := NewTestManager(t, srv.ApplicationLayer()) + require.NoError(t, qm.CreateQueue(ctx, "checkpoint_test", tableID)) + + reader1, err := qm.GetOrInitReader(ctx, "checkpoint_test") + require.NoError(t, err) + + db.Exec(t, `INSERT INTO t VALUES ('batch1_row1', 1), ('batch1_row2', 2)`) + + // Sleep to let the rangefeed checkpoint advance past the data timestamps. + // This is really ugly but with 3 seconds the test failed in 100% of runs. + time.Sleep(10 * time.Second) + + _ = pollForRows(t, ctx, reader1, 2) + + reader1.ConfirmReceipt(ctx) + require.NoError(t, reader1.(*Reader).Close()) + + db.Exec(t, `INSERT INTO t VALUES ('batch2_row1', 3), ('batch2_row2', 4)`) + + reader2, err := qm.GetOrInitReader(ctx, "checkpoint_test") + require.NoError(t, err) + defer reader2.(*Reader).Close() + + rows2 := pollForRows(t, ctx, reader2, 2) + + // Verify we got ONLY the new data, not the old data. + // Check that none of the rows are from batch1. + for _, row := range rows2 { + val := getString(row[0]) + require.NotContains(t, val, "batch1", "should not see batch1 data after checkpoint") + require.Contains(t, val, "batch2", "should see batch2 data") + } +} + +// pollForRows waits for the reader to return expectedCount rows. +func pollForRows( + t *testing.T, ctx context.Context, reader queuebase.Reader, expectedCount int, +) []tree.Datums { + var rows []tree.Datums + require.Eventually(t, func() bool { + var err error + rows, err = reader.GetRows(ctx, 10) + require.NoError(t, err) + if len(rows) < expectedCount { + reader.RollbackBatch(ctx) + } + return len(rows) == expectedCount + }, 5*time.Second, 50*time.Millisecond, "expected %d rows", expectedCount) + return rows +} + +// getString extracts a string from a tree.Datum. +func getString(d tree.Datum) string { + return string(*d.(*tree.DString)) +} + +// getInt extracts an int64 from a tree.Datum. +func getInt(d tree.Datum) int64 { + return int64(*d.(*tree.DInt)) +} + +// requireRow asserts that a row matches the expected string and int values. +func requireRow(t *testing.T, row tree.Datums, expectedStr string, expectedInt int64) { + require.Equal(t, expectedStr, getString(row[0])) + require.Equal(t, expectedInt, getInt(row[1])) +} From d5ab9c344b52d8dca2cd25db9177e674c22d7af7 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 13 Nov 2025 19:04:56 +0000 Subject: [PATCH 32/46] move reader management to conn executor --- pkg/sql/conn_executor.go | 69 ++++++++++++++++++++++ pkg/sql/faketreeeval/BUILD.bazel | 1 + pkg/sql/faketreeeval/evalctx.go | 6 ++ pkg/sql/planner.go | 9 +++ pkg/sql/queuefeed/assignments_test.go | 6 +- pkg/sql/queuefeed/manager.go | 42 +++---------- pkg/sql/queuefeed/manager_test.go | 21 +++++-- pkg/sql/queuefeed/queuebase/queuebase.go | 7 ++- pkg/sql/queuefeed/reader.go | 1 - pkg/sql/sem/builtins/builtins.go | 6 +- pkg/sql/sem/builtins/generator_builtins.go | 2 +- pkg/sql/sem/eval/BUILD.bazel | 1 + pkg/sql/sem/eval/deps.go | 5 ++ 13 files changed, 131 insertions(+), 45 deletions(-) diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 0a6a5b399f97..cedc31bcaea6 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -54,6 +54,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgwirecancel" "github.com/cockroachdb/cockroach/pkg/sql/prep" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/regions" "github.com/cockroachdb/cockroach/pkg/sql/schemachanger/scerrors" "github.com/cockroachdb/cockroach/pkg/sql/sem/asof" @@ -65,6 +67,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sessionmutator" "github.com/cockroachdb/cockroach/pkg/sql/sessionphase" "github.com/cockroachdb/cockroach/pkg/sql/sqlerrors" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/sql/sqlstats" "github.com/cockroachdb/cockroach/pkg/sql/sqlstats/insights" "github.com/cockroachdb/cockroach/pkg/sql/sqlstats/persistedsqlstats" @@ -91,6 +94,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/tochar" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/crlib/crtime" "github.com/cockroachdb/errors" "github.com/cockroachdb/logtags" @@ -1206,6 +1210,7 @@ func (s *Server) newConnExecutor( totalActiveTimeStopWatch: timeutil.NewStopWatch(), txnFingerprintIDCache: NewTxnFingerprintIDCache(ctx, s.cfg.Settings, &txnFingerprintIDCacheAcc), txnFingerprintIDAcc: &txnFingerprintIDCacheAcc, + queuefeedReaders: make(map[string]*queuefeed.Reader), } ex.rng.internal = rand.New(rand.NewSource(timeutil.Now().UnixNano())) @@ -1409,6 +1414,14 @@ func (ex *connExecutor) close(ctx context.Context, closeType closeType) { ex.state.finishExternalTxn() } + // Close all queuefeed readers + for name, reader := range ex.queuefeedReaders { + if err := reader.Close(); err != nil { + log.Dev.Warningf(ctx, "error closing queuefeed reader %s: %v", name, err) + } + } + ex.queuefeedReaders = nil + ex.resetExtraTxnState(ctx, txnEvent{eventType: txnEvType}, payloadErr) if ex.hasCreatedTemporarySchema && !ex.server.cfg.TestingKnobs.DisableTempObjectsCleanupOnSessionExit { err := cleanupSessionTempObjects( @@ -1902,6 +1915,10 @@ type connExecutor struct { // PCR reader catalog, which is done by checking for the ReplicatedPCRVersion // field on the system database (which is set during tenant bootstrap). isPCRReaderCatalog bool + + // queuefeedReaders stores queuefeed readers created for this connection. + // Readers are closed when the connection closes. + queuefeedReaders map[string]*queuefeed.Reader } // ctxHolder contains a connection's context and, while session tracing is @@ -3858,6 +3875,7 @@ func (ex *connExecutor) initEvalCtx(ctx context.Context, evalCtx *extendedEvalCo localSQLStats: ex.server.localSqlStats, indexUsageStats: ex.indexUsageStats, statementPreparer: ex, + QueueReaderProvider: ex, } evalCtx.copyFromExecCfg(ex.server.cfg) } @@ -4578,6 +4596,57 @@ func (ex *connExecutor) getCreatedSequencesAccessor() createdSequences { } } +// GetOrInitReader gets or creates a queuefeed reader for the given queue name. +// Readers are stored per-connection and closed when the connection closes. +func (ex *connExecutor) GetOrInitReader(ctx context.Context, name string) (queuebase.Reader, error) { + // Check if reader already exists and is alive + if reader, ok := ex.queuefeedReaders[name]; ok && reader.IsAlive() { + return reader, nil + } + + // Need to create a new reader + if ex.server.cfg.QueueManager == nil { + return nil, errors.New("queue manager not configured") + } + mgr := ex.server.cfg.QueueManager + + // Construct Session from connExecutor data + sessionID := ex.planner.extendedEvalCtx.SessionID + connectionIDBytes := sessionID.GetBytes() + connectionID, err := uuid.FromBytes(connectionIDBytes) + if err != nil { + return nil, errors.Wrapf(err, "converting session ID to UUID") + } + + // Get sqlliveness session ID + var livenessID sqlliveness.SessionID + if ex.server.cfg.SQLLiveness != nil { + session, err := ex.server.cfg.SQLLiveness.Session(ex.Ctx()) + if err != nil { + // If we can't get sqlliveness session, we'll use empty string + // This might happen in some environments + livenessID = "" + } else if session != nil { + livenessID = session.ID() + } + } + + session := queuefeed.Session{ + ConnectionID: connectionID, + LivenessID: livenessID, + } + + // Create reader using Manager's helper method + reader, err := mgr.CreateReaderForSession(ctx, name, session) + if err != nil { + return nil, errors.Wrapf(err, "creating reader for queue %s", name) + } + + // Store reader + ex.queuefeedReaders[name] = reader + return reader, nil +} + // sessionEventf logs a message to the session event log (if any). func (ex *connExecutor) sessionEventf(ctx context.Context, format string, args ...interface{}) { if log.ExpensiveLogEnabled(ctx, 2) { diff --git a/pkg/sql/faketreeeval/BUILD.bazel b/pkg/sql/faketreeeval/BUILD.bazel index d8a4c36dade1..4f65a18a7150 100644 --- a/pkg/sql/faketreeeval/BUILD.bazel +++ b/pkg/sql/faketreeeval/BUILD.bazel @@ -17,6 +17,7 @@ go_library( "//pkg/sql/pgwire/pgerror", "//pkg/sql/pgwire/pgnotice", "//pkg/sql/privilege", + "//pkg/sql/queuefeed/queuebase", "//pkg/sql/sem/eval", "//pkg/sql/sem/tree", "//pkg/sql/sessiondata", diff --git a/pkg/sql/faketreeeval/evalctx.go b/pkg/sql/faketreeeval/evalctx.go index 5795d935ee71..4094fafb148e 100644 --- a/pkg/sql/faketreeeval/evalctx.go +++ b/pkg/sql/faketreeeval/evalctx.go @@ -21,6 +21,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" @@ -365,6 +366,11 @@ func (p *DummyEvalPlanner) ExtendHistoryRetention(ctx context.Context, id jobspb return errors.WithStack(errEvalPlanner) } +// GetQueueReaderProvider is part of the eval.Planner interface. +func (*DummyEvalPlanner) GetQueueReaderProvider() queuebase.ReaderProvider { + return nil +} + var _ eval.Planner = &DummyEvalPlanner{} var errEvalPlanner = pgerror.New(pgcode.ScalarOperationCannotRunWithoutFullSessionContext, diff --git a/pkg/sql/planner.go b/pkg/sql/planner.go index 2bd5d90bcb58..a15543327853 100644 --- a/pkg/sql/planner.go +++ b/pkg/sql/planner.go @@ -42,6 +42,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/prep" "github.com/cockroachdb/cockroach/pkg/sql/privilege" "github.com/cockroachdb/cockroach/pkg/sql/querycache" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/regions" "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/sem/eval" @@ -117,6 +118,9 @@ type extendedEvalContext struct { // validateDbZoneConfig should the DB zone config on commit. validateDbZoneConfig *bool + + // QueueReaderProvider provides access to queuefeed readers for this session. + QueueReaderProvider queuebase.ReaderProvider } // copyFromExecCfg copies relevant fields from an ExecutorConfig. @@ -613,6 +617,11 @@ func (p *planner) Mon() *mon.BytesMonitor { return p.monitor } +// GetQueueReaderProvider is part of the eval.Planner interface. +func (p *planner) GetQueueReaderProvider() queuebase.ReaderProvider { + return p.extendedEvalCtx.QueueReaderProvider +} + // ExecCfg implements the PlanHookState interface. func (p *planner) ExecCfg() *ExecutorConfig { return p.extendedEvalCtx.ExecCfg diff --git a/pkg/sql/queuefeed/assignments_test.go b/pkg/sql/queuefeed/assignments_test.go index c0150c378927..38a1c9727690 100644 --- a/pkg/sql/queuefeed/assignments_test.go +++ b/pkg/sql/queuefeed/assignments_test.go @@ -83,7 +83,11 @@ func TestPartitionReassignments(t *testing.T) { pa := queuefeed.NewPartitionAssignments(s.ExecutorConfig().(sql.ExecutorConfig).InternalDB, queueName) - reader, err := manager.GetOrInitReader(ctx, "test_queue") + session := queuefeed.Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("1"), + } + reader, err := manager.CreateReaderForSession(ctx, "test_queue", session) require.NoError(t, err) // get the session the reader is using diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 9b47e962e390..147443af62f0 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -17,10 +17,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" - "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/syncutil" - "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" ) @@ -34,10 +32,6 @@ type Manager struct { mu struct { syncutil.Mutex - // name -> reader - // TODO: this should actually be a map of (session id, name) -> reader, or smth - readers map[string]*Reader - queueAssignment map[string]*PartitionAssignments } } @@ -52,7 +46,6 @@ func NewManager( // setup rangefeed on partitions table (/poll) // handle handoff from one server to another m := &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} - m.mu.readers = make(map[string]*Reader) m.mu.queueAssignment = make(map[string]*PartitionAssignments) return m } @@ -192,27 +185,6 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID }) } -func (m *Manager) GetOrInitReader(ctx context.Context, name string) (queuebase.Reader, error) { - m.mu.Lock() - defer m.mu.Unlock() - reader, ok := m.mu.readers[name] - if ok && reader.IsAlive() { - fmt.Printf("get or init reader for queue %s found in cache\n", name) - return reader, nil - } - fmt.Printf("get or init reader for queue %s not found in cache\n", name) - reader, err := m.newReaderLocked(ctx, name, Session{ - // TODO(queuefeed): get a real session here. - ConnectionID: uuid.MakeV4(), - LivenessID: sqlliveness.SessionID("1"), - }) - if err != nil { - return nil, err - } - m.mu.readers[name] = reader - return reader, nil -} - func (m *Manager) newReaderLocked( ctx context.Context, name string, session Session, ) (*Reader, error) { @@ -253,12 +225,14 @@ func (m *Manager) reassessAssignments(ctx context.Context, name string) (bool, e return false, nil } -func (m *Manager) forgetReader(name string) { - func() { - m.mu.Lock() - defer m.mu.Unlock() - delete(m.mu.readers, name) - }() +// CreateReaderForSession creates a new reader for the given queue name and session. +// This method handles locking and partition assignment lookup internally. +func (m *Manager) CreateReaderForSession( + ctx context.Context, name string, session Session, +) (*Reader, error) { + m.mu.Lock() + defer m.mu.Unlock() + return m.newReaderLocked(ctx, name, session) } func (m *Manager) WriteCheckpoint( diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 3a32d801ad61..62cd5a0c6af9 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -16,7 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/errors" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" ) @@ -39,7 +39,10 @@ func TestFeedCreation(t *testing.T) { // expect an error when trying to read from a queue that doesn't exist qm := NewTestManager(t, srv.ApplicationLayer()) - _, err := qm.GetOrInitReader(context.Background(), "test") + _, err := qm.CreateReaderForSession(context.Background(), "test", Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: "", + }) require.ErrorContains(t, err, "queue feed not found") // expect no error when creating a queue @@ -51,10 +54,13 @@ func TestFeedCreation(t *testing.T) { require.NoError(t, qm.CreateQueue(context.Background(), "test", tableID)) // now we can read from the queue - reader, err := qm.GetOrInitReader(context.Background(), "test") + reader, err := qm.CreateReaderForSession(context.Background(), "test", Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: "", + }) require.NoError(t, err) require.NotNil(t, reader) - reader.(*Reader).cancel(errors.New("test shutdown")) + reader.Close() } func TestQueuefeedCtxCancel(t *testing.T) { @@ -157,7 +163,10 @@ func TestFeedCreationPartitions(t *testing.T) { require.NoError(t, err) // Start a reader and verify it reads all the partitions. - reader, err := qm.GetOrInitReader(ctx, "test") + reader, err := qm.CreateReaderForSession(ctx, "test", Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: "", + }) require.NoError(t, err) require.NotNil(t, reader) defer func() { _ = reader.Close() }() @@ -166,7 +175,7 @@ func TestFeedCreationPartitions(t *testing.T) { partitions, err := pt.ListPartitions(ctx, txn) require.NoError(t, err) - session := reader.(*Reader).session + session := reader.session for _, partition := range partitions { assert.Equal(t, session, partition.Session) assert.True(t, partition.Successor.Empty(), "partition %d should not have a successor", partition.ID) diff --git a/pkg/sql/queuefeed/queuebase/queuebase.go b/pkg/sql/queuefeed/queuebase/queuebase.go index e0ab808f53b9..7e4ffeb1ce02 100644 --- a/pkg/sql/queuefeed/queuebase/queuebase.go +++ b/pkg/sql/queuefeed/queuebase/queuebase.go @@ -7,10 +7,15 @@ import ( ) type Manager interface { - GetOrInitReader(ctx context.Context, name string) (Reader, error) CreateQueue(ctx context.Context, name string, tableID int64) error } +// ReaderProvider provides access to queuefeed readers. This interface allows +// connExecutor to provide readers without creating circular dependencies. +type ReaderProvider interface { + GetOrInitReader(ctx context.Context, name string) (Reader, error) +} + type Reader interface { GetRows(ctx context.Context, limit int) ([]tree.Datums, error) ConfirmReceipt(ctx context.Context) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index e1da24279908..d8308e8d5117 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -242,7 +242,6 @@ func (r *Reader) run(ctx context.Context) { defer func() { fmt.Println("run done") r.isShutdown.Store(true) - r.mgr.forgetReader(r.name) }() for { diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 3aa6e7f66b89..7b78acdc1326 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4670,7 +4670,7 @@ value if you rely on the HLC for accuracy.`, ReturnType: tree.FixedReturnType(types.MakeArray(types.Json)), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { qn := args[0].(*tree.DString) - qr, err := getQueueManager(evalCtx).GetOrInitReader(evalCtx.SessionCtx, string(*qn)) + qr, err := getQueueReaderProvider(evalCtx).GetOrInitReader(evalCtx.SessionCtx, string(*qn)) if err != nil { return nil, errors.Wrapf(err, "get or init reader for queue %s", string(*qn)) } @@ -12913,3 +12913,7 @@ var nilRegionsError = errors.AssertionFailedf("evalCtx.Regions is nil") func getQueueManager(evalCtx *eval.Context) queuebase.Manager { return evalCtx.Planner.ExecutorConfig().(interface{ GetQueueManager() queuebase.Manager }).GetQueueManager() } + +func getQueueReaderProvider(evalCtx *eval.Context) queuebase.ReaderProvider { + return evalCtx.Planner.GetQueueReaderProvider() +} diff --git a/pkg/sql/sem/builtins/generator_builtins.go b/pkg/sql/sem/builtins/generator_builtins.go index f1ea20223d92..536579d57e8a 100644 --- a/pkg/sql/sem/builtins/generator_builtins.go +++ b/pkg/sql/sem/builtins/generator_builtins.go @@ -4395,7 +4395,7 @@ func (g *queueFeedGenerator) ResolvedType() *types.T { // Start implements the eval.ValueGenerator interface. func (g *queueFeedGenerator) Start(ctx context.Context, txn *kv.Txn) error { - qr, err := getQueueManager(g.evalCtx).GetOrInitReader(g.evalCtx.SessionCtx, g.queueName) + qr, err := getQueueReaderProvider(g.evalCtx).GetOrInitReader(g.evalCtx.SessionCtx, g.queueName) if err != nil { return err } diff --git a/pkg/sql/sem/eval/BUILD.bazel b/pkg/sql/sem/eval/BUILD.bazel index e007a41e4805..df59d0f62dfb 100644 --- a/pkg/sql/sem/eval/BUILD.bazel +++ b/pkg/sql/sem/eval/BUILD.bazel @@ -65,6 +65,7 @@ go_library( "//pkg/sql/pgwire/pgnotice", "//pkg/sql/pgwire/pgwirecancel", "//pkg/sql/privilege", + "//pkg/sql/queuefeed/queuebase", "//pkg/sql/sem/builtins/builtinsregistry", "//pkg/sql/sem/cast", "//pkg/sql/sem/catid", diff --git a/pkg/sql/sem/eval/deps.go b/pkg/sql/sem/eval/deps.go index 34f6de9bd4ba..ce43fb06300d 100644 --- a/pkg/sql/sem/eval/deps.go +++ b/pkg/sql/sem/eval/deps.go @@ -18,6 +18,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/hintpb" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgnotice" "github.com/cockroachdb/cockroach/pkg/sql/privilege" + "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/catid" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" @@ -297,6 +298,10 @@ type Planner interface { // the `system.users` table UserHasAdminRole(ctx context.Context, user username.SQLUsername) (bool, error) + // GetQueueReaderProvider returns the ReaderProvider for queuefeed readers, + // or nil if not available. + GetQueueReaderProvider() queuebase.ReaderProvider + // MemberOfWithAdminOption is used to collect a list of roles (direct and // indirect) that the member is part of. See the comment on the planner // implementation in authorization.go From 229ee22da72e18c1cf3589d4c3a6fc8e4fff3aab Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 13 Nov 2025 19:14:24 +0000 Subject: [PATCH 33/46] cleanup --- pkg/sql/conn_executor.go | 22 ++++++---------- pkg/sql/queuefeed/manager.go | 1 - pkg/sql/queuefeed/manager_test.go | 2 +- pkg/sql/queuefeed/queuebase/queuebase.go | 3 +-- pkg/sql/queuefeed/reader.go | 12 +++------ pkg/sql/queuefeed/reader_test.go | 32 ++++++++++++++++++------ pkg/sql/sem/eval/deps.go | 3 +-- 7 files changed, 37 insertions(+), 38 deletions(-) diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index cedc31bcaea6..a8d8f3f656b9 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -4599,18 +4599,16 @@ func (ex *connExecutor) getCreatedSequencesAccessor() createdSequences { // GetOrInitReader gets or creates a queuefeed reader for the given queue name. // Readers are stored per-connection and closed when the connection closes. func (ex *connExecutor) GetOrInitReader(ctx context.Context, name string) (queuebase.Reader, error) { - // Check if reader already exists and is alive if reader, ok := ex.queuefeedReaders[name]; ok && reader.IsAlive() { return reader, nil } - // Need to create a new reader if ex.server.cfg.QueueManager == nil { return nil, errors.New("queue manager not configured") } mgr := ex.server.cfg.QueueManager - // Construct Session from connExecutor data + // Construct Session. sessionID := ex.planner.extendedEvalCtx.SessionID connectionIDBytes := sessionID.GetBytes() connectionID, err := uuid.FromBytes(connectionIDBytes) @@ -4618,31 +4616,25 @@ func (ex *connExecutor) GetOrInitReader(ctx context.Context, name string) (queue return nil, errors.Wrapf(err, "converting session ID to UUID") } - // Get sqlliveness session ID var livenessID sqlliveness.SessionID if ex.server.cfg.SQLLiveness != nil { session, err := ex.server.cfg.SQLLiveness.Session(ex.Ctx()) if err != nil { - // If we can't get sqlliveness session, we'll use empty string - // This might happen in some environments - livenessID = "" - } else if session != nil { - livenessID = session.ID() + return nil, errors.Wrapf(err, "getting sqlliveness session") } + if session == nil { + return nil, errors.New("sqlliveness session is nil") + } + livenessID = session.ID() } - session := queuefeed.Session{ - ConnectionID: connectionID, - LivenessID: livenessID, - } + session := queuefeed.Session{ConnectionID: connectionID, LivenessID: livenessID} - // Create reader using Manager's helper method reader, err := mgr.CreateReaderForSession(ctx, name, session) if err != nil { return nil, errors.Wrapf(err, "creating reader for queue %s", name) } - // Store reader ex.queuefeedReaders[name] = reader return reader, nil } diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 147443af62f0..2459b41119c8 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -226,7 +226,6 @@ func (m *Manager) reassessAssignments(ctx context.Context, name string) (bool, e } // CreateReaderForSession creates a new reader for the given queue name and session. -// This method handles locking and partition assignment lookup internally. func (m *Manager) CreateReaderForSession( ctx context.Context, name string, session Session, ) (*Reader, error) { diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 62cd5a0c6af9..6fe9b5b8b91f 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -60,7 +60,7 @@ func TestFeedCreation(t *testing.T) { }) require.NoError(t, err) require.NotNil(t, reader) - reader.Close() + _ = reader.Close() } func TestQueuefeedCtxCancel(t *testing.T) { diff --git a/pkg/sql/queuefeed/queuebase/queuebase.go b/pkg/sql/queuefeed/queuebase/queuebase.go index 7e4ffeb1ce02..b98858e1bb7b 100644 --- a/pkg/sql/queuefeed/queuebase/queuebase.go +++ b/pkg/sql/queuefeed/queuebase/queuebase.go @@ -10,8 +10,7 @@ type Manager interface { CreateQueue(ctx context.Context, name string, tableID int64) error } -// ReaderProvider provides access to queuefeed readers. This interface allows -// connExecutor to provide readers without creating circular dependencies. +// Implemented by the conn executor in reality type ReaderProvider interface { GetOrInitReader(ctx context.Context, name string) (Reader, error) } diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index d8308e8d5117..00298aaffacb 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -218,24 +218,17 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err return errors.Wrap(err, "starting rangefeed") } - _ = context.AfterFunc(ctx, func() { - // TODO(queuefeed): move this to Close and hook Close into - // connExecutor.Close(). - fmt.Println("closing rangefeed") - rf.Close() - })() - r.rangefeed = rf r.assignment = assignment return nil } // - [x] setup rangefeed on data -// - [ ] handle only watching my partitions +// - [X] handle only watching my partitions // - [X] after each batch, ask mgr if i need to assignments // - [X] buffer rows in the background before being asked for them // - [ ] checkpoint frontier if our frontier has advanced and we confirmed receipt -// - [ ] gonna need some way to clean stuff up on conn_executor.close() +// - [X] gonna need some way to clean stuff up on conn_executor.close() // TODO: this loop isnt doing much anymore. if we dont need it for anything else, let's remove it func (r *Reader) run(ctx context.Context) { @@ -414,6 +407,7 @@ func (r *Reader) IsAlive() bool { func (r *Reader) Close() error { err := r.assigner.UnregisterSession(r.goroCtx, r.session) r.cancel(errors.New("reader closing")) + r.rangefeed.Close() return err } diff --git a/pkg/sql/queuefeed/reader_test.go b/pkg/sql/queuefeed/reader_test.go index 5743176a24da..6d23aa290010 100644 --- a/pkg/sql/queuefeed/reader_test.go +++ b/pkg/sql/queuefeed/reader_test.go @@ -8,10 +8,12 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/stretchr/testify/require" ) @@ -32,9 +34,12 @@ func TestReaderBasic(t *testing.T) { qm := NewTestManager(t, srv.ApplicationLayer()) require.NoError(t, qm.CreateQueue(ctx, "test_queue", tableID)) - reader, err := qm.GetOrInitReader(ctx, "test_queue") + reader, err := qm.CreateReaderForSession(ctx, "test_queue", Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("1"), + }) require.NoError(t, err) - defer reader.(*Reader).Close() + defer reader.Close() db.Exec(t, `INSERT INTO t VALUES ('row1', 10), ('row2', 20), ('row3', 30)`) @@ -63,9 +68,12 @@ func TestReaderRollback(t *testing.T) { qm := NewTestManager(t, srv.ApplicationLayer()) require.NoError(t, qm.CreateQueue(ctx, "rollback_test", tableID)) - reader, err := qm.GetOrInitReader(ctx, "rollback_test") + reader, err := qm.CreateReaderForSession(ctx, "rollback_test", Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("1"), + }) require.NoError(t, err) - defer reader.(*Reader).Close() + defer reader.Close() db.Exec(t, `INSERT INTO t VALUES ('row1', 100), ('row2', 200)`) @@ -113,7 +121,11 @@ func TestCheckpointRestoration(t *testing.T) { qm := NewTestManager(t, srv.ApplicationLayer()) require.NoError(t, qm.CreateQueue(ctx, "checkpoint_test", tableID)) - reader1, err := qm.GetOrInitReader(ctx, "checkpoint_test") + session1 := Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("1"), + } + reader1, err := qm.CreateReaderForSession(ctx, "checkpoint_test", session1) require.NoError(t, err) db.Exec(t, `INSERT INTO t VALUES ('batch1_row1', 1), ('batch1_row2', 2)`) @@ -125,13 +137,17 @@ func TestCheckpointRestoration(t *testing.T) { _ = pollForRows(t, ctx, reader1, 2) reader1.ConfirmReceipt(ctx) - require.NoError(t, reader1.(*Reader).Close()) + require.NoError(t, reader1.Close()) db.Exec(t, `INSERT INTO t VALUES ('batch2_row1', 3), ('batch2_row2', 4)`) - reader2, err := qm.GetOrInitReader(ctx, "checkpoint_test") + session2 := Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("2"), + } + reader2, err := qm.CreateReaderForSession(ctx, "checkpoint_test", session2) require.NoError(t, err) - defer reader2.(*Reader).Close() + defer reader2.Close() rows2 := pollForRows(t, ctx, reader2, 2) diff --git a/pkg/sql/sem/eval/deps.go b/pkg/sql/sem/eval/deps.go index ce43fb06300d..573217051f24 100644 --- a/pkg/sql/sem/eval/deps.go +++ b/pkg/sql/sem/eval/deps.go @@ -298,8 +298,7 @@ type Planner interface { // the `system.users` table UserHasAdminRole(ctx context.Context, user username.SQLUsername) (bool, error) - // GetQueueReaderProvider returns the ReaderProvider for queuefeed readers, - // or nil if not available. + // GetQueueReaderProvider returns the ReaderProvider for queuefeed readers. GetQueueReaderProvider() queuebase.ReaderProvider // MemberOfWithAdminOption is used to collect a list of roles (direct and From e08acd2659e53a6e3b5752d9b40766cb08ddae0f Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 13 Nov 2025 19:35:40 +0000 Subject: [PATCH 34/46] fix test --- pkg/sql/queuefeed/reader_test.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/pkg/sql/queuefeed/reader_test.go b/pkg/sql/queuefeed/reader_test.go index 6d23aa290010..d62c71007298 100644 --- a/pkg/sql/queuefeed/reader_test.go +++ b/pkg/sql/queuefeed/reader_test.go @@ -121,11 +121,11 @@ func TestCheckpointRestoration(t *testing.T) { qm := NewTestManager(t, srv.ApplicationLayer()) require.NoError(t, qm.CreateQueue(ctx, "checkpoint_test", tableID)) - session1 := Session{ + session := Session{ ConnectionID: uuid.MakeV4(), LivenessID: sqlliveness.SessionID("1"), } - reader1, err := qm.CreateReaderForSession(ctx, "checkpoint_test", session1) + reader1, err := qm.CreateReaderForSession(ctx, "checkpoint_test", session) require.NoError(t, err) db.Exec(t, `INSERT INTO t VALUES ('batch1_row1', 1), ('batch1_row2', 2)`) @@ -141,11 +141,7 @@ func TestCheckpointRestoration(t *testing.T) { db.Exec(t, `INSERT INTO t VALUES ('batch2_row1', 3), ('batch2_row2', 4)`) - session2 := Session{ - ConnectionID: uuid.MakeV4(), - LivenessID: sqlliveness.SessionID("2"), - } - reader2, err := qm.CreateReaderForSession(ctx, "checkpoint_test", session2) + reader2, err := qm.CreateReaderForSession(ctx, "checkpoint_test", session) require.NoError(t, err) defer reader2.Close() From 8e41ad4d34dadddf30a1910652b55031ecff5f8a Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 13 Nov 2025 19:52:44 +0000 Subject: [PATCH 35/46] watch all my partitions --- pkg/sql/queuefeed/BUILD.bazel | 1 - pkg/sql/queuefeed/assignments.go | 9 +++++++++ pkg/sql/queuefeed/reader.go | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 5a307da93949..af501332e2f4 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -73,7 +73,6 @@ go_test( "//pkg/util/log", "//pkg/util/randutil", "//pkg/util/uuid", - "@com_github_cockroachdb_errors//:errors", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@org_golang_x_sync//errgroup", diff --git a/pkg/sql/queuefeed/assignments.go b/pkg/sql/queuefeed/assignments.go index 6b1dfeae4d88..96ef9c462be1 100644 --- a/pkg/sql/queuefeed/assignments.go +++ b/pkg/sql/queuefeed/assignments.go @@ -5,6 +5,7 @@ import ( "context" "slices" + "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/errors" ) @@ -19,6 +20,14 @@ type Assignment struct { Partitions []Partition } +func (a *Assignment) Spans() []roachpb.Span { + sg := roachpb.SpanGroup{} + for _, partition := range a.Partitions { + sg.Add(partition.Span) + } + return sg.Slice() +} + type PartitionAssignments struct { db isql.DB partitionTable *partitionTable diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 00298aaffacb..8d07d325028a 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -210,7 +210,7 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., ) - spans := []roachpb.Span{assignment.Partitions[0].Span} + spans := assignment.Spans() fmt.Printf("starting rangefeed with spans: %+v\n", spans) From 4f5a5b7d8f2ea224f3ed746dab5755d1ca410106 Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Thu, 13 Nov 2025 14:42:43 -0500 Subject: [PATCH 36/46] queuefeed: add smoke test with multiple partitions Add a test for multiple ranges. Rewrite the span splitting logic so that it follows the pattern established by backup. --- pkg/server/server_sql.go | 4 +- pkg/sql/queuefeed/BUILD.bazel | 3 ++ pkg/sql/queuefeed/manager.go | 84 +++++++++++++++---------------- pkg/sql/queuefeed/manager_test.go | 3 +- pkg/sql/queuefeed/reader.go | 36 +++++++------ pkg/sql/queuefeed/reader_test.go | 6 +-- pkg/sql/queuefeed/smoke_test.go | 65 ++++++++++++++++++++++++ 7 files changed, 138 insertions(+), 63 deletions(-) diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index a40470b4d9fa..08183083c19e 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -1065,7 +1065,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { TenantReadOnly: cfg.SQLConfig.TenantReadOnly, CidrLookup: cfg.BaseConfig.CidrLookup, LicenseEnforcer: cfg.SQLConfig.LicenseEnforcer, - QueueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, codec, leaseMgr), + QueueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, cfg.rangeDescIteratorFactory, codec, leaseMgr), } if codec.ForSystemTenant() { @@ -1462,7 +1462,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { internalDBMemMonitor: internalDBMonitor, upgradeManager: upgradeMgr, serviceMode: cfg.serviceMode, - queueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, execCfg.Codec, leaseMgr), + queueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, cfg.rangeDescIteratorFactory, execCfg.Codec, leaseMgr), }, nil } diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index af501332e2f4..f4ee09cb6cbd 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -16,6 +16,7 @@ go_library( "//pkg/keys", "//pkg/kv", "//pkg/kv/kvclient/kvcoord", + "//pkg/kv/kvclient/rangecache", "//pkg/kv/kvclient/rangefeed", "//pkg/kv/kvpb", "//pkg/roachpb", @@ -32,6 +33,7 @@ go_library( "//pkg/sql/sqlliveness", "//pkg/util", "//pkg/util/hlc", + "//pkg/util/rangedesc", "//pkg/util/syncutil", "//pkg/util/timeutil", "//pkg/util/uuid", @@ -51,6 +53,7 @@ go_test( "smoke_test.go", ], embed = [":queuefeed"], + shard_count = 4, deps = [ "//pkg/base", "//pkg/kv/kvclient/rangefeed", diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index 2459b41119c8..fc7fd95070fc 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -6,8 +6,7 @@ import ( "fmt" "github.com/cockroachdb/cockroach/pkg/keys" - "github.com/cockroachdb/cockroach/pkg/kv" - "github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord" + "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangecache" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -18,6 +17,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/rangedesc" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/errors" ) @@ -27,6 +27,8 @@ import ( type Manager struct { executor isql.DB rff *rangefeed.Factory + rdi rangedesc.IteratorFactory + rc *rangecache.RangeCache codec keys.SQLCodec leaseMgr *lease.Manager @@ -40,12 +42,13 @@ func NewManager( _ context.Context, executor isql.DB, rff *rangefeed.Factory, + rdi rangedesc.IteratorFactory, codec keys.SQLCodec, leaseMgr *lease.Manager, ) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another - m := &Manager{executor: executor, rff: rff, codec: codec, leaseMgr: leaseMgr} + m := &Manager{executor: executor, rff: rff, rdi: rdi, codec: codec, leaseMgr: leaseMgr} m.mu.queueAssignment = make(map[string]*PartitionAssignments) return m } @@ -127,46 +130,16 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID EndKey: primaryIndexPrefix.PrefixEnd(), } - // Extract DistSender from the executor's KV DB to iterate over ranges. - txnWrapperSender, ok := m.executor.KV().NonTransactionalSender().(*kv.CrossRangeTxnWrapperSender) - if !ok { - return errors.Errorf("failed to extract a %T from %T", - (*kv.CrossRangeTxnWrapperSender)(nil), m.executor.KV().NonTransactionalSender()) - } - distSender, ok := txnWrapperSender.Wrapped().(*kvcoord.DistSender) - if !ok { - return errors.Errorf("failed to extract a %T from %T", - (*kvcoord.DistSender)(nil), txnWrapperSender.Wrapped()) - } - - // Convert the span to an RSpan for range iteration. - rSpan, err := keys.SpanAddr(primaryKeySpan) + spans, err := m.splitOnRanges(ctx, primaryKeySpan) if err != nil { - return errors.Wrapf(err, "converting primary key span to address span") + return err } - // Iterate over all ranges covering the primary key span. - it := kvcoord.MakeRangeIterator(distSender) partitionID := int64(1) - for it.Seek(ctx, rSpan.Key, kvcoord.Ascending); ; it.Next(ctx) { - if !it.Valid() { - return errors.Wrapf(it.Error(), "iterating ranges for primary key span") - } - - // Get the range descriptor and trim its span to the primary key span boundaries. - desc := it.Desc() - startKey := desc.StartKey - if startKey.Compare(rSpan.Key) < 0 { - startKey = rSpan.Key - } - endKey := desc.EndKey - if endKey.Compare(rSpan.EndKey) > 0 { - endKey = rSpan.EndKey - } - + for _, span := range spans { partition := Partition{ ID: partitionID, - Span: roachpb.Span{Key: startKey.AsRawKey(), EndKey: endKey.AsRawKey()}, + Span: span, } if err := pt.InsertPartition(ctx, txn, partition); err != nil { @@ -174,11 +147,6 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID } partitionID++ - - // Check if we need to continue to the next range. - if !it.NeedAnother(rSpan) { - break - } } return nil @@ -276,4 +244,36 @@ func (m *Manager) ReadCheckpoint( return ts, err } +func (m *Manager) splitOnRanges(ctx context.Context, span roachpb.Span) ([]roachpb.Span, error) { + const pageSize = 100 + rdi, err := m.rdi.NewLazyIterator(ctx, span, pageSize) + if err != nil { + return nil, err + } + + var spans []roachpb.Span + remainingSpan := span + + for ; rdi.Valid(); rdi.Next() { + rangeDesc := rdi.CurRangeDescriptor() + rangeSpan := roachpb.Span{Key: rangeDesc.StartKey.AsRawKey(), EndKey: rangeDesc.EndKey.AsRawKey()} + subspan := remainingSpan.Intersect(rangeSpan) + if !subspan.Valid() { + return nil, errors.AssertionFailedf("%s not in %s of %s", rangeSpan, remainingSpan, span) + } + spans = append(spans, subspan) + remainingSpan.Key = subspan.EndKey + } + + if err := rdi.Error(); err != nil { + return nil, err + } + + if remainingSpan.Valid() { + spans = append(spans, remainingSpan) + } + + return spans, nil +} + var _ queuebase.Manager = &Manager{} diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 6fe9b5b8b91f..1d05a298b399 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -16,6 +16,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/rangedesc" "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -24,7 +25,7 @@ import ( func NewTestManager(t *testing.T, a serverutils.ApplicationLayerInterface) *Manager { ctx := context.Background() db := a.InternalDB().(isql.DB) - m := NewManager(ctx, db, a.RangeFeedFactory().(*rangefeed.Factory), a.Codec(), a.LeaseManager().(*lease.Manager)) + m := NewManager(ctx, db, a.RangeFeedFactory().(*rangefeed.Factory), a.RangeDescIteratorFactory().(rangedesc.IteratorFactory), a.Codec(), a.LeaseManager().(*lease.Manager)) require.NotNil(t, m.codec) return m } diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 8d07d325028a..1437a11bab8d 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -455,32 +455,38 @@ func (r *Reader) checkForReassignment(ctx context.Context) error { func (r *Reader) decodeRangefeedValue( ctx context.Context, rfv *kvpb.RangeFeedValue, ) (tree.Datums, error) { - key, value := rfv.Key, rfv.Value - key, err := r.codec.StripTenantPrefix(key) + partialKey := rfv.Key + partialKey, err := r.codec.StripTenantPrefix(partialKey) if err != nil { - return nil, errors.Wrapf(err, "stripping tenant prefix: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "stripping tenant prefix: %s", keys.PrettyPrint(nil, partialKey)) } - _, tableID, _, err := rowenc.DecodePartialTableIDIndexID(key) + _, tableID, _, err := rowenc.DecodePartialTableIDIndexID(partialKey) if err != nil { - return nil, errors.Wrapf(err, "decoding partial table id index id: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "decoding partial table id index id: %s", keys.PrettyPrint(nil, partialKey)) } - tableDesc, err := r.fetchTableDesc(ctx, tableID, value.Timestamp) + + familyID, err := keys.DecodeFamilyKey(partialKey) + if err != nil { + return nil, errors.Wrapf(err, "decoding family key: %s", keys.PrettyPrint(nil, partialKey)) + } + + tableDesc, err := r.fetchTableDesc(ctx, tableID, rfv.Value.Timestamp) if err != nil { - return nil, errors.Wrapf(err, "fetching table descriptor: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "fetching table descriptor: %s", keys.PrettyPrint(nil, partialKey)) } - familyDesc, err := catalog.MustFindFamilyByID(tableDesc, 0) + familyDesc, err := catalog.MustFindFamilyByID(tableDesc, descpb.FamilyID(familyID)) if err != nil { - return nil, errors.Wrapf(err, "fetching family descriptor: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "fetching family descriptor: %s", keys.PrettyPrint(nil, partialKey)) } cols, err := getRelevantColumnsForFamily(tableDesc, familyDesc) if err != nil { - return nil, errors.Wrapf(err, "getting relevant columns for family: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "getting relevant columns for family: %s", keys.PrettyPrint(nil, partialKey)) } var spec fetchpb.IndexFetchSpec if err := rowenc.InitIndexFetchSpec(&spec, r.codec, tableDesc, tableDesc.GetPrimaryIndex(), cols); err != nil { - return nil, errors.Wrapf(err, "initializing index fetch spec: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "initializing index fetch spec: %s", keys.PrettyPrint(nil, partialKey)) } rf := row.Fetcher{} if err := rf.Init(ctx, row.FetcherInitArgs{ @@ -489,15 +495,15 @@ func (r *Reader) decodeRangefeedValue( TraceKV: true, TraceKVEvery: &util.EveryN{N: 1}, }); err != nil { - return nil, errors.Wrapf(err, "initializing row fetcher: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "initializing row fetcher: %s", keys.PrettyPrint(nil, partialKey)) } - kvProvider := row.KVProvider{KVs: []roachpb.KeyValue{{Key: key, Value: value}}} + kvProvider := row.KVProvider{KVs: []roachpb.KeyValue{{Key: rfv.Key, Value: rfv.Value}}} if err := rf.ConsumeKVProvider(ctx, &kvProvider); err != nil { - return nil, errors.Wrapf(err, "consuming kv provider: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "consuming kv provider: %s", keys.PrettyPrint(nil, partialKey)) } encDatums, _, err := rf.NextRow(ctx) if err != nil { - return nil, errors.Wrapf(err, "fetching next row: %s", keys.PrettyPrint(nil, key)) + return nil, errors.Wrapf(err, "fetching next row: %s", keys.PrettyPrint(nil, partialKey)) } _ = encDatums diff --git a/pkg/sql/queuefeed/reader_test.go b/pkg/sql/queuefeed/reader_test.go index d62c71007298..7f22d0475f80 100644 --- a/pkg/sql/queuefeed/reader_test.go +++ b/pkg/sql/queuefeed/reader_test.go @@ -39,7 +39,7 @@ func TestReaderBasic(t *testing.T) { LivenessID: sqlliveness.SessionID("1"), }) require.NoError(t, err) - defer reader.Close() + defer func() { _ = reader.Close() }() db.Exec(t, `INSERT INTO t VALUES ('row1', 10), ('row2', 20), ('row3', 30)`) @@ -73,7 +73,7 @@ func TestReaderRollback(t *testing.T) { LivenessID: sqlliveness.SessionID("1"), }) require.NoError(t, err) - defer reader.Close() + defer func() { _ = reader.Close() }() db.Exec(t, `INSERT INTO t VALUES ('row1', 100), ('row2', 200)`) @@ -143,7 +143,7 @@ func TestCheckpointRestoration(t *testing.T) { reader2, err := qm.CreateReaderForSession(ctx, "checkpoint_test", session) require.NoError(t, err) - defer reader2.Close() + defer func() { _ = reader2.Close() }() rows2 := pollForRows(t, ctx, reader2, 2) diff --git a/pkg/sql/queuefeed/smoke_test.go b/pkg/sql/queuefeed/smoke_test.go index 1ed10c50e45e..6f0dfc2c9e76 100644 --- a/pkg/sql/queuefeed/smoke_test.go +++ b/pkg/sql/queuefeed/smoke_test.go @@ -2,14 +2,17 @@ package queuefeed import ( "context" + "math/rand" "testing" "time" "github.com/cockroachdb/cockroach/pkg/base" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" + "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/stretchr/testify/require" "golang.org/x/sync/errgroup" ) @@ -71,3 +74,65 @@ func TestQueuefeedSmoketest(t *testing.T) { cancel() require.NoError(t, group.Wait()) } + +func TestQueuefeedSmoketestMultipleRanges(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := sqlutils.MakeSQLRunner(sqlDB) + _, err := srv.SystemLayer().SQLConn(t).Exec(`SET CLUSTER SETTING kv.rangefeed.enabled = true`) + require.NoError(t, err) + + // Create table with composite primary key and split it + db.Exec(t, `CREATE TABLE t (k1 INT, k2 INT, v string, PRIMARY KEY (k1, k2))`) + db.Exec(t, `ALTER TABLE t SPLIT AT VALUES (1)`) + + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) + db.Exec(t, `SELECT crdb_internal.create_queue_feed('test_multi', $1)`, tableID) + + // Create two managers for separate readers + qm := NewTestManager(t, srv.ApplicationLayer()) + newReader := func(session uuid.UUID) *Reader { + qm.mu.Lock() + defer qm.mu.Unlock() + + // TODO: use the built ins once readers are properly assigned to a session. + reader, err := qm.newReaderLocked(ctx, "test_multi", Session{ + ConnectionID: uuid.NewV4(), + LivenessID: sqlliveness.SessionID("1"), + }) + require.NoError(t, err) + return reader + } + + ctx, cancel := context.WithCancel(ctx) + group, ctx := errgroup.WithContext(ctx) + + group.Go(func() error { + for ctx.Err() == nil { + db.Exec(t, `INSERT INTO t VALUES ($1, $2, 'foo')`, rand.Intn(3), rand.Int()) + time.Sleep(10 * time.Millisecond) + } + t.Log("inserter stopping") + return nil + }) + + readRows := 0 + reader := newReader(uuid.NewV4()) + for readRows < 100 { + rows, err := reader.GetRows(ctx, 10) + require.NoError(t, err) + reader.ConfirmReceipt(ctx) + t.Log("reader read", len(rows), "rows") + readRows += len(rows) + require.NoError(t, err) + } + + cancel() + _ = group.Wait() +} From 623d77427450ce08c0cffab6cca128723648b5c3 Mon Sep 17 00:00:00 2001 From: Miles Frankel Date: Thu, 13 Nov 2025 22:22:16 +0000 Subject: [PATCH 37/46] unassign dead sessions --- pkg/server/server_sql.go | 10 +- pkg/sql/queuefeed/BUILD.bazel | 7 +- pkg/sql/queuefeed/manager.go | 181 ++++++++++++++++++++++++++++-- pkg/sql/queuefeed/manager_test.go | 163 ++++++++++++++++++++++++++- 4 files changed, 345 insertions(+), 16 deletions(-) diff --git a/pkg/server/server_sql.go b/pkg/server/server_sql.go index 08183083c19e..daa2f120a613 100644 --- a/pkg/server/server_sql.go +++ b/pkg/server/server_sql.go @@ -217,8 +217,6 @@ type SQLServer struct { // serviceMode is the service mode this server was started with. serviceMode mtinfopb.TenantServiceMode - - queueManager *queuefeed.Manager } // sqlServerOptionalKVArgs are the arguments supplied to newSQLServer which are @@ -1065,7 +1063,7 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { TenantReadOnly: cfg.SQLConfig.TenantReadOnly, CidrLookup: cfg.BaseConfig.CidrLookup, LicenseEnforcer: cfg.SQLConfig.LicenseEnforcer, - QueueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, cfg.rangeDescIteratorFactory, codec, leaseMgr), + QueueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, cfg.rangeDescIteratorFactory, codec, leaseMgr, cfg.sqlLivenessProvider.CachedReader()), } if codec.ForSystemTenant() { @@ -1462,7 +1460,6 @@ func newSQLServer(ctx context.Context, cfg sqlServerArgs) (*SQLServer, error) { internalDBMemMonitor: internalDBMonitor, upgradeManager: upgradeMgr, serviceMode: cfg.serviceMode, - queueManager: queuefeed.NewManager(ctx, cfg.internalDB, cfg.rangeFeedFactory, cfg.rangeDescIteratorFactory, execCfg.Codec, leaseMgr), }, nil } @@ -1796,6 +1793,11 @@ func (s *SQLServer) preStart( s.startLicenseEnforcer(ctx, knobs) + // Close queue manager when the stopper stops. + stopper.AddCloser(stop.CloserFn(func() { + s.execCfg.QueueManager.Close() + })) + // Report a warning if the server is being shut down via the stopper // before it was gracefully drained. This warning may be innocuous // in tests where there is no use of the test server/cluster after diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index f4ee09cb6cbd..e89ed932ed76 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -14,8 +14,6 @@ go_library( deps = [ "//pkg/ccl/changefeedccl/changefeedbase", "//pkg/keys", - "//pkg/kv", - "//pkg/kv/kvclient/kvcoord", "//pkg/kv/kvclient/rangecache", "//pkg/kv/kvclient/rangefeed", "//pkg/kv/kvpb", @@ -33,6 +31,7 @@ go_library( "//pkg/sql/sqlliveness", "//pkg/util", "//pkg/util/hlc", + "//pkg/util/log", "//pkg/util/rangedesc", "//pkg/util/syncutil", "//pkg/util/timeutil", @@ -53,7 +52,7 @@ go_test( "smoke_test.go", ], embed = [":queuefeed"], - shard_count = 4, + shard_count = 4, deps = [ "//pkg/base", "//pkg/kv/kvclient/rangefeed", @@ -69,12 +68,14 @@ go_test( "//pkg/sql/queuefeed/queuebase", "//pkg/sql/sem/tree", "//pkg/sql/sqlliveness", + "//pkg/sql/sqlliveness/slstorage", "//pkg/testutils/serverutils", "//pkg/testutils/sqlutils", "//pkg/testutils/testcluster", "//pkg/util/leaktest", "//pkg/util/log", "//pkg/util/randutil", + "//pkg/util/rangedesc", "//pkg/util/uuid", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index fc7fd95070fc..a10623796624 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -4,6 +4,8 @@ package queuefeed import ( "context" "fmt" + "sync" + "time" "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangecache" @@ -16,7 +18,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sessiondata" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/rangedesc" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/errors" @@ -25,31 +29,55 @@ import ( // watch queue partition table // and create it too?? type Manager struct { - executor isql.DB - rff *rangefeed.Factory - rdi rangedesc.IteratorFactory - rc *rangecache.RangeCache - codec keys.SQLCodec - leaseMgr *lease.Manager + executor isql.DB + rff *rangefeed.Factory + rdi rangedesc.IteratorFactory + rc *rangecache.RangeCache + codec keys.SQLCodec + leaseMgr *lease.Manager + sqlLivenessReader sqlliveness.Reader mu struct { syncutil.Mutex queueAssignment map[string]*PartitionAssignments } + + // watchCtx and watchCancel are used to control the watchForDeadSessions goroutine. + watchCtx context.Context + watchCancel context.CancelFunc + watchWg sync.WaitGroup } func NewManager( - _ context.Context, + ctx context.Context, executor isql.DB, rff *rangefeed.Factory, rdi rangedesc.IteratorFactory, codec keys.SQLCodec, leaseMgr *lease.Manager, + sqlLivenessReader sqlliveness.Reader, ) *Manager { // setup rangefeed on partitions table (/poll) // handle handoff from one server to another - m := &Manager{executor: executor, rff: rff, rdi: rdi, codec: codec, leaseMgr: leaseMgr} + watchCtx, watchCancel := context.WithCancel(ctx) + m := &Manager{ + executor: executor, + rff: rff, + rdi: rdi, + codec: codec, + leaseMgr: leaseMgr, + sqlLivenessReader: sqlLivenessReader, + watchCtx: watchCtx, + watchCancel: watchCancel, + } m.mu.queueAssignment = make(map[string]*PartitionAssignments) + + m.watchWg.Add(1) + go func() { + defer m.watchWg.Done() + m.watchForDeadSessions(watchCtx) + }() + return m } @@ -276,4 +304,141 @@ func (m *Manager) splitOnRanges(ctx context.Context, span roachpb.Span) ([]roach return spans, nil } +// A loop that looks for partitions that are assigned to sql liveness sessions +// that are no longer alive and removes all of their partition claims. (see the +// IsAlive method in the sqlliveness packages) +func (m *Manager) watchForDeadSessions(ctx context.Context) { + // Check for dead sessions every 10 seconds. + ticker := time.NewTicker(10 * time.Second) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := m.checkAndClearDeadSessions(ctx); err != nil { + log.Dev.Warningf(ctx, "error checking for dead sessions: %v", err) + } + } + } +} + +const listQueueFeedsSQL = `SELECT queue_feed_name FROM defaultdb.queue_feeds` + +// checkAndClearDeadSessions checks all partitions across all queues for dead sessions +// and clears their claims. +func (m *Manager) checkAndClearDeadSessions(ctx context.Context) error { + // Get all queue names. + var queueNames []string + err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + rows, err := txn.QueryBuffered(ctx, "list-queue-feeds", txn.KV(), listQueueFeedsSQL) + if err != nil { + return err + } + queueNames = make([]string, 0, len(rows)) + for _, row := range rows { + queueNames = append(queueNames, string(tree.MustBeDString(row[0]))) + } + return nil + }) + if err != nil { + return errors.Wrap(err, "listing queue feeds") + } + + // Check each queue for dead sessions. + for _, queueName := range queueNames { + if err := m.checkQueueForDeadSessions(ctx, queueName); err != nil { + log.Dev.Warningf(ctx, "error checking queue %s for dead sessions: %v", queueName, err) + // Continue checking other queues even if one fails. + } + } + + return nil +} + +// checkQueueForDeadSessions checks all partitions in a queue for dead sessions +// and clears their claims. +func (m *Manager) checkQueueForDeadSessions(ctx context.Context, queueName string) error { + pt := &partitionTable{queueName: queueName} + var partitionsToUpdate []Partition + + err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partitions, err := pt.ListPartitions(ctx, txn) + if err != nil { + return err + } + + for _, partition := range partitions { + needsUpdate := false + updatedPartition := partition + + // Check if the Session is assigned to a dead session. + if partition.Session.LivenessID != "" { + alive, err := m.sqlLivenessReader.IsAlive(ctx, partition.Session.LivenessID) + if err != nil { + // If we can't determine liveness, err on the side of caution and don't clear. + log.Dev.Warningf(ctx, "error checking liveness for session %s: %v", partition.Session.LivenessID, err) + continue + } + if !alive { + // Session is dead. Clear the claim. + // If there's a successor, promote it to Session. + if !partition.Successor.Empty() { + updatedPartition.Session = partition.Successor + updatedPartition.Successor = Session{} + } else { + updatedPartition.Session = Session{} + } + needsUpdate = true + } + } + + // Check if the Successor is assigned to a dead session. + if partition.Successor.LivenessID != "" { + alive, err := m.sqlLivenessReader.IsAlive(ctx, partition.Successor.LivenessID) + if err != nil { + log.Dev.Warningf(ctx, "error checking liveness for successor session %s: %v", partition.Successor.LivenessID, err) + continue + } + if !alive { + // Successor session is dead. Clear it. + updatedPartition.Successor = Session{} + needsUpdate = true + } + } + + if needsUpdate { + partitionsToUpdate = append(partitionsToUpdate, updatedPartition) + } + } + + return nil + }) + if err != nil { + return errors.Wrapf(err, "listing partitions for queue %s", queueName) + } + + // Update partitions that need to be cleared. + if len(partitionsToUpdate) > 0 { + return m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + for _, partition := range partitionsToUpdate { + if err := pt.UpdatePartition(ctx, txn, partition); err != nil { + return errors.Wrapf(err, "updating partition %d for queue %s", partition.ID, queueName) + } + fmt.Printf("pruning dead sessions: updated partition %d for queue %s\n", partition.ID, queueName) + } + return nil + }) + } + + return nil +} + +// Close stops the Manager and waits for all background goroutines to exit. +func (m *Manager) Close() { + m.watchCancel() + m.watchWg.Wait() +} + var _ queuebase.Manager = &Manager{} diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 1d05a298b399..273771efb226 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -12,6 +12,8 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness/slstorage" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" @@ -25,7 +27,7 @@ import ( func NewTestManager(t *testing.T, a serverutils.ApplicationLayerInterface) *Manager { ctx := context.Background() db := a.InternalDB().(isql.DB) - m := NewManager(ctx, db, a.RangeFeedFactory().(*rangefeed.Factory), a.RangeDescIteratorFactory().(rangedesc.IteratorFactory), a.Codec(), a.LeaseManager().(*lease.Manager)) + m := NewManager(ctx, db, a.RangeFeedFactory().(*rangefeed.Factory), a.RangeDescIteratorFactory().(rangedesc.IteratorFactory), a.Codec(), a.LeaseManager().(*lease.Manager), nil) require.NotNil(t, m.codec) return m } @@ -186,3 +188,162 @@ func TestFeedCreationPartitions(t *testing.T) { }) require.NoError(t, err) } + +func TestWatchForDeadSessions(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + // Create a fake SQL liveness storage for testing + fakeStorage := slstorage.NewFakeStorage() + + // Create a manager with the fake storage + db := srv.ApplicationLayer().InternalDB().(isql.DB) + m := NewManager( + ctx, db, srv.ApplicationLayer().RangeFeedFactory().(*rangefeed.Factory), + srv.ApplicationLayer().RangeDescIteratorFactory().(rangedesc.IteratorFactory), srv.ApplicationLayer().Codec(), srv.ApplicationLayer().LeaseManager().(*lease.Manager), + fakeStorage, + ) + + // Create a queue + sqlDB := sqlutils.MakeSQLRunner(conn) + sqlDB.Exec(t, `CREATE TABLE t (a string PRIMARY KEY)`) + var tableID int64 + sqlDB.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) + + // Create multiple ranges BEFORE creating the queue to ensure we have enough partitions + sqlDB.Exec(t, `INSERT INTO t (a) SELECT generate_series(1, 100)`) + sqlDB.Exec(t, `ALTER TABLE t SPLIT AT VALUES ('10'), ('20'), ('30'), ('40'), ('50'), ('60'), ('70'), ('80'), ('90')`) + sqlDB.Exec(t, `ALTER TABLE t SCATTER`) + + // Now create the queue - it will create partitions for all the ranges + require.NoError(t, m.CreateQueue(ctx, "test", tableID)) + + // Get partitions + pt := &partitionTable{queueName: "test"} + var partitions []Partition + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + var err error + partitions, err = pt.ListPartitions(ctx, txn) + return err + }) + require.NoError(t, err) + require.GreaterOrEqual(t, len(partitions), 4, "should have at least 4 partitions for this test") + + // Create two sessions with liveness IDs + deadSessionID := sqlliveness.SessionID("dead-session") + aliveSessionID := sqlliveness.SessionID("alive-session") + successorSessionID := sqlliveness.SessionID("successor-session") + + deadSession := Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: deadSessionID, + } + aliveSession := Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: aliveSessionID, + } + successorSession := Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: successorSessionID, + } + + // Mark sessions as alive in fake storage + clock := srv.ApplicationLayer().Clock() + expiration := clock.Now().Add(10*time.Second.Nanoseconds(), 0) + require.NoError(t, fakeStorage.Insert(ctx, deadSessionID, expiration)) + require.NoError(t, fakeStorage.Insert(ctx, aliveSessionID, expiration)) + require.NoError(t, fakeStorage.Insert(ctx, successorSessionID, expiration)) + + // Assign some partitions to the dead session + deadPartition := partitions[0] + deadPartition.Session = deadSession + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.UpdatePartition(ctx, txn, deadPartition) + }) + require.NoError(t, err) + + // Assign a partition with a dead session and a successor + deadWithSuccessorPartition := partitions[1] + deadWithSuccessorPartition.Session = deadSession + deadWithSuccessorPartition.Successor = successorSession + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.UpdatePartition(ctx, txn, deadWithSuccessorPartition) + }) + require.NoError(t, err) + + // Assign a partition with a dead successor + deadSuccessorPartition := partitions[2] + deadSuccessorPartition.Session = aliveSession + deadSuccessorPartition.Successor = deadSession + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.UpdatePartition(ctx, txn, deadSuccessorPartition) + }) + require.NoError(t, err) + + // Assign a partition to an alive session (should remain unchanged) + alivePartition := partitions[3] + alivePartition.Session = aliveSession + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.UpdatePartition(ctx, txn, alivePartition) + }) + require.NoError(t, err) + + // Mark the dead session as dead by deleting it from fake storage + require.NoError(t, fakeStorage.Delete(ctx, deadSessionID)) + + // Check for dead sessions + require.NoError(t, m.checkQueueForDeadSessions(ctx, "test")) + + // Verify results + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + updatedPartitions, err := pt.ListPartitions(ctx, txn) + require.NoError(t, err) + + // Find the partitions we updated + var foundDeadPartition, foundDeadWithSuccessorPartition, foundDeadSuccessorPartition, foundAlivePartition *Partition + for i := range updatedPartitions { + p := &updatedPartitions[i] + if p.ID == deadPartition.ID { + foundDeadPartition = p + } else if p.ID == deadWithSuccessorPartition.ID { + foundDeadWithSuccessorPartition = p + } else if p.ID == deadSuccessorPartition.ID { + foundDeadSuccessorPartition = p + } else if p.ID == alivePartition.ID { + foundAlivePartition = p + } + } + + // Dead session partition should be cleared + require.NotNil(t, foundDeadPartition, "should find dead partition") + assert.True(t, foundDeadPartition.Session.Empty(), "dead session partition should be cleared") + assert.True(t, foundDeadPartition.Successor.Empty(), "dead session partition should have no successor") + + // Dead session with successor should promote successor to session + require.NotNil(t, foundDeadWithSuccessorPartition, "should find dead with successor partition") + assert.Equal(t, successorSession, foundDeadWithSuccessorPartition.Session, "successor should be promoted to session") + assert.True(t, foundDeadWithSuccessorPartition.Successor.Empty(), "successor should be cleared") + + // Dead successor should be cleared + require.NotNil(t, foundDeadSuccessorPartition, "should find dead successor partition") + assert.Equal(t, aliveSession, foundDeadSuccessorPartition.Session, "alive session should remain") + assert.True(t, foundDeadSuccessorPartition.Successor.Empty(), "dead successor should be cleared") + + // Alive session partition should remain unchanged + require.NotNil(t, foundAlivePartition, "should find alive partition") + assert.Equal(t, aliveSession, foundAlivePartition.Session, "alive session should remain unchanged") + assert.True(t, foundAlivePartition.Successor.Empty(), "alive partition should have no successor") + + return nil + }) + require.NoError(t, err) + + // Close the manager to wait for the watchForDeadSessions goroutine to exit + m.Close() +} From cd24183f34372654ec56182f98a44ae478548427 Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Thu, 13 Nov 2025 20:09:45 -0500 Subject: [PATCH 38/46] queuefeed: call close on manager in tests --- pkg/sql/queuefeed/assignments_test.go | 2 ++ pkg/sql/queuefeed/manager_test.go | 6 ++++-- pkg/sql/queuefeed/reader_test.go | 3 +++ pkg/sql/queuefeed/smoke_test.go | 1 + 4 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pkg/sql/queuefeed/assignments_test.go b/pkg/sql/queuefeed/assignments_test.go index 38a1c9727690..f0b3088d589d 100644 --- a/pkg/sql/queuefeed/assignments_test.go +++ b/pkg/sql/queuefeed/assignments_test.go @@ -33,6 +33,7 @@ func TestPartitionAssignments(t *testing.T) { // Create queue using QueueManager manager := queuefeed.NewTestManager(t, s.ApplicationLayer()) + defer manager.Close() queueName := "test_queue" err := manager.CreateQueue(ctx, queueName, tableDescID) require.NoError(t, err) @@ -77,6 +78,7 @@ func TestPartitionReassignments(t *testing.T) { // Create queue using QueueManager manager := queuefeed.NewTestManager(t, s.ApplicationLayer()) + defer manager.Close() queueName := "test_queue" err := manager.CreateQueue(ctx, queueName, tableDescID) require.NoError(t, err) diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 273771efb226..376a5a3b7662 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -25,10 +25,10 @@ import ( ) func NewTestManager(t *testing.T, a serverutils.ApplicationLayerInterface) *Manager { - ctx := context.Background() db := a.InternalDB().(isql.DB) - m := NewManager(ctx, db, a.RangeFeedFactory().(*rangefeed.Factory), a.RangeDescIteratorFactory().(rangedesc.IteratorFactory), a.Codec(), a.LeaseManager().(*lease.Manager), nil) + m := NewManager(context.Background(), db, a.RangeFeedFactory().(*rangefeed.Factory), a.RangeDescIteratorFactory().(rangedesc.IteratorFactory), a.Codec(), a.LeaseManager().(*lease.Manager), nil) require.NotNil(t, m.codec) + t.Cleanup(m.Close) return m } @@ -42,6 +42,7 @@ func TestFeedCreation(t *testing.T) { // expect an error when trying to read from a queue that doesn't exist qm := NewTestManager(t, srv.ApplicationLayer()) + defer qm.Close() _, err := qm.CreateReaderForSession(context.Background(), "test", Session{ ConnectionID: uuid.MakeV4(), LivenessID: "", @@ -107,6 +108,7 @@ func TestFeedCreationPartitions(t *testing.T) { var tableID int64 db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) qm := NewTestManager(t, srv.ApplicationLayer()) + defer qm.Close() require.NoError(t, qm.CreateQueue(ctx, "test", tableID)) // Get the table descriptor to determine the primary index span. diff --git a/pkg/sql/queuefeed/reader_test.go b/pkg/sql/queuefeed/reader_test.go index 7f22d0475f80..c2c908d3e74e 100644 --- a/pkg/sql/queuefeed/reader_test.go +++ b/pkg/sql/queuefeed/reader_test.go @@ -32,6 +32,7 @@ func TestReaderBasic(t *testing.T) { db.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 't'").Scan(&tableID) qm := NewTestManager(t, srv.ApplicationLayer()) + defer qm.Close() require.NoError(t, qm.CreateQueue(ctx, "test_queue", tableID)) reader, err := qm.CreateReaderForSession(ctx, "test_queue", Session{ @@ -66,6 +67,7 @@ func TestReaderRollback(t *testing.T) { db.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 't'").Scan(&tableID) qm := NewTestManager(t, srv.ApplicationLayer()) + defer qm.Close() require.NoError(t, qm.CreateQueue(ctx, "rollback_test", tableID)) reader, err := qm.CreateReaderForSession(ctx, "rollback_test", Session{ @@ -119,6 +121,7 @@ func TestCheckpointRestoration(t *testing.T) { db.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 't'").Scan(&tableID) qm := NewTestManager(t, srv.ApplicationLayer()) + defer qm.Close() require.NoError(t, qm.CreateQueue(ctx, "checkpoint_test", tableID)) session := Session{ diff --git a/pkg/sql/queuefeed/smoke_test.go b/pkg/sql/queuefeed/smoke_test.go index 6f0dfc2c9e76..219d61d39618 100644 --- a/pkg/sql/queuefeed/smoke_test.go +++ b/pkg/sql/queuefeed/smoke_test.go @@ -97,6 +97,7 @@ func TestQueuefeedSmoketestMultipleRanges(t *testing.T) { // Create two managers for separate readers qm := NewTestManager(t, srv.ApplicationLayer()) + defer qm.Close() newReader := func(session uuid.UUID) *Reader { qm.mu.Lock() defer qm.mu.Unlock() From b36402a2786e582d8d1186730b02a3eb185f2aad Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Thu, 13 Nov 2025 17:40:43 -0500 Subject: [PATCH 39/46] queuefeed: wire up partition cache to the assigner --- pkg/sql/queuefeed/BUILD.bazel | 5 +- pkg/sql/queuefeed/assignments.go | 363 ++++++++++++++++++++------ pkg/sql/queuefeed/assignments_test.go | 77 +----- pkg/sql/queuefeed/manager.go | 17 +- pkg/sql/queuefeed/manager_test.go | 106 -------- pkg/sql/queuefeed/partitions.go | 193 ++++++++++++-- pkg/sql/queuefeed/partitions_test.go | 130 +++++++++ pkg/sql/queuefeed/reader.go | 34 ++- pkg/sql/queuefeed/smoke_test.go | 94 ++++++- 9 files changed, 712 insertions(+), 307 deletions(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index e89ed932ed76..4c3ee5d79d0d 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -29,6 +29,7 @@ go_library( "//pkg/sql/sem/tree", "//pkg/sql/sessiondata", "//pkg/sql/sqlliveness", + "//pkg/sql/types", "//pkg/util", "//pkg/util/hlc", "//pkg/util/log", @@ -61,14 +62,13 @@ go_test( "//pkg/security/securitytest", "//pkg/server", "//pkg/sql", - "//pkg/sql/catalog", - "//pkg/sql/catalog/descpb", "//pkg/sql/catalog/lease", "//pkg/sql/isql", "//pkg/sql/queuefeed/queuebase", "//pkg/sql/sem/tree", "//pkg/sql/sqlliveness", "//pkg/sql/sqlliveness/slstorage", + "//pkg/testutils", "//pkg/testutils/serverutils", "//pkg/testutils/sqlutils", "//pkg/testutils/testcluster", @@ -77,6 +77,7 @@ go_test( "//pkg/util/randutil", "//pkg/util/rangedesc", "//pkg/util/uuid", + "@com_github_cockroachdb_errors//:errors", "@com_github_stretchr_testify//assert", "@com_github_stretchr_testify//require", "@org_golang_x_sync//errgroup", diff --git a/pkg/sql/queuefeed/assignments.go b/pkg/sql/queuefeed/assignments.go index 96ef9c462be1..ca302392637c 100644 --- a/pkg/sql/queuefeed/assignments.go +++ b/pkg/sql/queuefeed/assignments.go @@ -1,12 +1,12 @@ package queuefeed import ( - "cmp" "context" - "slices" + "time" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/errors" ) @@ -31,54 +31,73 @@ func (a *Assignment) Spans() []roachpb.Span { type PartitionAssignments struct { db isql.DB partitionTable *partitionTable + + refresh struct { + // Lock ordering: refresh may be locked before mu + lastRefresh time.Time + syncutil.Mutex + } + + mu struct { + syncutil.Mutex + cache partitionCache + } } -func NewPartitionAssignments(db isql.DB, queueName string) *PartitionAssignments { - return &PartitionAssignments{ +func NewPartitionAssignments(db isql.DB, queueName string) (*PartitionAssignments, error) { + pa := &PartitionAssignments{ db: db, partitionTable: &partitionTable{queueName: queueName}, } + + var partitions []Partition + err := db.Txn(context.Background(), func(ctx context.Context, txn isql.Txn) error { + var err error + partitions, err = pa.partitionTable.ListPartitions(ctx, txn) + return err + }) + if err != nil { + return nil, errors.Wrap(err, "unable to load initial partitions") + } + + pa.mu.cache.Init(partitions) + pa.refresh.lastRefresh = time.Now() + + return pa, nil } -// RefreshAssignment refreshes the assignment for the given session. It returnrns -// nil if the assignment has not changed. -// -// If the session is caught up (i.e. it has proceessed up to a recent timestamp -// for all assigned partitions), then it may be assigned new partitions. -// -// If a partition has a successor session, then calling RefreshAssignment will -// return an assignment that does not include that partition. -func (p *PartitionAssignments) RefreshAssignment( - ctx context.Context, session Session, caughtUp bool, -) (updatedAssignment *Assignment, err error) { - // find my assignments and see if any of them have a successor session. return the ones that don't. - // TODO: this should be done in sql - // TODO: this handles partition handoff, but not hand...on... (?) - var myPartitions []Partition - anyChanged := false - err = p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - partitions, err := p.partitionTable.ListPartitions(ctx, txn) - if err != nil { - return err - } - for _, partition := range partitions { - if partition.Session != session { - continue - } - if !partition.Successor.Empty() { - anyChanged = true - continue - } - myPartitions = append(myPartitions, partition) - } +func (p *PartitionAssignments) maybeRefreshCache() error { + // TODO handle deletions + // TODO add a version mechanism to avoid races between write through updates and refereshes + // TODO use a rangefeed instead of polling + + p.refresh.Lock() + defer p.refresh.Unlock() + + if time.Since(p.refresh.lastRefresh) < 5*time.Second { return nil + } + + var partitions []Partition + err := p.db.Txn(context.Background(), func(ctx context.Context, txn isql.Txn) error { + var err error + partitions, err = p.partitionTable.ListPartitions(ctx, txn) + return err }) - if !anyChanged { - return nil, nil + if err != nil { + return err + } + + updates := make(map[int64]Partition) + for _, partition := range partitions { + updates[partition.ID] = partition } - slices.SortFunc(myPartitions, func(a, b Partition) int { return cmp.Compare(a.ID, b.ID) }) - return &Assignment{Session: session, Partitions: myPartitions}, nil + p.mu.Lock() + defer p.mu.Unlock() + p.mu.cache.Update(updates) + p.refresh.lastRefresh = time.Now() + return nil } // RegisterSession registers a new session. The session may be assigned zero @@ -88,78 +107,250 @@ func (p *PartitionAssignments) RefreshAssignment( func (p *PartitionAssignments) RegisterSession( ctx context.Context, session Session, ) (*Assignment, error) { - // TODO(jeffswenson): this is a stub implementation that simply assigns all - // unclaimed partitions to the current session. + if err := p.maybeRefreshCache(); err != nil { + return nil, errors.Wrap(err, "refreshing partition cache") + } - var result *Assignment - err := p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - result = &Assignment{Session: session} + var err error + var done bool + for !done { + tryClaim, trySteal := func() (Partition, Partition) { + p.mu.Lock() + defer p.mu.Unlock() + return p.mu.cache.planRegister(session, p.mu.cache) + }() + switch { + case !tryClaim.Empty(): + err, done = p.tryClaim(session, tryClaim) + if err != nil { + return nil, errors.Wrap(err, "claiming partition") + } + case !trySteal.Empty(): + err, done = p.trySteal(session, trySteal) + if err != nil { + return nil, errors.Wrap(err, "stealing partition") + } + default: + done = true + } + } + return p.constructAssignment(session), nil +} + +func (p *PartitionAssignments) tryClaim(session Session, toClaim Partition) (error, bool) { + var updates map[int64]Partition + var done bool + err := p.db.Txn(context.Background(), func(ctx context.Context, txn isql.Txn) error { + done, updates = false, nil - partitions, err := p.partitionTable.ListPartitions(ctx, txn) - if err != nil { + var err error + updates, err = p.anyStale(ctx, txn, []Partition{toClaim}) + if err != nil || len(updates) != 0 { return err } - for _, partition := range partitions { - // TODO we really shouldn't force assign partitions, but we are not watch - // sql liveness so we can't detect dead sessions yet. - //if !partition.Session.Empty() { - // continue - //} - partition.Session = session - if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { - return errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) - } - result.Partitions = append(result.Partitions, partition) + + updates = make(map[int64]Partition) + toClaim.Session = session + updates[toClaim.ID] = toClaim + if err := p.partitionTable.UpdatePartition(ctx, txn, toClaim); err != nil { + return err } + done = true return nil }) if err != nil { - return nil, errors.Wrap(err, "registering session") + return err, false } - return result, nil + + p.mu.Lock() + defer p.mu.Unlock() + p.mu.cache.Update(updates) + return nil, done } -func (p *PartitionAssignments) UnregisterSession(ctx context.Context, session Session) error { - // TODO: this should probably be pushed onto some task queue that is - // independent of the pgwire session so we can retry without block connection - // cleanup. - return p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - partitions, err := p.partitionTable.ListPartitions(ctx, txn) - if err != nil { +func (p *PartitionAssignments) trySteal(session Session, toSteal Partition) (error, bool) { + var updates map[int64]Partition + var done bool + err := p.db.Txn(context.Background(), func(ctx context.Context, txn isql.Txn) error { + done, updates = false, nil + + var err error + updates, err = p.anyStale(ctx, txn, []Partition{toSteal}) + if err != nil || len(updates) != 0 { return err } - for _, partition := range partitions { - if partition.Session == session { - partition.Session = partition.Successor - if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { - return errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) - } + + updates = make(map[int64]Partition) + toSteal.Successor = session + updates[toSteal.ID] = toSteal + if err := p.partitionTable.UpdatePartition(ctx, txn, toSteal); err != nil { + return err + } + + done = true + return nil + }) + if err != nil { + return err, false + } + + p.mu.Lock() + defer p.mu.Unlock() + p.mu.cache.Update(updates) + return nil, done +} + +// RefreshAssignment refreshes the assignment for the given session. It returnrns +// nil if the assignment has not changed. +// +// If the session is caught up (i.e. it has proceessed up to a recent timestamp +// for all assigned partitions), then it may be assigned new partitions. +// +// If a partition has a successor session, then calling RefreshAssignment will +// return an assignment that does not include that partition. +func (p *PartitionAssignments) RefreshAssignment( + ctx context.Context, assignment *Assignment, caughtUp bool, +) (*Assignment, error) { + if err := p.maybeRefreshCache(); err != nil { + return nil, errors.Wrap(err, "refreshing partition cache") + } + + var done bool + var err error + for !done { + tryRelease, tryClaim, trySteal := func() ([]Partition, Partition, Partition) { + p.mu.Lock() + defer p.mu.Unlock() + return p.mu.cache.planAssignment(assignment.Session, caughtUp, p.mu.cache) + }() + switch { + case len(tryRelease) != 0: + err, done = p.tryRelease(assignment.Session, tryRelease) + if err != nil { + return nil, errors.Wrap(err, "releasing partition") + } + case !tryClaim.Empty(): + err, done = p.tryClaim(assignment.Session, tryClaim) + if err != nil { + return nil, errors.Wrap(err, "claiming partition") } - if partition.Successor == session { - partition.Successor = Session{} - if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { - return errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) - } + case !trySteal.Empty(): + err, done = p.trySteal(assignment.Session, trySteal) + if err != nil { + return nil, errors.Wrap(err, "stealing partition") } + default: + stale := func() bool { + p.mu.Lock() + defer p.mu.Unlock() + return p.mu.cache.isStale(assignment) + }() + if !stale { + return nil, nil + } + done = true } + } + if err != nil { + return nil, err + } + return p.constructAssignment(assignment.Session), nil +} + +// anyStale checks if any of the provided partitions have become stale by +// comparing them with the current state in the database. Returns a map of +// partition ID to the updated partition that can be applied to the cache. +func (p *PartitionAssignments) anyStale( + ctx context.Context, txn isql.Txn, partitions []Partition, +) (map[int64]Partition, error) { + if len(partitions) == 0 { + return make(map[int64]Partition), nil + } + + // Extract partition IDs + partitionIDs := make([]int64, len(partitions)) + for i, partition := range partitions { + partitionIDs[i] = partition.ID + } + + // Fetch current state from database + currentPartitions, err := p.partitionTable.FetchPartitions(ctx, txn, partitionIDs) + if err != nil { + return nil, err + } + + // Compare cached vs current state and collect stale partitions + stalePartitions := make(map[int64]Partition) + for _, cachedPartition := range partitions { + currentPartition := currentPartitions[cachedPartition.ID] + + // If partition was deleted from database, mark it as empty in updates + if currentPartition.Empty() { + stalePartitions[cachedPartition.ID] = Partition{} + } else if !cachedPartition.Equal(currentPartition) { + // If partition has changed, include the updated version + stalePartitions[cachedPartition.ID] = currentPartition + } + } + + return stalePartitions, nil +} + +func (p *PartitionAssignments) tryRelease(session Session, toRelease []Partition) (error, bool) { + var updates map[int64]Partition + var done bool + err := p.db.Txn(context.Background(), func(ctx context.Context, txn isql.Txn) error { + done, updates = false, nil + + var err error + updates, err = p.anyStale(ctx, txn, toRelease) + if err != nil || len(updates) != 0 { + return err + } + + updates = make(map[int64]Partition) + for _, partition := range toRelease { + partition.Session = partition.Successor + updates[partition.ID] = partition + if err := p.partitionTable.UpdatePartition(ctx, txn, partition); err != nil { + return err + } + } + + done = true return nil }) + if err != nil { + return err, false + } + + p.mu.Lock() + defer p.mu.Unlock() + p.mu.cache.Update(updates) + return nil, done +} + +func (p *PartitionAssignments) constructAssignment(session Session) *Assignment { + p.mu.Lock() + defer p.mu.Unlock() + return p.mu.cache.constructAssignment(session) } -// Try to claim a partition for the given session. -func (p *PartitionAssignments) TryClaim(ctx context.Context, session Session, partition Partition) (Partition, error) { - partition.Successor = session +func (p *PartitionAssignments) UnregisterSession(ctx context.Context, session Session) error { + var updates map[int64]Partition err := p.db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - return p.partitionTable.UpdatePartition(ctx, txn, partition) + var err error + updates, err = p.partitionTable.UnregisterSession(ctx, txn, session) + return err }) if err != nil { - return Partition{}, errors.Wrapf(err, "updating partition %d for session %s", partition.ID, session.ConnectionID) + return err } - return partition, nil -} -func (p *PartitionAssignments) tryRelease(session Session, toRelease []Partition) error { - // Release the given partitions from the session. + p.mu.Lock() + defer p.mu.Unlock() + p.mu.cache.Update(updates) + return nil } diff --git a/pkg/sql/queuefeed/assignments_test.go b/pkg/sql/queuefeed/assignments_test.go index f0b3088d589d..9655f46f00db 100644 --- a/pkg/sql/queuefeed/assignments_test.go +++ b/pkg/sql/queuefeed/assignments_test.go @@ -6,7 +6,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/sql" - "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/sql/queuefeed" "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" @@ -38,7 +37,8 @@ func TestPartitionAssignments(t *testing.T) { err := manager.CreateQueue(ctx, queueName, tableDescID) require.NoError(t, err) - pa := queuefeed.NewPartitionAssignments(s.ExecutorConfig().(sql.ExecutorConfig).InternalDB, queueName) + pa, err := queuefeed.NewPartitionAssignments(s.ExecutorConfig().(sql.ExecutorConfig).InternalDB, queueName) + require.NoError(t, err) session := queuefeed.Session{ ConnectionID: uuid.MakeV4(), @@ -54,78 +54,13 @@ func TestPartitionAssignments(t *testing.T) { "SELECT sql_liveness_session, user_session FROM defaultdb.queue_partition_"+queueName, [][]string{{"1", session.ConnectionID.String()}}) + newAssignment, err := pa.RefreshAssignment(context.Background(), assignment, true) + require.NoError(t, err) + require.Nil(t, newAssignment) + require.NoError(t, pa.UnregisterSession(ctx, session)) tdb.CheckQueryResults(t, "SELECT sql_liveness_session, user_session FROM defaultdb.queue_partition_"+queueName, [][]string{{"NULL", "NULL"}}) } - -func TestPartitionReassignments(t *testing.T) { - defer leaktest.AfterTest(t)() - defer log.Scope(t).Close(t) - - ctx := context.Background() - s, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) - defer s.Stopper().Stop(ctx) - - tdb := sqlutils.MakeSQLRunner(sqlDB) - // tdb.Exec(t, "CREATE TABLE test_table (id INT PRIMARY KEY, data TEXT)") // TODO: why does this fail with "empty encoded value"? - tdb.Exec(t, "CREATE TABLE test_table (a string)") - - var tableDescID int64 - tdb.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 'test_table'").Scan(&tableDescID) - - // Create queue using QueueManager - manager := queuefeed.NewTestManager(t, s.ApplicationLayer()) - defer manager.Close() - queueName := "test_queue" - err := manager.CreateQueue(ctx, queueName, tableDescID) - require.NoError(t, err) - - pa := queuefeed.NewPartitionAssignments(s.ExecutorConfig().(sql.ExecutorConfig).InternalDB, queueName) - - session := queuefeed.Session{ - ConnectionID: uuid.MakeV4(), - LivenessID: sqlliveness.SessionID("1"), - } - reader, err := manager.CreateReaderForSession(ctx, "test_queue", session) - require.NoError(t, err) - - // get the session the reader is using - var partition queuefeed.Partition - pt := queuefeed.TestNewPartitionsTable(queueName) - err = s.ExecutorConfig().(sql.ExecutorConfig).InternalDB.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - partitions, err := pt.ListPartitions(ctx, txn) - require.NoError(t, err) - require.Len(t, partitions, 1) - partition = partitions[0] - return nil - }) - require.NoError(t, err) - - // some other session tries to claim the partition - someOtherSession := queuefeed.Session{ - ConnectionID: uuid.MakeV4(), - LivenessID: sqlliveness.SessionID("2"), - } - partition, err = pa.TryClaim(ctx, someOtherSession, partition) - require.NoError(t, err) - require.Equal(t, someOtherSession, partition.Successor) - - // do a read from the queue so it checks for a reassignment - tdb.Exec(t, "INSERT INTO test_table (a) VALUES ('test'), ('test2')") - rows, err := reader.GetRows(ctx, 1) - require.NoError(t, err) - require.Len(t, rows, 1) - // confirm receipt. it should then check for a reassignment and see that we disowned it - reader.ConfirmReceipt(ctx) - - // try to read again to see that it failed - // NOTE: we want this to not fail in the future but to sleep & poll instead. - // sooo maybe this isnt the best way to test this. but i cant think of a - // better way at this exact moment. - - rows, err = reader.GetRows(ctx, 1) - require.Error(t, err) -} diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index a10623796624..cf5d0f9d63ab 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -186,12 +186,6 @@ func (m *Manager) newReaderLocked( ) (*Reader, error) { var tableDescID int64 - assigner, ok := m.mu.queueAssignment[name] - if !ok { - assigner = NewPartitionAssignments(m.executor, name) - m.mu.queueAssignment[name] = assigner - } - // TODO: this ctx on the other hand should be stmt scoped err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) @@ -213,6 +207,17 @@ func (m *Manager) newReaderLocked( if err != nil { return nil, err } + + assigner, ok := m.mu.queueAssignment[name] + if !ok { + var err error + assigner, err = NewPartitionAssignments(m.executor, name) + if err != nil { + return nil, err + } + m.mu.queueAssignment[name] = assigner + } + fmt.Printf("get or init reader for queue %s with table desc id: %d\n", name, tableDescID) return NewReader(ctx, m.executor, m, m.rff, m.codec, m.leaseMgr, session, assigner, name) } diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 376a5a3b7662..0090fab42e32 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -7,9 +7,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangefeed" - "github.com/cockroachdb/cockroach/pkg/roachpb" - "github.com/cockroachdb/cockroach/pkg/sql/catalog" - "github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb" "github.com/cockroachdb/cockroach/pkg/sql/catalog/lease" "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" @@ -88,109 +85,6 @@ func TestQueuefeedCtxCancel(t *testing.T) { require.Error(t, err) } -func TestFeedCreationPartitions(t *testing.T) { - defer leaktest.AfterTest(t)() - defer log.Scope(t).Close(t) - - ctx := context.Background() - srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) - defer srv.Stopper().Stop(ctx) - - // expect no error when creating a queue - db := sqlutils.MakeSQLRunner(conn) - db.Exec(t, `CREATE TABLE t (a string)`) - // split into 1k ranges - db.Exec(t, `INSERT INTO t (a) SELECT generate_series(1, 10000)`) - db.Exec(t, `ALTER TABLE t SPLIT AT (SELECT (i/10)::int FROM generate_series(1, 10000) AS g(i))`) - db.Exec(t, `ALTER TABLE t SCATTER`) - - // get table id - var tableID int64 - db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) - qm := NewTestManager(t, srv.ApplicationLayer()) - defer qm.Close() - require.NoError(t, qm.CreateQueue(ctx, "test", tableID)) - - // Get the table descriptor to determine the primary index span. - leaseMgr := srv.ApplicationLayer().LeaseManager().(*lease.Manager) - descriptor, err := leaseMgr.Acquire(ctx, lease.TimestampToReadTimestamp(srv.ApplicationLayer().Clock().Now()), descpb.ID(tableID)) - require.NoError(t, err) - defer descriptor.Release(ctx) - tableDesc := descriptor.Underlying().(catalog.TableDescriptor) - primaryIndexSpan := tableDesc.PrimaryIndexSpan(qm.codec) - - // Count the number of partitions. - pt := &partitionTable{queueName: "test"} - err = srv.ApplicationLayer().InternalDB().(isql.DB).Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - partitions, err := pt.ListPartitions(ctx, txn) - require.NoError(t, err) - require.GreaterOrEqual(t, len(partitions), 1000, "expected at least 1000 partitions") // It could be a bit more than 1k. - - partitionIDs := make(map[int64]bool) - var partitionSpans []roachpb.Span - for _, partition := range partitions { - // There should be no duplicate partition IDs. - assert.NotZero(t, partition.ID) - _, ok := partitionIDs[partition.ID] - assert.False(t, ok, "duplicate partition ID: %d", partition.ID) - partitionIDs[partition.ID] = true - - // The spans should be primary index only and not overlap and cover the entire primary index span. - partitionSpan := partition.Span - assert.True(t, partitionSpan.Valid()) - assert.True(t, primaryIndexSpan.Contains(partitionSpan)) - partitionSpans = append(partitionSpans, partitionSpan) - - assert.True(t, partition.Session.Empty(), "partition %d should not be assigned to a session", partition.ID) - assert.True(t, partition.Successor.Empty(), "partition %d should not have a successor", partition.ID) - } - - // Verify spans don't overlap by checking each pair. - for i, span1 := range partitionSpans { - for j, span2 := range partitionSpans { - if i < j { - // Spans should not overlap (they can be adjacent). - assert.False(t, span1.Overlaps(span2), - "partition spans should not overlap: span1=%v, span2=%v", span1, span2) - } - } - } - - // Verify spans cover the entire primary index span - var spanGroup roachpb.SpanGroup - spanGroup.Add(partitionSpans...) - mergedSpans := spanGroup.Slice() // should be a single span covering the entire primary index span - assert.Equal(t, 1, len(mergedSpans)) - assert.True(t, mergedSpans[0].Equal(primaryIndexSpan)) - - return nil - }) - require.NoError(t, err) - - // Start a reader and verify it reads all the partitions. - reader, err := qm.CreateReaderForSession(ctx, "test", Session{ - ConnectionID: uuid.MakeV4(), - LivenessID: "", - }) - require.NoError(t, err) - require.NotNil(t, reader) - defer func() { _ = reader.Close() }() - - err = srv.ApplicationLayer().InternalDB().(isql.DB).Txn(ctx, func(ctx context.Context, txn isql.Txn) error { - partitions, err := pt.ListPartitions(ctx, txn) - require.NoError(t, err) - - session := reader.session - for _, partition := range partitions { - assert.Equal(t, session, partition.Session) - assert.True(t, partition.Successor.Empty(), "partition %d should not have a successor", partition.ID) - } - - return nil - }) - require.NoError(t, err) -} - func TestWatchForDeadSessions(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) diff --git a/pkg/sql/queuefeed/partitions.go b/pkg/sql/queuefeed/partitions.go index 91d51593ea84..8f2b24afd4b8 100644 --- a/pkg/sql/queuefeed/partitions.go +++ b/pkg/sql/queuefeed/partitions.go @@ -8,7 +8,9 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/isql" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/sql/types" "github.com/cockroachdb/cockroach/pkg/util/uuid" + "github.com/cockroachdb/errors" ) type Partition struct { @@ -24,6 +26,38 @@ type Partition struct { Span roachpb.Span } +func PartitionFromDatums(row tree.Datums) (Partition, error) { + var session, successor Session + if !(row[1] == tree.DNull || row[2] == tree.DNull) { + session = Session{ + LivenessID: sqlliveness.SessionID(tree.MustBeDBytes(row[1])), + ConnectionID: tree.MustBeDUuid(row[2]).UUID, + } + } + if !(row[3] == tree.DNull || row[4] == tree.DNull) { + successor = Session{ + LivenessID: sqlliveness.SessionID(tree.MustBeDBytes(row[3])), + ConnectionID: tree.MustBeDUuid(row[4]).UUID, + } + } + + var span roachpb.Span + if row[5] != tree.DNull { + var err error + span, err = decodeSpan([]byte(*row[5].(*tree.DBytes))) + if err != nil { + return Partition{}, err + } + } + + return Partition{ + ID: int64(tree.MustBeDInt(row[0])), + Session: session, + Successor: successor, + Span: span, + }, nil +} + type partitionTable struct { queueName string } @@ -54,37 +88,97 @@ func (p *partitionTable) ListPartitions(ctx context.Context, txn isql.Txn) ([]Pa if err != nil { return nil, err } - partitions := make([]Partition, len(rows)) for i, row := range rows { - var session, successor Session - if !(row[1] == tree.DNull || row[2] == tree.DNull) { - session = Session{ - LivenessID: sqlliveness.SessionID(tree.MustBeDBytes(row[1])), - ConnectionID: tree.MustBeDUuid(row[2]).UUID, - } + var err error + partitions[i], err = PartitionFromDatums(row) + if err != nil { + return nil, err } - if !(row[3] == tree.DNull || row[4] == tree.DNull) { - successor = Session{ - LivenessID: sqlliveness.SessionID(tree.MustBeDBytes(row[3])), - ConnectionID: tree.MustBeDUuid(row[4]).UUID, - } + } + return partitions, nil +} + +// FetchPartitions fetches all of the partitions with the given IDs. The len of +// the returned map is eqaul to the number of unique partitionIDs passed in. If +// a partition id is not found, it will be present in the map with a zero-value +// Partition. +func (p *partitionTable) FetchPartitions( + ctx context.Context, txn isql.Txn, partitionIDs []int64, +) (map[int64]Partition, error) { + if len(partitionIDs) == 0 { + return make(map[int64]Partition), nil + } + + // Initialize result map with zero-value partitions for all unique IDs + result := make(map[int64]Partition) + for _, id := range partitionIDs { + result[id] = Partition{} // Zero-value partition as placeholder + } + + datumArray := tree.NewDArray(types.Int) + for _, id := range partitionIDs { + if err := datumArray.Append(tree.NewDInt(tree.DInt(id))); err != nil { + return nil, err } + } + + rows, err := txn.QueryBuffered(ctx, "fetch-partitions", txn.KV(), fmt.Sprintf(` + SELECT + partition_id, + sql_liveness_session, + user_session, + sql_liveness_session_successor, + user_session_successor, + partition_spec + FROM defaultdb.queue_partition_%s + WHERE partition_id = ANY($1)`, p.queueName), datumArray) + if err != nil { + return nil, err + } - span, err := decodeSpan([]byte(*row[5].(*tree.DBytes))) + // Process found partitions + for _, row := range rows { + partition, err := PartitionFromDatums(row) if err != nil { return nil, err } + result[partition.ID] = partition + } - partitions[i] = Partition{ - ID: int64(tree.MustBeDInt(row[0])), - Session: session, - Successor: successor, - Span: span, - } + return result, nil +} + +// Get retrieves a single partition by ID. Returns an error if the partition +// is not found. +func (p *partitionTable) Get( + ctx context.Context, txn isql.Txn, partitionID int64, +) (Partition, error) { + row, err := txn.QueryRow(ctx, "get-partition", txn.KV(), + fmt.Sprintf(` + SELECT + partition_id, + sql_liveness_session, + user_session, + sql_liveness_session_successor, + user_session_successor, + partition_spec + FROM defaultdb.queue_partition_%s + WHERE partition_id = $1`, p.queueName), partitionID) + if err != nil { + return Partition{}, err } - return partitions, nil + if row == nil { + return Partition{}, errors.Newf("no partition found with id %d", partitionID) + } + + partition, err := PartitionFromDatums(row) + if err != nil { + return Partition{}, err + } + + return partition, nil } func (p *partitionTable) InsertPartition( @@ -159,10 +253,67 @@ func (p *partitionTable) UpdatePartition( return err } -func (p *Partition) Empty() bool { +// UnregisterSession removes the given session from all assignments and +// partition claims, it returns the updated partitions. +func (p *partitionTable) UnregisterSession( + ctx context.Context, txn isql.Txn, session Session, +) (updates map[int64]Partition, err error) { + sessionLivenessID := []byte(session.LivenessID) + sessionConnectionID := session.ConnectionID + + rows, err := txn.QueryBuffered(ctx, "unregister-session", txn.KV(), fmt.Sprintf(` + UPDATE defaultdb.queue_partition_%s + SET + sql_liveness_session = CASE + WHEN sql_liveness_session = $1 AND user_session = $2 THEN sql_liveness_session_successor + ELSE sql_liveness_session + END, + user_session = CASE + WHEN sql_liveness_session = $1 AND user_session = $2 THEN user_session_successor + ELSE user_session + END, + sql_liveness_session_successor = CASE + WHEN sql_liveness_session = $1 AND user_session = $2 THEN NULL + WHEN sql_liveness_session_successor = $1 AND user_session_successor = $2 THEN NULL + ELSE sql_liveness_session_successor + END, + user_session_successor = CASE + WHEN sql_liveness_session = $1 AND user_session = $2 THEN NULL + WHEN sql_liveness_session_successor = $1 AND user_session_successor = $2 THEN NULL + ELSE user_session_successor + END + WHERE (sql_liveness_session = $1 AND user_session = $2) + OR (sql_liveness_session_successor = $1 AND user_session_successor = $2) + RETURNING partition_id, sql_liveness_session, user_session, + sql_liveness_session_successor, user_session_successor, partition_spec`, p.queueName), sessionLivenessID, sessionConnectionID) + if err != nil { + return nil, err + } + + updates = make(map[int64]Partition) + for _, row := range rows { + partition, err := PartitionFromDatums(row) + if err != nil { + return nil, err + } + updates[partition.ID] = partition + } + + return updates, nil +} + +func (p Partition) Empty() bool { return p.ID == 0 } +// Equal returns true if two partitions are equal in all fields. +func (p Partition) Equal(other Partition) bool { + return p.ID == other.ID && + p.Session == other.Session && + p.Successor == other.Successor && + p.Span.Equal(other.Span) +} + type Session struct { // ConnectionID is the ID of the underlying connection. ConnectionID uuid.UUID diff --git a/pkg/sql/queuefeed/partitions_test.go b/pkg/sql/queuefeed/partitions_test.go index 65391597c03c..9dffd1fa989c 100644 --- a/pkg/sql/queuefeed/partitions_test.go +++ b/pkg/sql/queuefeed/partitions_test.go @@ -174,3 +174,133 @@ func TestInsertPartition(t *testing.T) { }) require.NoError(t, err) } + +func TestFetchPartitions(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := srv.ApplicationLayer().InternalDB().(isql.DB) + sqlRunner := sqlutils.MakeSQLRunner(sqlDB) + queueName := "test" + pt := &partitionTable{queueName: queueName} + + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.CreateSchema(ctx, txn) + }) + require.NoError(t, err) + + // Insert some test data + sqlRunner.Exec(t, `INSERT INTO defaultdb.queue_partition_test (partition_id) VALUES (1), (3)`) + + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + result, err := pt.FetchPartitions(ctx, txn, []int64{1, 2, 3}) + require.NoError(t, err) + require.Len(t, result, 3) + require.Equal(t, int64(1), result[1].ID) + require.True(t, result[2].Empty()) + require.Equal(t, int64(3), result[3].ID) + return nil + }) + require.NoError(t, err) +} + +func TestGetPartition(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := srv.ApplicationLayer().InternalDB().(isql.DB) + sqlRunner := sqlutils.MakeSQLRunner(sqlDB) + pt := &partitionTable{queueName: "test"} + + // Create table + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.CreateSchema(ctx, txn) + }) + require.NoError(t, err) + + // Test not found + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + _, err := pt.Get(ctx, txn, 999) + require.Error(t, err) + return nil + }) + require.NoError(t, err) + + // Insert test partition + connectionID := uuid.MakeV4() + span := roachpb.Span{Key: roachpb.Key("test"), EndKey: roachpb.Key("testend")} + spanBytes, _ := span.Marshal() + sqlRunner.Exec(t, `INSERT INTO defaultdb.queue_partition_test (partition_id, sql_liveness_session, user_session, partition_spec) VALUES (1, $1, $2, $3)`, []byte("test-session"), connectionID, spanBytes) + + // Test found + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + partition, err := pt.Get(ctx, txn, 1) + require.NoError(t, err) + require.Equal(t, int64(1), partition.ID) + require.Equal(t, connectionID, partition.Session.ConnectionID) + return nil + }) + require.NoError(t, err) +} + +func TestUnregisterSession(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := srv.ApplicationLayer().InternalDB().(isql.DB) + sqlRunner := sqlutils.MakeSQLRunner(sqlDB) + pt := &partitionTable{queueName: "test"} + + // Create table + err := db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + return pt.CreateSchema(ctx, txn) + }) + require.NoError(t, err) + + // Create sessions + session1 := Session{LivenessID: "session1", ConnectionID: uuid.MakeV4()} + session2 := Session{LivenessID: "session2", ConnectionID: uuid.MakeV4()} + + // Insert partition with session1 as owner, session2 as successor + span1 := roachpb.Span{Key: roachpb.Key("a"), EndKey: roachpb.Key("b")} + span1Bytes, _ := span1.Marshal() + sqlRunner.Exec(t, `INSERT INTO defaultdb.queue_partition_test (partition_id, sql_liveness_session, user_session, sql_liveness_session_successor, user_session_successor, partition_spec) VALUES (1, $1, $2, $3, $4, $5)`, + []byte(session1.LivenessID), session1.ConnectionID, []byte(session2.LivenessID), session2.ConnectionID, span1Bytes) + + // Insert partition with session1 as owner, no successor + span2 := roachpb.Span{Key: roachpb.Key("c"), EndKey: roachpb.Key("d")} + span2Bytes, _ := span2.Marshal() + sqlRunner.Exec(t, `INSERT INTO defaultdb.queue_partition_test (partition_id, sql_liveness_session, user_session, partition_spec) VALUES (2, $1, $2, $3)`, + []byte(session1.LivenessID), session1.ConnectionID, span2Bytes) + + // Unregister session1 + err = db.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { + updates, err := pt.UnregisterSession(ctx, txn, session1) + require.NoError(t, err) + require.Len(t, updates, 2) + + // Partition 1: session2 should now be the owner + partition1 := updates[1] + require.Equal(t, session2.ConnectionID, partition1.Session.ConnectionID) + require.True(t, partition1.Successor.Empty()) + + // Partition 2: should be unassigned (no successor to promote) + partition2 := updates[2] + require.True(t, partition2.Session.Empty()) + require.True(t, partition2.Successor.Empty()) + return nil + }) + require.NoError(t, err) +} diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 1437a11bab8d..2eb9541fb776 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -6,6 +6,7 @@ import ( "slices" "sync" "sync/atomic" + "time" "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl/changefeedbase" "github.com/cockroachdb/cockroach/pkg/keys" @@ -110,15 +111,15 @@ func NewReader( ctx, cancel := context.WithCancelCause(ctx) r.cancel = func(cause error) { - fmt.Printf("canceling with cause: %s\n", cause) + fmt.Printf("canceling with cause: %+v\n", cause) cancel(cause) r.mu.poppedWakeup.Broadcast() r.mu.pushedWakeup.Broadcast() } - assignment, err := assigner.RegisterSession(ctx, session) + assignment, err := r.waitForAssignment(ctx, session) if err != nil { - return nil, errors.Wrap(err, "registering session for reader") + return nil, errors.Wrap(err, "waiting for assignment") } if err := r.setupRangefeed(ctx, assignment); err != nil { return nil, errors.Wrap(err, "setting up rangefeed") @@ -129,6 +130,31 @@ func NewReader( var ErrNoPartitionsAssigned = errors.New("no partitions assigned to reader: todo support this case by polling for assignment") +func (r *Reader) waitForAssignment(ctx context.Context, session Session) (*Assignment, error) { + // We can rapidly poll this because the assigner has an in-memory cache of + // assignments. + // + // TODO: should this retry loop be in RegisterSession instead? + timer := time.NewTicker(100 * time.Millisecond) + defer timer.Stop() + for { + assignment, err := r.assigner.RegisterSession(ctx, session) + if err != nil { + return nil, errors.Wrap(err, "registering session for reader") + } + if len(assignment.Partitions) != 0 { + return assignment, nil + } + + select { + case <-ctx.Done(): + return nil, errors.Wrap(ctx.Err(), "waiting for assignment") + case <-timer.C: + // continue + } + } +} + func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) error { defer func() { fmt.Println("setupRangefeed done") @@ -364,7 +390,7 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { return default: // TODO only set caughtUp to true if our frontier is near the current time. - newAssignment, err := r.assigner.RefreshAssignment(ctx, r.session /*caughtUp=*/, true) + newAssignment, err := r.assigner.RefreshAssignment(ctx, r.assignment, true /*=caughtUp*/) if err != nil { r.cancel(errors.Wrap(err, "refreshing assignment")) return diff --git a/pkg/sql/queuefeed/smoke_test.go b/pkg/sql/queuefeed/smoke_test.go index 219d61d39618..d8a5eeffd861 100644 --- a/pkg/sql/queuefeed/smoke_test.go +++ b/pkg/sql/queuefeed/smoke_test.go @@ -3,16 +3,19 @@ package queuefeed import ( "context" "math/rand" + "sync/atomic" "testing" "time" "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" + "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/uuid" + "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" "golang.org/x/sync/errgroup" ) @@ -79,6 +82,8 @@ func TestQueuefeedSmoketestMultipleRanges(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) + // TODO(jeffswenson): rewrite this test to use normal sessions. + ctx := context.Background() srv, sqlDB, _ := serverutils.StartServer(t, base.TestServerArgs{}) defer srv.Stopper().Stop(ctx) @@ -104,13 +109,16 @@ func TestQueuefeedSmoketestMultipleRanges(t *testing.T) { // TODO: use the built ins once readers are properly assigned to a session. reader, err := qm.newReaderLocked(ctx, "test_multi", Session{ - ConnectionID: uuid.NewV4(), + ConnectionID: session, LivenessID: sqlliveness.SessionID("1"), }) require.NoError(t, err) return reader } + var rowsRead1, rowsRead2 atomic.Int64 + session1, session2 := uuid.NewV4(), uuid.NewV4() + ctx, cancel := context.WithCancel(ctx) group, ctx := errgroup.WithContext(ctx) @@ -123,17 +131,81 @@ func TestQueuefeedSmoketestMultipleRanges(t *testing.T) { return nil }) - readRows := 0 - reader := newReader(uuid.NewV4()) - for readRows < 100 { - rows, err := reader.GetRows(ctx, 10) - require.NoError(t, err) - reader.ConfirmReceipt(ctx) - t.Log("reader read", len(rows), "rows") - readRows += len(rows) - require.NoError(t, err) + reader1 := newReader(session1) + group.Go(func() error { + for ctx.Err() == nil { + rows, err := reader1.GetRows(ctx, 10) + if err != nil { + t.Log("reader 1 got error:", err) + return err + } + reader1.ConfirmReceipt(ctx) + t.Log("reader 1 read", len(rows), "rows") + rowsRead1.Add(int64(len(rows))) + } + t.Log("reader 1 stopping") + return nil + }) + + countPartitions := func() map[string]int { + partitions := map[string]int{} + rows := db.Query(t, `SELECT COALESCE(user_session::STRING, ''), COUNT(*) FROM defaultdb.queue_partition_test_multi GROUP BY 1`) + defer rows.Close() + for rows.Next() { + var sessionID string + var count int + require.NoError(t, rows.Scan(&sessionID, &count)) + partitions[sessionID] = count + } + return partitions } + testutils.SucceedsWithin(t, func() error { + partitions := countPartitions() + if partitions[session1.String()] != 2 { + //t.Logf("expected reader '%s' to have 2 partitions, got %+v", session1.String(), partitions) + return errors.Newf("expected reader '%s' to have 2 partitions, got %+v", session1.String(), partitions) + } + t.Log("reader 1 has 2 partitions:", partitions) + return nil + }, 10*time.Second) + t.Log("reader 1 has 2 partitions") + + reader2 := newReader(session2) + group.Go(func() error { + t.Log("reader 2 started") + for ctx.Err() == nil { + rows, err := reader2.GetRows(ctx, 10) + if err != nil { + t.Log("reader 2 got error:", err) + return err + + } + reader2.ConfirmReceipt(ctx) + rowsRead2.Add(int64(len(rows))) + } + return nil + }) + + testutils.SucceedsWithin(t, func() error { + partitions := countPartitions() + if partitions[session1.String()] != 1 && partitions[session2.String()] != 1 { + return errors.Newf("expected each reader to have 1 partition found: %+v", partitions) + } + if rowsRead1.Load() == 0 { + return errors.New("expected reader 1 to have read some rows") + } + if rowsRead2.Load() == 0 { + return errors.New("epected reader 2 to have read some rows") + } + return nil + }, 10*time.Second) + t.Log("partitions are balanced and both readers have read some rows") + cancel() - _ = group.Wait() + err = group.Wait() + if err != nil { + t.Log("group finished with error:", err) + require.ErrorIs(t, err, context.Canceled) + } } From 3d5df162903d176b85b2b14ceaf98721f82055c2 Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Fri, 14 Nov 2025 09:39:15 -0500 Subject: [PATCH 40/46] queuefeed: support partitions in checkpointing Epic: none --- pkg/sql/queuefeed/reader.go | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 2eb9541fb776..7008b1d63f34 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -220,12 +220,21 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err rangefeed.WithFiltering(false), } - // Resume from checkpoint if available - // TODO: Support multiple partitions - partitionID := int64(1) - initialTS, err := r.mgr.ReadCheckpoint(ctx, r.name, partitionID) - if err != nil { - return errors.Wrap(err, "reading checkpoint") + var initialTS hlc.Timestamp + fmt.Printf("reading checkpoints for %d partitions\n", len(assignment.Partitions)) + for _, partition := range assignment.Partitions { + fmt.Printf("reading checkpoint for partition %d\n", partition.ID) + checkpointTS, err := r.mgr.ReadCheckpoint(ctx, r.name, partition.ID) + if err != nil { + return errors.Wrapf(err, "reading checkpoint for partition %d", partition.ID) + } + fmt.Printf("checkpoint for partition %d: %+v\n", partition.ID, checkpointTS) + if !checkpointTS.IsEmpty() { + if initialTS.IsEmpty() || checkpointTS.Less(initialTS) { + initialTS = checkpointTS + } + } + fmt.Printf("initialTS: %+v\n", initialTS) } if initialTS.IsEmpty() { // No checkpoint found, start from now @@ -373,13 +382,16 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { }() // Persist the checkpoint if we have one. + fmt.Printf("persisting checkpoint: %+v\n", checkpointToWrite) if !checkpointToWrite.IsEmpty() { - // TODO: Support multiple partitions - for now we only have partition 1. - partitionID := int64(1) - if err := r.mgr.WriteCheckpoint(ctx, r.name, partitionID, checkpointToWrite); err != nil { - fmt.Printf("error writing checkpoint: %s\n", err) - // TODO: decide how to handle checkpoint write errors. Since the txn - // has already committed, I don't think we can really fail at this point. + fmt.Printf("persisting checkpoint to %d partitions\n", len(r.assignment.Partitions)) + for _, partition := range r.assignment.Partitions { + fmt.Printf("persisting checkpoint to partition %d\n", partition.ID) + if err := r.mgr.WriteCheckpoint(ctx, r.name, partition.ID, checkpointToWrite); err != nil { + fmt.Printf("error writing checkpoint for partition %d: %s\n", partition.ID, err) + // TODO: decide how to handle checkpoint write errors. Since the txn + // has already committed, I don't think we can really fail at this point. + } } } From f84b7f7a748428e49d76f761b07b477151be2276 Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Fri, 14 Nov 2025 09:53:32 -0500 Subject: [PATCH 41/46] queuefeed: use frontier for tracking partition progress Epic: none --- pkg/sql/queuefeed/BUILD.bazel | 1 + pkg/sql/queuefeed/reader.go | 31 ++++++++++++++----------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 4c3ee5d79d0d..30b15267885f 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -34,6 +34,7 @@ go_library( "//pkg/util/hlc", "//pkg/util/log", "//pkg/util/rangedesc", + "//pkg/util/span", "//pkg/util/syncutil", "//pkg/util/timeutil", "//pkg/util/uuid", diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 7008b1d63f34..40e08e9f8ce5 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" "github.com/cockroachdb/cockroach/pkg/util" "github.com/cockroachdb/cockroach/pkg/util/hlc" + "github.com/cockroachdb/cockroach/pkg/util/span" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" @@ -220,36 +221,32 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err rangefeed.WithFiltering(false), } - var initialTS hlc.Timestamp - fmt.Printf("reading checkpoints for %d partitions\n", len(assignment.Partitions)) + frontier, err := span.MakeFrontier(assignment.Spans()...) + if err != nil { + return errors.Wrap(err, "creating frontier") + } + for _, partition := range assignment.Partitions { - fmt.Printf("reading checkpoint for partition %d\n", partition.ID) checkpointTS, err := r.mgr.ReadCheckpoint(ctx, r.name, partition.ID) if err != nil { return errors.Wrapf(err, "reading checkpoint for partition %d", partition.ID) } - fmt.Printf("checkpoint for partition %d: %+v\n", partition.ID, checkpointTS) if !checkpointTS.IsEmpty() { - if initialTS.IsEmpty() || checkpointTS.Less(initialTS) { - initialTS = checkpointTS - } + frontier.Forward(partition.Span, checkpointTS) + } else { + frontier.Forward(partition.Span, hlc.Timestamp{WallTime: timeutil.Now().UnixNano()}) } - fmt.Printf("initialTS: %+v\n", initialTS) } - if initialTS.IsEmpty() { - // No checkpoint found, start from now - initialTS = hlc.Timestamp{WallTime: timeutil.Now().UnixNano()} + + if frontier.Frontier().IsEmpty() { + return errors.New("frontier is empty") } rf := r.rff.New( - fmt.Sprintf("queuefeed.reader.name=%s", r.name), initialTS, onValue, opts..., + fmt.Sprintf("queuefeed.reader.name=%s", r.name), frontier.Frontier(), onValue, opts..., ) - spans := assignment.Spans() - - fmt.Printf("starting rangefeed with spans: %+v\n", spans) - - if err := rf.Start(ctx, spans); err != nil { + if err := rf.StartFromFrontier(ctx, frontier); err != nil { return errors.Wrap(err, "starting rangefeed") } From 516bd9b7f09d93a0ec7f50bcf8685da3edd176c5 Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Fri, 14 Nov 2025 10:06:33 -0500 Subject: [PATCH 42/46] queuefeed: set the cursor to the queue creation time on creation Epic: none --- pkg/sql/queuefeed/manager.go | 6 ++++++ pkg/sql/queuefeed/reader.go | 6 +----- pkg/sql/queuefeed/reader_test.go | 5 +++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index cf5d0f9d63ab..fb1202cce563 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -174,6 +174,12 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID return errors.Wrapf(err, "inserting partition %d for range", partitionID) } + // checkpoint the partition at the transaction timestamp + err = m.WriteCheckpoint(ctx, queueName, partitionID, txn.KV().ReadTimestamp()) + if err != nil { + return errors.Wrapf(err, "writing checkpoint for partition %d", partitionID) + } + partitionID++ } diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 40e08e9f8ce5..d619b256b1f6 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -26,7 +26,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/span" "github.com/cockroachdb/cockroach/pkg/util/syncutil" - "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/errors" ) @@ -234,7 +233,7 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err if !checkpointTS.IsEmpty() { frontier.Forward(partition.Span, checkpointTS) } else { - frontier.Forward(partition.Span, hlc.Timestamp{WallTime: timeutil.Now().UnixNano()}) + return errors.Errorf("checkpoint is empty for partition %d", partition.ID) } } @@ -379,11 +378,8 @@ func (r *Reader) ConfirmReceipt(ctx context.Context) { }() // Persist the checkpoint if we have one. - fmt.Printf("persisting checkpoint: %+v\n", checkpointToWrite) if !checkpointToWrite.IsEmpty() { - fmt.Printf("persisting checkpoint to %d partitions\n", len(r.assignment.Partitions)) for _, partition := range r.assignment.Partitions { - fmt.Printf("persisting checkpoint to partition %d\n", partition.ID) if err := r.mgr.WriteCheckpoint(ctx, r.name, partition.ID, checkpointToWrite); err != nil { fmt.Printf("error writing checkpoint for partition %d: %s\n", partition.ID, err) // TODO: decide how to handle checkpoint write errors. Since the txn diff --git a/pkg/sql/queuefeed/reader_test.go b/pkg/sql/queuefeed/reader_test.go index c2c908d3e74e..ec622a1fbac6 100644 --- a/pkg/sql/queuefeed/reader_test.go +++ b/pkg/sql/queuefeed/reader_test.go @@ -35,6 +35,9 @@ func TestReaderBasic(t *testing.T) { defer qm.Close() require.NoError(t, qm.CreateQueue(ctx, "test_queue", tableID)) + // These should be readable as long as they were written after the queue was created. + db.Exec(t, `INSERT INTO t VALUES ('row1', 10), ('row2', 20), ('row3', 30)`) + reader, err := qm.CreateReaderForSession(ctx, "test_queue", Session{ ConnectionID: uuid.MakeV4(), LivenessID: sqlliveness.SessionID("1"), @@ -42,8 +45,6 @@ func TestReaderBasic(t *testing.T) { require.NoError(t, err) defer func() { _ = reader.Close() }() - db.Exec(t, `INSERT INTO t VALUES ('row1', 10), ('row2', 20), ('row3', 30)`) - rows := pollForRows(t, ctx, reader, 3) requireRow(t, rows[0], "row1", 10) From 8642c184d368a9b2a8bfe4ec8ab60c04a61c9fda Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Fri, 14 Nov 2025 10:30:22 -0500 Subject: [PATCH 43/46] add builtin for starting a changefeed from a cursor --- pkg/sql/queuefeed/BUILD.bazel | 1 - pkg/sql/queuefeed/manager.go | 9 ++++- pkg/sql/queuefeed/queuebase/BUILD.bazel | 5 ++- pkg/sql/queuefeed/queuebase/queuebase.go | 2 ++ pkg/sql/queuefeed/reader_test.go | 45 ++++++++++++++++++++++++ pkg/sql/sem/builtins/builtins.go | 27 ++++++++++++++ pkg/sql/sem/builtins/fixed_oids.go | 1 + 7 files changed, 87 insertions(+), 3 deletions(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index 30b15267885f..b9496c6a1df0 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -36,7 +36,6 @@ go_library( "//pkg/util/rangedesc", "//pkg/util/span", "//pkg/util/syncutil", - "//pkg/util/timeutil", "//pkg/util/uuid", "@com_github_cockroachdb_errors//:errors", ], diff --git a/pkg/sql/queuefeed/manager.go b/pkg/sql/queuefeed/manager.go index fb1202cce563..0c87823c9af3 100644 --- a/pkg/sql/queuefeed/manager.go +++ b/pkg/sql/queuefeed/manager.go @@ -113,6 +113,9 @@ SELECT cursor FROM defaultdb.queue_cursor_%s WHERE partition_id = $1 // should take a txn func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID int64) error { + return m.CreateQueueFromCursor(ctx, queueName, tableDescID, hlc.Timestamp{}) +} +func (m *Manager) CreateQueueFromCursor(ctx context.Context, queueName string, tableDescID int64, cursor hlc.Timestamp) error { err := m.executor.Txn(ctx, func(ctx context.Context, txn isql.Txn) error { _, err := txn.Exec(ctx, "create_q", txn.KV(), createQueueTableSQL) if err != nil { @@ -174,8 +177,12 @@ func (m *Manager) CreateQueue(ctx context.Context, queueName string, tableDescID return errors.Wrapf(err, "inserting partition %d for range", partitionID) } + checkpointTS := txn.KV().ReadTimestamp() + if !cursor.IsEmpty() { + checkpointTS = cursor + } // checkpoint the partition at the transaction timestamp - err = m.WriteCheckpoint(ctx, queueName, partitionID, txn.KV().ReadTimestamp()) + err = m.WriteCheckpoint(ctx, queueName, partitionID, checkpointTS) if err != nil { return errors.Wrapf(err, "writing checkpoint for partition %d", partitionID) } diff --git a/pkg/sql/queuefeed/queuebase/BUILD.bazel b/pkg/sql/queuefeed/queuebase/BUILD.bazel index c36da12750bf..afcb02c98b49 100644 --- a/pkg/sql/queuefeed/queuebase/BUILD.bazel +++ b/pkg/sql/queuefeed/queuebase/BUILD.bazel @@ -5,5 +5,8 @@ go_library( srcs = ["queuebase.go"], importpath = "github.com/cockroachdb/cockroach/pkg/sql/queuefeed/queuebase", visibility = ["//visibility:public"], - deps = ["//pkg/sql/sem/tree"], + deps = [ + "//pkg/sql/sem/tree", + "//pkg/util/hlc", + ], ) diff --git a/pkg/sql/queuefeed/queuebase/queuebase.go b/pkg/sql/queuefeed/queuebase/queuebase.go index b98858e1bb7b..da319c0f51b7 100644 --- a/pkg/sql/queuefeed/queuebase/queuebase.go +++ b/pkg/sql/queuefeed/queuebase/queuebase.go @@ -4,10 +4,12 @@ import ( "context" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" + "github.com/cockroachdb/cockroach/pkg/util/hlc" ) type Manager interface { CreateQueue(ctx context.Context, name string, tableID int64) error + CreateQueueFromCursor(ctx context.Context, name string, tableID int64, cursor hlc.Timestamp) error } // Implemented by the conn executor in reality diff --git a/pkg/sql/queuefeed/reader_test.go b/pkg/sql/queuefeed/reader_test.go index ec622a1fbac6..bab1f7f025b3 100644 --- a/pkg/sql/queuefeed/reader_test.go +++ b/pkg/sql/queuefeed/reader_test.go @@ -11,6 +11,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" + "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/uuid" @@ -160,6 +161,50 @@ func TestCheckpointRestoration(t *testing.T) { } } +func TestCreateQueueFeedFromCursor(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + srv, conn, _ := serverutils.StartServer(t, base.TestServerArgs{}) + defer srv.Stopper().Stop(ctx) + + db := sqlutils.MakeSQLRunner(conn) + db.Exec(t, `CREATE TABLE t (a STRING, b INT)`) + + var tableID int64 + db.QueryRow(t, "SELECT id FROM system.namespace WHERE name = 't'").Scan(&tableID) + + // Insert first batch of data (should NOT be read). + db.Exec(t, `INSERT INTO t VALUES ('batch1_row1', 10), ('batch1_row2', 20)`) + + // Get cursor timestamp after first batch. + var cursorStr string + db.QueryRow(t, "SELECT cluster_logical_timestamp()").Scan(&cursorStr) + cursor, err := hlc.ParseHLC(cursorStr) + + // Insert second batch of data (should be read). + db.Exec(t, `INSERT INTO t VALUES ('batch2_row1', 30), ('batch2_row2', 40)`) + + qm := NewTestManager(t, srv.ApplicationLayer()) + defer qm.Close() + require.NoError(t, qm.CreateQueueFromCursor(ctx, "cursor_test", tableID, cursor)) + + reader, err := qm.CreateReaderForSession(ctx, "cursor_test", Session{ + ConnectionID: uuid.MakeV4(), + LivenessID: sqlliveness.SessionID("1"), + }) + require.NoError(t, err) + defer func() { _ = reader.Close() }() + + // Should only get the second batch. + rows := pollForRows(t, ctx, reader, 2) + requireRow(t, rows[0], "batch2_row1", 30) + requireRow(t, rows[1], "batch2_row2", 40) + + reader.ConfirmReceipt(ctx) +} + // pollForRows waits for the reader to return expectedCount rows. func pollForRows( t *testing.T, ctx context.Context, reader queuebase.Reader, expectedCount int, diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 7b78acdc1326..0c1c897b94d4 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4661,6 +4661,33 @@ value if you rely on the HLC for accuracy.`, }, }), + "crdb_internal.create_queue_feed_from_cursor": makeBuiltin(defProps(), tree.Overload{ + Types: tree.ParamTypes{ + {Name: "queue_name", Typ: types.String}, + {Name: "table_descriptor_id", Typ: types.Int}, + {Name: "cursor", Typ: types.Decimal}, + }, + Volatility: volatility.Volatile, + ReturnType: tree.FixedReturnType(types.Void), + Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { + qn := args[0].(*tree.DString) + qm := getQueueManager(evalCtx) + tID := args[1].(*tree.DInt) + cursorDecimal := tree.MustBeDDecimal(args[2]) + + // Convert the decimal cursor to hlc.Timestamp + cursor, err := hlc.DecimalToHLC(&cursorDecimal.Decimal) + if err != nil { + return nil, errors.Wrap(err, "converting cursor decimal to HLC timestamp") + } + + if err := qm.CreateQueueFromCursor(ctx, string(*qn), int64(*tID), cursor); err != nil { + return nil, err + } + return tree.DVoidDatum, nil + }, + }), + "crdb_internal.select_array_from_queue_feed": makeBuiltin(defProps(), tree.Overload{ Types: tree.ParamTypes{ {Name: "queue_name", Typ: types.String}, diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index ccaf8e3e2fe7..132939087c3a 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2866,6 +2866,7 @@ var builtinOidsArray = []string{ 2911: `crdb_internal.create_queue_feed(queue_name: string, table_descriptor_id: int) -> void`, 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb`, 2913: `crdb_internal.select_array_from_queue_feed(queue_name: string, limit: int) -> jsonb[]`, + 2914: `crdb_internal.create_queue_feed_from_cursor(queue_name: string, table_descriptor_id: int, cursor: decimal) -> void`, } var builtinOidsBySignature map[string]oid.Oid From 255443b894c092370b27461c2dd471e4d2cab2f1 Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Thu, 13 Nov 2025 17:52:22 -0500 Subject: [PATCH 44/46] queuefeed: create a multi-reader smoke test --- pkg/sql/queuefeed/partition_cache.go | 68 +++++++++++++ pkg/sql/queuefeed/reader.go | 12 ++- pkg/sql/queuefeed/smoke_test.go | 143 ++++++++++----------------- 3 files changed, 128 insertions(+), 95 deletions(-) diff --git a/pkg/sql/queuefeed/partition_cache.go b/pkg/sql/queuefeed/partition_cache.go index f19be3fca26f..774ce0711c95 100644 --- a/pkg/sql/queuefeed/partition_cache.go +++ b/pkg/sql/queuefeed/partition_cache.go @@ -16,6 +16,9 @@ type partitionCache struct { // assignmentIndex is a map of sessions to assigned partitions. assignmentIndex map[Session]map[int64]struct{} + // successorIndex is a map of successor sessions to partitions. + successorIndex map[Session]map[int64]struct{} + sessions map[Session]struct{} } @@ -65,12 +68,37 @@ func (p *partitionCache) DebugString() string { } } + // Print successor index + result.WriteString("\nSuccessor Index (successor session -> partitions):\n") + if len(p.successorIndex) == 0 { + result.WriteString(" (none)\n") + } else { + for session, partitions := range p.successorIndex { + result.WriteString(fmt.Sprintf(" %s: [", session.ConnectionID.String()[:8])) + partitionIDs := make([]int64, 0, len(partitions)) + for id := range partitions { + partitionIDs = append(partitionIDs, id) + } + sort.Slice(partitionIDs, func(i, j int) bool { + return partitionIDs[i] < partitionIDs[j] + }) + for i, id := range partitionIDs { + if i > 0 { + result.WriteString(", ") + } + result.WriteString(fmt.Sprintf("%d", id)) + } + result.WriteString("]\n") + } + } + return result.String() } func (p *partitionCache) Init(partitions []Partition) { p.partitions = make(map[int64]Partition) p.assignmentIndex = make(map[Session]map[int64]struct{}) + p.successorIndex = make(map[Session]map[int64]struct{}) for _, partition := range partitions { p.addPartition(partition) @@ -111,6 +139,16 @@ func (p *partitionCache) removePartition(partitionID int64) { } } } + + // Remove from successor index + if !partition.Successor.Empty() { + if successors, ok := p.successorIndex[partition.Successor]; ok { + delete(successors, partitionID) + if len(successors) == 0 { + delete(p.successorIndex, partition.Successor) + } + } + } } func (p *partitionCache) addPartition(partition Partition) { @@ -125,6 +163,13 @@ func (p *partitionCache) addPartition(partition Partition) { p.assignmentIndex[partition.Session][partition.ID] = struct{}{} } + // Add to successor index for successor session + if !partition.Successor.Empty() { + if _, ok := p.successorIndex[partition.Successor]; !ok { + p.successorIndex[partition.Successor] = make(map[int64]struct{}) + } + p.successorIndex[partition.Successor][partition.ID] = struct{}{} + } } func (p *partitionCache) updatePartition(oldPartition, newPartition Partition) { @@ -148,6 +193,24 @@ func (p *partitionCache) updatePartition(oldPartition, newPartition Partition) { } p.assignmentIndex[newPartition.Session][newPartition.ID] = struct{}{} } + + // Remove old successor assignments + if !oldPartition.Successor.Empty() { + if successors, ok := p.successorIndex[oldPartition.Successor]; ok { + delete(successors, oldPartition.ID) + if len(successors) == 0 { + delete(p.successorIndex, oldPartition.Successor) + } + } + } + + // Add new successor assignments + if !newPartition.Successor.Empty() { + if _, ok := p.successorIndex[newPartition.Successor]; !ok { + p.successorIndex[newPartition.Successor] = make(map[int64]struct{}) + } + p.successorIndex[newPartition.Successor][newPartition.ID] = struct{}{} + } } func (p *partitionCache) isStale(assignment *Assignment) bool { @@ -218,6 +281,11 @@ func (p *partitionCache) planAssignment( } } + if p.successorIndex[session] != nil { + // If the session is trying to steal already, do not steal another session. + return nil, Partition{}, Partition{} + } + // maxPartitions is the maximum number of partitions we would expect to be // assigned to this session. maxPartitions := len(p.partitions) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index d619b256b1f6..9dcd201b01ba 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -447,12 +447,14 @@ func (r *Reader) updateAssignment(assignment *Assignment) error { fmt.Printf("updateAssignment done with assignment: %+v\n", assignment) }() - r.mu.Lock() - defer r.mu.Unlock() - - r.assignment = assignment r.rangefeed.Close() - r.mu.buf = r.mu.buf[:0] + r.assignment = assignment + + func() { + r.mu.Lock() + defer r.mu.Unlock() + r.mu.buf = r.mu.buf[:0] + }() if err := r.setupRangefeed(r.goroCtx, assignment); err != nil { return errors.Wrapf(err, "setting up rangefeed for new assignment: %+v", assignment) diff --git a/pkg/sql/queuefeed/smoke_test.go b/pkg/sql/queuefeed/smoke_test.go index d8a5eeffd861..b01fe5e862e5 100644 --- a/pkg/sql/queuefeed/smoke_test.go +++ b/pkg/sql/queuefeed/smoke_test.go @@ -8,13 +8,11 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/base" - "github.com/cockroachdb/cockroach/pkg/sql/sqlliveness" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" "github.com/cockroachdb/cockroach/pkg/testutils/sqlutils" "github.com/cockroachdb/cockroach/pkg/util/leaktest" "github.com/cockroachdb/cockroach/pkg/util/log" - "github.com/cockroachdb/cockroach/pkg/util/uuid" "github.com/cockroachdb/errors" "github.com/stretchr/testify/require" "golang.org/x/sync/errgroup" @@ -78,7 +76,7 @@ func TestQueuefeedSmoketest(t *testing.T) { require.NoError(t, group.Wait()) } -func TestQueuefeedSmoketestMultipleRanges(t *testing.T) { +func TestQueuefeedSmoketestMultipleReaders(t *testing.T) { defer leaktest.AfterTest(t)() defer log.Scope(t).Close(t) @@ -94,118 +92,83 @@ func TestQueuefeedSmoketestMultipleRanges(t *testing.T) { // Create table with composite primary key and split it db.Exec(t, `CREATE TABLE t (k1 INT, k2 INT, v string, PRIMARY KEY (k1, k2))`) - db.Exec(t, `ALTER TABLE t SPLIT AT VALUES (1)`) + db.Exec(t, `ALTER TABLE t SPLIT AT VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9)`) var tableID int64 db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) - db.Exec(t, `SELECT crdb_internal.create_queue_feed('test_multi', $1)`, tableID) - - // Create two managers for separate readers - qm := NewTestManager(t, srv.ApplicationLayer()) - defer qm.Close() - newReader := func(session uuid.UUID) *Reader { - qm.mu.Lock() - defer qm.mu.Unlock() - - // TODO: use the built ins once readers are properly assigned to a session. - reader, err := qm.newReaderLocked(ctx, "test_multi", Session{ - ConnectionID: session, - LivenessID: sqlliveness.SessionID("1"), - }) - require.NoError(t, err) - return reader - } - - var rowsRead1, rowsRead2 atomic.Int64 - session1, session2 := uuid.NewV4(), uuid.NewV4() + db.Exec(t, `SELECT crdb_internal.create_queue_feed('t_queue', $1)`, tableID) ctx, cancel := context.WithCancel(ctx) group, ctx := errgroup.WithContext(ctx) group.Go(func() error { - for ctx.Err() == nil { - db.Exec(t, `INSERT INTO t VALUES ($1, $2, 'foo')`, rand.Intn(3), rand.Int()) - time.Sleep(10 * time.Millisecond) + for i := 0; ctx.Err() == nil; i++ { + _, err := sqlDB.ExecContext(ctx, `INSERT INTO t VALUES ($1, $2)`, i%10, rand.Int()) + if err != nil { + return errors.Wrap(err, "inserting a row") + } } - t.Log("inserter stopping") - return nil + return ctx.Err() }) - reader1 := newReader(session1) - group.Go(func() error { - for ctx.Err() == nil { - rows, err := reader1.GetRows(ctx, 10) + numWriters := rand.Intn(10) + 1 // create [1, 10] writers + rowsSeen := make([]atomic.Int64, numWriters) + t.Logf("spawning %d readers", numWriters) + for i := range numWriters { + group.Go(func() error { + time.Sleep(time.Duration(rand.Intn(1000)) * time.Millisecond) + + conn, err := srv.SQLConn(t).Conn(ctx) if err != nil { - t.Log("reader 1 got error:", err) return err } - reader1.ConfirmReceipt(ctx) - t.Log("reader 1 read", len(rows), "rows") - rowsRead1.Add(int64(len(rows))) - } - t.Log("reader 1 stopping") - return nil - }) - countPartitions := func() map[string]int { - partitions := map[string]int{} - rows := db.Query(t, `SELECT COALESCE(user_session::STRING, ''), COUNT(*) FROM defaultdb.queue_partition_test_multi GROUP BY 1`) - defer rows.Close() - for rows.Next() { - var sessionID string - var count int - require.NoError(t, rows.Scan(&sessionID, &count)) - partitions[sessionID] = count - } - return partitions + for ctx.Err() == nil { + cursor, err := conn.QueryContext(ctx, `SELECT * FROM crdb_internal.select_from_queue_feed('t_queue', 1000)`) + if err != nil { + return err + } + for cursor.Next() { + var e string + if err := cursor.Scan(&e); err != nil { + return errors.Wrap(err, "scanning queue feed row") + } + rowsSeen[i].Add(1) + } + require.NoError(t, cursor.Close()) + } + + return ctx.Err() + }) } - testutils.SucceedsWithin(t, func() error { - partitions := countPartitions() - if partitions[session1.String()] != 2 { - //t.Logf("expected reader '%s' to have 2 partitions, got %+v", session1.String(), partitions) - return errors.Newf("expected reader '%s' to have 2 partitions, got %+v", session1.String(), partitions) + // Wait for every reader to observe rows and every partition to be assigned. + testutils.SucceedsSoon(t, func() error { + for _, row := range db.QueryStr(t, "SELECT partition_id, user_session, user_session_successor FROM defaultdb.queue_partition_t_queue") { + t.Logf("partition row: %v", row) } - t.Log("reader 1 has 2 partitions:", partitions) - return nil - }, 10*time.Second) - t.Log("reader 1 has 2 partitions") - reader2 := newReader(session2) - group.Go(func() error { - t.Log("reader 2 started") - for ctx.Err() == nil { - rows, err := reader2.GetRows(ctx, 10) - if err != nil { - t.Log("reader 2 got error:", err) - return err + seen := make([]int64, numWriters) + for i := range numWriters { + seen[i] = rowsSeen[i].Load() + } + t.Logf("row counts %v", seen) + for i := range numWriters { + if seen[i] == 0 { + return errors.Newf("reader %d has not seen any rows yet", i) } - reader2.ConfirmReceipt(ctx) - rowsRead2.Add(int64(len(rows))) } - return nil - }) - testutils.SucceedsWithin(t, func() error { - partitions := countPartitions() - if partitions[session1.String()] != 1 && partitions[session2.String()] != 1 { - return errors.Newf("expected each reader to have 1 partition found: %+v", partitions) - } - if rowsRead1.Load() == 0 { - return errors.New("expected reader 1 to have read some rows") - } - if rowsRead2.Load() == 0 { - return errors.New("epected reader 2 to have read some rows") + var unassignedPartitions int + db.QueryRow(t, "SELECT COUNT(*) FROM defaultdb.queue_partition_t_queue WHERE user_session IS NULL").Scan(&unassignedPartitions) + if unassignedPartitions != 0 { + return errors.Newf("%d unassigned partitions remain", unassignedPartitions) } + return nil - }, 10*time.Second) - t.Log("partitions are balanced and both readers have read some rows") + }) cancel() - err = group.Wait() - if err != nil { - t.Log("group finished with error:", err) - require.ErrorIs(t, err, context.Canceled) - } + _ = group.Wait() } From 9b9c8602b43660e53743f880fd2ad52dcd104d3c Mon Sep 17 00:00:00 2001 From: Aerin Freilich Date: Fri, 14 Nov 2025 11:14:13 -0500 Subject: [PATCH 45/46] queuefeed: create queuefeed from table name Epic: none --- pkg/sql/queuefeed/BUILD.bazel | 1 + pkg/sql/queuefeed/manager_test.go | 5 +---- pkg/sql/queuefeed/smoke_test.go | 8 ++------ pkg/sql/sem/builtins/builtins.go | 27 ++++++++++++++++++--------- pkg/sql/sem/builtins/fixed_oids.go | 4 ++-- 5 files changed, 24 insertions(+), 21 deletions(-) diff --git a/pkg/sql/queuefeed/BUILD.bazel b/pkg/sql/queuefeed/BUILD.bazel index b9496c6a1df0..646a32eb6859 100644 --- a/pkg/sql/queuefeed/BUILD.bazel +++ b/pkg/sql/queuefeed/BUILD.bazel @@ -72,6 +72,7 @@ go_test( "//pkg/testutils/serverutils", "//pkg/testutils/sqlutils", "//pkg/testutils/testcluster", + "//pkg/util/hlc", "//pkg/util/leaktest", "//pkg/util/log", "//pkg/util/randutil", diff --git a/pkg/sql/queuefeed/manager_test.go b/pkg/sql/queuefeed/manager_test.go index 0090fab42e32..2013626f6ef8 100644 --- a/pkg/sql/queuefeed/manager_test.go +++ b/pkg/sql/queuefeed/manager_test.go @@ -74,10 +74,7 @@ func TestQueuefeedCtxCancel(t *testing.T) { db := sqlutils.MakeSQLRunner(conn) db.Exec(t, `CREATE TABLE t (a string)`) - // get table id - var tableID int64 - db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) - db.Exec(t, `SELECT crdb_internal.create_queue_feed('hi', $1)`, tableID) + db.Exec(t, `SELECT crdb_internal.create_queue_feed('hi', 't')`) ctx, cancel := context.WithTimeout(ctx, 1*time.Second) defer cancel() diff --git a/pkg/sql/queuefeed/smoke_test.go b/pkg/sql/queuefeed/smoke_test.go index b01fe5e862e5..baf43c0cc8d2 100644 --- a/pkg/sql/queuefeed/smoke_test.go +++ b/pkg/sql/queuefeed/smoke_test.go @@ -31,9 +31,7 @@ func TestQueuefeedSmoketest(t *testing.T) { _, err := srv.SystemLayer().SQLConn(t).Exec(`SET CLUSTER SETTING kv.rangefeed.enabled = true`) require.NoError(t, err) - var tableID int64 - db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) - db.Exec(t, `SELECT crdb_internal.create_queue_feed('test_queue', $1)`, tableID) + db.Exec(t, `SELECT crdb_internal.create_queue_feed('test_queue', 't')`) // TODO improve this test once creating the queue sets an accurate cursor. We // should be able to read an expected set of rows. @@ -94,9 +92,7 @@ func TestQueuefeedSmoketestMultipleReaders(t *testing.T) { db.Exec(t, `CREATE TABLE t (k1 INT, k2 INT, v string, PRIMARY KEY (k1, k2))`) db.Exec(t, `ALTER TABLE t SPLIT AT VALUES (1), (2), (3), (4), (5), (6), (7), (8), (9)`) - var tableID int64 - db.QueryRow(t, "SELECT id FROM system.namespace where name = 't'").Scan(&tableID) - db.Exec(t, `SELECT crdb_internal.create_queue_feed('t_queue', $1)`, tableID) + db.Exec(t, `SELECT crdb_internal.create_queue_feed('t_queue', 't')`) ctx, cancel := context.WithCancel(ctx) group, ctx := errgroup.WithContext(ctx) diff --git a/pkg/sql/sem/builtins/builtins.go b/pkg/sql/sem/builtins/builtins.go index 0c1c897b94d4..0a94e1bd0ae2 100644 --- a/pkg/sql/sem/builtins/builtins.go +++ b/pkg/sql/sem/builtins/builtins.go @@ -4646,15 +4646,20 @@ value if you rely on the HLC for accuracy.`, "crdb_internal.create_queue_feed": makeBuiltin(defProps(), tree.Overload{ Types: tree.ParamTypes{ {Name: "queue_name", Typ: types.String}, - {Name: "table_descriptor_id", Typ: types.Int}, + {Name: "table_name", Typ: types.String}, }, Volatility: volatility.Volatile, ReturnType: tree.FixedReturnType(types.Void), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { qn := args[0].(*tree.DString) + tableName := tree.MustBeDString(args[1]) + dOid, err := eval.ParseDOid(ctx, evalCtx, string(tableName), types.RegClass) + if err != nil { + return nil, err + } + qm := getQueueManager(evalCtx) - tID := args[1].(*tree.DInt) - if err := qm.CreateQueue(ctx, string(*qn), int64(*tID)); err != nil { + if err := qm.CreateQueue(ctx, string(*qn), int64(dOid.Oid)); err != nil { return nil, err } return tree.DVoidDatum, nil @@ -4664,24 +4669,28 @@ value if you rely on the HLC for accuracy.`, "crdb_internal.create_queue_feed_from_cursor": makeBuiltin(defProps(), tree.Overload{ Types: tree.ParamTypes{ {Name: "queue_name", Typ: types.String}, - {Name: "table_descriptor_id", Typ: types.Int}, + {Name: "table_name", Typ: types.String}, {Name: "cursor", Typ: types.Decimal}, }, Volatility: volatility.Volatile, ReturnType: tree.FixedReturnType(types.Void), Fn: func(ctx context.Context, evalCtx *eval.Context, args tree.Datums) (tree.Datum, error) { qn := args[0].(*tree.DString) - qm := getQueueManager(evalCtx) - tID := args[1].(*tree.DInt) - cursorDecimal := tree.MustBeDDecimal(args[2]) - // Convert the decimal cursor to hlc.Timestamp + tableName := tree.MustBeDString(args[1]) + dOid, err := eval.ParseDOid(ctx, evalCtx, string(tableName), types.RegClass) + if err != nil { + return nil, err + } + + cursorDecimal := tree.MustBeDDecimal(args[2]) cursor, err := hlc.DecimalToHLC(&cursorDecimal.Decimal) if err != nil { return nil, errors.Wrap(err, "converting cursor decimal to HLC timestamp") } - if err := qm.CreateQueueFromCursor(ctx, string(*qn), int64(*tID), cursor); err != nil { + qm := getQueueManager(evalCtx) + if err := qm.CreateQueueFromCursor(ctx, string(*qn), int64(dOid.Oid), cursor); err != nil { return nil, err } return tree.DVoidDatum, nil diff --git a/pkg/sql/sem/builtins/fixed_oids.go b/pkg/sql/sem/builtins/fixed_oids.go index 132939087c3a..dd1aa3f035c2 100644 --- a/pkg/sql/sem/builtins/fixed_oids.go +++ b/pkg/sql/sem/builtins/fixed_oids.go @@ -2863,10 +2863,10 @@ var builtinOidsArray = []string{ 2908: `crdb_internal.inject_hint(statement_fingerprint: string, donor_sql: string) -> int`, 2909: `crdb_internal.clear_statement_hints_cache() -> void`, 2910: `crdb_internal.await_statement_hints_cache() -> void`, - 2911: `crdb_internal.create_queue_feed(queue_name: string, table_descriptor_id: int) -> void`, + 2911: `crdb_internal.create_queue_feed(queue_name: string, table_name: string) -> void`, 2912: `crdb_internal.select_from_queue_feed(queue_name: string, limit: int) -> jsonb`, 2913: `crdb_internal.select_array_from_queue_feed(queue_name: string, limit: int) -> jsonb[]`, - 2914: `crdb_internal.create_queue_feed_from_cursor(queue_name: string, table_descriptor_id: int, cursor: decimal) -> void`, + 2914: `crdb_internal.create_queue_feed_from_cursor(queue_name: string, table_name: string, cursor: decimal) -> void`, } var builtinOidsBySignature map[string]oid.Oid From 4288fab913d25c3e8056fafda8d86ea00fa18449 Mon Sep 17 00:00:00 2001 From: Jeff Swenson Date: Fri, 14 Nov 2025 11:52:53 -0500 Subject: [PATCH 46/46] roachtest: add queuefeed roachtest --- pkg/cmd/roachtest/tests/BUILD.bazel | 2 + pkg/cmd/roachtest/tests/queuefeed.go | 145 +++++++++++++++++++++++++++ pkg/cmd/roachtest/tests/registry.go | 1 + pkg/sql/queuefeed/reader.go | 5 +- 4 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 pkg/cmd/roachtest/tests/queuefeed.go diff --git a/pkg/cmd/roachtest/tests/BUILD.bazel b/pkg/cmd/roachtest/tests/BUILD.bazel index c51a84bc1d85..1b2bc3980a91 100644 --- a/pkg/cmd/roachtest/tests/BUILD.bazel +++ b/pkg/cmd/roachtest/tests/BUILD.bazel @@ -167,6 +167,7 @@ go_library( "ptp.go", "query_comparison_util.go", "queue.go", + "queuefeed.go", "quit.go", "rapid_restart.go", "rebalance_load.go", @@ -365,6 +366,7 @@ go_library( "@org_golang_google_protobuf//proto", "@org_golang_x_exp//maps", "@org_golang_x_oauth2//clientcredentials", + "@org_golang_x_sync//errgroup", "@org_golang_x_text//cases", "@org_golang_x_text//language", ], diff --git a/pkg/cmd/roachtest/tests/queuefeed.go b/pkg/cmd/roachtest/tests/queuefeed.go new file mode 100644 index 000000000000..a2a2432d9968 --- /dev/null +++ b/pkg/cmd/roachtest/tests/queuefeed.go @@ -0,0 +1,145 @@ +// Copyright 2025 The Cockroach Authors. +// +// Use of this software is governed by the CockroachDB Software License +// included in the /LICENSE file. + +package tests + +import ( + "context" + "database/sql" + "fmt" + "math/rand" + "strings" + "sync/atomic" + "time" + + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec" + "github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test" + "github.com/cockroachdb/cockroach/pkg/roachprod/install" + "github.com/cockroachdb/errors" + "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" +) + +func registerQueuefeed(r registry.Registry) { + r.Add(registry.TestSpec{ + Name: "queuefeed", + Owner: registry.OwnerCDC, + Cluster: r.MakeClusterSpec(4, spec.WorkloadNode()), + Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { + runQueuefeed(ctx, t, c) + }, + CompatibleClouds: registry.AllClouds, + Suites: registry.Suites(registry.Nightly), + }) +} + +func runQueuefeed(ctx context.Context, t test.Test, c cluster.Cluster) { + c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.CRDBNodes()) + + db := c.Conn(ctx, t.L(), 1) + defer db.Close() + + _, err := db.ExecContext(ctx, "SET CLUSTER SETTING kv.rangefeed.enabled = true") + require.NoError(t, errors.Wrap(err, "enabling rangefeeds")) + + t.Status("initializing kv workload") + c.Run(ctx, option.WithNodes(c.WorkloadNode()), + "./cockroach workload init kv --splits=100 {pgurl:1}") + + var tableID int64 + err = db.QueryRowContext(ctx, "SELECT id FROM system.namespace WHERE name = 'kv' and \"parentSchemaID\" <> 0;").Scan(&tableID) + require.NoError(t, err) + + t.Status("creating kv_queue") + _, err = db.ExecContext(ctx, "SELECT crdb_internal.create_queue_feed('kv_queue', $1)", tableID) + require.NoError(t, err) + + t.Status("running queue feed queries") + + ctx, cancel := context.WithTimeout(ctx, 10*time.Minute) + defer cancel() + + g, ctx := errgroup.WithContext(ctx) + + const numReaders = 10 + counters := make([]*atomic.Int64, numReaders) + for i := range counters { + counters[i] = &atomic.Int64{} + } + + g.Go(func() error { + return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), + "./cockroach workload run kv --duration=10m {pgurl:1}") + }) + + g.Go(func() error { + ticker := time.NewTicker(1 * time.Second) + defer ticker.Stop() + lastCounts := make([]int64, numReaders) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + qps := make([]string, numReaders) + for i := 0; i < numReaders; i++ { + currentCount := counters[i].Load() + ratePerSec := currentCount - lastCounts[i] + qps[i] = fmt.Sprintf("%d", ratePerSec) + lastCounts[i] = currentCount + } + t.L().Printf("qps: %s", strings.Join(qps, ",")) + } + } + }) + + dbNodes := 1 // TODO fix bug that occurs with 3 + nodePool := make([]*sql.DB, numReaders) + for i := range dbNodes { + nodePool[i] = c.Conn(ctx, t.L(), i+1) + } + defer func() { + for i := range dbNodes { + _ = nodePool[i].Close() + } + }() + + for i := 0; i < numReaders; i++ { + readerIndex := i + g.Go(func() error { + // Stagger the readers a bit. This helps test re-distribution of + // partitions. + // TODO fix bug that occurs with jitter + // time.Sleep(time.Duration(rand.Intn(int(time.Minute)))) + + // Connect to a random node to simulate a tcp load balancer. + conn, err := nodePool[rand.Intn(dbNodes)].Conn(ctx) + if err != nil { + return errors.Wrap(err, "getting connection for the queuefeed reader") + } + defer func() { _ = conn.Close() }() + + for ctx.Err() == nil { + var count int + err := conn.QueryRowContext(ctx, + "SELECT count(*) FROM crdb_internal.select_from_queue_feed('kv_queue', 10000)").Scan(&count) + if err != nil { + return err + } + counters[readerIndex].Add(int64(count)) + } + return ctx.Err() + }) + } + + err = g.Wait() + if err != nil && ctx.Err() == nil { + t.Fatal(err) + } +} diff --git a/pkg/cmd/roachtest/tests/registry.go b/pkg/cmd/roachtest/tests/registry.go index 400d25fa4234..cec911c3bc66 100644 --- a/pkg/cmd/roachtest/tests/registry.go +++ b/pkg/cmd/roachtest/tests/registry.go @@ -131,6 +131,7 @@ func RegisterTests(r registry.Registry) { registerPruneDanglingSnapshotsAndDisks(r) registerPTP(r) registerQueue(r) + registerQueuefeed(r) registerQuitTransfersLeases(r) registerRebalanceLoad(r) registerReplicaGC(r) diff --git a/pkg/sql/queuefeed/reader.go b/pkg/sql/queuefeed/reader.go index 9dcd201b01ba..35c0d360c765 100644 --- a/pkg/sql/queuefeed/reader.go +++ b/pkg/sql/queuefeed/reader.go @@ -231,7 +231,10 @@ func (r *Reader) setupRangefeed(ctx context.Context, assignment *Assignment) err return errors.Wrapf(err, "reading checkpoint for partition %d", partition.ID) } if !checkpointTS.IsEmpty() { - frontier.Forward(partition.Span, checkpointTS) + _, err := frontier.Forward(partition.Span, checkpointTS) + if err != nil { + return errors.Wrapf(err, "advancing frontier for partition %d to checkpoint %s", partition.ID, checkpointTS) + } } else { return errors.Errorf("checkpoint is empty for partition %d", partition.ID) }