@@ -12,6 +12,7 @@ import (
1212 "net"
1313 "net/url"
1414 "testing"
15+ "time"
1516
1617 "github.com/cockroachdb/cockroach/pkg/base"
1718 _ "github.com/cockroachdb/cockroach/pkg/ccl/changefeedccl"
@@ -742,3 +743,69 @@ func TestPhysicalReplicationGatewayRoute(t *testing.T) {
742743 progress := jobutils .GetJobProgress (t , systemDB , jobspb .JobID (jobID ))
743744 require .Empty (t , progress .Details .(* jobspb.Progress_StreamIngest ).StreamIngest .PartitionConnUris )
744745}
746+
747+ func TestPhysicalReplicationCancelsProducerOnCutoverFromSystem (t * testing.T ) {
748+ defer leaktest .AfterTest (t )()
749+ defer log .Scope (t ).Close (t )
750+
751+ // This test verifies that the span reconciliation job on a promoted standby
752+ // tenant that was streaming from a system tenant is able to make progress.
753+ // See https://github.com/cockroachdb/cockroach/issues/155444 for more
754+ // details.
755+ ctx := context .Background ()
756+ args := replicationtestutils .DefaultTenantStreamingClustersArgs
757+ args .MultitenantSingleClusterNumNodes = 1
758+ args .SrcTenantID = roachpb .SystemTenantID
759+ args .SrcTenantName = "system"
760+
761+ c , cleanup := replicationtestutils .CreateMultiTenantStreamingCluster (ctx , t , args )
762+ defer cleanup ()
763+
764+ producerJobID , consumerJobID := c .StartStreamReplication (ctx )
765+ jobutils .WaitForJobToRun (c .T , c .SrcSysSQL , jobspb .JobID (producerJobID ))
766+ jobutils .WaitForJobToRun (c .T , c .DestSysSQL , jobspb .JobID (consumerJobID ))
767+
768+ replicationtestutils .WaitUntilStartTimeReached (t , c .DestSysSQL , jobspb .JobID (consumerJobID ))
769+ srcTime := c .SrcCluster .Server (0 ).Clock ().Now ()
770+ c .Cutover (ctx , producerJobID , consumerJobID , srcTime .GoTime (), false /* async */ )
771+ destCleanup := c .StartDestTenant (ctx , nil /* withTestingKnobs */ , 0 /* server */ )
772+ defer destCleanup ()
773+
774+ // We speed up the span config reconciliation job manager so that it will
775+ // quickly detect the missing reconciliation job on the tenant (the previous
776+ // one was canceled as it was replicated) and spin up a new one.
777+ c .DestSysSQL .Exec (t , "ALTER TENANT ALL SET CLUSTER SETTING spanconfig.reconciliation_job.check_interval = '10ms'" )
778+
779+ registry := c .DestTenantServer .JobRegistry ().(* jobs.Registry )
780+ // We nudge the adoption queue to pick up the replicated stream producer job
781+ // so that it will be dropped.
782+ registry .TestingNudgeAdoptionQueue ()
783+
784+ now := c .DestSysServer .Clock ().Now ()
785+ testutils .SucceedsWithin (t , func () error {
786+ var spanConfigJobID jobspb.JobID
787+ rows := c .DestTenantSQL .Query (
788+ t , `SELECT id FROM system.jobs WHERE job_type = 'AUTO SPAN CONFIG RECONCILIATION' AND status = 'running'` ,
789+ )
790+ defer rows .Close ()
791+ if ! rows .Next () {
792+ return errors .New ("no running span config reconciliation job found" )
793+ }
794+ if err := rows .Scan (& spanConfigJobID ); err != nil {
795+ return err
796+ }
797+ spanConfigJob , err := registry .LoadJob (ctx , spanConfigJobID )
798+ require .NoError (t , err )
799+ spanConfigProg := spanConfigJob .Progress ().
800+ Details .(* jobspb.Progress_AutoSpanConfigReconciliation ).
801+ AutoSpanConfigReconciliation
802+
803+ if spanConfigProg .Checkpoint .Less (now ) {
804+ return errors .Newf (
805+ "waiting for span config reconciliation job to checkpoint past %v, at %v" ,
806+ now , spanConfigProg .Checkpoint ,
807+ )
808+ }
809+ return nil
810+ }, 2 * time .Minute )
811+ }
0 commit comments