Skip to content

Commit b97d3b3

Browse files
craig[bot]nameisbhaskar
andcommitted
Merge #155081
155081: workload_generator: fix constraint prefix check and computed columns r=shailendra-patel a=nameisbhaskar This PR has the following changes: 1 While extracting the table schemas from the create_statements file, we use a prefix-based check to identify whether a line has a column information or a constraint. This check fails in case the prefix of the column name matches. e.g. a line beginning with "CHECK" is a constraint. But, if the column name is "checksum", this matches and causes an issue to identify the column as a constraint. To fix this, the check is changed to be "CHECK ". A TODO is added to use sql parser instead of this string match and regex logics. 2. Computed columns are not supported in import. So, the computed columns are ignored if thay are present in the table definition. Release Note: None Epic: None Co-authored-by: Bhaskarjyoti Bora <bhaskar.bora@cockroachlabs.com>
2 parents c4dbc61 + 1eab839 commit b97d3b3

File tree

3 files changed

+207
-4
lines changed

3 files changed

+207
-4
lines changed

pkg/workload/workload_generator/schema_generator.go

Lines changed: 123 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ const (
3434
checkInlinePattern = `(?i)\bCHECK\s*\(`
3535
primaryKeyPattern = `(?i)\bPRIMARY\s+KEY\b`
3636
uniquePattern = `(?i)\bUNIQUE\b`
37+
computedPattern = `(?i)\b(AS)\b`
3738

3839
// Patterns for table-level constraints
3940
tablePrimaryKeyPattern = `\((.*?)\)`
@@ -60,6 +61,7 @@ var (
6061
checkInlineRe = regexp.MustCompile(checkInlinePattern)
6162
primaryKeyRe = regexp.MustCompile(primaryKeyPattern)
6263
uniqueRe = regexp.MustCompile(uniquePattern)
64+
computedRe = regexp.MustCompile(computedPattern)
6365
//following regexes are for table constraint handling
6466
tablePrimaryKeyRe = regexp.MustCompile(tablePrimaryKeyPattern)
6567
tableUniqueRe = regexp.MustCompile(tableUniquePattern)
@@ -352,6 +354,7 @@ func generateDDLFromCSV(
352354
// It parses the table name, columns, and constraints (primary keys, unique constraints,
353355
// foreign keys, and check constraints) from the DDL statement and returns a structured
354356
// representation of the table schema.
357+
// TODO: (@nameisbhaskar) use sql parser instead of regexes - https://github.com/cockroachdb/cockroach/issues/155173
355358
func ParseDDL(ddl string) (*TableSchema, error) {
356359
tableMatch := tablePattern.FindStringSubmatch(ddl)
357360
if tableMatch == nil {
@@ -439,7 +442,8 @@ func hasConstrainingPrefix(up string) bool {
439442
sqlFamily,
440443
}
441444
for _, p := range prefixes {
442-
if strings.HasPrefix(up, p) {
445+
// a space is added after the prefix to avoid matches like "checksum" as a field name.
446+
if strings.HasPrefix(up, p+" ") {
443447
return true
444448
}
445449
}
@@ -453,7 +457,7 @@ func processColumnDefs(table *TableSchema, columnDefs []string) {
453457
// Process each column definition
454458
for _, columnDef := range columnDefs {
455459
colMatch := colPattern.FindStringSubmatch(columnDef)
456-
if colMatch == nil {
460+
if colMatch == nil || computedRe.MatchString(columnDef) {
457461
continue // Skip if pattern doesn't match
458462
}
459463
// Extract column properties from regex matches
@@ -659,6 +663,120 @@ func openCreateStatementsTSV(zipDir string) (*os.File, error) {
659663
return f, nil
660664
}
661665

666+
// removeComputedColumns removes computed columns (columns with AS clause) from a CREATE TABLE statement.
667+
// It parses the statement, identifies computed columns, and reconstructs the statement without them.
668+
// It also removes any indexes or constraints that reference the computed columns.
669+
func removeComputedColumns(stmt string) string {
670+
columnBlockMatch := bodyRe.FindStringSubmatch(stmt)
671+
if columnBlockMatch == nil {
672+
return stmt
673+
}
674+
675+
body := columnBlockMatch[1]
676+
suffix := columnBlockMatch[2]
677+
678+
// column definitions and table constraints are split based on commas
679+
var partsList []string
680+
buf := ""
681+
depth := 0
682+
for _, ch := range body {
683+
switch ch {
684+
case '(':
685+
depth++
686+
buf += string(ch)
687+
case ')':
688+
depth--
689+
buf += string(ch)
690+
case ',':
691+
if depth == 0 {
692+
partsList = append(partsList, strings.TrimSpace(buf))
693+
buf = ""
694+
} else {
695+
buf += string(ch)
696+
}
697+
default:
698+
buf += string(ch)
699+
}
700+
}
701+
if strings.TrimSpace(buf) != "" {
702+
partsList = append(partsList, strings.TrimSpace(buf))
703+
}
704+
705+
// computed column names are collected for reference
706+
computedColumnNames := make(map[string]struct{})
707+
for _, p := range partsList {
708+
// Computed column (contains AS keyword) is identified using the computedRe regex
709+
// Example: "haserror BOOL NULL AS (errordetail != '':::STRING) VIRTUAL"
710+
// This will match because it contains "AS"
711+
// Note: This is a simple heuristic and may need to be improved for complex cases
712+
// where "AS" might appear in other contexts.
713+
// The regex is case-insensitive to match "AS", "as", "As", etc.
714+
// It looks for the word "AS" surrounded by word boundaries to avoid partial matches.
715+
// This approach assumes that computed columns are defined with the "AS" keyword.
716+
// If the SQL dialect changes or has different syntax, this may need to be adjusted.
717+
// The regex is applied to each part of the column block to identify computed columns.
718+
// If a part matches, the column name is extracted and added to the computedColumnNames map.
719+
if computedRe.MatchString(p) {
720+
colMatch := colPattern.FindStringSubmatch(p)
721+
if len(colMatch) > 1 {
722+
colName := strings.Trim(colMatch[1], `"`)
723+
computedColumnNames[colName] = struct{}{}
724+
}
725+
}
726+
}
727+
728+
// Computed columns and constraints/indexes that reference them are filtered out
729+
var filteredParts []string
730+
for _, p := range partsList {
731+
// Skip computed columns
732+
if computedRe.MatchString(p) {
733+
continue
734+
}
735+
736+
// If this is an index or constraint that references a computed column, it is skipped.
737+
shouldSkip := false
738+
pUpper := strings.ToUpper(p)
739+
if strings.HasPrefix(pUpper, "INDEX ") || strings.HasPrefix(pUpper, "UNIQUE INDEX ") {
740+
// Computed column names are checked in the index definition
741+
// If any computed column is found, the index is skipped
742+
// This is a simple substring check and may need to be improved for complex cases
743+
// where column names might be part of other identifiers.
744+
// For example, if a computed column is "col1", an index on "col10" should not be skipped.
745+
for colName := range computedColumnNames {
746+
// A regex pattern to match the column name in the index definition is created.
747+
pattern := regexp.MustCompile(`\b` + regexp.QuoteMeta(colName) + `\b`)
748+
if pattern.MatchString(p) {
749+
shouldSkip = true
750+
break
751+
}
752+
}
753+
}
754+
755+
if !shouldSkip {
756+
filteredParts = append(filteredParts, p)
757+
}
758+
}
759+
760+
// If all parts were filtered out, return original statement
761+
if len(filteredParts) == 0 {
762+
return stmt
763+
}
764+
765+
// The statement is reconstructed without computed columns and related constraints/indexes
766+
// The original formatting (indentation, line breaks) is preserved as much as possible
767+
// by joining the filtered parts with commas and new lines.
768+
tableMatch := tablePattern.FindStringSubmatch(stmt)
769+
if tableMatch == nil {
770+
return stmt
771+
}
772+
773+
// The CREATE TABLE is the part before the column block
774+
createTablePart := stmt[:strings.Index(stmt, "(")]
775+
reconstructed := createTablePart + "(\n\t" + strings.Join(filteredParts, ",\n\t") + "\n)" + suffix
776+
777+
return reconstructed
778+
}
779+
662780
// processDDLRecord inspects one TSV row and, if it represents a public table
663781
// in dbName, normalizes its CREATE TABLE stmt and appends it to order/statements.
664782
// It returns the fully qualified table name and table statements.
@@ -673,6 +791,9 @@ func processDDLRecord(
673791
stmt = createTableRe.ReplaceAllString(stmt, "${1}IF NOT EXISTS ")
674792
}
675793

794+
// 5) Remove computed columns from the statement
795+
stmt = removeComputedColumns(stmt)
796+
676797
return fmt.Sprintf("%s.%s.%s", dbName, schemaName, tableName), stmt
677798
}
678799

pkg/workload/workload_generator/schema_generator_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,11 @@ func TestSplitColumnDefsAndTableConstraints(t *testing.T) {
2626
id INT PRIMARY KEY,
2727
name TEXT NOT NULL,
2828
age INT DEFAULT 30,
29+
checksum BYTES DEFAULT 30,
30+
haserror BOOL NULL AS (errordetail != '':::STRING) VIRTUAL,
2931
CONSTRAINT user_pk PRIMARY KEY (id),
3032
UNIQUE (name),
33+
INDEX emigratingbuckets_haserror_idx (haserror DESC),
3134
FOREIGN KEY (age) REFERENCES other(age),
3235
CHECK (age > 0)
3336
`
@@ -36,16 +39,89 @@ func TestSplitColumnDefsAndTableConstraints(t *testing.T) {
3639
"id INT PRIMARY KEY",
3740
"name TEXT NOT NULL",
3841
"age INT DEFAULT 30",
42+
"checksum BYTES DEFAULT 30",
43+
"haserror BOOL NULL AS (errordetail != '':::STRING) VIRTUAL",
3944
}
4045
wantConstraints := []string{
4146
"CONSTRAINT user_pk PRIMARY KEY (id)",
4247
"UNIQUE (name)",
48+
"INDEX emigratingbuckets_haserror_idx (haserror DESC)",
4349
"FOREIGN KEY (age) REFERENCES other(age)",
4450
"CHECK (age > 0)",
4551
}
4652
assert.Equal(t, wantCols, cols)
4753
assert.Equal(t, wantConstraints, constraints)
4854
}
55+
func TestRemoveComputedColumns(t *testing.T) {
56+
stmt := `
57+
CREATE TABLE public.emigratingbuckets (
58+
bucketnum INT8 NOT NULL,
59+
bucket STRING NOT NULL,
60+
shardname STRING NOT NULL,
61+
destcluster STRING NOT NULL,
62+
starttime TIMESTAMP(0) NULL,
63+
startbinlogfile STRING NULL,
64+
startbinlogpos INT4 NULL,
65+
generation INT4 NOT NULL DEFAULT 0:::INT8,
66+
state STRING NOT NULL,
67+
activeon STRING NOT NULL,
68+
lastheartbeat TIMESTAMP(0) NULL,
69+
enqd BOOL NULL,
70+
enqdfirst BOOL NULL,
71+
enqdtime TIMESTAMP(6) NULL,
72+
laststatechangetime TIMESTAMP(0) NOT NULL,
73+
lastbinlogfile STRING NOT NULL,
74+
lastbinlogpos INT4 NOT NULL,
75+
laststatusupdate TIMESTAMP(0) NULL,
76+
errordetail STRING NOT NULL,
77+
migrationparams JSONB NULL,
78+
migrationstatus JSONB NULL,
79+
srcbucketinfo JSONB NULL,
80+
haserror BOOL NULL AS (errordetail != '':::STRING) VIRTUAL,
81+
CONSTRAINT ""primary"" PRIMARY KEY (bucketnum ASC),
82+
INDEX emigratingbuckets_state_idx (state ASC),
83+
INDEX emigratingbuckets_activeon_idx (activeon DESC),
84+
INDEX emigratingbuckets_shardname_idx (shardname ASC),
85+
INDEX emigratingbuckets_destcluster_idx (destcluster ASC),
86+
INDEX emigratingbuckets_enqd_idx (enqd DESC),
87+
INDEX emigratingbuckets_enqdfirst_idx (enqdfirst DESC),
88+
INDEX emigratingbuckets_haserror_idx (haserror DESC),
89+
CONSTRAINT check_bucketnum CHECK (bucketnum > 0:::INT8)
90+
)`
91+
assert.Equal(t, `
92+
CREATE TABLE public.emigratingbuckets (
93+
bucketnum INT8 NOT NULL,
94+
bucket STRING NOT NULL,
95+
shardname STRING NOT NULL,
96+
destcluster STRING NOT NULL,
97+
starttime TIMESTAMP(0) NULL,
98+
startbinlogfile STRING NULL,
99+
startbinlogpos INT4 NULL,
100+
generation INT4 NOT NULL DEFAULT 0:::INT8,
101+
state STRING NOT NULL,
102+
activeon STRING NOT NULL,
103+
lastheartbeat TIMESTAMP(0) NULL,
104+
enqd BOOL NULL,
105+
enqdfirst BOOL NULL,
106+
enqdtime TIMESTAMP(6) NULL,
107+
laststatechangetime TIMESTAMP(0) NOT NULL,
108+
lastbinlogfile STRING NOT NULL,
109+
lastbinlogpos INT4 NOT NULL,
110+
laststatusupdate TIMESTAMP(0) NULL,
111+
errordetail STRING NOT NULL,
112+
migrationparams JSONB NULL,
113+
migrationstatus JSONB NULL,
114+
srcbucketinfo JSONB NULL,
115+
CONSTRAINT ""primary"" PRIMARY KEY (bucketnum ASC),
116+
INDEX emigratingbuckets_state_idx (state ASC),
117+
INDEX emigratingbuckets_activeon_idx (activeon DESC),
118+
INDEX emigratingbuckets_shardname_idx (shardname ASC),
119+
INDEX emigratingbuckets_destcluster_idx (destcluster ASC),
120+
INDEX emigratingbuckets_enqd_idx (enqd DESC),
121+
INDEX emigratingbuckets_enqdfirst_idx (enqdfirst DESC),
122+
CONSTRAINT check_bucketnum CHECK (bucketnum > 0:::INT8)
123+
)`, removeComputedColumns(stmt))
124+
}
49125

50126
func TestParseDDL(t *testing.T) {
51127
t.Run("Basic DDL", func(t *testing.T) {

pkg/workload/workload_generator/schema_utils.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -492,11 +492,17 @@ func mapSQLType(sql string, col *Column, rng *rand.Rand) (GeneratorType, map[str
492492
return GenTypeString, args
493493
}
494494

495-
func mapIntegerType(_ string, col *Column, args map[string]any) (GeneratorType, map[string]any) {
495+
func mapIntegerType(sql string, col *Column, args map[string]any) (GeneratorType, map[string]any) {
496496
if col.IsPrimaryKey || col.IsUnique {
497497
return GenTypeSequence, map[string]any{"start": 1, "seed": args["seed"]}
498498
}
499-
setArgsRange(args, -(1 << 31), (1<<31)-1)
499+
if sql == "smallint" || sql == "int2" {
500+
setArgsRange(args, -(1 << 15), (1<<15)-1)
501+
} else {
502+
// default is regular INT
503+
// INT is 32-bit in CockroachDB even on 64-bit platforms
504+
setArgsRange(args, -(1 << 31), (1<<31)-1)
505+
}
500506
return GenTypeInteger, args
501507
}
502508

0 commit comments

Comments
 (0)