Skip to content

Commit 54d7c77

Browse files
authored
feat(compass-collection): Avoid sending sample values for binary fields - CLOUDP-350484 (#7439)
* Skip binary for sample values * Comment * WIP
1 parent dc92e06 commit 54d7c77

File tree

2 files changed

+160
-11
lines changed

2 files changed

+160
-11
lines changed

packages/compass-collection/src/transform-schema-to-field-info.spec.ts

Lines changed: 131 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ describe('processSchema', function () {
148148
expect(result.arrayLengthMap).to.deep.equal({});
149149
});
150150

151-
it('limits sample values to 10', function () {
151+
it('limits sample values to 5', function () {
152152
const manyValues = Array.from({ length: 20 }, (_, i) => `value${i}`);
153153

154154
const schema: Schema = {
@@ -177,9 +177,9 @@ describe('processSchema', function () {
177177

178178
const result = processSchema(schema);
179179

180-
expect(result.fieldInfo.field.sampleValues).to.have.length(10);
180+
expect(result.fieldInfo.field.sampleValues).to.have.length(5);
181181
expect(result.fieldInfo.field.sampleValues).to.deep.equal(
182-
manyValues.slice(0, 10)
182+
manyValues.slice(0, 5)
183183
);
184184
});
185185

@@ -486,7 +486,7 @@ describe('processSchema', function () {
486486
},
487487
binary: {
488488
type: 'Binary',
489-
sampleValues: ['dGVzdA=='],
489+
// sampleValues property should be absent for Binary fields
490490
probability: 1.0,
491491
},
492492
regex: {
@@ -533,6 +533,132 @@ describe('processSchema', function () {
533533
expect(result.arrayLengthMap).to.deep.equal({});
534534
});
535535

536+
it('excludes sample values for Binary fields to avoid massive payloads', function () {
537+
const embedding = new Binary(Buffer.from([1, 2, 3, 4])); // Test Binary field logic
538+
const schema: Schema = {
539+
fields: [
540+
{
541+
name: 'plot_embedding',
542+
path: ['plot_embedding'],
543+
count: 1,
544+
type: ['Binary'],
545+
probability: 1.0,
546+
hasDuplicates: false,
547+
types: [
548+
{
549+
name: 'Binary',
550+
bsonType: 'Binary',
551+
path: ['plot_embedding'],
552+
count: 1,
553+
probability: 1.0,
554+
values: [embedding],
555+
},
556+
],
557+
},
558+
{
559+
name: 'regular_field',
560+
path: ['regular_field'],
561+
count: 1,
562+
type: ['String'],
563+
probability: 1.0,
564+
hasDuplicates: false,
565+
types: [
566+
{
567+
name: 'String',
568+
bsonType: 'String',
569+
path: ['regular_field'],
570+
count: 1,
571+
probability: 1.0,
572+
values: ['test'],
573+
},
574+
],
575+
},
576+
],
577+
count: 1,
578+
};
579+
580+
const result = processSchema(schema);
581+
582+
expect(result.fieldInfo).to.deep.equal({
583+
plot_embedding: {
584+
type: 'Binary',
585+
// sampleValues property should be absent for Binary fields
586+
probability: 1.0,
587+
},
588+
regular_field: {
589+
type: 'String',
590+
sampleValues: ['test'], // Should still have sample values for non-Binary fields
591+
probability: 1.0,
592+
},
593+
});
594+
expect(result.arrayLengthMap).to.deep.equal({});
595+
});
596+
597+
it('truncates very long sample values to prevent massive payloads', function () {
598+
const longText = 'A'.repeat(1000);
599+
const schema: Schema = {
600+
fields: [
601+
{
602+
name: 'longField',
603+
path: ['longField'],
604+
count: 1,
605+
type: ['String'],
606+
probability: 1.0,
607+
hasDuplicates: false,
608+
types: [
609+
{
610+
name: 'String',
611+
bsonType: 'String',
612+
path: ['longField'],
613+
count: 1,
614+
probability: 1.0,
615+
values: [longText, 'short'],
616+
},
617+
],
618+
},
619+
],
620+
count: 1,
621+
};
622+
623+
const result = processSchema(schema);
624+
625+
expect(result.fieldInfo.longField.sampleValues).to.have.length(2);
626+
expect(result.fieldInfo.longField.sampleValues![0]).to.equal(
627+
'A'.repeat(300) + '...'
628+
);
629+
expect(result.fieldInfo.longField.sampleValues![1]).to.equal('short');
630+
});
631+
632+
it('rounds probability to 2 decimal places', function () {
633+
const schema: Schema = {
634+
fields: [
635+
{
636+
name: 'field',
637+
path: ['field'],
638+
count: 1,
639+
type: ['String'],
640+
probability: 0.23076923076923078, // Very precise decimal
641+
hasDuplicates: false,
642+
types: [
643+
{
644+
name: 'String',
645+
bsonType: 'String',
646+
path: ['field'],
647+
count: 1,
648+
probability: 1.0,
649+
values: ['test'],
650+
},
651+
],
652+
},
653+
],
654+
count: 1,
655+
};
656+
657+
const result = processSchema(schema);
658+
659+
expect(result.fieldInfo.field.probability).to.equal(0.23); // Rounded to 2 decimal places
660+
});
661+
536662
it('transforms nested document field', function () {
537663
const schema: Schema = {
538664
fields: [
@@ -908,7 +1034,7 @@ describe('processSchema', function () {
9081034
expect(result.fieldInfo).to.deep.equal({
9091035
'cube[][][]': {
9101036
type: 'Number',
911-
sampleValues: [1, 2, 3, 4, 5, 6, 7, 8],
1037+
sampleValues: [1, 2, 3, 4, 5],
9121038
probability: 1.0,
9131039
},
9141040
});

packages/compass-collection/src/transform-schema-to-field-info.ts

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,14 @@ import {
4343
/**
4444
* Maximum number of sample values to include for each field
4545
*/
46-
const MAX_SAMPLE_VALUES = 10;
46+
const MAX_SAMPLE_VALUES = 5;
47+
48+
/**
49+
* Maximum length for individual sample values (to prevent massive payloads)
50+
* 300 chars allows for meaningful text samples while keeping payloads manageable
51+
*/
52+
const MAX_STRING_SAMPLE_VALUE_LENGTH = 300;
53+
4754
export const FIELD_NAME_SEPARATOR = '.';
4855

4956
/**
@@ -113,7 +120,11 @@ function convertBSONToPrimitive(value: unknown): SampleValue {
113120
return value.toString();
114121
}
115122
if (value instanceof Binary) {
116-
return value.toString('base64');
123+
// For Binary data, provide a descriptive placeholder instead of the actual data
124+
// to avoid massive base64 strings that can break LLM requests
125+
// (Defensive check: should never be called, since sample values for binary are skipped)
126+
const sizeInBytes = value.buffer?.length || 0;
127+
return `<Binary data: ${sizeInBytes} bytes>`;
117128
}
118129
if (value instanceof BSONRegExp) {
119130
return value.pattern;
@@ -146,6 +157,14 @@ function convertBSONToPrimitive(value: unknown): SampleValue {
146157
return result as SampleValue;
147158
}
148159

160+
// Truncate very long strings to prevent massive payloads
161+
if (
162+
typeof value === 'string' &&
163+
value.length > MAX_STRING_SAMPLE_VALUE_LENGTH
164+
) {
165+
return value.substring(0, MAX_STRING_SAMPLE_VALUE_LENGTH) + '...';
166+
}
167+
149168
return value as SampleValue;
150169
}
151170

@@ -229,7 +248,7 @@ function processNamedField(
229248
primaryType,
230249
currentPath,
231250
result,
232-
field.probability,
251+
Math.round(field.probability * 100) / 100, // Round to 2 decimal places
233252
arrayLengthMap
234253
);
235254
}
@@ -281,12 +300,16 @@ function processType(
281300
// Primitive: Create entry
282301
const fieldInfo: FieldInfo = {
283302
type: type.name,
284-
sampleValues: type.values
285-
.slice(0, MAX_SAMPLE_VALUES)
286-
.map(convertBSONToPrimitive),
287303
probability: fieldProbability,
288304
};
289305

306+
// Only add sampleValues if not Binary (to avoid massive payloads from embeddings)
307+
if (type.name !== 'Binary') {
308+
fieldInfo.sampleValues = type.values
309+
.slice(0, MAX_SAMPLE_VALUES)
310+
.map(convertBSONToPrimitive);
311+
}
312+
290313
result[currentPath] = fieldInfo;
291314
}
292315
}

0 commit comments

Comments
 (0)