Skip to content

Commit 2dcdd90

Browse files
πŸ› fix(duplicate-ja-doc-bug): centralize filename sanitization logic (#1319)
1 parent d6531de commit 2dcdd90

File tree

4 files changed

+85
-4
lines changed

4 files changed

+85
-4
lines changed

β€Žpackages/cdk/lambda/utils/bedrockAgentApi.tsβ€Ž

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import {
2121
BraveSearchResult,
2222
} from 'generative-ai-use-cases';
2323
import { streamingChunk } from './streamingChunk';
24+
import { convertToSafeFilename } from './fileNameUtils';
2425
import {
2526
initBedrockAgentClient,
2627
initBedrockAgentRuntimeClient,
@@ -127,7 +128,7 @@ const bedrockAgentApi: ApiInterface = {
127128
files: messages
128129
.flatMap((m: UnrecordedMessage) => {
129130
return m.extraData?.map((file) => ({
130-
name: file.name.replace(/[^a-zA-Z0-9\s\-()[\].]/g, 'X'), // If the file name contains Japanese, it is not recognized, so replace it
131+
name: convertToSafeFilename(file.name),
131132
source: {
132133
sourceType: 'BYTE_CONTENT',
133134
byteContent: {
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
import crypto from 'crypto';
2+
3+
/**
4+
* Convert filename to safe format for AWS Bedrock API
5+
* AWS Bedrock DocumentBlock.name only allows: alphanumeric, whitespace, hyphens, parentheses, square brackets
6+
* Replaces non-allowed characters with '_' and adds hash suffix only when replacements occur
7+
* @param filename Original filename
8+
* @returns Safe filename with hash suffix (only if non-allowed characters were replaced)
9+
*/
10+
export const convertToSafeFilename = (filename: string): string => {
11+
const lastDotIndex = filename.lastIndexOf('.');
12+
const nameWithoutExt =
13+
lastDotIndex > 0 ? filename.substring(0, lastDotIndex) : filename;
14+
const safeName = nameWithoutExt.replace(/[^a-zA-Z0-9\s\-()[\]]/g, '_');
15+
16+
// Add hash only if non-ASCII characters were replaced
17+
if (safeName !== nameWithoutExt) {
18+
const hash = crypto
19+
.createHash('md5')
20+
.update(filename)
21+
.digest('hex')
22+
.substring(0, 8);
23+
return `${safeName}_${hash}`;
24+
}
25+
26+
return safeName;
27+
};

β€Žpackages/cdk/lambda/utils/models.tsβ€Ž

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ import {
3131
applyAutoCacheToSystem,
3232
} from './promptCache';
3333
import { getFormatFromMimeType, getMimeTypeFromFileName } from './media';
34+
import { convertToSafeFilename } from './fileNameUtils';
3435

3536
// Default Models
3637

@@ -428,9 +429,7 @@ const createConverseCommandInput = (
428429
contentBlocks.push({
429430
document: {
430431
format,
431-
name: extra.name
432-
.split('.')[0]
433-
.replace(/[^a-zA-Z0-9\s\-()[\]]/g, 'X'), // If the file name contains Japanese, it will cause an error, so convert it
432+
name: convertToSafeFilename(extra.name),
434433
source: {
435434
bytes: Buffer.from(extra.source.data, 'base64'),
436435
},
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
/* eslint-disable i18nhelper/no-jp-string */
2+
import { convertToSafeFilename } from '../../../lambda/utils/fileNameUtils';
3+
4+
describe('convertToSafeFilename', () => {
5+
it('should return filename without hash when only ASCII characters', () => {
6+
const result = convertToSafeFilename('document.pdf');
7+
expect(result).toBe('document');
8+
});
9+
10+
it('should return filename without hash for ASCII with allowed special chars', () => {
11+
const result = convertToSafeFilename('report-2024 (final)[v1].pdf');
12+
expect(result).toBe('report-2024 (final)[v1]');
13+
});
14+
15+
it('should add hash when Japanese characters are present', () => {
16+
const result = convertToSafeFilename('資料.pdf');
17+
expect(result).toBe('___46a890b2');
18+
});
19+
20+
it('should add hash when mixed Japanese and ASCII characters', () => {
21+
const result = convertToSafeFilename('report資料2024.pdf');
22+
expect(result).toBe('report__2024_f3805637');
23+
});
24+
25+
it('should generate different hashes for different Japanese filenames with same length', () => {
26+
const result1 = convertToSafeFilename('資料.pdf');
27+
const result2 = convertToSafeFilename('ζ›Έι‘ž.pdf');
28+
expect(result1).toBe('___46a890b2');
29+
expect(result2).toBe('___5c4aa342');
30+
expect(result1).not.toBe(result2);
31+
});
32+
33+
it('should generate consistent hash for same filename', () => {
34+
const result1 = convertToSafeFilename('資料.pdf');
35+
const result2 = convertToSafeFilename('資料.pdf');
36+
expect(result1).toBe('___46a890b2');
37+
expect(result2).toBe('___46a890b2');
38+
});
39+
40+
it('should handle filename without extension', () => {
41+
const result = convertToSafeFilename('document');
42+
expect(result).toBe('document');
43+
});
44+
45+
it('should handle filename with multiple dots', () => {
46+
const result = convertToSafeFilename('report.final.pdf');
47+
expect(result).toBe('report_final_8d101382');
48+
});
49+
50+
it('should replace special characters with underscore and add hash', () => {
51+
const result = convertToSafeFilename('file@#$.pdf');
52+
expect(result).toBe('file____cf25ced4');
53+
});
54+
});

0 commit comments

Comments
Β (0)