Skip to content

Commit af0fee7

Browse files
committed
feat: split text into chunks if too large
1 parent a670da0 commit af0fee7

File tree

4 files changed

+226
-51
lines changed

4 files changed

+226
-51
lines changed

packages/translate/src/chunk.ts

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// Constants for token estimation
2+
export const CHAR_TO_TOKEN_RATIO = 0.3; // 1 English character ≈ 0.3 token
3+
export const CHAR_TO_TOKEN_RATIO_ZH = 0.5; // 1 Chinese character ≈ 0.5 token
4+
export const MAX_INPUT_TOKENS = 64 * 1024; // DeepSeek's 64K context length
5+
export const MAX_OUTPUT_TOKENS = 8 * 1024; // DeepSeek's 8K max output
6+
7+
// Chunk size constants (in estimated tokens)
8+
export const MAX_CHUNK_SIZE_TOKENS = 16 * 1024; // Use smaller chunks for better translation quality
9+
10+
// Helper function to estimate tokens based on content
11+
export function estimateTokens(content: string): number {
12+
// we use Chinese ratio because Chinese is concise
13+
return Math.ceil(content.length * CHAR_TO_TOKEN_RATIO_ZH);
14+
}
15+
16+
export function needsChunking(content: string): boolean {
17+
return estimateTokens(content) > MAX_OUTPUT_TOKENS;
18+
}
19+
20+
// Split text into chunks that respect markdown structure and heading hierarchy
21+
export function splitIntoChunks(content: string): string[] {
22+
// Define a regex pattern for markdown headings (## Heading)
23+
const headingPattern = /^(#{2,}) /gm;
24+
25+
// Split content by headings while keeping the headings with the content that follows
26+
const sections: string[] = [];
27+
let lastIndex = 0;
28+
29+
// Find all heading matches
30+
const contentWithExtraLineBreak = `${content}\n\n`;
31+
const matches = Array.from(
32+
contentWithExtraLineBreak.matchAll(headingPattern),
33+
);
34+
35+
if (matches.length > 0) {
36+
sections.push(content.substring(0, matches[0].index));
37+
} else {
38+
sections.push(content);
39+
}
40+
41+
// Process each heading match
42+
for (let i = 0; i < matches.length; i++) {
43+
const match = matches[i];
44+
const currentIndex = match.index !== undefined ? match.index : 0;
45+
46+
// If this isn't the first heading, add the section between last heading and this one
47+
if (i > 0 && lastIndex < currentIndex) {
48+
sections.push(content.substring(lastIndex, currentIndex));
49+
}
50+
51+
// Update lastIndex for next iteration
52+
lastIndex = currentIndex;
53+
54+
// If this is the last heading, add the section from this heading to the end
55+
if (i === matches.length - 1) {
56+
sections.push(content.substring(lastIndex));
57+
}
58+
}
59+
60+
// Further process sections to ensure they don't exceed token limits
61+
const chunks = [];
62+
let currentChunk = '';
63+
let currentTokens = 0;
64+
65+
for (const section of sections) {
66+
const sectionTokens = estimateTokens(section);
67+
68+
if (currentTokens + sectionTokens > MAX_OUTPUT_TOKENS) {
69+
// If adding this section would exceed the limit, start a new chunk
70+
chunks.push(currentChunk);
71+
currentChunk = section;
72+
currentTokens = sectionTokens;
73+
} else {
74+
currentChunk += section;
75+
currentTokens += sectionTokens;
76+
}
77+
}
78+
79+
// Add any remaining content as the last chunk
80+
if (currentChunk) {
81+
chunks.push(currentChunk);
82+
}
83+
84+
// console.log('chunks-tokens', chunks.map((chunk) => estimateTokens(chunk)));
85+
86+
return chunks;
87+
}

packages/translate/src/main.ts

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,11 @@ import * as path from 'node:path';
33
import micromatch from 'micromatch';
44
import { executeInBatches } from './batch';
55
import { logger } from './logger';
6-
import { $translateConfig } from './openai';
76
import type { MainConfig } from './types';
87
import {
9-
copyDoc,
10-
extractPathToLabelMap,
118
findDocFiles,
129
getDocUpdateStatus,
13-
getTranslatedConfig,
1410
normalizePatterns,
15-
shouldTranslateConfig,
1611
translateDoc,
1712
} from './utils';
1813

@@ -150,7 +145,7 @@ export async function main({
150145

151146
// Check if this path should be copied without translation
152147

153-
const [shouldUpdate, reason] = await getDocUpdateStatus({
148+
const { shouldUpdate, reason, chunks } = await getDocUpdateStatus({
154149
sourcePath,
155150
targetPath,
156151
});
@@ -159,6 +154,7 @@ export async function main({
159154
Source: sourcePath,
160155
Target: targetPath,
161156
'Should update?': shouldUpdate ? '✅ Yes' : '❌ No',
157+
Chunks: chunks,
162158
Reason: reason,
163159
});
164160

packages/translate/src/openai.ts

Lines changed: 101 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,11 @@
11
import OpenAI from 'openai';
2+
import type { ChatCompletionMessageParam } from 'openai/resources.mjs';
3+
import {
4+
MAX_OUTPUT_TOKENS,
5+
estimateTokens,
6+
needsChunking,
7+
splitIntoChunks,
8+
} from './chunk';
29
import { logger } from './logger';
310

411
interface LangConfig {
@@ -33,17 +40,17 @@ export const model = 'deepseek-chat';
3340
export const systemPrompt =
3441
'You are a professional technical translator specializing in software documentation. You are particularly skilled at translating React, web development, and programming terminology, keeping the translations consistent and readable.';
3542

36-
// Improved $translateDocument function with language-specific prompts
37-
export async function $translateDocument({
38-
content,
39-
langConfig,
40-
context = '',
41-
}: TranslateDocumentParams): Promise<string> {
43+
// Helper function to translate a single chunk
44+
async function translateChunk(
45+
chunk: string,
46+
langConfig: LangConfig,
47+
context: string,
48+
): Promise<string> {
4249
if (!openai) {
4350
throw new Error('OPENAI_API_KEY is not set.');
4451
}
4552

46-
const textLength = content.length;
53+
const textLength = chunk.length;
4754
const prompt = `
4855
Translate the following documentation from English to ${langConfig.name}.
4956
The document being translated is of MDX file type, which means there are components in the text. Please ensure component closing tags are not missing, for example:
@@ -60,33 +67,101 @@ HERE IS THE TEXT TO TRANSLATE:
6067
`;
6168

6269
logger.debug(
63-
`Sending translation request, text length: ${textLength} characters, prompt totoal length: ${
70+
`Sending chunk translation request, text length: ${textLength} characters, prompt total length: ${
6471
prompt.length + textLength
65-
} characters`,
72+
} characters, estimated tokens: ${estimateTokens(prompt + chunk)} tokens`,
6673
);
6774

68-
// logger.debug(`Prompt:\n${prompt}`);
75+
const messages: Array<ChatCompletionMessageParam> = [
76+
{
77+
role: 'system',
78+
content: systemPrompt,
79+
},
80+
{
81+
role: 'user',
82+
content: prompt,
83+
},
84+
{
85+
role: 'user',
86+
content: chunk,
87+
},
88+
];
6989

7090
const response = await openai.chat.completions.create({
7191
model: model,
72-
max_completion_tokens: 8 * 1024,
73-
max_tokens: 8 * 1024,
74-
messages: [
75-
{
76-
role: 'system',
77-
content: systemPrompt,
78-
},
79-
{
80-
role: 'user',
81-
content: prompt + content,
82-
},
83-
],
92+
max_completion_tokens: MAX_OUTPUT_TOKENS,
93+
max_tokens: MAX_OUTPUT_TOKENS,
94+
messages: messages,
8495
});
8596

86-
const translatedContent = response.choices?.[0]?.message?.content;
87-
if (!translatedContent) {
97+
logger.debug(`response.usage:\n${JSON.stringify(response.usage, null, 2)}`);
98+
const message = response.choices?.[0]?.message;
99+
100+
if (!message?.content) {
88101
throw new Error('Failed to get translation response');
89102
}
90-
logger.debug(`Response:\n${response}`);
91-
return translatedContent.trim();
103+
// Return the translated content and the message to add to history
104+
return message.content;
105+
}
106+
107+
// New implementation for translating large documents using multi-round chat
108+
export async function $translateDocument({
109+
content,
110+
langConfig,
111+
context = '',
112+
}: TranslateDocumentParams): Promise<string> {
113+
if (!openai) {
114+
throw new Error('OPENAI_API_KEY is not set.');
115+
}
116+
117+
const textLength = content.length;
118+
logger.debug(
119+
`Starting translation of document with length: ${textLength} characters, estimated tokens: ${estimateTokens(content)} tokens`,
120+
);
121+
122+
// For small documents, use the direct approach
123+
if (!needsChunking(content)) {
124+
return await translateChunk(content, langConfig, context);
125+
}
126+
127+
logger.debug(
128+
'Document is large, splitting into chunks for multi-round translation',
129+
);
130+
const chunks = splitIntoChunks(content);
131+
logger.debug(`Split document into ${chunks.length} chunks`);
132+
133+
let translatedContent = '';
134+
for (let i = 0; i < chunks.length; i++) {
135+
const chunk = chunks[i];
136+
logger.debug(`Translating chunk ${i + 1} of ${chunks.length}`);
137+
138+
// Translate the chunk
139+
const translatedChunk = await translateChunk(chunk, langConfig, context);
140+
141+
// Add to the complete translated content
142+
if (chunk[0] === '\n' && translatedChunk[0] !== '\n') {
143+
translatedContent += '\n';
144+
}
145+
if (chunk[1] === '\n' && translatedChunk[1] !== '\n') {
146+
translatedContent += '\n';
147+
}
148+
149+
translatedContent += translatedChunk;
150+
151+
if (
152+
chunk[chunk.length - 1] === '\n' &&
153+
translatedChunk[translatedChunk.length - 1] !== '\n'
154+
) {
155+
translatedContent += '\n';
156+
}
157+
if (
158+
chunk[chunk.length - 2] === '\n' &&
159+
translatedChunk[translatedChunk.length - 2] !== '\n'
160+
) {
161+
translatedContent += '\n';
162+
}
163+
}
164+
165+
logger.debug(`Completed translation of all ${chunks.length} chunks`);
166+
return translatedContent;
92167
}

packages/translate/src/utils.ts

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ import * as fs$ from 'node:fs/promises';
33
import path from 'node:path';
44
import { glob } from 'glob';
55
import matter from 'gray-matter';
6+
import { needsChunking, splitIntoChunks } from './chunk';
67
import { logger } from './logger';
78
import { $translateDocument } from './openai';
89

@@ -57,17 +58,24 @@ export function getLastModifiedTimeFromGit(filePath: string): Date {
5758
export async function getDocUpdateStatus({
5859
sourcePath,
5960
targetPath,
60-
}: CheckFileUpdateParams): Promise<[boolean, string]> {
61+
}: CheckFileUpdateParams): Promise<{
62+
shouldUpdate: boolean;
63+
chunks: 'N/A' | number;
64+
reason: string;
65+
}> {
6166
try {
6267
await fs$.access(sourcePath);
6368
} catch (error) {
6469
logger.error(
6570
`Source file not found: ${sourcePath}, don't need updating, consider removing it`,
6671
);
67-
return [false, 'Source not found'];
72+
return { shouldUpdate: false, chunks: 'N/A', reason: 'Source not found' };
6873
}
6974

7075
const sourceContent = await fs$.readFile(sourcePath, 'utf8');
76+
const chunks = needsChunking(sourceContent)
77+
? splitIntoChunks(sourceContent).length
78+
: 1;
7179
const sourceParsed = matter(sourceContent);
7280

7381
let sourceLastModifiedDate = getLastModifiedTimeFromGit(sourcePath);
@@ -78,7 +86,11 @@ export async function getDocUpdateStatus({
7886
logger.error(
7987
`Referenced file not found: ${sourceParsed.data.ref}, don't need updating, consider REMOVING it`,
8088
);
81-
return [false, 'Referenced file not found'];
89+
return {
90+
shouldUpdate: false,
91+
reason: 'Referenced file not found',
92+
chunks,
93+
};
8294
}
8395

8496
const refLastModifiedDate = getLastModifiedTimeFromGit(
@@ -93,7 +105,7 @@ export async function getDocUpdateStatus({
93105
await fs$.access(targetPath);
94106
} catch (error) {
95107
logger.debug(`Target file not found: ${targetPath}, needs updating`);
96-
return [true, 'Target not found.'];
108+
return { shouldUpdate: true, reason: 'Target not found.', chunks };
97109
}
98110

99111
// Read target file and parse frontmatter
@@ -112,16 +124,28 @@ export async function getDocUpdateStatus({
112124
logger.debug(
113125
`Source file ${sourcePath} has been updated since last translation, needs updating`,
114126
);
115-
return [true, 'Source has been modified. '];
127+
return {
128+
shouldUpdate: true,
129+
reason: 'Source has been modified. ',
130+
chunks,
131+
};
116132
}
117-
return [false, 'Source has not been modified.'];
133+
return {
134+
shouldUpdate: false,
135+
reason: 'Source has not been modified.',
136+
chunks,
137+
};
118138
}
119139

120140
// If there's no source-updated-at in target, it needs to be updated
121141
logger.debug(
122142
`Target file ${targetPath} has no source-updated-at metadata, needs updating`,
123143
);
124-
return [true, 'Target no source-updated-at metadata, needs updating.'];
144+
return {
145+
shouldUpdate: true,
146+
reason: 'Target no source-updated-at metadata, needs updating.',
147+
chunks,
148+
};
125149
}
126150

127151
// New helper function to extract context from overview files
@@ -191,19 +215,12 @@ export async function translateDoc({
191215
const sourceUpdatedAt = getLastModifiedTimeFromGit(sourcePath).toISOString();
192216
const translationUpdatedAt = new Date().toISOString();
193217

194-
const parsed = matter(translatedContent);
195-
196-
// Create frontmatter data object
197-
const frontmatterData = {
198-
'source-updated-at': sourceUpdatedAt,
199-
'translation-updated-at': translationUpdatedAt,
200-
...parsed.data,
201-
};
202-
203-
logger.debug(
204-
`Writing translated content, ${JSON.stringify(frontmatterData)}`,
218+
const newContent = translatedContent.replace(
219+
'---',
220+
`---
221+
source-updated-at: ${sourceUpdatedAt}
222+
translation-updated-at: ${translationUpdatedAt}`,
205223
);
206-
const newContent = matter.stringify(parsed.content, frontmatterData);
207224

208225
logger.debug(`Writing translated content to ${targetPath}`);
209226
await fs$.writeFile(targetPath, newContent, 'utf8');

0 commit comments

Comments
 (0)