55import argparse
66import json
77import math
8- from typing import Dict , Optional
8+ import shutil
99from openai import OpenAI
1010from concurrent .futures import ThreadPoolExecutor
11- import re
12- from pydantic import BaseModel , Field
1311
14-
15- EXCLUDED_FILES = {"about-us/adopters.md" }
16- EXCLUDED_FOLDERS = {"whats-new" , "changelogs" }
12+ TRANSLATE_EXCLUDED_FILES = {"about-us/adopters.md" , "index.md" }
13+ TRANSLATE_EXCLUDED_FOLDERS = {"whats-new" , "changelogs" }
1714
1815client = OpenAI (
1916 api_key = os .environ .get ("OPENAI_API_KEY" ),
@@ -48,7 +45,8 @@ def format_glossary_prompt(glossary):
4845def translate_text (config , text , model = "gpt-4o-mini" ):
4946 language = config ["language" ]
5047 glossary = config ["glossary" ]
51- prompt = config ["prompt" ] if "prompt" in config else f"Translate the following ClickHouse documentation text from English to { language } . This content may be part of a document, so maintain the original html tags and markdown formatting used in Docusaurus, including any headings, code blocks, lists, links, and inline formatting like bold or italic text. Ensure that no content, links, or references are omitted or altered during translation, preserving the same amount of information as the original text. Do not translate code, URLs, or any links within markdown. This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content. Strive to convey the original meaning clearly, adapting phrases where necessary to maintain natural and fluent { language } ."
48+ prompt = config [
49+ "prompt" ] if "prompt" in config else f"Translate the following ClickHouse documentation text from English to { language } . This content may be part of a document, so maintain the original html tags and markdown formatting used in Docusaurus, including any headings, code blocks, lists, links, and inline formatting like bold or italic text. Ensure that no content, links, explicit heading ids (denoted by {{#my-explicit-id}}), or references are omitted or altered during translation, preserving the same amount of information as the original text. Do not translate code, URLs, or any links within markdown. This translation is intended for users familiar with ClickHouse, databases, and IT terminology, so use technically accurate and context-appropriate language. Keep the translation precise and professional, reflecting the technical nature of the content. Strive to convey the original meaning clearly, adapting phrases where necessary to maintain natural and fluent { language } ."
5250 glossary_prompt = format_glossary_prompt (glossary )
5351 prompt_content = f"{ glossary_prompt } \n { prompt } "
5452 try :
@@ -82,24 +80,6 @@ def split_text(text, max_chunk_size):
8280 return chunks
8381
8482
85- def process_page_new_language (content , lang_code , is_intro = False ):
86- replacements = [
87- (r"slug: /en/" , f"slug: /{ lang_code } /" ),
88- (r"slug: '/en/" , f"slug: '/{ lang_code } /" ),
89- (r'slug: "/en/' , f'slug: "/{ lang_code } /' ),
90- (r"\(/docs/en/" , f"(/docs/{ lang_code } /" ),
91- (r"\]\(/en/" , f"](/{ lang_code } /" ),
92- (r"@site/docs/" , f"@site/docs/{ lang_code } /" ),
93- (r'"/docs/en/' , f'"/docs/{ lang_code } /' ),
94- (r"clickhouse.com/docs/en" , f"clickhouse.com/docs/{ lang_code } " ),
95- ]
96- for pattern , replacement in replacements :
97- content = re .sub (pattern , replacement , content )
98- if is_intro :
99- content = re .sub (r"^---$" , f"---\n slug: /{ lang_code } " , content , count = 1 , flags = re .MULTILINE )
100- return content
101-
102-
10383def translate_file (config , input_file_path , output_file_path , model ):
10484 print (f"start translation: input[{ input_file_path } ], output[{ output_file_path } ]" )
10585 start_time = time .time ()
@@ -108,7 +88,6 @@ def translate_file(config, input_file_path, output_file_path, model):
10888 with open (input_file_path , "r" , encoding = "utf-8" ) as input_file :
10989 original_text = input_file .read ()
11090 print (f" - length: { len (original_text )} " )
111- original_text = process_page_new_language (original_text , config ["lang_code" ])
11291 # Split text into chunks and translate
11392 num_chunk = math .ceil (len (original_text ) / MAX_CHUNK_SIZE )
11493 count = 1
@@ -118,19 +97,16 @@ def translate_file(config, input_file_path, output_file_path, model):
11897 translated_chunk = translate_text (config , chunk , model )
11998 if translated_chunk :
12099 translated_text += translated_chunk + "\n "
121- count += 1
100+ count += 1
122101 else :
123102 print (f"failed to translate a chunk: [{ input_file_path } ]" )
124103 return
125104
126105 with open (output_file_path , "w" , encoding = "utf-8" ) as output_file :
127106 output_file .write (translated_text )
128107
129- # Rename input file with .translated suffix
130- translated_file_name = f"{ os .path .basename (input_file_path )} .translated"
131- translated_file_path = os .path .join (os .path .dirname (input_file_path ), translated_file_name )
132-
133- os .rename (input_file_path , translated_file_path )
108+ # Rename output file with .translate suffix to .translated
109+ os .rename (output_file_path , f"{ output_file_path } d" )
134110
135111 except FileNotFoundError :
136112 print (f"no file: { input_file_path } " )
@@ -139,41 +115,48 @@ def translate_file(config, input_file_path, output_file_path, model):
139115
140116 end_time = time .time ()
141117 duration = end_time - start_time
142- print (f"finished translation: input[{ input_file_path } ], output[{ output_file_path } ], duration seconds[{ duration :.2f} ]" )
118+ print (
119+ f"finished translation: input[{ input_file_path } ], output[{ output_file_path } ], duration seconds[{ duration :.2f} ]" )
143120
144121
145- def translate_folder (config , input_folder , output_folder , model = "gpt-4o-mini" ):
122+ def translate_docs_folder (config , input_folder , output_folder , model = "gpt-4o-mini" ):
146123 with ThreadPoolExecutor (max_workers = 5 ) as executor :
147124 futures = []
148125 for root , _ , files in os .walk (input_folder ):
149126 relative_folder_path = os .path .relpath (root , input_folder )
150- if any (excluded in relative_folder_path for excluded in EXCLUDED_FOLDERS ):
151- print (f" - Skipping due to exclusion target: { relative_folder_path } " )
127+ if any (excluded in relative_folder_path for excluded in TRANSLATE_EXCLUDED_FOLDERS ):
128+ print (f"Skipping translation due to excluded folder target: { relative_folder_path } " )
129+ shutil .copytree (os .path .join (input_folder , relative_folder_path ), os .path .join (output_folder , relative_folder_path ),dirs_exist_ok = True )
152130 continue
153131
154132 for file in files :
155133 input_file_path = os .path .join (root , file )
156134 relative_path = os .path .relpath (input_file_path , input_folder )
157- output_file_path = os .path .join (output_folder , relative_path + ".translated" )
158135 if file .endswith ((".md" , ".mdx" )):
159- os .makedirs (os .path .dirname (output_file_path ), exist_ok = True )
160-
161136 # Skip files that are in the excluded files set
162- if relative_path in EXCLUDED_FILES :
163- print (f" - Skipping due to exclusion target: { input_file_path } " )
137+ if relative_path in TRANSLATE_EXCLUDED_FILES :
138+ output_file_path = os .path .join (output_folder , relative_path )
139+ print (f"Skipping translation due to exclusion target: { input_file_path } " )
140+ shutil .copy (input_file_path , output_file_path )
164141 continue
165-
166142 # Skip files that already have the translated suffix - allows continuing from failed point
167- if file .endswith (".translated" ):
143+ if os .path .exists (os .path .join (output_folder , relative_path + ".translated" )):
144+ print (f"Skipping ${ input_file_path } translation due to already translated file" )
168145 continue
169-
146+ # re-do files partially through translation
147+ if os .path .exists (os .path .join (output_folder , relative_path + ".translate" )):
148+ os .remove (os .path .join (output_folder , relative_path + ".translate" ))
149+ output_file_path = os .path .join (output_folder , relative_path + ".translate" )
150+ os .makedirs (os .path .dirname (output_file_path ), exist_ok = True )
170151 # Submit the translation task to be run in parallel
171152 futures .append (executor .submit (translate_file , config , input_file_path , output_file_path , model ))
172153 else :
173154 # symlink these files as we want to update in a single place
174155 try :
156+ output_file_path = os .path .join (output_folder , relative_path )
175157 if os .path .exists (output_file_path ) or os .path .islink (output_file_path ):
176158 os .remove (output_file_path ) # Remove existing file/link before creating symlink
159+ os .makedirs (os .path .dirname (output_file_path ), exist_ok = True )
177160 os .symlink (input_file_path , output_file_path )
178161 print (f" - Created symlink: { output_file_path } -> { input_file_path } " )
179162 except OSError as e :
@@ -183,6 +166,7 @@ def translate_folder(config, input_folder, output_folder, model="gpt-4o-mini"):
183166 for future in futures :
184167 future .result ()
185168
169+
186170def rename_translated_files (output_folder ):
187171 for root , _ , files in os .walk (output_folder ):
188172 for file in files :
@@ -201,7 +185,8 @@ def rename_translated_files(output_folder):
201185
202186
203187def translate_plugin_data (output_folder , config , model = "gpt-4o-mini" ):
204- json_files = glob .glob (os .path .join (output_folder , "*.json" )) + glob .glob (os .path .join (output_folder , "*" , "*.json" ))
188+ json_files = glob .glob (os .path .join (output_folder , "*.json" )) + glob .glob (
189+ os .path .join (output_folder , "*" , "*.json" ))
205190 language = config ["language" ]
206191 glossary = config ["glossary" ]
207192 prompt = f"""
@@ -223,7 +208,7 @@ def translate_plugin_data(output_folder, config, model="gpt-4o-mini"):
223208 {"role" : "system" , "content" : prompt_content },
224209 {"role" : "user" , "content" : json .dumps (text )},
225210 ],
226- response_format = { "type" : "json_object" }
211+ response_format = {"type" : "json_object" }
227212 )
228213 translated_text = completion .choices [0 ].message .content
229214 translated_config = json .loads (translated_text )
@@ -234,6 +219,7 @@ def translate_plugin_data(output_folder, config, model="gpt-4o-mini"):
234219 print (f"failed to translate: { e } " )
235220 raise e
236221
222+
237223script_dir = os .path .dirname (os .path .abspath (__file__ ))
238224default_input_folder = os .path .abspath (os .path .join (script_dir , "../../docs/" ))
239225
@@ -252,8 +238,9 @@ def main():
252238 parser .add_argument ("--model" , default = "gpt-4o-mini" , help = "Specify the OpenAI model to use for translation" )
253239 args = parser .parse_args ()
254240 config = load_config (args .config )
255- translate_plugin_data (args .output_folder , config , model = args .model )
256- translate_folder (config , args .input_folder , args .output_folder , args .model )
241+ # translate_plugin_data(args.output_folder, config, model=args.model)
242+ translate_docs_folder (config , args .input_folder ,
243+ os .path .join (args .output_folder + "/docusaurus-plugin-content-docs/current" ), args .model )
257244 rename_translated_files (args .output_folder )
258245
259246
0 commit comments