@@ -88,14 +88,11 @@ func (t *pdfToMarkdownTransformer) transform() (converterOutput, error) {
8888 }
8989 benchmarkLog = benchmarkLog .With (zap .Time ("repair" , time .Now ()))
9090
91- paramsJSON , err := json . Marshal ( map [string ]interface {}{
91+ params := map [string ]interface {}{
9292 "PDF" : pdfBase64 ,
9393 "display-image-tag" : t .pdfToMarkdownStruct .displayImageTag ,
9494 "display-all-page-image" : t .pdfToMarkdownStruct .displayAllPageImage ,
9595 "resolution" : t .pdfToMarkdownStruct .resolution ,
96- })
97- if err != nil {
98- return output , fmt .Errorf ("marshalling conversion params: %w" , err )
9996 }
10097
10198 var pythonCode string
@@ -105,34 +102,12 @@ func (t *pdfToMarkdownTransformer) transform() (converterOutput, error) {
105102 default :
106103 pythonCode = pageImageProcessor + pdfTransformer + pdfPlumberPDFToMDConverter
107104 }
108- cmdRunner := exec .Command (pythonInterpreter , "-c" , pythonCode )
109- stdin , err := cmdRunner .StdinPipe ()
110- if err != nil {
111- return output , fmt .Errorf ("creating stdin pipe: %w" , err )
112- }
113-
114- errChan := make (chan error , 1 )
115- go func () {
116- defer stdin .Close ()
117- _ , err := stdin .Write (paramsJSON )
118- if err != nil {
119- errChan <- fmt .Errorf ("writing to stdin: %w" , err )
120- return
121- }
122- errChan <- nil
123- }()
124105
125- outputBytes , err := cmdRunner . CombinedOutput ( )
106+ outputBytes , err := util . ExecutePythonCode ( pythonCode , params )
126107
127108 benchmarkLog = benchmarkLog .With (zap .Time ("convert" , time .Now ()))
128109 if err != nil {
129- errorStr := string (outputBytes )
130- return output , fmt .Errorf ("running Python script: %w, %s" , err , errorStr )
131- }
132-
133- err = <- errChan
134- if err != nil {
135- return output , err
110+ return output , fmt .Errorf ("running Python script: %w" , err )
136111 }
137112
138113 err = json .Unmarshal (outputBytes , & output )
@@ -388,27 +363,27 @@ func encodeFileToBase64(inputPath string) (string, error) {
388363 return base64 .StdEncoding .EncodeToString (data ), nil
389364}
390365
391- // ConvertToPDF converts a base64 encoded document to a PDF using LibreOffice.
392- // It uses a mutex to ensure only one LibreOffice process runs at a time, preventing
366+ // ConvertToPDF converts a base64 encoded document to a PDF using LibreOffice or wkhtmltopdf .
367+ // It uses a mutex to ensure only one conversion process runs at a time, preventing
393368// race conditions and permission issues.
394369func ConvertToPDF (base64Encoded , fileExtension string ) (string , error ) {
395- // Serialize LibreOffice operations to prevent race conditions
370+ // Serialize operations to prevent race conditions
396371 libreOfficeMutex .Lock ()
397372 defer libreOfficeMutex .Unlock ()
398373
399- tempPpt , err := os .CreateTemp ("" , "temp_document.*." + fileExtension )
374+ tempFile , err := os .CreateTemp ("" , "temp_document.*." + fileExtension )
400375 if err != nil {
401376 return "" , fmt .Errorf ("failed to create temporary document: %w" , err )
402377 }
403- inputFileName := tempPpt .Name ()
378+ inputFileName := tempFile .Name ()
404379 defer os .Remove (inputFileName )
405380
406- err = writeDecodeToFile (base64Encoded , tempPpt )
381+ err = writeDecodeToFile (base64Encoded , tempFile )
407382 if err != nil {
408383 return "" , fmt .Errorf ("failed to decode base64 to file: %w" , err )
409384 }
410385
411- tempDir , err := os .MkdirTemp ("" , "libreoffice " )
386+ tempDir , err := os .MkdirTemp ("" , "conversion " )
412387 if err != nil {
413388 return "" , fmt .Errorf ("failed to create temporary directory: %w" , err )
414389 }
@@ -419,24 +394,54 @@ func ConvertToPDF(base64Encoded, fileExtension string) (string, error) {
419394 return "" , fmt .Errorf ("failed to set permissions on temporary directory: %w" , err )
420395 }
421396
422- cmd := exec .Command ("libreoffice" , "--headless" , "--convert-to" , "pdf" , "--outdir" , tempDir , inputFileName )
423- cmd .Env = append (os .Environ (), "HOME=" + tempDir )
397+ var cmd * exec.Cmd
398+ var outputFileName string
399+
400+ // Use wkhtmltopdf for HTML files if available, otherwise use LibreOffice
401+ if fileExtension == "html" {
402+ // Try wkhtmltopdf first for HTML files (better HTML rendering)
403+ outputFileName = filepath .Join (tempDir , strings .TrimSuffix (filepath .Base (inputFileName ), ".html" )+ ".pdf" )
404+
405+ // Check if wkhtmltopdf is available
406+ if _ , err := exec .LookPath ("wkhtmltopdf" ); err == nil {
407+ // wkhtmltopdf is available, use it
408+ cmd = exec .Command ("wkhtmltopdf" , inputFileName , outputFileName )
409+ } else {
410+ // wkhtmltopdf not available, use LibreOffice for HTML with specific HTML import filter
411+ cmd = exec .Command ("libreoffice" , "--headless" , "--infilter=HTML" , "--convert-to" , "pdf" , "--outdir" , tempDir , inputFileName )
412+ cmd .Env = append (os .Environ (), "HOME=" + tempDir )
413+ outputFileName = filepath .Join (tempDir , strings .TrimSuffix (filepath .Base (inputFileName ), ".html" )+ ".pdf" )
414+ }
415+ } else {
416+ // Use LibreOffice for all other document types
417+ cmd = exec .Command ("libreoffice" , "--headless" , "--convert-to" , "pdf" , "--outdir" , tempDir , inputFileName )
418+ cmd .Env = append (os .Environ (), "HOME=" + tempDir )
419+ outputFileName = filepath .Join (tempDir , strings .TrimSuffix (filepath .Base (inputFileName ), "." + fileExtension )+ ".pdf" )
420+ }
424421
425422 // Capture both stdout and stderr for better error reporting
426423 output , err := cmd .CombinedOutput ()
427424 if err != nil {
428- return "" , fmt .Errorf ("failed to execute LibreOffice command: %s (output: %s)" , err .Error (), string (output ))
425+ return "" , fmt .Errorf ("failed to execute conversion command: %s (output: %s)" , err .Error (), string (output ))
426+ }
427+
428+ // Check if the output file exists
429+ if _ , err := os .Stat (outputFileName ); os .IsNotExist (err ) {
430+ // For LibreOffice fallback, try the standard LibreOffice output location
431+ noPathFileName := filepath .Base (inputFileName )
432+ standardPDFName := filepath .Join (tempDir , strings .TrimSuffix (noPathFileName , filepath .Ext (inputFileName ))+ ".pdf" )
433+ if _ , err := os .Stat (standardPDFName ); err == nil {
434+ outputFileName = standardPDFName
435+ } else {
436+ return "" , fmt .Errorf ("output PDF file not found at expected location: %s" , outputFileName )
437+ }
429438 }
430439
431- // With --outdir option, the generated PDF will be in the temp directory
432- noPathFileName := filepath .Base (inputFileName )
433- tempPDFName := filepath .Join (tempDir , strings .TrimSuffix (noPathFileName , filepath .Ext (inputFileName ))+ ".pdf" )
434- defer os .Remove (tempPDFName )
440+ defer os .Remove (outputFileName )
435441
436- base64PDF , err := encodeFileToBase64 (tempPDFName )
442+ base64PDF , err := encodeFileToBase64 (outputFileName )
437443 if err != nil {
438- // In the different containers, we have the different versions of LibreOffice, which means the behavior of LibreOffice may be different.
439- // So, we need to handle the case when the generated PDF is not in the temp directory.
444+ // Handle the case when the input is already a PDF
440445 if fileExtension == "pdf" {
441446 base64PDF , err := encodeFileToBase64 (inputFileName )
442447 if err != nil {
0 commit comments