@@ -30,6 +30,10 @@ class Html extends BaseReader
3030 */
3131 const TEST_SAMPLE_SIZE = 2048 ;
3232
33+ private const STARTS_WITH_BOM = '/^(?:\xfe\xff|\xff\xfe|\xEF\xBB\xBF)/ ' ;
34+
35+ private const DECLARES_CHARSET = '/ charset=/i ' ;
36+
3337 /**
3438 * Input encoding.
3539 */
@@ -144,6 +148,9 @@ public function canRead(string $filename): bool
144148 }
145149
146150 $ beginning = $ this ->readBeginning ();
151+ if (preg_match (self ::STARTS_WITH_BOM , $ beginning )) {
152+ return true ;
153+ }
147154 $ startWithTag = self ::startsWithTag ($ beginning );
148155 $ containsTags = self ::containsTags ($ beginning );
149156 $ endsWithTag = self ::endsWithTag ($ this ->readEnding ());
@@ -638,12 +645,7 @@ public function loadIntoExisting(string $filename, Spreadsheet $spreadsheet): Sp
638645 // Reload the HTML file into the DOM object
639646 try {
640647 $ convert = $ this ->getSecurityScannerOrThrow ()->scanFile ($ filename );
641- $ lowend = "\u{80}" ;
642- $ highend = "\u{10ffff}" ;
643- $ regexp = "/[ $ lowend- $ highend]/u " ;
644- /** @var callable $callback */
645- $ callback = [self ::class, 'replaceNonAscii ' ];
646- $ convert = preg_replace_callback ($ regexp , $ callback , $ convert );
648+ $ convert = self ::replaceNonAsciiIfNeeded ($ convert );
647649 $ loaded = ($ convert === null ) ? false : $ dom ->loadHTML ($ convert );
648650 } catch (Throwable $ e ) {
649651 $ loaded = false ;
@@ -736,6 +738,20 @@ private static function replaceNonAscii(array $matches): string
736738 return '&# ' . mb_ord ($ matches [0 ], 'UTF-8 ' ) . '; ' ;
737739 }
738740
741+ private static function replaceNonAsciiIfNeeded (string $ convert ): ?string
742+ {
743+ if (preg_match (self ::STARTS_WITH_BOM , $ convert ) !== 1 && preg_match (self ::DECLARES_CHARSET , $ convert ) !== 1 ) {
744+ $ lowend = "\u{80}" ;
745+ $ highend = "\u{10ffff}" ;
746+ $ regexp = "/[ $ lowend- $ highend]/u " ;
747+ /** @var callable $callback */
748+ $ callback = [self ::class, 'replaceNonAscii ' ];
749+ $ convert = preg_replace_callback ($ regexp , $ callback , $ convert );
750+ }
751+
752+ return $ convert ;
753+ }
754+
739755 /**
740756 * Spreadsheet from content.
741757 */
@@ -747,12 +763,7 @@ public function loadFromString(string $content, ?Spreadsheet $spreadsheet = null
747763 // Reload the HTML file into the DOM object
748764 try {
749765 $ convert = $ this ->getSecurityScannerOrThrow ()->scan ($ content );
750- $ lowend = "\u{80}" ;
751- $ highend = "\u{10ffff}" ;
752- $ regexp = "/[ $ lowend- $ highend]/u " ;
753- /** @var callable $callback */
754- $ callback = [self ::class, 'replaceNonAscii ' ];
755- $ convert = preg_replace_callback ($ regexp , $ callback , $ convert );
766+ $ convert = self ::replaceNonAsciiIfNeeded ($ convert );
756767 $ loaded = ($ convert === null ) ? false : $ dom ->loadHTML ($ convert );
757768 } catch (Throwable $ e ) {
758769 $ loaded = false ;
0 commit comments