Skip to content

Commit c3f5385

Browse files
authored
Php/iconv Should Not Treat FFFE/FFFF as Valid (#2910)
Fix #2897. We have been relying on iconv/mb_convert_encoding to detect invalid UTF-8, but all techniques designed to validate UTF-8 seem to accept FFFE and FFFF. This PR explicitly converts those characters to FFFD (Unicode substitution character) before validating the rest of the string. It also substitutes one or more FFFD when it detects invalid UTF-8 character sequences. A comment in the code being change stated that it doesn't handle surrogates. It is right not to do so. The only case where we should see surrogates is reading UTF-16. Additional tests are added to an existing test reading a UTF-16 Csv to demonstrate that surrogates are handled correctly, and that FFFE/FFFF are handled reasonably.
1 parent f90adcf commit c3f5385

File tree

6 files changed

+121
-45
lines changed

6 files changed

+121
-45
lines changed

phpstan-baseline.neon

Lines changed: 0 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2785,36 +2785,6 @@ parameters:
27852785
count: 1
27862786
path: src/PhpSpreadsheet/Shared/OLERead.php
27872787

2788-
-
2789-
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:formatNumber\\(\\) should return string but returns array\\|string\\.$#"
2790-
count: 1
2791-
path: src/PhpSpreadsheet/Shared/StringHelper.php
2792-
2793-
-
2794-
message: "#^Method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:sanitizeUTF8\\(\\) should return string but returns string\\|false\\.$#"
2795-
count: 1
2796-
path: src/PhpSpreadsheet/Shared/StringHelper.php
2797-
2798-
-
2799-
message: "#^Parameter \\#1 \\$string of function strlen expects string, float given\\.$#"
2800-
count: 1
2801-
path: src/PhpSpreadsheet/Shared/StringHelper.php
2802-
2803-
-
2804-
message: "#^Parameter \\#3 \\$subject of function str_replace expects array\\|string, float given\\.$#"
2805-
count: 1
2806-
path: src/PhpSpreadsheet/Shared/StringHelper.php
2807-
2808-
-
2809-
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$decimalSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
2810-
count: 1
2811-
path: src/PhpSpreadsheet/Shared/StringHelper.php
2812-
2813-
-
2814-
message: "#^Static property PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\StringHelper\\:\\:\\$thousandsSeparator \\(string\\) in isset\\(\\) is not nullable\\.$#"
2815-
count: 1
2816-
path: src/PhpSpreadsheet/Shared/StringHelper.php
2817-
28182788
-
28192789
message: "#^Static method PhpOffice\\\\PhpSpreadsheet\\\\Shared\\\\TimeZone\\:\\:validateTimeZone\\(\\) is unused\\.$#"
28202790
count: 1

src/PhpSpreadsheet/Shared/StringHelper.php

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,14 @@
33
namespace PhpOffice\PhpSpreadsheet\Shared;
44

55
use PhpOffice\PhpSpreadsheet\Calculation\Calculation;
6+
use UConverter;
67

78
class StringHelper
89
{
910
/** Constants */
1011
/** Regular Expressions */
1112
// Fraction
12-
const STRING_REGEXP_FRACTION = '(-?)(\d+)\s+(\d+\/\d+)';
13+
const STRING_REGEXP_FRACTION = '~^\s*(-?)((\d*)\s+)?(\d+\/\d+)\s*$~';
1314

1415
/**
1516
* Control characters array.
@@ -28,14 +29,14 @@ class StringHelper
2829
/**
2930
* Decimal separator.
3031
*
31-
* @var string
32+
* @var ?string
3233
*/
3334
private static $decimalSeparator;
3435

3536
/**
3637
* Thousands separator.
3738
*
38-
* @var string
39+
* @var ?string
3940
*/
4041
private static $thousandsSeparator;
4142

@@ -328,39 +329,51 @@ public static function controlCharacterPHP2OOXML($textValue)
328329
}
329330

330331
/**
331-
* Try to sanitize UTF8, stripping invalid byte sequences. Not perfect. Does not surrogate characters.
332+
* Try to sanitize UTF8, replacing invalid sequences with Unicode substitution characters.
332333
*/
333334
public static function sanitizeUTF8(string $textValue): string
334335
{
336+
$textValue = str_replace(["\xef\xbf\xbe", "\xef\xbf\xbf"], "\xef\xbf\xbd", $textValue);
337+
if (class_exists(UConverter::class)) {
338+
$returnValue = UConverter::transcode($textValue, 'UTF-8', 'UTF-8');
339+
if ($returnValue !== false) {
340+
return $returnValue;
341+
}
342+
}
343+
// @codeCoverageIgnoreStart
344+
// I don't think any of the code below should ever be executed.
335345
if (self::getIsIconvEnabled()) {
336-
$textValue = @iconv('UTF-8', 'UTF-8', $textValue);
337-
338-
return $textValue;
346+
$returnValue = @iconv('UTF-8', 'UTF-8', $textValue);
347+
if ($returnValue !== false) {
348+
return $returnValue;
349+
}
339350
}
340351

341-
$textValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8');
352+
// Phpstan does not think this can return false.
353+
$returnValue = mb_convert_encoding($textValue, 'UTF-8', 'UTF-8');
342354

343-
return $textValue;
355+
return $returnValue;
356+
// @codeCoverageIgnoreEnd
344357
}
345358

346359
/**
347360
* Check if a string contains UTF8 data.
348361
*/
349362
public static function isUTF8(string $textValue): bool
350363
{
351-
return $textValue === '' || preg_match('/^./su', $textValue) === 1;
364+
return $textValue === self::sanitizeUTF8($textValue);
352365
}
353366

354367
/**
355368
* Formats a numeric value as a string for output in various output writers forcing
356369
* point as decimal separator in case locale is other than English.
357370
*
358-
* @param mixed $numericValue
371+
* @param float|int|string $numericValue
359372
*/
360373
public static function formatNumber($numericValue): string
361374
{
362375
if (is_float($numericValue)) {
363-
return str_replace(',', '.', $numericValue);
376+
return str_replace(',', '.', (string) $numericValue);
364377
}
365378

366379
return (string) $numericValue;
@@ -537,9 +550,10 @@ public static function strCaseReverse(string $textValue): string
537550
*/
538551
public static function convertToNumberIfFraction(string &$operand): bool
539552
{
540-
if (preg_match('/^' . self::STRING_REGEXP_FRACTION . '$/i', $operand, $match)) {
553+
if (preg_match(self::STRING_REGEXP_FRACTION, $operand, $match)) {
541554
$sign = ($match[1] == '-') ? '-' : '+';
542-
$fractionFormula = '=' . $sign . $match[2] . $sign . $match[3];
555+
$wholePart = ($match[3] === '') ? '' : ($sign . $match[3]);
556+
$fractionFormula = '=' . $wholePart . $sign . $match[4];
543557
$operand = Calculation::getInstance()->_calculateFormulaValue($fractionFormula);
544558

545559
return true;
@@ -686,6 +700,6 @@ public static function testStringAsNumeric($textValue)
686700
}
687701
$v = (float) $textValue;
688702

689-
return (is_numeric(substr($textValue, 0, strlen($v)))) ? $v : $textValue;
703+
return (is_numeric(substr($textValue, 0, strlen((string) $v)))) ? $v : $textValue;
690704
}
691705
}

tests/PhpSpreadsheetTests/Reader/Csv/CsvEncodingTest.php

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,25 @@ public function testGuessEncoding(string $filename): void
6666
self::assertEquals('sixième', $sheet->getCell('C2')->getValue());
6767
}
6868

69+
public function testSurrogate(): void
70+
{
71+
// Surrogates should occur only in UTF-16, and should
72+
// be properly converted to UTF8 when read.
73+
// FFFE/FFFF are illegal, and should be converted to
74+
// substitution character when read.
75+
// Excel does not handle any of the cells in row 3 well.
76+
// LibreOffice handles A3 fine, and discards B3/C3,
77+
// which is a reasonable action.
78+
$filename = 'tests/data/Reader/CSV/premiere.utf16le.csv';
79+
$reader = new Csv();
80+
$reader->setInputEncoding(Csv::guessEncoding($filename));
81+
$spreadsheet = $reader->load($filename);
82+
$sheet = $spreadsheet->getActiveSheet();
83+
self::assertEquals('𐐀', $sheet->getCell('A3')->getValue());
84+
self::assertEquals('', $sheet->getCell('B3')->getValue());
85+
self::assertEquals('', $sheet->getCell('C3')->getValue());
86+
}
87+
6988
/**
7089
* @dataProvider providerGuessEncoding
7190
*/
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?php
2+
3+
namespace PhpOffice\PhpSpreadsheetTests\Shared;
4+
5+
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
6+
use PhpOffice\PhpSpreadsheet\Spreadsheet;
7+
use PHPUnit\Framework\TestCase;
8+
9+
class StringHelperInvalidCharTest extends TestCase
10+
{
11+
public function testInvalidChar(): void
12+
{
13+
$spreadsheet = new Spreadsheet();
14+
$sheet = $spreadsheet->getActiveSheet();
15+
$substitution = '';
16+
$array = [
17+
['Normal string', 'Hello', 'Hello'],
18+
['integer', 2, 2],
19+
['float', 2.1, 2.1],
20+
['boolean true', true, true],
21+
['illegal FFFE/FFFF', "H\xef\xbf\xbe\xef\xbf\xbfello", "H{$substitution}{$substitution}ello"],
22+
['illegal character', "H\xef\x00\x00ello", "H{$substitution}\x00\x00ello"],
23+
['overlong character', "H\xc0\xa0ello", "H{$substitution}{$substitution}ello"],
24+
['Osmanya as single character', "H\xf0\x90\x90\x80ello", 'H𐐀ello'],
25+
['Osmanya as surrogate pair (x)', "\xed\xa0\x81\xed\xb0\x80", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
26+
['Osmanya as surrogate pair (u)', "\u{d801}\u{dc00}", "{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}{$substitution}"],
27+
['Half surrogate pair (u)', "\u{d801}", "{$substitution}{$substitution}{$substitution}"],
28+
['Control character', "\u{7}", "\u{7}"],
29+
];
30+
31+
$sheet->fromArray($array);
32+
$row = 0;
33+
foreach ($array as $value) {
34+
self::assertSame($value[1] === $value[2], StringHelper::isUTF8((string) $value[1]));
35+
++$row;
36+
$expected = $value[2];
37+
self::assertSame(
38+
$expected,
39+
$sheet->getCell("B$row")->getValue(),
40+
$sheet->getCell("A$row")->getValue()
41+
);
42+
}
43+
}
44+
}

tests/PhpSpreadsheetTests/Shared/StringHelperTest.php

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,4 +119,33 @@ public function testSYLKtoUTF8(): void
119119

120120
self::assertEquals($expectedResult, $result);
121121
}
122+
123+
/**
124+
* @dataProvider providerFractions
125+
*/
126+
public function testFraction(string $expected, string $value): void
127+
{
128+
$originalValue = $value;
129+
$result = StringHelper::convertToNumberIfFraction($value);
130+
if ($result === false) {
131+
self::assertSame($expected, $originalValue);
132+
self::assertSame($expected, $value);
133+
} else {
134+
self::assertSame($expected, (string) $value);
135+
self::assertNotEquals($value, $originalValue);
136+
}
137+
}
138+
139+
public function providerFractions(): array
140+
{
141+
return [
142+
'non-fraction' => ['1', '1'],
143+
'common fraction' => ['1.5', '1 1/2'],
144+
'fraction between -1 and 0' => ['-0.5', '-1/2'],
145+
'fraction between -1 and 0 with space' => ['-0.5', ' - 1/2'],
146+
'fraction between 0 and 1' => ['0.75', '3/4 '],
147+
'fraction between 0 and 1 with space' => ['0.75', ' 3/4'],
148+
'improper fraction' => ['1.75', '7/4'],
149+
];
150+
}
122151
}
16 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)