@@ -101,6 +101,72 @@ class File extends BinaryStream {
101101 "Ccaron " , "ccaron " , "dmacron "
102102 );
103103
104+ private function uniord (string $ c , string $ encoding = null ) {
105+ if (function_exists ("mb_ord " )) {
106+ if (PHP_VERSION_ID < 80000 && $ encoding === null ) {
107+ // in PHP < 8 the encoding argument, if supplied, must be a valid encoding
108+ $ encoding = "UTF-8 " ;
109+ }
110+ return mb_ord ($ c , $ encoding );
111+ }
112+
113+ if ($ encoding != "UTF-8 " && $ encoding !== null ) {
114+ $ c = mb_convert_encoding ($ c , "UTF-8 " , $ encoding );
115+ }
116+
117+ $ length = mb_strlen (mb_substr ($ c , 0 , 1 ), '8bit ' );
118+ $ ord = false ;
119+ $ bytes = [];
120+ $ numbytes = 1 ;
121+ for ($ i = 0 ; $ i < $ length ; $ i ++) {
122+ $ o = \ord ($ c [$ i ]); // get one string character at time
123+ if (\count ($ bytes ) === 0 ) { // get starting octect
124+ if ($ o <= 0x7F ) {
125+ $ ord = $ o ;
126+ $ numbytes = 1 ;
127+ } elseif (($ o >> 0x05 ) === 0x06 ) { // 2 bytes character (0x06 = 110 BIN)
128+ $ bytes [] = ($ o - 0xC0 ) << 0x06 ;
129+ $ numbytes = 2 ;
130+ } elseif (($ o >> 0x04 ) === 0x0E ) { // 3 bytes character (0x0E = 1110 BIN)
131+ $ bytes [] = ($ o - 0xE0 ) << 0x0C ;
132+ $ numbytes = 3 ;
133+ } elseif (($ o >> 0x03 ) === 0x1E ) { // 4 bytes character (0x1E = 11110 BIN)
134+ $ bytes [] = ($ o - 0xF0 ) << 0x12 ;
135+ $ numbytes = 4 ;
136+ } else {
137+ $ ord = false ;
138+ break ;
139+ }
140+ } elseif (($ o >> 0x06 ) === 0x02 ) { // bytes 2, 3 and 4 must start with 0x02 = 10 BIN
141+ $ bytes [] = $ o - 0x80 ;
142+ if (\count ($ bytes ) === $ numbytes ) {
143+ // compose UTF-8 bytes to a single unicode value
144+ $ o = $ bytes [0 ];
145+ for ($ j = 1 ; $ j < $ numbytes ; $ j ++) {
146+ $ o += ($ bytes [$ j ] << (($ numbytes - $ j - 1 ) * 0x06 ));
147+ }
148+ if ((($ o >= 0xD800 ) and ($ o <= 0xDFFF )) or ($ o >= 0x10FFFF )) {
149+ // The definition of UTF-8 prohibits encoding character numbers between
150+ // U+D800 and U+DFFF, which are reserved for use with the UTF-16
151+ // encoding form (as surrogate pairs) and do not directly represent
152+ // characters.
153+ return false ;
154+ } else {
155+ $ ord = $ o ; // add char to array
156+ }
157+ // reset data for next char
158+ $ bytes = [];
159+ $ numbytes = 1 ;
160+ }
161+ } else {
162+ $ ord = false ;
163+ break ;
164+ }
165+ }
166+
167+ return $ ord ;
168+ }
169+
104170 function getTable () {
105171 $ this ->parseTableEntries ();
106172
@@ -157,7 +223,7 @@ function utf8toUnicode($str) {
157223 function getUnicodeCharMap () {
158224 $ subtable = null ;
159225 foreach ($ this ->getData ("cmap " , "subtables " ) as $ _subtable ) {
160- if ($ _subtable ["platformID " ] == 0 || $ _subtable ["platformID " ] == 3 && $ _subtable ["platformSpecificID " ] == 1 ) {
226+ if ($ _subtable ["platformID " ] == 0 || ( $ _subtable ["platformID " ] == 3 && $ _subtable ["platformSpecificID " ] == 1 ) ) {
161227 $ subtable = $ _subtable ;
162228 break ;
163229 }
@@ -167,6 +233,51 @@ function getUnicodeCharMap() {
167233 return $ subtable ["glyphIndexArray " ];
168234 }
169235
236+ $ system_encodings = mb_list_encodings ();
237+ $ system_encodings = array_change_key_case (array_fill_keys ($ system_encodings , true ), CASE_UPPER );
238+ foreach ($ this ->getData ("cmap " , "subtables " ) as $ _subtable ) {
239+ $ encoding = null ;
240+ switch ($ _subtable ["platformID " ]) {
241+ case 3 :
242+ switch ($ _subtable ["platformSpecificID " ]) {
243+ case 2 :
244+ if (\array_key_exists ("SJIS " , $ system_encodings )) {
245+ $ encoding = "SJIS " ;
246+ }
247+ break ;
248+ case 3 :
249+ if (\array_key_exists ("GB18030 " , $ system_encodings )) {
250+ $ encoding = "GB18030 " ;
251+ }
252+ break ;
253+ case 4 :
254+ if (\array_key_exists ("BIG-5 " , $ system_encodings )) {
255+ $ encoding = "BIG-5 " ;
256+ }
257+ break ;
258+ case 5 :
259+ if (\array_key_exists ("UHC " , $ system_encodings )) {
260+ $ encoding = "UHC " ;
261+ }
262+ break ;
263+ }
264+ break ;
265+ }
266+ if ($ encoding ) {
267+ $ glyphIndexArray = array ();
268+ foreach ($ _subtable ["glyphIndexArray " ] as $ c => $ gid ) {
269+ $ str = trim (pack ("N " , $ c ));
270+ if (\strlen ($ str ) > 0 ) {
271+ $ ord = $ this ->uniord ($ str , $ encoding );
272+ if ($ ord > 0 ) {
273+ $ glyphIndexArray [$ ord ] = $ gid ;
274+ }
275+ }
276+ }
277+ return $ glyphIndexArray ;
278+ }
279+ }
280+
170281 return null ;
171282 }
172283
0 commit comments