Skip to content

Commit b4c045e

Browse files
Added encoder files
1 parent 943f70e commit b4c045e

File tree

4 files changed

+50323
-0
lines changed

4 files changed

+50323
-0
lines changed

characters.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"0":"Ā","1":"ā","2":"Ă","3":"ă","4":"Ą","5":"ą","6":"Ć","7":"ć","8":"Ĉ","9":"ĉ","10":"Ċ","11":"ċ","12":"Č","13":"č","14":"Ď","15":"ď","16":"Đ","17":"đ","18":"Ē","19":"ē","20":"Ĕ","21":"ĕ","22":"Ė","23":"ė","24":"Ę","25":"ę","26":"Ě","27":"ě","28":"Ĝ","29":"ĝ","30":"Ğ","31":"ğ","32":"Ġ","33":"!","34":"\"","35":"#","36":"$","37":"%","38":"&","39":"'","40":"(","41":")","42":"*","43":"+","44":",","45":"-","46":".","47":"/","48":"0","49":"1","50":"2","51":"3","52":"4","53":"5","54":"6","55":"7","56":"8","57":"9","58":":","59":";","60":"<","61":"=","62":">","63":"?","64":"@","65":"A","66":"B","67":"C","68":"D","69":"E","70":"F","71":"G","72":"H","73":"I","74":"J","75":"K","76":"L","77":"M","78":"N","79":"O","80":"P","81":"Q","82":"R","83":"S","84":"T","85":"U","86":"V","87":"W","88":"X","89":"Y","90":"Z","91":"[","92":"\\","93":"]","94":"^","95":"_","96":"`","97":"a","98":"b","99":"c","100":"d","101":"e","102":"f","103":"g","104":"h","105":"i","106":"j","107":"k","108":"l","109":"m","110":"n","111":"o","112":"p","113":"q","114":"r","115":"s","116":"t","117":"u","118":"v","119":"w","120":"x","121":"y","122":"z","123":"{","124":"|","125":"}","126":"~","127":"ġ","128":"Ģ","129":"ģ","130":"Ĥ","131":"ĥ","132":"Ħ","133":"ħ","134":"Ĩ","135":"ĩ","136":"Ī","137":"ī","138":"Ĭ","139":"ĭ","140":"Į","141":"į","142":"İ","143":"ı","144":"IJ","145":"ij","146":"Ĵ","147":"ĵ","148":"Ķ","149":"ķ","150":"ĸ","151":"Ĺ","152":"ĺ","153":"Ļ","154":"ļ","155":"Ľ","156":"ľ","157":"Ŀ","158":"ŀ","159":"Ł","160":"ł","161":"¡","162":"¢","163":"£","164":"¤","165":"¥","166":"¦","167":"§","168":"¨","169":"©","170":"ª","171":"«","172":"¬","173":"Ń","174":"®","175":"¯","176":"°","177":"±","178":"²","179":"³","180":"´","181":"µ","182":"¶","183":"·","184":"¸","185":"¹","186":"º","187":"»","188":"¼","189":"½","190":"¾","191":"¿","192":"À","193":"Á","194":"Â","195":"Ã","196":"Ä","197":"Å","198":"Æ","199":"Ç","200":"È","201":"É","202":"Ê","203":"Ë","204":"Ì","205":"Í","206":"Î","207":"Ï","208":"Ð","209":"Ñ","210":"Ò","211":"Ó","212":"Ô","213":"Õ","214":"Ö","215":"×","216":"Ø","217":"Ù","218":"Ú","219":"Û","220":"Ü","221":"Ý","222":"Þ","223":"ß","224":"à","225":"á","226":"â","227":"ã","228":"ä","229":"å","230":"æ","231":"ç","232":"è","233":"é","234":"ê","235":"ë","236":"ì","237":"í","238":"î","239":"ï","240":"ð","241":"ñ","242":"ò","243":"ó","244":"ô","245":"õ","246":"ö","247":"÷","248":"ø","249":"ù","250":"ú","251":"û","252":"ü","253":"ý","254":"þ","255":"ÿ"}

encoder.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

gpt3-encoder.php

Lines changed: 320 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,320 @@
1+
<?php
2+
3+
function gpt_encode($text)
4+
{
5+
$bpe_tokens = array();
6+
if(empty($text))
7+
{
8+
return $bpe_tokens;
9+
}
10+
$raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json");
11+
$byte_encoder = json_decode($raw_chars, true);
12+
if(empty($byte_encoder))
13+
{
14+
error_log('Failed to load characters.json: ' . $raw_chars);
15+
return $bpe_tokens;
16+
}
17+
$rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json");
18+
$encoder = json_decode($rencoder, true);
19+
if(empty($encoder))
20+
{
21+
error_log('Failed to load encoder.json: ' . $rencoder);
22+
return $bpe_tokens;
23+
}
24+
25+
$bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe");
26+
if(empty($bpe_file))
27+
{
28+
error_log('Failed to load vocab.bpe');
29+
return $bpe_tokens;
30+
}
31+
32+
preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches);
33+
if(!isset($matches[0]) || count($matches[0]) == 0)
34+
{
35+
error_log('Failed to match string: ' . $text);
36+
return $bpe_tokens;
37+
}
38+
$lines = preg_split('/\r\n|\r|\n/', $bpe_file);
39+
$bpe_merges = array();
40+
$bpe_merges_temp = array_slice($lines, 1, count($lines), true);
41+
foreach($bpe_merges_temp as $bmt)
42+
{
43+
$split_bmt = preg_split('#(\s+)#', $bmt);
44+
$split_bmt = array_filter($split_bmt, 'gpt_my_filter');
45+
if(count($split_bmt) > 0)
46+
{
47+
$bpe_merges[] = $split_bmt;
48+
}
49+
}
50+
$bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1));
51+
52+
$cache = array();
53+
foreach($matches[0] as $token)
54+
{
55+
$new_tokens = array();
56+
$chars = array();
57+
$token = utf8_encode($token);
58+
if(function_exists('mb_strlen'))
59+
{
60+
$len = mb_strlen($token, 'UTF-8');
61+
for ($i = 0; $i < $len; $i++)
62+
{
63+
$chars[] = mb_substr($token, $i, 1, 'UTF-8');
64+
}
65+
}
66+
else
67+
{
68+
$chars = str_split($token);
69+
}
70+
$result_word = '';
71+
foreach($chars as $char)
72+
{
73+
if(isset($byte_encoder[gpt_unichr($char)]))
74+
{
75+
$result_word .= $byte_encoder[gpt_unichr($char)];
76+
}
77+
}
78+
$new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache);
79+
$new_tokens_bpe = explode(' ', $new_tokens_bpe);
80+
foreach($new_tokens_bpe as $x)
81+
{
82+
if(isset($encoder[$x]))
83+
{
84+
$new_tokens[$x] = $encoder[$x];
85+
}
86+
else
87+
{
88+
$new_tokens[$x] = $x;
89+
}
90+
}
91+
foreach($new_tokens as $ninx => $nval)
92+
{
93+
if(isset($bpe_tokens[$ninx]))
94+
{
95+
$bpe_tokens[] = $nval;
96+
}
97+
else
98+
{
99+
$bpe_tokens[$ninx] = $nval;
100+
}
101+
}
102+
}
103+
return $bpe_tokens;
104+
}
105+
106+
function gpt_my_filter($var)
107+
{
108+
return ($var !== NULL && $var !== FALSE && $var !== '');
109+
}
110+
111+
function gpt_unichr($c)
112+
{
113+
if (ord($c[0]) >=0 && ord($c[0]) <= 127)
114+
{
115+
return ord($c[0]);
116+
}
117+
if (ord($c[0]) >= 192 && ord($c[0]) <= 223)
118+
{
119+
return (ord($c[0])-192)*64 + (ord($c[1])-128);
120+
}
121+
if (ord($c[0]) >= 224 && ord($c[0]) <= 239)
122+
{
123+
return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128);
124+
}
125+
if (ord($c[0]) >= 240 && ord($c[0]) <= 247)
126+
{
127+
return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128);
128+
}
129+
if (ord($c[0]) >= 248 && ord($c[0]) <= 251)
130+
{
131+
return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128);
132+
}
133+
if (ord($c[0]) >= 252 && ord($c[0]) <= 253)
134+
{
135+
return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128);
136+
}
137+
if (ord($c[0]) >= 254 && ord($c[0]) <= 255)
138+
{
139+
return 0;
140+
}
141+
return 0;
142+
}
143+
function gpt_dictZip($x, $y)
144+
{
145+
$result = array();
146+
$cnt = 0;
147+
foreach($x as $i)
148+
{
149+
if(isset($i[1]) && isset($i[0]))
150+
{
151+
$result[$i[0] . ',' . $i[1]] = $cnt;
152+
$cnt++;
153+
}
154+
}
155+
return $result;
156+
}
157+
function gpt_get_pairs($word)
158+
{
159+
$pairs = array();
160+
$prev_char = $word[0];
161+
for ($i = 1; $i < count($word); $i++)
162+
{
163+
$char = $word[$i];
164+
$pairs[] = array($prev_char, $char);
165+
$prev_char = $char;
166+
}
167+
return $pairs;
168+
}
169+
function gpt_split($str, $len = 1)
170+
{
171+
$arr = [];
172+
if(function_exists('mb_strlen'))
173+
{
174+
$length = mb_strlen($str, 'UTF-8');
175+
}
176+
else
177+
{
178+
$length = strlen($str);
179+
}
180+
181+
for ($i = 0; $i < $length; $i += $len)
182+
{
183+
if(function_exists('mb_substr'))
184+
{
185+
$arr[] = mb_substr($str, $i, $len, 'UTF-8');
186+
}
187+
else
188+
{
189+
$arr[] = substr($str, $i, $len);
190+
}
191+
}
192+
return $arr;
193+
194+
}
195+
function gpt_bpe($token, $bpe_ranks, &$cache)
196+
{
197+
if(array_key_exists($token, $cache))
198+
{
199+
return $cache[$token];
200+
}
201+
$word = gpt_split($token);
202+
$init_len = count($word);
203+
$pairs = gpt_get_pairs($word);
204+
if(!$pairs)
205+
{
206+
return $token;
207+
}
208+
while (true)
209+
{
210+
$minPairs = array();
211+
foreach($pairs as $pair)
212+
{
213+
if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks))
214+
{
215+
$rank = $bpe_ranks[$pair[0] . ','. $pair[1]];
216+
$minPairs[$rank] = $pair;
217+
}
218+
else
219+
{
220+
$minPairs[10e10] = $pair;
221+
}
222+
}
223+
ksort($minPairs);
224+
$min_key = array_key_first($minPairs);
225+
foreach($minPairs as $mpi => $mp)
226+
{
227+
if($mpi < $min_key)
228+
{
229+
$min_key = $mpi;
230+
}
231+
}
232+
$bigram = $minPairs[$min_key];
233+
if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks))
234+
{
235+
break;
236+
}
237+
$first = $bigram[0];
238+
$second = $bigram[1];
239+
$new_word = array();
240+
$i = 0;
241+
while ($i < count($word))
242+
{
243+
$j = gpt_indexOf($word, $first, $i);
244+
if ($j === -1)
245+
{
246+
$new_word = array_merge($new_word, array_slice($word, $i, null, true));
247+
break;
248+
}
249+
if($i > $j)
250+
{
251+
$slicer = array();
252+
}
253+
elseif($j == 0)
254+
{
255+
$slicer = array();
256+
}
257+
else
258+
{
259+
$slicer = array_slice($word, $i, $j - $i, true);
260+
}
261+
$new_word = array_merge($new_word, $slicer);
262+
if(count($new_word) > $init_len)
263+
{
264+
break;
265+
}
266+
$i = $j;
267+
if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second)
268+
{
269+
array_push($new_word, $first . $second);
270+
$i = $i + 2;
271+
}
272+
else
273+
{
274+
array_push($new_word, $word[$i]);
275+
$i = $i + 1;
276+
}
277+
}
278+
if($word == $new_word)
279+
{
280+
break;
281+
}
282+
$word = $new_word;
283+
if (count($word) === 1)
284+
{
285+
break;
286+
}
287+
else
288+
{
289+
$pairs = gpt_get_pairs($word);
290+
}
291+
}
292+
$word = implode(' ', $word);
293+
$cache[$token] = $word;
294+
return $word;
295+
}
296+
function gpt_indexOf($arrax, $searchElement, $fromIndex)
297+
{
298+
$index = 0;
299+
foreach($arrax as $index => $value)
300+
{
301+
if($index < $fromIndex)
302+
{
303+
$index++;
304+
continue;
305+
}
306+
if($value == $searchElement)
307+
{
308+
return $index;
309+
}
310+
$index++;
311+
}
312+
return -1;
313+
}
314+
315+
$prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890";
316+
$token_array = gpt_encode($prompt);
317+
error_log('Token array: ' . print_r($token_array, true));
318+
error_log('Count: ' . count($token_array));
319+
320+
?>

0 commit comments

Comments
 (0)