CodeRevolutionPlugins
diff --git a/‎characters.json‎
Lines changed: 1 addition & 0 deletions b/‎characters.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎encoder.json‎
Lines changed: 1 addition & 0 deletions b/‎encoder.json‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎gpt3-encoder.php‎
Lines changed: 320 additions & 0 deletions b/‎gpt3-encoder.php‎
Lines changed: 320 additions & 0 deletions
@@ -0,0 +1 @@
+{"0":"Ā","1":"ā","2":"Ă","3":"ă","4":"Ą","5":"ą","6":"Ć","7":"ć","8":"Ĉ","9":"ĉ","10":"Ċ","11":"ċ","12":"Č","13":"č","14":"Ď","15":"ď","16":"Đ","17":"đ","18":"Ē","19":"ē","20":"Ĕ","21":"ĕ","22":"Ė","23":"ė","24":"Ę","25":"ę","26":"Ě","27":"ě","28":"Ĝ","29":"ĝ","30":"Ğ","31":"ğ","32":"Ġ","33":"!","34":"\"","35":"#","36":"$","37":"%","38":"&","39":"'","40":"(","41":")","42":"*","43":"+","44":",","45":"-","46":".","47":"/","48":"0","49":"1","50":"2","51":"3","52":"4","53":"5","54":"6","55":"7","56":"8","57":"9","58":":","59":";","60":"<","61":"=","62":">","63":"?","64":"@","65":"A","66":"B","67":"C","68":"D","69":"E","70":"F","71":"G","72":"H","73":"I","74":"J","75":"K","76":"L","77":"M","78":"N","79":"O","80":"P","81":"Q","82":"R","83":"S","84":"T","85":"U","86":"V","87":"W","88":"X","89":"Y","90":"Z","91":"[","92":"\\","93":"]","94":"^","95":"_","96":"`","97":"a","98":"b","99":"c","100":"d","101":"e","102":"f","103":"g","104":"h","105":"i","106":"j","107":"k","108":"l","109":"m","110":"n","111":"o","112":"p","113":"q","114":"r","115":"s","116":"t","117":"u","118":"v","119":"w","120":"x","121":"y","122":"z","123":"{","124":"|","125":"}","126":"~","127":"ġ","128":"Ģ","129":"ģ","130":"Ĥ","131":"ĥ","132":"Ħ","133":"ħ","134":"Ĩ","135":"ĩ","136":"Ī","137":"ī","138":"Ĭ","139":"ĭ","140":"Į","141":"į","142":"İ","143":"ı","144":"Ĳ","145":"ĳ","146":"Ĵ","147":"ĵ","148":"Ķ","149":"ķ","150":"ĸ","151":"Ĺ","152":"ĺ","153":"Ļ","154":"ļ","155":"Ľ","156":"ľ","157":"Ŀ","158":"ŀ","159":"Ł","160":"ł","161":"¡","162":"¢","163":"£","164":"¤","165":"¥","166":"¦","167":"§","168":"¨","169":"©","170":"ª","171":"«","172":"¬","173":"Ń","174":"®","175":"¯","176":"°","177":"±","178":"²","179":"³","180":"´","181":"µ","182":"¶","183":"·","184":"¸","185":"¹","186":"º","187":"»","188":"¼","189":"½","190":"¾","191":"¿","192":"À","193":"Á","194":"Â","195":"Ã","196":"Ä","197":"Å","198":"Æ","199":"Ç","200":"È","201":"É","202":"Ê","203":"Ë","204":"Ì","205":"Í","206":"Î","207":"Ï","208":"Ð","209":"Ñ","210":"Ò","211":"Ó","212":"Ô","213":"Õ","214":"Ö","215":"×","216":"Ø","217":"Ù","218":"Ú","219":"Û","220":"Ü","221":"Ý","222":"Þ","223":"ß","224":"à","225":"á","226":"â","227":"ã","228":"ä","229":"å","230":"æ","231":"ç","232":"è","233":"é","234":"ê","235":"ë","236":"ì","237":"í","238":"î","239":"ï","240":"ð","241":"ñ","242":"ò","243":"ó","244":"ô","245":"õ","246":"ö","247":"÷","248":"ø","249":"ù","250":"ú","251":"û","252":"ü","253":"ý","254":"þ","255":"ÿ"}
@@ -0,0 +1,320 @@
+<?php
+
+function gpt_encode($text) 
+{
+    $bpe_tokens = array();
+    if(empty($text))
+    {
+        return $bpe_tokens;
+    }
+    $raw_chars = file_get_contents(dirname(__FILE__) . "/characters.json");
+    $byte_encoder = json_decode($raw_chars, true);
+    if(empty($byte_encoder))
+    {
+        error_log('Failed to load characters.json: ' . $raw_chars);
+        return $bpe_tokens;
+    }
+    $rencoder = file_get_contents(dirname(__FILE__) . "/encoder.json");
+    $encoder = json_decode($rencoder, true);
+    if(empty($encoder))
+    {
+        error_log('Failed to load encoder.json: ' . $rencoder);
+        return $bpe_tokens;
+    }
+
+    $bpe_file = file_get_contents(dirname(__FILE__) . "/vocab.bpe");
+    if(empty($bpe_file))
+    {
+        error_log('Failed to load vocab.bpe');
+        return $bpe_tokens;
+    }
+
+    preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches);
+    if(!isset($matches[0]) || count($matches[0]) == 0)
+    {
+        error_log('Failed to match string: ' . $text);
+        return $bpe_tokens;
+    }
+    $lines = preg_split('/\r\n|\r|\n/', $bpe_file);
+    $bpe_merges = array();
+    $bpe_merges_temp = array_slice($lines, 1, count($lines), true);
+    foreach($bpe_merges_temp as $bmt)
+    {
+        $split_bmt = preg_split('#(\s+)#', $bmt);
+        $split_bmt = array_filter($split_bmt, 'gpt_my_filter');
+        if(count($split_bmt) > 0)
+        {
+            $bpe_merges[] = $split_bmt;
+        }
+    }
+    $bpe_ranks = gpt_dictZip($bpe_merges, range(0, count($bpe_merges) - 1));
+    
+    $cache = array();
+    foreach($matches[0] as $token)
+    {
+        $new_tokens = array();
+        $chars = array();
+        $token = utf8_encode($token);
+        if(function_exists('mb_strlen'))
+        {
+            $len = mb_strlen($token, 'UTF-8');
+            for ($i = 0; $i < $len; $i++) 
+            {
+                $chars[] = mb_substr($token, $i, 1, 'UTF-8');
+            }
+        }
+        else
+        {
+            $chars = str_split($token);
+        }
+        $result_word = '';
+        foreach($chars as $char)
+        {
+            if(isset($byte_encoder[gpt_unichr($char)]))
+            {
+                $result_word .= $byte_encoder[gpt_unichr($char)];
+            }
+        }
+        $new_tokens_bpe = gpt_bpe($result_word, $bpe_ranks, $cache);
+        $new_tokens_bpe = explode(' ', $new_tokens_bpe);
+        foreach($new_tokens_bpe as $x)
+        {
+            if(isset($encoder[$x]))
+            {
+                $new_tokens[$x] = $encoder[$x];
+            }
+            else
+            {
+                $new_tokens[$x] = $x;
+            }
+        }
+        foreach($new_tokens as $ninx => $nval)
+        {
+            if(isset($bpe_tokens[$ninx]))
+            {
+                $bpe_tokens[] = $nval;
+            }
+            else
+            {
+                $bpe_tokens[$ninx] = $nval;
+            }
+        }
+    }
+    return $bpe_tokens;
+}
+
+function gpt_my_filter($var)
+{
+    return ($var !== NULL && $var !== FALSE && $var !== '');
+}
+
+function gpt_unichr($c) 
+{
+    if (ord($c[0]) >=0 && ord($c[0]) <= 127)
+    {
+        return ord($c[0]);
+    }
+    if (ord($c[0]) >= 192 && ord($c[0]) <= 223)
+    {
+        return (ord($c[0])-192)*64 + (ord($c[1])-128);
+    }
+    if (ord($c[0]) >= 224 && ord($c[0]) <= 239)
+    {
+        return (ord($c[0])-224)*4096 + (ord($c[1])-128)*64 + (ord($c[2])-128);
+    }
+    if (ord($c[0]) >= 240 && ord($c[0]) <= 247)
+    {
+        return (ord($c[0])-240)*262144 + (ord($c[1])-128)*4096 + (ord($c[2])-128)*64 + (ord($c[3])-128);
+    }
+    if (ord($c[0]) >= 248 && ord($c[0]) <= 251)
+    {
+        return (ord($c[0])-248)*16777216 + (ord($c[1])-128)*262144 + (ord($c[2])-128)*4096 + (ord($c[3])-128)*64 + (ord($c[4])-128);
+    }
+    if (ord($c[0]) >= 252 && ord($c[0]) <= 253)
+    {
+        return (ord($c[0])-252)*1073741824 + (ord($c[1])-128)*16777216 + (ord($c[2])-128)*262144 + (ord($c[3])-128)*4096 + (ord($c[4])-128)*64 + (ord($c[5])-128);
+    }
+    if (ord($c[0]) >= 254 && ord($c[0]) <= 255)
+    {
+        return 0;
+    }
+    return 0;
+}
+function gpt_dictZip($x, $y)
+{
+    $result = array();
+    $cnt = 0;
+    foreach($x as $i)
+    {
+        if(isset($i[1]) && isset($i[0]))
+        {
+            $result[$i[0] . ',' . $i[1]] = $cnt;
+            $cnt++;
+        }
+    }
+    return $result;
+}
+function gpt_get_pairs($word) 
+{
+    $pairs = array();
+    $prev_char = $word[0];
+    for ($i = 1; $i < count($word); $i++) 
+    {
+        $char = $word[$i];
+        $pairs[] = array($prev_char, $char);
+        $prev_char = $char;
+    }
+    return $pairs;
+}
+function gpt_split($str, $len = 1) 
+{
+    $arr		= [];
+    if(function_exists('mb_strlen'))
+    {
+        $length 	= mb_strlen($str, 'UTF-8');
+    }
+    else
+    {
+        $length 	= strlen($str);
+    }
+
+    for ($i = 0; $i < $length; $i += $len) 
+    {
+        if(function_exists('mb_substr'))
+        {
+            $arr[] = mb_substr($str, $i, $len, 'UTF-8');
+        }
+        else
+        {
+            $arr[] = substr($str, $i, $len);
+        }
+    }
+    return $arr;
+
+}
+function gpt_bpe($token, $bpe_ranks, &$cache)
+{
+    if(array_key_exists($token, $cache))
+    {
+        return $cache[$token];
+    }
+    $word = gpt_split($token);
+    $init_len = count($word);
+    $pairs = gpt_get_pairs($word);
+    if(!$pairs)
+    {
+        return $token;
+    }
+    while (true) 
+    {
+        $minPairs = array();
+        foreach($pairs as $pair)
+        {
+            if(array_key_exists($pair[0] . ','. $pair[1], $bpe_ranks))
+            {
+                $rank = $bpe_ranks[$pair[0] . ','. $pair[1]];
+                $minPairs[$rank] = $pair;
+            }
+            else
+            { 
+                $minPairs[10e10] = $pair;
+            }
+        }
+        ksort($minPairs);
+        $min_key = array_key_first($minPairs);
+        foreach($minPairs as $mpi => $mp)
+        {
+            if($mpi < $min_key)
+            {
+                $min_key = $mpi;
+            }
+        }
+        $bigram = $minPairs[$min_key];
+        if(!array_key_exists($bigram[0] . ',' . $bigram[1], $bpe_ranks))
+        {
+            break;
+        }
+        $first = $bigram[0];
+        $second = $bigram[1];
+        $new_word = array();
+        $i = 0;
+        while ($i < count($word)) 
+        {
+            $j = gpt_indexOf($word, $first, $i);
+            if ($j === -1) 
+            {
+                $new_word = array_merge($new_word, array_slice($word, $i, null, true));
+                break;
+            }
+            if($i > $j)
+            {
+                $slicer = array();
+            }
+            elseif($j == 0)
+            {
+                $slicer = array();
+            }
+            else
+            {
+                $slicer = array_slice($word, $i, $j - $i, true);
+            }
+            $new_word = array_merge($new_word, $slicer);
+            if(count($new_word) > $init_len)
+            {
+                break;
+            }
+            $i = $j;
+            if ($word[$i] === $first && $i < count($word) - 1 && $word[$i + 1] === $second) 
+            {
+                array_push($new_word, $first . $second);
+                $i = $i + 2;
+            }
+            else
+            {
+                array_push($new_word, $word[$i]);
+                $i = $i + 1;
+            }
+        }
+        if($word == $new_word)
+        {
+            break;
+        }
+        $word = $new_word;
+        if (count($word) === 1) 
+        {
+            break;
+        }
+        else
+        {
+            $pairs = gpt_get_pairs($word);
+        }
+    }
+    $word = implode(' ', $word);
+    $cache[$token] = $word;
+    return $word;
+}
+function gpt_indexOf($arrax, $searchElement, $fromIndex)
+{
+    $index = 0;
+    foreach($arrax as $index => $value)
+    {
+        if($index < $fromIndex)
+        {
+            $index++;
+            continue;
+        }
+        if($value == $searchElement)
+        {
+            return $index;
+        }
+        $index++;
+    }
+    return -1;
+}
+
+$prompt = "Many words map to one token, but some don't: indivisible. Unicode characters like emojis may be split into many tokens containing the underlying bytes: 🤚🏾 Sequences of characters commonly found next to each other may be grouped together: 1234567890";
+$token_array = gpt_encode($prompt);
+error_log('Token array: ' . print_r($token_array, true));
+error_log('Count: ' . count($token_array));
+
+?>
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	+{"0":"Ā","1":"ā","2":"Ă","3":"ă","4":"Ą","5":"ą","6":"Ć","7":"ć","8":"Ĉ","9":"ĉ","10":"Ċ","11":"ċ","12":"Č","13":"č","14":"Ď","15":"ď","16":"Đ","17":"đ","18":"Ē","19":"ē","20":"Ĕ","21":"ĕ","22":"Ė","23":"ė","24":"Ę","25":"ę","26":"Ě","27":"ě","28":"Ĝ","29":"ĝ","30":"Ğ","31":"ğ","32":"Ġ","33":"!","34":"\"","35":"#","36":"$","37":"%","38":"&","39":"'","40":"(","41":")","42":"*","43":"+","44":",","45":"-","46":".","47":"/","48":"0","49":"1","50":"2","51":"3","52":"4","53":"5","54":"6","55":"7","56":"8","57":"9","58":":","59":";","60":"<","61":"=","62":">","63":"?","64":"@","65":"A","66":"B","67":"C","68":"D","69":"E","70":"F","71":"G","72":"H","73":"I","74":"J","75":"K","76":"L","77":"M","78":"N","79":"O","80":"P","81":"Q","82":"R","83":"S","84":"T","85":"U","86":"V","87":"W","88":"X","89":"Y","90":"Z","91":"[","92":"\\","93":"]","94":"^","95":"_","96":"`","97":"a","98":"b","99":"c","100":"d","101":"e","102":"f","103":"g","104":"h","105":"i","106":"j","107":"k","108":"l","109":"m","110":"n","111":"o","112":"p","113":"q","114":"r","115":"s","116":"t","117":"u","118":"v","119":"w","120":"x","121":"y","122":"z","123":"{","124":"\|","125":"}","126":"~","127":"ġ","128":"Ģ","129":"ģ","130":"Ĥ","131":"ĥ","132":"Ħ","133":"ħ","134":"Ĩ","135":"ĩ","136":"Ī","137":"ī","138":"Ĭ","139":"ĭ","140":"Į","141":"į","142":"İ","143":"ı","144":"Ĳ","145":"ĳ","146":"Ĵ","147":"ĵ","148":"Ķ","149":"ķ","150":"ĸ","151":"Ĺ","152":"ĺ","153":"Ļ","154":"ļ","155":"Ľ","156":"ľ","157":"Ŀ","158":"ŀ","159":"Ł","160":"ł","161":"¡","162":"¢","163":"£","164":"¤","165":"¥","166":"¦","167":"§","168":"¨","169":"©","170":"ª","171":"«","172":"¬","173":"Ń","174":"®","175":"¯","176":"°","177":"±","178":"²","179":"³","180":"´","181":"µ","182":"¶","183":"·","184":"¸","185":"¹","186":"º","187":"»","188":"¼","189":"½","190":"¾","191":"¿","192":"À","193":"Á","194":"Â","195":"Ã","196":"Ä","197":"Å","198":"Æ","199":"Ç","200":"È","201":"É","202":"Ê","203":"Ë","204":"Ì","205":"Í","206":"Î","207":"Ï","208":"Ð","209":"Ñ","210":"Ò","211":"Ó","212":"Ô","213":"Õ","214":"Ö","215":"×","216":"Ø","217":"Ù","218":"Ú","219":"Û","220":"Ü","221":"Ý","222":"Þ","223":"ß","224":"à","225":"á","226":"â","227":"ã","228":"ä","229":"å","230":"æ","231":"ç","232":"è","233":"é","234":"ê","235":"ë","236":"ì","237":"í","238":"î","239":"ï","240":"ð","241":"ñ","242":"ò","243":"ó","244":"ô","245":"õ","246":"ö","247":"÷","248":"ø","249":"ù","250":"ú","251":"û","252":"ü","253":"ý","254":"þ","255":"ÿ"}