Skip to content

Commit b85873e

Browse files
author
Anton Kuznetsov
committed
Optimized bpe_merges calculations
1 parent b8c5f2c commit b85873e

File tree

1 file changed

+22
-14
lines changed

1 file changed

+22
-14
lines changed

src/GPT3Encoder/Gpt3Encoder.php

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,24 +29,14 @@ public function encode(string $text): array
2929
if (empty($text)) {
3030
return $bpe_tokens;
3131
}
32-
$byte_encoder = $this->byteEncoder();
33-
$encoder = $this->encoder();
34-
$bpe_file = $this->vocabulary();
35-
3632
preg_match_all("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u", $text, $matches);
3733
if (!isset($matches[0]) || count($matches[0]) === 0) {
3834
throw new \RuntimeException('Failed to match string');
3935
}
40-
$lines = preg_split('/\r\n|\r|\n/', $bpe_file);
41-
$bpe_merges = [];
42-
$bpe_merges_temp = array_slice($lines, 1, count($lines), true);
43-
foreach ($bpe_merges_temp as $bmt) {
44-
$split_bmt = preg_split('#(\s+)#', $bmt);
45-
$split_bmt = array_filter($split_bmt, fn($item) => $this->myFilter($item));
46-
if (count($split_bmt) > 0) {
47-
$bpe_merges[] = $split_bmt;
48-
}
49-
}
36+
$byte_encoder = $this->byteEncoder();
37+
$encoder = $this->encoder();
38+
$bpe_file = $this->vocabulary();
39+
$bpe_merges = $this->merges($bpe_file);
5040
$bpe_ranks = $this->dictZip($bpe_merges);
5141

5242
$cache = [];
@@ -350,4 +340,22 @@ public function decode(array $tokens, bool $throwOnUndefinedChar = false): strin
350340
}
351341
return $output;
352342
}
343+
344+
private function merges(string $bpe_file): array
345+
{
346+
if (!$bpe_merges = $this->cache->get('bpe_merges')) {
347+
$lines = preg_split('/\r\n|\r|\n/', $bpe_file);
348+
$bpe_merges = [];
349+
$bpe_merges_temp = array_slice($lines, 1, count($lines), true);
350+
foreach ($bpe_merges_temp as $bmt) {
351+
$split_bmt = preg_split('#(\s+)#', $bmt);
352+
$split_bmt = array_filter($split_bmt, fn($item) => $this->myFilter($item));
353+
if (count($split_bmt) > 0) {
354+
$bpe_merges[] = $split_bmt;
355+
}
356+
}
357+
$this->cache->set('bpe_merges', $bpe_merges);
358+
}
359+
return $bpe_merges;
360+
}
353361
}

0 commit comments

Comments
 (0)