@@ -29,24 +29,14 @@ public function encode(string $text): array
2929 if (empty ($ text )) {
3030 return $ bpe_tokens ;
3131 }
32- $ byte_encoder = $ this ->byteEncoder ();
33- $ encoder = $ this ->encoder ();
34- $ bpe_file = $ this ->vocabulary ();
35-
3632 preg_match_all ("#'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+#u " , $ text , $ matches );
3733 if (!isset ($ matches [0 ]) || count ($ matches [0 ]) === 0 ) {
3834 throw new \RuntimeException ('Failed to match string ' );
3935 }
40- $ lines = preg_split ('/\r\n|\r|\n/ ' , $ bpe_file );
41- $ bpe_merges = [];
42- $ bpe_merges_temp = array_slice ($ lines , 1 , count ($ lines ), true );
43- foreach ($ bpe_merges_temp as $ bmt ) {
44- $ split_bmt = preg_split ('#(\s+)# ' , $ bmt );
45- $ split_bmt = array_filter ($ split_bmt , fn ($ item ) => $ this ->myFilter ($ item ));
46- if (count ($ split_bmt ) > 0 ) {
47- $ bpe_merges [] = $ split_bmt ;
48- }
49- }
36+ $ byte_encoder = $ this ->byteEncoder ();
37+ $ encoder = $ this ->encoder ();
38+ $ bpe_file = $ this ->vocabulary ();
39+ $ bpe_merges = $ this ->merges ($ bpe_file );
5040 $ bpe_ranks = $ this ->dictZip ($ bpe_merges );
5141
5242 $ cache = [];
@@ -350,4 +340,22 @@ public function decode(array $tokens, bool $throwOnUndefinedChar = false): strin
350340 }
351341 return $ output ;
352342 }
343+
344+ private function merges (string $ bpe_file ): array
345+ {
346+ if (!$ bpe_merges = $ this ->cache ->get ('bpe_merges ' )) {
347+ $ lines = preg_split ('/\r\n|\r|\n/ ' , $ bpe_file );
348+ $ bpe_merges = [];
349+ $ bpe_merges_temp = array_slice ($ lines , 1 , count ($ lines ), true );
350+ foreach ($ bpe_merges_temp as $ bmt ) {
351+ $ split_bmt = preg_split ('#(\s+)# ' , $ bmt );
352+ $ split_bmt = array_filter ($ split_bmt , fn ($ item ) => $ this ->myFilter ($ item ));
353+ if (count ($ split_bmt ) > 0 ) {
354+ $ bpe_merges [] = $ split_bmt ;
355+ }
356+ }
357+ $ this ->cache ->set ('bpe_merges ' , $ bpe_merges );
358+ }
359+ return $ bpe_merges ;
360+ }
353361}
0 commit comments