elastic
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2Tokenizer.java‎
Lines changed: 1 addition & 1 deletion b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2Tokenizer.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java‎
Lines changed: 67 additions & 4 deletions b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizer.java‎
Lines changed: 67 additions & 4 deletions
diff --git a/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/XLMRobertaTokenizer.java‎
Lines changed: 1 addition & 1 deletion b/‎x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/XLMRobertaTokenizer.java‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java‎
Lines changed: 22 additions & 3 deletions b/‎x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/DebertaV2TokenizerTests.java‎
Lines changed: 22 additions & 3 deletions
diff --git a/‎x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizerTests.java‎
Lines changed: 1 addition & 1 deletion b/‎x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/UnigramTokenizerTests.java‎
Lines changed: 1 addition & 1 deletion
@@ -291,7 +291,7 @@ protected Reader initReader(String fieldName, Reader reader) {
 
  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
- this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken);
+ this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken, true);
  return new TokenStreamComponents(this.innerTokenizer);
  }
 
 
@@ -49,7 +49,13 @@ public final class UnigramTokenizer extends Tokenizer {
  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 
- static UnigramTokenizer build(List<String> neverSplit, List<String> dictionary, double[] scores, String unknownToken) {
+ static UnigramTokenizer build(
+ List<String> neverSplit,
+ List<String> dictionary,
+ double[] scores,
+ String unknownToken,
+ boolean byteFallback
+ ) {
  if (dictionary.isEmpty()) {
  throw new IllegalArgumentException("vocab empty");
  }
@@ -84,7 +90,8 @@ static UnigramTokenizer build(List<String> neverSplit, List<String> dictionary,
  Optional.ofNullable(tokenToId.get(new BytesRef(unknownToken)))
  .orElseThrow(
  () -> new IllegalArgumentException("provided vocabulary does not contain the unknown token of [" + unknownToken + "]")
- )
+ ),
+ byteFallback
  );
  }
 
@@ -94,7 +101,7 @@ static UnigramTokenizer build(List<String> neverSplit, List<String> dictionary,
 
  private final double minScore;
  // This may be configurable in the future
- private final boolean fuseUnk = true;
+ private boolean fuseUnk = true;
  private final double[] vocabScores;
  private final CharTrie neverSplit;
  private final CharArraySet neverSplitHash;
@@ -104,6 +111,7 @@ static UnigramTokenizer build(List<String> neverSplit, List<String> dictionary,
  // This is a buffer that is reused per token for decoding the normalized char-sequence into utf-8 bytes
  // It's usage is NOT thread safe
  private byte[] normalizedByteBuffer = new byte[128];
+ private boolean byteFallback = false; // If true, decompose unknown pieces into UTF-8 byte pieces
 
  public UnigramTokenizer(
  double minScore,
@@ -127,6 +135,31 @@ public UnigramTokenizer(
  this.whitespaceTokenizer = new SimpleWhitespaceTokenizer();
  }
 
+ public UnigramTokenizer(
+ double minScore,
+ double[] vocabScores,
+ CharTrie neverSplit,
+ CharArraySet neverSplitHash,
+ Map<BytesRef, Integer> vocabToId,
+ BytesTrie vocabTrie,
+ int unknownTokenId,
+ boolean byteFallback
+ ) {
+ super();
+ this.tokens = new LinkedList<>();
+ this.tokenizedValues = new ArrayList<>();
+ this.minScore = minScore;
+ this.neverSplit = neverSplit;
+ this.neverSplitHash = neverSplitHash;
+ this.vocabToId = vocabToId;
+ this.vocabTrie = vocabTrie;
+ this.unknownTokenId = unknownTokenId;
+ this.vocabScores = vocabScores;
+ this.whitespaceTokenizer = new SimpleWhitespaceTokenizer();
+ this.byteFallback = byteFallback;
+ this.fuseUnk = byteFallback == false;
+ }
+
  List<DelimitedToken.Encoded> getTokenizedValues() {
  return tokenizedValues;
  }
@@ -231,6 +264,22 @@ public boolean incrementToken() throws IOException {
  return false;
  }
 
+ private int[] decomposeBytePieces(CharSequence maybeTokenized) {
+ assert this.byteFallback;
+
+ byte[] bytes = maybeTokenized.toString().getBytes(StandardCharsets.UTF_8);
+ int[] pieces = new int[bytes.length];
+ for (int i = 0; i < bytes.length; i++) {
+ BytesRef decomposedToken = new BytesRef(String.format("<0x%02X>", bytes[i]));
+ Integer piece = vocabToId.get(decomposedToken);
+ if (piece == null) {
+ piece = unknownTokenId;
+ }
+ pieces[i] = piece;
+ }
+ return pieces;
+ }
+
  /**
  * This algorithm does the following:
  *
@@ -309,7 +358,21 @@ List<DelimitedToken.Encoded> tokenize(CharSequence inputSequence, IntToIntFuncti
  while (endsAtBytes > 0) {
  BestPathNode node = bestPathNodes[endsAtBytes];
  int startsAtBytes = node.startsAtBytePos;
- if (node.id == unknownTokenId && fuseUnk) {
+ if (node.id == unknownTokenId && byteFallback) {
+ CharSequence multiByteSequence = inputSequence.subSequence(node.startsAtCharPos, endsAtChars);
+ byte[] bytes = multiByteSequence.toString().getBytes(StandardCharsets.UTF_8);
+ int[] pieces = decomposeBytePieces(multiByteSequence);
+ for (int i = pieces.length - 1; i >= 0; i--) {
+ results.add(
+ new DelimitedToken.Encoded(
+ String.format("<0x%02X>", bytes[i]),
+ pieces[i],
+ offsetCorrection.apply(node.startsAtCharPos),
+ offsetCorrection.apply(startsAtBytes + i)
+ )
+ );
+ }
+ } else if (node.id == unknownTokenId && fuseUnk) {
  unknownTokens.add(
  new DelimitedToken.Encoded(
  new String(normalizedByteBuffer, startsAtBytes, endsAtBytes - startsAtBytes, StandardCharsets.UTF_8),
 
@@ -284,7 +284,7 @@ protected Reader initReader(String fieldName, Reader reader) {
 
  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
- this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken);
+ this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken, false);
  return new TokenStreamComponents(this.innerTokenizer);
  }
 
 
@@ -43,7 +43,9 @@ public class DebertaV2TokenizerTests extends ESTestCase {
  "▁😀",
  "▁🇸🇴",
  MASK_TOKEN,
- "."
+ ".",
+ "<0xC2>",
+ "<0xAD>"
  );
  private static final List<Double> TEST_CASE_SCORES = List.of(
  0.0,
@@ -65,7 +67,9 @@ public class DebertaV2TokenizerTests extends ESTestCase {
  -10.230172157287598,
  -9.451579093933105,
  0.0,
- -3.0
+ -3.0,
+ 1.0,
+ 2.0
  );
 
  private List<String> tokenStrings(List<? extends DelimitedToken> tokens) {
@@ -96,14 +100,29 @@ public void testSurrogatePair() throws IOException {
  new DebertaV2Tokenization(false, false, null, Tokenization.Truncate.NONE, -1)
  ).build()
  ) {
- TokenizationResult.Tokens tokenization = tokenizer.tokenize("😀", Tokenization.Truncate.NONE, -1, 0, null).get(0);
+ TokenizationResult.Tokens tokenization = tokenizer.tokenize(
+ "Elastic" + "\u00AD" + "search 😀" + "\u00AD" + " fun",
+ Tokenization.Truncate.NONE,
+ -1,
+ 0,
+ null
+ ).get(0);
+ assertArrayEquals(new int[] { 4, 5, 20, 21, 6, 16, 20, 21, 8 }, tokenization.tokenIds());
+ System.out.println(tokenization.tokens().get(0));
+ assertThat(
+ tokenStrings(tokenization.tokens().get(0)),
+ contains("▁Ela", "stic", "<0xC2>", "<0xAD>", "search", "▁\uD83D\uDE00", "<0xC2>", "<0xAD>", "▁fun")
+ );
+
+ tokenization = tokenizer.tokenize("😀", Tokenization.Truncate.NONE, -1, 0, null).get(0);
  assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁\uD83D\uDE00"));
 
  tokenization = tokenizer.tokenize("Elasticsearch 😀", Tokenization.Truncate.NONE, -1, 0, null).get(0);
  assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁Ela", "stic", "search", "▁\uD83D\uDE00"));
 
  tokenization = tokenizer.tokenize("Elasticsearch 😀 fun", Tokenization.Truncate.NONE, -1, 0, null).get(0);
  assertThat(tokenStrings(tokenization.tokens().get(0)), contains("▁Ela", "stic", "search", "▁\uD83D\uDE00", "▁fun"));
+
  }
  }
 
 
@@ -153,7 +153,7 @@ protected Reader initReader(String fieldName, Reader reader) {
 
  @Override
  protected TokenStreamComponents createComponents(String fieldName) {
- UnigramTokenizer tokenizer = UnigramTokenizer.build(NEVER_SPLIT, dictionary, scores, unknownToken);
+ UnigramTokenizer tokenizer = UnigramTokenizer.build(NEVER_SPLIT, dictionary, scores, unknownToken, false);
  return new TokenStreamComponents(tokenizer);
  }
  }
Original file line number	Diff line number	Diff line change
`@@ -291,7 +291,7 @@ protected Reader initReader(String fieldName, Reader reader) {`
`291`	`291`
`292`	`292`	`@Override`
`293`	`293`	`protected TokenStreamComponents createComponents(String fieldName) {`
`294`		`- this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken);`
	`294`	`+ this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken, true);`
`295`	`295`	`return new TokenStreamComponents(this.innerTokenizer);`
`296`	`296`	`}`
`297`	`297`
Original file line number	Diff line number	Diff line change
`@@ -284,7 +284,7 @@ protected Reader initReader(String fieldName, Reader reader) {`
`284`	`284`
`285`	`285`	`@Override`
`286`	`286`	`protected TokenStreamComponents createComponents(String fieldName) {`
`287`		`- this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken);`
	`287`	`+ this.innerTokenizer = UnigramTokenizer.build(neverSplit, vocabulary, scores, unknownToken, false);`
`288`	`288`	`return new TokenStreamComponents(this.innerTokenizer);`
`289`	`289`	`}`
`290`	`290`
Original file line number	Diff line number	Diff line change
`@@ -153,7 +153,7 @@ protected Reader initReader(String fieldName, Reader reader) {`
`153`	`153`
`154`	`154`	`@Override`
`155`	`155`	`protected TokenStreamComponents createComponents(String fieldName) {`
`156`		`- UnigramTokenizer tokenizer = UnigramTokenizer.build(NEVER_SPLIT, dictionary, scores, unknownToken);`
	`156`	`+ UnigramTokenizer tokenizer = UnigramTokenizer.build(NEVER_SPLIT, dictionary, scores, unknownToken, false);`
`157`	`157`	`return new TokenStreamComponents(tokenizer);`
`158`	`158`	`}`
`159`	`159`	`}`