streamich
diff --git a/‎packages/json-joy/src/json-patch-diff/__tests__/JsonPatchDiff-string-emoji-bug.spec.ts‎
Lines changed: 92 additions & 0 deletions b/‎packages/json-joy/src/json-patch-diff/__tests__/JsonPatchDiff-string-emoji-bug.spec.ts‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎packages/json-joy/src/util/diff/str.ts‎
Lines changed: 26 additions & 1 deletion b/‎packages/json-joy/src/util/diff/str.ts‎
Lines changed: 26 additions & 1 deletion
@@ -0,0 +1,92 @@
+import {assertDiff} from './util';
+
+/**
+ * Tests for a specific bug where the string diff algorithm would incorrectly
+ * handle surrogate pairs (emoji characters) when they appeared in different
+ * positions in the source and destination strings.
+ */
+describe('String diff with emoji - specific bug cases', () => {
+ test('emoji at beginning - from error log', () => {
+ // This is the exact case that was failing in the fuzzing tests
+ const str1 = '💚莡韚😻襘😴}诇';
+ const str2 = '😻Ê¯愂H😤副🗶íŋ😒😹Ù';
+ assertDiff(str1, str2);
+ });
+
+ test('simple emoji replacement at start', () => {
+ assertDiff('😻hello', '😤hello');
+ });
+
+ test('emoji deletion at start', () => {
+ assertDiff('😻hello', 'hello');
+ });
+
+ test('emoji insertion at start', () => {
+ assertDiff('hello', '😻hello');
+ });
+
+ test('multiple emojis at start', () => {
+ assertDiff('😻😤hello', '😤😻hello');
+ });
+
+ test('emoji in middle position', () => {
+ assertDiff('hello😻world', 'hello😤world');
+ });
+
+ test('complex emoji sequences', () => {
+ assertDiff('👨‍👩‍👧‍👦test', 'test👨‍👩‍👧‍👦');
+ });
+
+ test('emoji with other unicode', () => {
+ assertDiff('😻你好', 'hello😻');
+ });
+
+ test('surrogate pair handling', () => {
+ // Test surrogate pairs specifically
+ // 😻 is encoded as \uD83D\uDE3B (two UTF-16 code units)
+ const emoji = '\uD83D\uDE3B'; // 😻
+ assertDiff(emoji + 'test', 'test' + emoji);
+ });
+
+ test('mixed emoji and CJK', () => {
+ assertDiff('😻中文😤', '中文😻😤');
+ });
+
+ test('emoji at same position in both strings', () => {
+ // Should recognize emoji as common part
+ assertDiff('😻test', '😻best');
+ });
+
+ test('multiple different emojis starting with same high surrogate', () => {
+ // Many emojis share the same high surrogate (d83d)
+ // 😻 = d83d de3b
+ // 😤 = d83d de24
+ // 😴 = d83d de34
+ // 💚 = d83d dc9a
+ assertDiff('💚😻😤', '😴😻😤');
+ });
+
+ test('emoji moved from middle to beginning', () => {
+ assertDiff('abc😻def', '😻abcdef');
+ });
+
+ test('emoji moved from beginning to middle', () => {
+ assertDiff('😻abcdef', 'abc😻def');
+ });
+
+ test('two identical emojis at different positions', () => {
+ assertDiff('😻abc😻def', 'abc😻def😻');
+ });
+
+ test('long strings with emoji in common', () => {
+ const prefix = 'a'.repeat(100);
+ const suffix = 'b'.repeat(100);
+ assertDiff(prefix + '😻' + suffix, prefix + '😤' + suffix);
+ });
+
+ test('regression: emoji not in common but sharing high surrogate', () => {
+ // This specifically tests the case where emojis share a high surrogate
+ // but are different characters - should NOT be treated as common
+ assertDiff('💚test', '😻test');
+ });
+});
@@ -241,6 +241,31 @@ const cleanupMerge = (diff: Patch, fixUnicode: boolean) => {
  * @return Array of diff tuples.
  */
 const bisectSplit = (text1: string, text2: string, x: number, y: number): Patch => {
+ // Adjust split points to avoid breaking surrogate pairs.
+ // The bisect algorithm uses .charAt() which operates on UTF-16 code units,
+ // so the split points (x, y) might fall in the middle of a surrogate pair.
+ // We need to adjust them to ensure we don't split emoji or other multi-unit characters.
+
+ // If x is in the middle of a surrogate pair in text1, move it back
+ if (x > 0 && x < text1.length) {
+ const code = text1.charCodeAt(x);
+ // If x points to a low surrogate, we're splitting a pair
+ if (code >= 0xdc00 && code <= 0xdfff) {
+ x--;
+ }
+ }
+
+ // If y is in the middle of a surrogate pair in text2, move it back
+ if (y > 0 && y < text2.length) {
+ const code = text2.charCodeAt(y);
+ // If y points to a low surrogate, we're splitting a pair
+ if (code >= 0xdc00 && code <= 0xdfff) {
+ y--;
+ }
+ }
+
+ // Use fixUnicode: true to ensure the recursive diffs also handle Unicode properly.
+ // This prevents issues where surrogate pairs might be split at diff boundaries.
  const diffsA = diff_(text1.slice(0, x), text2.slice(0, y), false);
  const diffsB = diff_(text1.slice(x), text2.slice(y), false);
  return diffsA.concat(diffsB);
@@ -499,7 +524,7 @@ export const overlap = (str1: string, str2: string): number => {
  *
  * @param src Old string to be diffed.
  * @param dst New string to be diffed.
- * @param cleanup Whether to apply semantic cleanup before returning.
+ * @param fixUnicode Whether to apply semantic cleanup before returning.
  * @return A {@link Patch} - an array of patch operations.
  */
 const diff_ = (src: string, dst: string, fixUnicode: boolean): Patch => {