Skip to content

Commit 0f1c1f8

Browse files
authored
Merge pull request #967 from streamich/json-patch-diff-fixes
JSON Patch and string diff fix
2 parents 1aa7532 + 90eab7d commit 0f1c1f8

File tree

2 files changed

+118
-1
lines changed

2 files changed

+118
-1
lines changed
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import {assertDiff} from './util';
2+
3+
/**
4+
* Tests for a specific bug where the string diff algorithm would incorrectly
5+
* handle surrogate pairs (emoji characters) when they appeared in different
6+
* positions in the source and destination strings.
7+
*/
8+
describe('String diff with emoji - specific bug cases', () => {
9+
test('emoji at beginning - from error log', () => {
10+
// This is the exact case that was failing in the fuzzing tests
11+
const str1 = '💚莡韚😻襘😴}诇';
12+
const str2 = '😻ʯ愂H😤副🗶íŋ😒😹Ù';
13+
assertDiff(str1, str2);
14+
});
15+
16+
test('simple emoji replacement at start', () => {
17+
assertDiff('😻hello', '😤hello');
18+
});
19+
20+
test('emoji deletion at start', () => {
21+
assertDiff('😻hello', 'hello');
22+
});
23+
24+
test('emoji insertion at start', () => {
25+
assertDiff('hello', '😻hello');
26+
});
27+
28+
test('multiple emojis at start', () => {
29+
assertDiff('😻😤hello', '😤😻hello');
30+
});
31+
32+
test('emoji in middle position', () => {
33+
assertDiff('hello😻world', 'hello😤world');
34+
});
35+
36+
test('complex emoji sequences', () => {
37+
assertDiff('👨‍👩‍👧‍👦test', 'test👨‍👩‍👧‍👦');
38+
});
39+
40+
test('emoji with other unicode', () => {
41+
assertDiff('😻你好', 'hello😻');
42+
});
43+
44+
test('surrogate pair handling', () => {
45+
// Test surrogate pairs specifically
46+
// 😻 is encoded as \uD83D\uDE3B (two UTF-16 code units)
47+
const emoji = '\uD83D\uDE3B'; // 😻
48+
assertDiff(emoji + 'test', 'test' + emoji);
49+
});
50+
51+
test('mixed emoji and CJK', () => {
52+
assertDiff('😻中文😤', '中文😻😤');
53+
});
54+
55+
test('emoji at same position in both strings', () => {
56+
// Should recognize emoji as common part
57+
assertDiff('😻test', '😻best');
58+
});
59+
60+
test('multiple different emojis starting with same high surrogate', () => {
61+
// Many emojis share the same high surrogate (d83d)
62+
// 😻 = d83d de3b
63+
// 😤 = d83d de24
64+
// 😴 = d83d de34
65+
// 💚 = d83d dc9a
66+
assertDiff('💚😻😤', '😴😻😤');
67+
});
68+
69+
test('emoji moved from middle to beginning', () => {
70+
assertDiff('abc😻def', '😻abcdef');
71+
});
72+
73+
test('emoji moved from beginning to middle', () => {
74+
assertDiff('😻abcdef', 'abc😻def');
75+
});
76+
77+
test('two identical emojis at different positions', () => {
78+
assertDiff('😻abc😻def', 'abc😻def😻');
79+
});
80+
81+
test('long strings with emoji in common', () => {
82+
const prefix = 'a'.repeat(100);
83+
const suffix = 'b'.repeat(100);
84+
assertDiff(prefix + '😻' + suffix, prefix + '😤' + suffix);
85+
});
86+
87+
test('regression: emoji not in common but sharing high surrogate', () => {
88+
// This specifically tests the case where emojis share a high surrogate
89+
// but are different characters - should NOT be treated as common
90+
assertDiff('💚test', '😻test');
91+
});
92+
});

packages/json-joy/src/util/diff/str.ts

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,31 @@ const cleanupMerge = (diff: Patch, fixUnicode: boolean) => {
241241
* @return Array of diff tuples.
242242
*/
243243
const bisectSplit = (text1: string, text2: string, x: number, y: number): Patch => {
244+
// Adjust split points to avoid breaking surrogate pairs.
245+
// The bisect algorithm uses .charAt() which operates on UTF-16 code units,
246+
// so the split points (x, y) might fall in the middle of a surrogate pair.
247+
// We need to adjust them to ensure we don't split emoji or other multi-unit characters.
248+
249+
// If x is in the middle of a surrogate pair in text1, move it back
250+
if (x > 0 && x < text1.length) {
251+
const code = text1.charCodeAt(x);
252+
// If x points to a low surrogate, we're splitting a pair
253+
if (code >= 0xdc00 && code <= 0xdfff) {
254+
x--;
255+
}
256+
}
257+
258+
// If y is in the middle of a surrogate pair in text2, move it back
259+
if (y > 0 && y < text2.length) {
260+
const code = text2.charCodeAt(y);
261+
// If y points to a low surrogate, we're splitting a pair
262+
if (code >= 0xdc00 && code <= 0xdfff) {
263+
y--;
264+
}
265+
}
266+
267+
// Use fixUnicode: true to ensure the recursive diffs also handle Unicode properly.
268+
// This prevents issues where surrogate pairs might be split at diff boundaries.
244269
const diffsA = diff_(text1.slice(0, x), text2.slice(0, y), false);
245270
const diffsB = diff_(text1.slice(x), text2.slice(y), false);
246271
return diffsA.concat(diffsB);
@@ -499,7 +524,7 @@ export const overlap = (str1: string, str2: string): number => {
499524
*
500525
* @param src Old string to be diffed.
501526
* @param dst New string to be diffed.
502-
* @param cleanup Whether to apply semantic cleanup before returning.
527+
* @param fixUnicode Whether to apply semantic cleanup before returning.
503528
* @return A {@link Patch} - an array of patch operations.
504529
*/
505530
const diff_ = (src: string, dst: string, fixUnicode: boolean): Patch => {

0 commit comments

Comments
 (0)