Skip to content

Commit 6becdc3

Browse files
committed
refactor: split impls into arch
1 parent 1339b96 commit 6becdc3

File tree

10 files changed

+868
-434
lines changed

10 files changed

+868
-434
lines changed

src/lib.rs

Lines changed: 23 additions & 388 deletions
Large diffs are not rendered by default.

src/simd/avx2.rs

Lines changed: 180 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,19 @@ use std::arch::x86_64::*;
55

66
use std::ops::{BitAnd, BitOr, BitOrAssign};
77

8-
use super::{Mask, Simd};
8+
use super::{Mask, Simd, traits::BitMask, util::escape_unchecked};
99

10-
#[derive(Debug)]
10+
#[cfg(any(target_os = "linux", target_os = "macos"))]
11+
use super::util::check_cross_page;
12+
13+
const LANES: usize = 32;
14+
const CHUNK: usize = LANES * 4;
15+
16+
#[derive(Debug, Clone, Copy)]
1117
#[repr(transparent)]
1218
pub struct Simd256u(__m256i);
1319

14-
#[derive(Debug)]
20+
#[derive(Debug, Clone, Copy)]
1521
#[repr(transparent)]
1622
pub struct Mask256(__m256i);
1723

@@ -51,7 +57,7 @@ impl BitOrAssign<Mask256> for Mask256 {
5157
}
5258

5359
impl Simd for Simd256u {
54-
const LANES: usize = 32;
60+
const LANES: usize = LANES;
5561
type Mask = Mask256;
5662
type Element = u8;
5763

@@ -87,3 +93,173 @@ impl Simd for Simd256u {
8793
}
8894
}
8995
}
96+
97+
#[inline(always)]
98+
fn escaped_mask(v: Simd256u) -> u32 {
99+
let x1f = Simd256u::splat(0x1f); // 0x00 ~ 0x20
100+
let blash = Simd256u::splat(b'\\');
101+
let quote = Simd256u::splat(b'"');
102+
let v = v.le(&x1f) | v.eq(&blash) | v.eq(&quote);
103+
v.bitmask()
104+
}
105+
106+
#[target_feature(enable = "avx2")]
107+
pub unsafe fn format_string(value: &str, dst: &mut [u8]) -> usize {
108+
unsafe {
109+
let slice = value.as_bytes();
110+
let mut sptr = slice.as_ptr();
111+
let mut dptr = dst.as_mut_ptr();
112+
let dstart = dptr;
113+
let mut nb: usize = slice.len();
114+
115+
*dptr = b'"';
116+
dptr = dptr.add(1);
117+
118+
// Process CHUNK (4 * LANES = 128 bytes) at a time
119+
while nb >= CHUNK {
120+
// Load 4 SIMD vectors
121+
let v1 = Simd256u::loadu(sptr);
122+
let v2 = Simd256u::loadu(sptr.add(LANES));
123+
let v3 = Simd256u::loadu(sptr.add(LANES * 2));
124+
let v4 = Simd256u::loadu(sptr.add(LANES * 3));
125+
126+
// Check all 4 masks
127+
let mask1 = escaped_mask(v1);
128+
let mask2 = escaped_mask(v2);
129+
let mask3 = escaped_mask(v3);
130+
let mask4 = escaped_mask(v4);
131+
132+
// Fast path: if all vectors are clean, write the entire chunk
133+
if mask1.all_zero() && mask2.all_zero() && mask3.all_zero() && mask4.all_zero() {
134+
v1.storeu(dptr);
135+
v2.storeu(dptr.add(LANES));
136+
v3.storeu(dptr.add(LANES * 2));
137+
v4.storeu(dptr.add(LANES * 3));
138+
nb -= CHUNK;
139+
dptr = dptr.add(CHUNK);
140+
sptr = sptr.add(CHUNK);
141+
} else {
142+
// Slow path: handle escape character
143+
// Process v1
144+
v1.storeu(dptr);
145+
if !mask1.all_zero() {
146+
let cn = mask1.first_offset();
147+
nb -= cn;
148+
dptr = dptr.add(cn);
149+
sptr = sptr.add(cn);
150+
escape_unchecked(&mut sptr, &mut nb, &mut dptr);
151+
continue;
152+
}
153+
nb -= LANES;
154+
dptr = dptr.add(LANES);
155+
sptr = sptr.add(LANES);
156+
157+
// Process v2
158+
v2.storeu(dptr);
159+
if !mask2.all_zero() {
160+
let cn = mask2.first_offset();
161+
nb -= cn;
162+
dptr = dptr.add(cn);
163+
sptr = sptr.add(cn);
164+
escape_unchecked(&mut sptr, &mut nb, &mut dptr);
165+
continue;
166+
}
167+
nb -= LANES;
168+
dptr = dptr.add(LANES);
169+
sptr = sptr.add(LANES);
170+
171+
// Process v3
172+
v3.storeu(dptr);
173+
if !mask3.all_zero() {
174+
let cn = mask3.first_offset();
175+
nb -= cn;
176+
dptr = dptr.add(cn);
177+
sptr = sptr.add(cn);
178+
escape_unchecked(&mut sptr, &mut nb, &mut dptr);
179+
continue;
180+
}
181+
nb -= LANES;
182+
dptr = dptr.add(LANES);
183+
sptr = sptr.add(LANES);
184+
185+
// Process v4
186+
v4.storeu(dptr);
187+
if !mask4.all_zero() {
188+
let cn = mask4.first_offset();
189+
nb -= cn;
190+
dptr = dptr.add(cn);
191+
sptr = sptr.add(cn);
192+
escape_unchecked(&mut sptr, &mut nb, &mut dptr);
193+
continue;
194+
}
195+
nb -= LANES;
196+
dptr = dptr.add(LANES);
197+
sptr = sptr.add(LANES);
198+
}
199+
}
200+
201+
// Process remaining LANES bytes at a time
202+
while nb >= LANES {
203+
let v = Simd256u::loadu(sptr);
204+
v.storeu(dptr);
205+
let mask = escaped_mask(v);
206+
207+
if mask.all_zero() {
208+
nb -= LANES;
209+
dptr = dptr.add(LANES);
210+
sptr = sptr.add(LANES);
211+
} else {
212+
let cn = mask.first_offset();
213+
nb -= cn;
214+
dptr = dptr.add(cn);
215+
sptr = sptr.add(cn);
216+
escape_unchecked(&mut sptr, &mut nb, &mut dptr);
217+
}
218+
}
219+
220+
// Handle remaining bytes
221+
let mut placeholder: [u8; LANES] = [0; LANES];
222+
while nb > 0 {
223+
#[cfg(not(any(target_os = "linux", target_os = "macos")))]
224+
let v = {
225+
std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb);
226+
Simd256u::loadu(placeholder.as_ptr())
227+
};
228+
#[cfg(any(target_os = "linux", target_os = "macos"))]
229+
let v = {
230+
if check_cross_page(sptr, LANES) {
231+
std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb);
232+
Simd256u::loadu(placeholder.as_ptr())
233+
} else {
234+
#[cfg(any(debug_assertions, miri))]
235+
{
236+
std::ptr::copy_nonoverlapping(sptr, placeholder.as_mut_ptr(), nb);
237+
Simd256u::loadu(placeholder.as_ptr())
238+
}
239+
#[cfg(not(any(debug_assertions, miri)))]
240+
{
241+
Simd256u::loadu(sptr)
242+
}
243+
}
244+
};
245+
246+
v.storeu(dptr);
247+
let mask = escaped_mask(v).clear_high_bits(LANES - nb);
248+
249+
if mask.all_zero() {
250+
dptr = dptr.add(nb);
251+
break;
252+
} else {
253+
let cn = mask.first_offset();
254+
nb -= cn;
255+
dptr = dptr.add(cn);
256+
sptr = sptr.add(cn);
257+
escape_unchecked(&mut sptr, &mut nb, &mut dptr);
258+
}
259+
}
260+
261+
*dptr = b'"';
262+
dptr = dptr.add(1);
263+
dptr as usize - dstart as usize
264+
}
265+
}

0 commit comments

Comments
 (0)