@@ -5,13 +5,19 @@ use std::arch::x86_64::*;
55
66use std:: ops:: { BitAnd , BitOr , BitOrAssign } ;
77
8- use super :: { Mask , Simd } ;
8+ use super :: { Mask , Simd , traits :: BitMask , util :: escape_unchecked } ;
99
10- #[ derive( Debug ) ]
10+ #[ cfg( any( target_os = "linux" , target_os = "macos" ) ) ]
11+ use super :: util:: check_cross_page;
12+
13+ const LANES : usize = 32 ;
14+ const CHUNK : usize = LANES * 4 ;
15+
16+ #[ derive( Debug , Clone , Copy ) ]
1117#[ repr( transparent) ]
1218pub struct Simd256u ( __m256i ) ;
1319
14- #[ derive( Debug ) ]
20+ #[ derive( Debug , Clone , Copy ) ]
1521#[ repr( transparent) ]
1622pub struct Mask256 ( __m256i ) ;
1723
@@ -51,7 +57,7 @@ impl BitOrAssign<Mask256> for Mask256 {
5157}
5258
5359impl Simd for Simd256u {
54- const LANES : usize = 32 ;
60+ const LANES : usize = LANES ;
5561 type Mask = Mask256 ;
5662 type Element = u8 ;
5763
@@ -87,3 +93,173 @@ impl Simd for Simd256u {
8793 }
8894 }
8995}
96+
97+ #[ inline( always) ]
98+ fn escaped_mask ( v : Simd256u ) -> u32 {
99+ let x1f = Simd256u :: splat ( 0x1f ) ; // 0x00 ~ 0x20
100+ let blash = Simd256u :: splat ( b'\\' ) ;
101+ let quote = Simd256u :: splat ( b'"' ) ;
102+ let v = v. le ( & x1f) | v. eq ( & blash) | v. eq ( & quote) ;
103+ v. bitmask ( )
104+ }
105+
106+ #[ target_feature( enable = "avx2" ) ]
107+ pub unsafe fn format_string ( value : & str , dst : & mut [ u8 ] ) -> usize {
108+ unsafe {
109+ let slice = value. as_bytes ( ) ;
110+ let mut sptr = slice. as_ptr ( ) ;
111+ let mut dptr = dst. as_mut_ptr ( ) ;
112+ let dstart = dptr;
113+ let mut nb: usize = slice. len ( ) ;
114+
115+ * dptr = b'"' ;
116+ dptr = dptr. add ( 1 ) ;
117+
118+ // Process CHUNK (4 * LANES = 128 bytes) at a time
119+ while nb >= CHUNK {
120+ // Load 4 SIMD vectors
121+ let v1 = Simd256u :: loadu ( sptr) ;
122+ let v2 = Simd256u :: loadu ( sptr. add ( LANES ) ) ;
123+ let v3 = Simd256u :: loadu ( sptr. add ( LANES * 2 ) ) ;
124+ let v4 = Simd256u :: loadu ( sptr. add ( LANES * 3 ) ) ;
125+
126+ // Check all 4 masks
127+ let mask1 = escaped_mask ( v1) ;
128+ let mask2 = escaped_mask ( v2) ;
129+ let mask3 = escaped_mask ( v3) ;
130+ let mask4 = escaped_mask ( v4) ;
131+
132+ // Fast path: if all vectors are clean, write the entire chunk
133+ if mask1. all_zero ( ) && mask2. all_zero ( ) && mask3. all_zero ( ) && mask4. all_zero ( ) {
134+ v1. storeu ( dptr) ;
135+ v2. storeu ( dptr. add ( LANES ) ) ;
136+ v3. storeu ( dptr. add ( LANES * 2 ) ) ;
137+ v4. storeu ( dptr. add ( LANES * 3 ) ) ;
138+ nb -= CHUNK ;
139+ dptr = dptr. add ( CHUNK ) ;
140+ sptr = sptr. add ( CHUNK ) ;
141+ } else {
142+ // Slow path: handle escape character
143+ // Process v1
144+ v1. storeu ( dptr) ;
145+ if !mask1. all_zero ( ) {
146+ let cn = mask1. first_offset ( ) ;
147+ nb -= cn;
148+ dptr = dptr. add ( cn) ;
149+ sptr = sptr. add ( cn) ;
150+ escape_unchecked ( & mut sptr, & mut nb, & mut dptr) ;
151+ continue ;
152+ }
153+ nb -= LANES ;
154+ dptr = dptr. add ( LANES ) ;
155+ sptr = sptr. add ( LANES ) ;
156+
157+ // Process v2
158+ v2. storeu ( dptr) ;
159+ if !mask2. all_zero ( ) {
160+ let cn = mask2. first_offset ( ) ;
161+ nb -= cn;
162+ dptr = dptr. add ( cn) ;
163+ sptr = sptr. add ( cn) ;
164+ escape_unchecked ( & mut sptr, & mut nb, & mut dptr) ;
165+ continue ;
166+ }
167+ nb -= LANES ;
168+ dptr = dptr. add ( LANES ) ;
169+ sptr = sptr. add ( LANES ) ;
170+
171+ // Process v3
172+ v3. storeu ( dptr) ;
173+ if !mask3. all_zero ( ) {
174+ let cn = mask3. first_offset ( ) ;
175+ nb -= cn;
176+ dptr = dptr. add ( cn) ;
177+ sptr = sptr. add ( cn) ;
178+ escape_unchecked ( & mut sptr, & mut nb, & mut dptr) ;
179+ continue ;
180+ }
181+ nb -= LANES ;
182+ dptr = dptr. add ( LANES ) ;
183+ sptr = sptr. add ( LANES ) ;
184+
185+ // Process v4
186+ v4. storeu ( dptr) ;
187+ if !mask4. all_zero ( ) {
188+ let cn = mask4. first_offset ( ) ;
189+ nb -= cn;
190+ dptr = dptr. add ( cn) ;
191+ sptr = sptr. add ( cn) ;
192+ escape_unchecked ( & mut sptr, & mut nb, & mut dptr) ;
193+ continue ;
194+ }
195+ nb -= LANES ;
196+ dptr = dptr. add ( LANES ) ;
197+ sptr = sptr. add ( LANES ) ;
198+ }
199+ }
200+
201+ // Process remaining LANES bytes at a time
202+ while nb >= LANES {
203+ let v = Simd256u :: loadu ( sptr) ;
204+ v. storeu ( dptr) ;
205+ let mask = escaped_mask ( v) ;
206+
207+ if mask. all_zero ( ) {
208+ nb -= LANES ;
209+ dptr = dptr. add ( LANES ) ;
210+ sptr = sptr. add ( LANES ) ;
211+ } else {
212+ let cn = mask. first_offset ( ) ;
213+ nb -= cn;
214+ dptr = dptr. add ( cn) ;
215+ sptr = sptr. add ( cn) ;
216+ escape_unchecked ( & mut sptr, & mut nb, & mut dptr) ;
217+ }
218+ }
219+
220+ // Handle remaining bytes
221+ let mut placeholder: [ u8 ; LANES ] = [ 0 ; LANES ] ;
222+ while nb > 0 {
223+ #[ cfg( not( any( target_os = "linux" , target_os = "macos" ) ) ) ]
224+ let v = {
225+ std:: ptr:: copy_nonoverlapping ( sptr, placeholder. as_mut_ptr ( ) , nb) ;
226+ Simd256u :: loadu ( placeholder. as_ptr ( ) )
227+ } ;
228+ #[ cfg( any( target_os = "linux" , target_os = "macos" ) ) ]
229+ let v = {
230+ if check_cross_page ( sptr, LANES ) {
231+ std:: ptr:: copy_nonoverlapping ( sptr, placeholder. as_mut_ptr ( ) , nb) ;
232+ Simd256u :: loadu ( placeholder. as_ptr ( ) )
233+ } else {
234+ #[ cfg( any( debug_assertions, miri) ) ]
235+ {
236+ std:: ptr:: copy_nonoverlapping ( sptr, placeholder. as_mut_ptr ( ) , nb) ;
237+ Simd256u :: loadu ( placeholder. as_ptr ( ) )
238+ }
239+ #[ cfg( not( any( debug_assertions, miri) ) ) ]
240+ {
241+ Simd256u :: loadu ( sptr)
242+ }
243+ }
244+ } ;
245+
246+ v. storeu ( dptr) ;
247+ let mask = escaped_mask ( v) . clear_high_bits ( LANES - nb) ;
248+
249+ if mask. all_zero ( ) {
250+ dptr = dptr. add ( nb) ;
251+ break ;
252+ } else {
253+ let cn = mask. first_offset ( ) ;
254+ nb -= cn;
255+ dptr = dptr. add ( cn) ;
256+ sptr = sptr. add ( cn) ;
257+ escape_unchecked ( & mut sptr, & mut nb, & mut dptr) ;
258+ }
259+ }
260+
261+ * dptr = b'"' ;
262+ dptr = dptr. add ( 1 ) ;
263+ dptr as usize - dstart as usize
264+ }
265+ }
0 commit comments