@@ -305,43 +305,52 @@ fn unwrap_or_0(opt: Option<&u8>) -> u8 {
305305 }
306306}
307307
308+ /// Reads the next code point out of a byte iterator (assuming a
309+ /// UTF-8-like encoding).
310+ #[ unstable]
311+ pub fn next_code_point ( bytes : & mut slice:: Iter < u8 > ) -> Option < u32 > {
312+ // Decode UTF-8
313+ let x = match bytes. next ( ) {
314+ None => return None ,
315+ Some ( & next_byte) if next_byte < 128 => return Some ( next_byte as u32 ) ,
316+ Some ( & next_byte) => next_byte,
317+ } ;
318+
319+ // Multibyte case follows
320+ // Decode from a byte combination out of: [[[x y] z] w]
321+ // NOTE: Performance is sensitive to the exact formulation here
322+ let init = utf8_first_byte ! ( x, 2 ) ;
323+ let y = unwrap_or_0 ( bytes. next ( ) ) ;
324+ let mut ch = utf8_acc_cont_byte ! ( init, y) ;
325+ if x >= 0xE0 {
326+ // [[x y z] w] case
327+ // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
328+ let z = unwrap_or_0 ( bytes. next ( ) ) ;
329+ let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
330+ ch = init << 12 | y_z;
331+ if x >= 0xF0 {
332+ // [x y z w] case
333+ // use only the lower 3 bits of `init`
334+ let w = unwrap_or_0 ( bytes. next ( ) ) ;
335+ ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
336+ }
337+ }
338+
339+ Some ( ch)
340+ }
341+
308342#[ stable]
309343impl < ' a > Iterator for Chars < ' a > {
310344 type Item = char ;
311345
312346 #[ inline]
313347 fn next ( & mut self ) -> Option < char > {
314- // Decode UTF-8, using the valid UTF-8 invariant
315- let x = match self . iter . next ( ) {
316- None => return None ,
317- Some ( & next_byte) if next_byte < 128 => return Some ( next_byte as char ) ,
318- Some ( & next_byte) => next_byte,
319- } ;
320-
321- // Multibyte case follows
322- // Decode from a byte combination out of: [[[x y] z] w]
323- // NOTE: Performance is sensitive to the exact formulation here
324- let init = utf8_first_byte ! ( x, 2 ) ;
325- let y = unwrap_or_0 ( self . iter . next ( ) ) ;
326- let mut ch = utf8_acc_cont_byte ! ( init, y) ;
327- if x >= 0xE0 {
328- // [[x y z] w] case
329- // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
330- let z = unwrap_or_0 ( self . iter . next ( ) ) ;
331- let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
332- ch = init << 12 | y_z;
333- if x >= 0xF0 {
334- // [x y z w] case
335- // use only the lower 3 bits of `init`
336- let w = unwrap_or_0 ( self . iter . next ( ) ) ;
337- ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
348+ next_code_point ( & mut self . iter ) . map ( |ch| {
349+ // str invariant says `ch` is a valid Unicode Scalar Value
350+ unsafe {
351+ mem:: transmute ( ch)
338352 }
339- }
340-
341- // str invariant says `ch` is a valid Unicode Scalar Value
342- unsafe {
343- Some ( mem:: transmute ( ch) )
344- }
353+ } )
345354 }
346355
347356 #[ inline]
@@ -1517,25 +1526,8 @@ impl StrExt for str {
15171526
15181527 #[ inline]
15191528 fn char_range_at ( & self , i : uint ) -> CharRange {
1520- if self . as_bytes ( ) [ i] < 128u8 {
1521- return CharRange { ch : self . as_bytes ( ) [ i] as char , next : i + 1 } ;
1522- }
1523-
1524- // Multibyte case is a fn to allow char_range_at to inline cleanly
1525- fn multibyte_char_range_at ( s : & str , i : uint ) -> CharRange {
1526- let mut val = s. as_bytes ( ) [ i] as u32 ;
1527- let w = UTF8_CHAR_WIDTH [ val as uint ] as uint ;
1528- assert ! ( ( w != 0 ) ) ;
1529-
1530- val = utf8_first_byte ! ( val, w) ;
1531- val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 1 ] ) ;
1532- if w > 2 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 2 ] ) ; }
1533- if w > 3 { val = utf8_acc_cont_byte ! ( val, s. as_bytes( ) [ i + 3 ] ) ; }
1534-
1535- return CharRange { ch : unsafe { mem:: transmute ( val) } , next : i + w} ;
1536- }
1537-
1538- return multibyte_char_range_at ( self , i) ;
1529+ let ( c, n) = char_range_at_raw ( self . as_bytes ( ) , i) ;
1530+ CharRange { ch : unsafe { mem:: transmute ( c) } , next : n }
15391531 }
15401532
15411533 #[ inline]
@@ -1653,6 +1645,32 @@ impl StrExt for str {
16531645 fn parse < T : FromStr > ( & self ) -> Option < T > { FromStr :: from_str ( self ) }
16541646}
16551647
1648+ /// Pluck a code point out of a UTF-8-like byte slice and return the
1649+ /// index of the next code point.
1650+ #[ inline]
1651+ #[ unstable]
1652+ pub fn char_range_at_raw ( bytes : & [ u8 ] , i : uint ) -> ( u32 , usize ) {
1653+ if bytes[ i] < 128u8 {
1654+ return ( bytes[ i] as u32 , i + 1 ) ;
1655+ }
1656+
1657+ // Multibyte case is a fn to allow char_range_at to inline cleanly
1658+ fn multibyte_char_range_at ( bytes : & [ u8 ] , i : uint ) -> ( u32 , usize ) {
1659+ let mut val = bytes[ i] as u32 ;
1660+ let w = UTF8_CHAR_WIDTH [ val as uint ] as uint ;
1661+ assert ! ( ( w != 0 ) ) ;
1662+
1663+ val = utf8_first_byte ! ( val, w) ;
1664+ val = utf8_acc_cont_byte ! ( val, bytes[ i + 1 ] ) ;
1665+ if w > 2 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 2 ] ) ; }
1666+ if w > 3 { val = utf8_acc_cont_byte ! ( val, bytes[ i + 3 ] ) ; }
1667+
1668+ return ( val, i + w) ;
1669+ }
1670+
1671+ multibyte_char_range_at ( bytes, i)
1672+ }
1673+
16561674#[ stable]
16571675impl < ' a > Default for & ' a str {
16581676 #[ stable]
0 commit comments