v1.patch - Ruby - Ruby Issue Tracking System

Feature #12275 » v1.patch

tad (Tadashi Saito), 11/27/2017 07:56 PM

    NEWS  
     * String#each_grapheme_cluster and String#grapheme_clusters is added to
 
    enumerate grapheme clusters [Feature #13780]
 
    * String#start_with? supports regexp [Feature #13712]
 
    * String#undump is added to unescape String#dump'ed string [Feature #12275]
 
   * Regexp/String: Update Unicode version from 9.0.0 to 10.0.0 [Feature #13685]
 
    string.c  
    #include "ruby_assert.h"
 
   #include "id.h"
 
   #include "debug_counter.h"
 
   #include "ruby/util.h"
 
   #define BEG(no) (regs->beg[(no)])
 
   #define END(no) (regs->end[(no)])
 
    return rb_str_eql(folded_str1, folded_str2);
 
   }
 
   static long
 
   strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
 
    const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
 
   {
 
    const char *search_start = str_ptr;
 
    long pos, search_len = str_len - offset;
 
    for (;;) {
 
   const char *t;
 
   pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
 
   if (pos < 0) return pos;
 
   t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
 
   if (t == search_start + pos) break;
 
   search_len -= t - search_start;
 
   if (search_len <= 0) return -1;
 
   offset += t - search_start;
 
   search_start = t;
 
    }
 
    return pos + offset;
 
   }
 
   #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
 
   static long
 
   rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
 
   {
 
    const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start;
 
    long pos, str_len, sub_len, search_len;
 
    const char *str_ptr, *str_ptr_end, *sub_ptr;
 
    long str_len, sub_len;
 
    int single_byte = single_byte_optimizable(str);
 
    rb_encoding *enc;
 
    if (sub_len == 0) return offset;
 
    /* need proceed one character at a time */
 
    search_start = str_ptr;
 
    search_len = RSTRING_LEN(str) - offset;
 
    for (;;) {
 
   const char *t;
 
   pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
 
   if (pos < 0) return pos;
 
   t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
 
   if (t == search_start + pos) break;
 
   search_len -= t - search_start;
 
   if (search_len <= 0) return -1;
 
   offset += t - search_start;
 
   search_start = t;
 
    }
 
    return pos + offset;
 
    return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
 
   }
 
    return result;
 
   }
 
   /* Is s wrapped with '"'? */
 
   static int
 
   is_wrapped(const char *s, const char *s_end, long len, rb_encoding *enc)
 
   {
 
    unsigned int cbeg, cend;
 
    const char *prev;
 
    if (len < 2) return FALSE;
 
    cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc);
 
    if (cbeg != '"') return FALSE;
 
    prev = rb_enc_prev_char(s, s_end, s_end, enc);
 
    cend = rb_enc_mbc_to_codepoint(prev, s_end, enc);
 
    return cend == '"';
 
   }
 
   static const char *
 
   unescape_ascii(unsigned int c)
 
   {
 
    switch (c) {
 
    case 'n':
 
   return "\n";
 
    case 'r':
 
   return "\r";
 
    case 't':
 
   return "\t";
 
    case 'f':
 
   return "\f";
 
    case 'v':
 
   return "\v";
 
    case 'b':
 
   return "\b";
 
    case 'a':
 
   return "\a";
 
    case 'e':
 
   return "\e";
 
    default:
 
   UNREACHABLE;
 
    }
 
   }
 
   static int
 
   undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding *enc)
 
   {
 
    unsigned int c, c2;
 
    int n, n2, codelen;
 
    size_t hexlen;
 
    char buf[6];
 
    c = rb_enc_codepoint_len(s, s_end, &n, enc);
 
    switch (c) {
 
    case '\\':
 
    case '"':
 
   rb_str_cat(undumped, s, n); /* cat itself */
 
   n++;
 
   break;
 
    case 'n':
 
    case 'r':
 
    case 't':
 
    case 'f':
 
    case 'v':
 
    case 'b':
 
    case 'a':
 
    case 'e':
 
   rb_str_cat(undumped, unescape_ascii(c), n);
 
   n++;
 
   break;
 
    case 'u':
 
   if (s+1 >= s_end) {
 
    rb_raise(rb_eArgError, "invalid Unicode escape");
 
   }
 
   c2 = rb_enc_codepoint_len(s+1, s_end, NULL, enc);
 
   if (c2 == '{') { /* handle \u{...} form */
 
    const char *hexstr = s + 2;
 
    unsigned int hex;
 
    static const char* const close_brace = "}";
 
    long pos;
 
    if (hexstr >= s_end) {
 
   rb_raise(rb_eArgError, "unterminated Unicode escape");
 
    }
 
    /* find close brace */
 
    pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, enc);
 
    if (pos < 0) {
 
   rb_raise(rb_eArgError, "unterminated Unicode escape");
 
    }
 
    hex = ruby_scan_hex(hexstr, pos, &hexlen);
 
    if (hexlen == 0 || hexlen > 6) {
 
   rb_raise(rb_eArgError, "invalid Unicode escape");
 
    }
 
    if (hex > 0x10ffffU) {
 
   rb_raise(rb_eArgError, "invalid Unicode codepoint (too large)");
 
    }
 
    if ((hex & 0xfffff800U) == 0xd800U) {
 
   rb_raise(rb_eArgError, "invalid Unicode codepoint");
 
    }
 
    codelen = rb_enc_codelen(hex, enc);
 
    rb_enc_mbcput(hex, buf, enc);
 
    rb_str_cat(undumped, buf, codelen);
 
    n += rb_strlen_lit("u{}") + hexlen;
 
   }
 
   else { /* handle \uXXXX form */
 
    unsigned int hex = ruby_scan_hex(s+1, 4, &hexlen);
 
    if (hexlen != 4) {
 
   rb_raise(rb_eArgError, "invalid Unicode escape");
 
    }
 
    codelen = rb_enc_codelen(hex, enc);
 
    rb_enc_mbcput(hex, buf, enc);
 
    rb_str_cat(undumped, buf, codelen);
 
    n += rb_strlen_lit("uXXXX");
 
   }
 
   break;
 
    case 'x':
 
   if (s+1 >= s_end) {
 
    rb_raise(rb_eArgError, "invalid hex escape");
 
   }
 
   c2 = ruby_scan_hex(s+1, 2, &hexlen);
 
   if (hexlen != 2) {
 
    rb_raise(rb_eArgError, "invalid hex escape");
 
   }
 
   *buf = (char)c2;
 
   rb_str_cat(undumped, buf, 1L);
 
   n += rb_strlen_lit("xXX");
 
   break;
 
    case '#':
 
   if (s+1 >= s_end) {
 
    rb_str_cat(undumped, s, 1L); /* just '#' */
 
    n++;
 
    break;
 
   }
 
   n2 = rb_enc_mbclen(s+1, s_end, enc);
 
   if (n2 == 1 && IS_EVSTR(s+1, s_end)) {
 
    rb_str_cat(undumped, s, n);
 
    n += n2;
 
   }
 
   break;
 
    default:
 
   rb_str_cat(undumped, "\\", 1L); /* keep backslash */
 
    }
 
    return n;
 
   }
 
   /*
 
    * call-seq:
 
    * str.undump -> new_str
 
    *
 
    * Produces unescaped version of +str+.
 
    * See also String#dump because String#undump does inverse of String#dump.
 
    *
 
    * "\"hello \\n ''\"".undump #=> "hello \n ''"
 
    */
 
   static VALUE
 
   str_undump(VALUE str)
 
   {
 
    const char *s = RSTRING_PTR(str);
 
    const char *s_end = RSTRING_END(str);
 
    long len = RSTRING_LEN(str);
 
    rb_encoding *enc = rb_enc_get(str);
 
    int n;
 
    unsigned int c;
 
    VALUE undumped = rb_enc_str_new(s, 0L, enc);
 
    rb_must_asciicompat(str);
 
    if (is_wrapped(s, s_end, len, enc)) {
 
   /* strip '"' at the begin and the end */
 
   s++;
 
   s_end--;
 
    }
 
    for (; s < s_end; s += n) {
 
   c = rb_enc_codepoint_len(s, s_end, &n, enc);
 
   if (c == '\\') {
 
    if (s+1 >= s_end) {
 
   rb_raise(rb_eArgError, "invalid escape");
 
    }
 
    n = undump_after_backslash(undumped, s+1, s_end, enc);
 
   }
 
   else {
 
    rb_str_cat(undumped, s, n);
 
   }
 
    }
 
    OBJ_INFECT(undumped, str);
 
    return undumped;
 
   }
 
   static void
 
   rb_str_check_dummy_enc(rb_encoding *enc)
 
    rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
 
    rb_define_method(rb_cString, "inspect", rb_str_inspect, 0);
 
    rb_define_method(rb_cString, "dump", rb_str_dump, 0);
 
    rb_define_method(rb_cString, "undump", str_undump, 0);
 
    sym_ascii = ID2SYM(rb_intern("ascii"));
 
    sym_turkic = ID2SYM(rb_intern("turkic"));
 
    test/ruby/test_string.rb  
     assert_equal(S('"\\u{10ABCD}"'), b.dump)
 
    end
 
    def test_undump
 
    a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10
 
    assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump)
 
    assert_equal(S("\u{7F}"), S('"\\x7F"').undump)
 
    assert_equal(S("\u{AB}"), S('"\\u00AB"').undump)
 
    assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump)
 
    assert_equal(S("\uABCD"), S('"\\uABCD"').undump)
 
    assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump)
 
    assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump)
 
    assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump)
 
    assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump)
 
    end
 
    def test_dup
 
    for taint in [ false, true ]
 
    for frozen in [ false, true ]

(2-2/5)

Project

General

Profile

Ruby

Feature #12275 » v1.patch