Feature #12275 » v1.patch
| NEWS | ||
|---|---|---|
| * String#each_grapheme_cluster and String#grapheme_clusters is added to | ||
| enumerate grapheme clusters [Feature #13780] | ||
| * String#start_with? supports regexp [Feature #13712] | ||
| * String#undump is added to unescape String#dump'ed string [Feature #12275] | ||
| * Regexp/String: Update Unicode version from 9.0.0 to 10.0.0 [Feature #13685] | ||
| string.c | ||
|---|---|---|
| #include "ruby_assert.h" | ||
| #include "id.h" | ||
| #include "debug_counter.h" | ||
| #include "ruby/util.h" | ||
| #define BEG(no) (regs->beg[(no)]) | ||
| #define END(no) (regs->end[(no)]) | ||
| ... | ... | |
| return rb_str_eql(folded_str1, folded_str2); | ||
| } | ||
| static long | ||
| strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len, | ||
| const char *sub_ptr, long sub_len, long offset, rb_encoding *enc) | ||
| { | ||
| const char *search_start = str_ptr; | ||
| long pos, search_len = str_len - offset; | ||
| for (;;) { | ||
| const char *t; | ||
| pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc); | ||
| if (pos < 0) return pos; | ||
| t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc); | ||
| if (t == search_start + pos) break; | ||
| search_len -= t - search_start; | ||
| if (search_len <= 0) return -1; | ||
| offset += t - search_start; | ||
| search_start = t; | ||
| } | ||
| return pos + offset; | ||
| } | ||
| #define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0) | ||
| static long | ||
| rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte) | ||
| { | ||
| const char *str_ptr, *str_ptr_end, *sub_ptr, *search_start; | ||
| long pos, str_len, sub_len, search_len; | ||
| const char *str_ptr, *str_ptr_end, *sub_ptr; | ||
| long str_len, sub_len; | ||
| int single_byte = single_byte_optimizable(str); | ||
| rb_encoding *enc; | ||
| ... | ... | |
| if (sub_len == 0) return offset; | ||
| /* need proceed one character at a time */ | ||
| search_start = str_ptr; | ||
| search_len = RSTRING_LEN(str) - offset; | ||
| for (;;) { | ||
| const char *t; | ||
| pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc); | ||
| if (pos < 0) return pos; | ||
| t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc); | ||
| if (t == search_start + pos) break; | ||
| search_len -= t - search_start; | ||
| if (search_len <= 0) return -1; | ||
| offset += t - search_start; | ||
| search_start = t; | ||
| } | ||
| return pos + offset; | ||
| return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc); | ||
| } | ||
| ... | ... | |
| return result; | ||
| } | ||
| /* Is s wrapped with '"'? */ | ||
| static int | ||
| is_wrapped(const char *s, const char *s_end, long len, rb_encoding *enc) | ||
| { | ||
| unsigned int cbeg, cend; | ||
| const char *prev; | ||
| if (len < 2) return FALSE; | ||
| cbeg = rb_enc_mbc_to_codepoint(s, s_end, enc); | ||
| if (cbeg != '"') return FALSE; | ||
| prev = rb_enc_prev_char(s, s_end, s_end, enc); | ||
| cend = rb_enc_mbc_to_codepoint(prev, s_end, enc); | ||
| return cend == '"'; | ||
| } | ||
| static const char * | ||
| unescape_ascii(unsigned int c) | ||
| { | ||
| switch (c) { | ||
| case 'n': | ||
| return "\n"; | ||
| case 'r': | ||
| return "\r"; | ||
| case 't': | ||
| return "\t"; | ||
| case 'f': | ||
| return "\f"; | ||
| case 'v': | ||
| return "\v"; | ||
| case 'b': | ||
| return "\b"; | ||
| case 'a': | ||
| return "\a"; | ||
| case 'e': | ||
| return "\e"; | ||
| default: | ||
| UNREACHABLE; | ||
| } | ||
| } | ||
| static int | ||
| undump_after_backslash(VALUE undumped, const char *s, const char *s_end, rb_encoding *enc) | ||
| { | ||
| unsigned int c, c2; | ||
| int n, n2, codelen; | ||
| size_t hexlen; | ||
| char buf[6]; | ||
| c = rb_enc_codepoint_len(s, s_end, &n, enc); | ||
| switch (c) { | ||
| case '\\': | ||
| case '"': | ||
| rb_str_cat(undumped, s, n); /* cat itself */ | ||
| n++; | ||
| break; | ||
| case 'n': | ||
| case 'r': | ||
| case 't': | ||
| case 'f': | ||
| case 'v': | ||
| case 'b': | ||
| case 'a': | ||
| case 'e': | ||
| rb_str_cat(undumped, unescape_ascii(c), n); | ||
| n++; | ||
| break; | ||
| case 'u': | ||
| if (s+1 >= s_end) { | ||
| rb_raise(rb_eArgError, "invalid Unicode escape"); | ||
| } | ||
| c2 = rb_enc_codepoint_len(s+1, s_end, NULL, enc); | ||
| if (c2 == '{') { /* handle \u{...} form */ | ||
| const char *hexstr = s + 2; | ||
| unsigned int hex; | ||
| static const char* const close_brace = "}"; | ||
| long pos; | ||
| if (hexstr >= s_end) { | ||
| rb_raise(rb_eArgError, "unterminated Unicode escape"); | ||
| } | ||
| /* find close brace */ | ||
| pos = strseq_core(hexstr, s_end, s_end - hexstr, close_brace, 1, 0, enc); | ||
| if (pos < 0) { | ||
| rb_raise(rb_eArgError, "unterminated Unicode escape"); | ||
| } | ||
| hex = ruby_scan_hex(hexstr, pos, &hexlen); | ||
| if (hexlen == 0 || hexlen > 6) { | ||
| rb_raise(rb_eArgError, "invalid Unicode escape"); | ||
| } | ||
| if (hex > 0x10ffffU) { | ||
| rb_raise(rb_eArgError, "invalid Unicode codepoint (too large)"); | ||
| } | ||
| if ((hex & 0xfffff800U) == 0xd800U) { | ||
| rb_raise(rb_eArgError, "invalid Unicode codepoint"); | ||
| } | ||
| codelen = rb_enc_codelen(hex, enc); | ||
| rb_enc_mbcput(hex, buf, enc); | ||
| rb_str_cat(undumped, buf, codelen); | ||
| n += rb_strlen_lit("u{}") + hexlen; | ||
| } | ||
| else { /* handle \uXXXX form */ | ||
| unsigned int hex = ruby_scan_hex(s+1, 4, &hexlen); | ||
| if (hexlen != 4) { | ||
| rb_raise(rb_eArgError, "invalid Unicode escape"); | ||
| } | ||
| codelen = rb_enc_codelen(hex, enc); | ||
| rb_enc_mbcput(hex, buf, enc); | ||
| rb_str_cat(undumped, buf, codelen); | ||
| n += rb_strlen_lit("uXXXX"); | ||
| } | ||
| break; | ||
| case 'x': | ||
| if (s+1 >= s_end) { | ||
| rb_raise(rb_eArgError, "invalid hex escape"); | ||
| } | ||
| c2 = ruby_scan_hex(s+1, 2, &hexlen); | ||
| if (hexlen != 2) { | ||
| rb_raise(rb_eArgError, "invalid hex escape"); | ||
| } | ||
| *buf = (char)c2; | ||
| rb_str_cat(undumped, buf, 1L); | ||
| n += rb_strlen_lit("xXX"); | ||
| break; | ||
| case '#': | ||
| if (s+1 >= s_end) { | ||
| rb_str_cat(undumped, s, 1L); /* just '#' */ | ||
| n++; | ||
| break; | ||
| } | ||
| n2 = rb_enc_mbclen(s+1, s_end, enc); | ||
| if (n2 == 1 && IS_EVSTR(s+1, s_end)) { | ||
| rb_str_cat(undumped, s, n); | ||
| n += n2; | ||
| } | ||
| break; | ||
| default: | ||
| rb_str_cat(undumped, "\\", 1L); /* keep backslash */ | ||
| } | ||
| return n; | ||
| } | ||
| /* | ||
| * call-seq: | ||
| * str.undump -> new_str | ||
| * | ||
| * Produces unescaped version of +str+. | ||
| * See also String#dump because String#undump does inverse of String#dump. | ||
| * | ||
| * "\"hello \\n ''\"".undump #=> "hello \n ''" | ||
| */ | ||
| static VALUE | ||
| str_undump(VALUE str) | ||
| { | ||
| const char *s = RSTRING_PTR(str); | ||
| const char *s_end = RSTRING_END(str); | ||
| long len = RSTRING_LEN(str); | ||
| rb_encoding *enc = rb_enc_get(str); | ||
| int n; | ||
| unsigned int c; | ||
| VALUE undumped = rb_enc_str_new(s, 0L, enc); | ||
| rb_must_asciicompat(str); | ||
| if (is_wrapped(s, s_end, len, enc)) { | ||
| /* strip '"' at the begin and the end */ | ||
| s++; | ||
| s_end--; | ||
| } | ||
| for (; s < s_end; s += n) { | ||
| c = rb_enc_codepoint_len(s, s_end, &n, enc); | ||
| if (c == '\\') { | ||
| if (s+1 >= s_end) { | ||
| rb_raise(rb_eArgError, "invalid escape"); | ||
| } | ||
| n = undump_after_backslash(undumped, s+1, s_end, enc); | ||
| } | ||
| else { | ||
| rb_str_cat(undumped, s, n); | ||
| } | ||
| } | ||
| OBJ_INFECT(undumped, str); | ||
| return undumped; | ||
| } | ||
| static void | ||
| rb_str_check_dummy_enc(rb_encoding *enc) | ||
| ... | ... | |
| rb_define_method(rb_cString, "to_str", rb_str_to_s, 0); | ||
| rb_define_method(rb_cString, "inspect", rb_str_inspect, 0); | ||
| rb_define_method(rb_cString, "dump", rb_str_dump, 0); | ||
| rb_define_method(rb_cString, "undump", str_undump, 0); | ||
| sym_ascii = ID2SYM(rb_intern("ascii")); | ||
| sym_turkic = ID2SYM(rb_intern("turkic")); | ||
| test/ruby/test_string.rb | ||
|---|---|---|
| assert_equal(S('"\\u{10ABCD}"'), b.dump) | ||
| end | ||
| def test_undump | ||
| a = S("Test") << 1 << 2 << 3 << 9 << 13 << 10 | ||
| assert_equal(a, S('"Test\\x01\\x02\\x03\\t\\r\\n"').undump) | ||
| assert_equal(S("\u{7F}"), S('"\\x7F"').undump) | ||
| assert_equal(S("\u{AB}"), S('"\\u00AB"').undump) | ||
| assert_equal(S("\u{ABC}"), S('"\\u0ABC"').undump) | ||
| assert_equal(S("\uABCD"), S('"\\uABCD"').undump) | ||
| assert_equal(S("\u{ABCDE}"), S('"\\u{ABCDE}"').undump) | ||
| assert_equal(S("\u{10ABCD}"), S('"\\u{10ABCD}"').undump) | ||
| assert_equal(S("äöü"), S('"\u00E4\u00F6\u00FC"').undump) | ||
| assert_equal(S("äöü"), S('"\xC3\xA4\xC3\xB6\xC3\xBC"').undump) | ||
| end | ||
| def test_dup | ||
| for taint in [ false, true ] | ||
| for frozen in [ false, true ] | ||