Skip to content

Commit c668d2e

Browse files
committed
html: Sync changes from x/net
Before 324513b (2012-01-04) std "html" and what is now "golang.org/x/net/html" were the same. Ever since then (well, since 4e0749a (2012-05-29)) the escape/unescape code that they share has been drifting apart, each receiving separate improvements. This CL cherry-picks over all of the changes that "x/net/html" has seen. This is the counterpart to https://golang.org/cl/580855, and so also includes the de-duplication requested at https://go-review.googlesource.com/c/net/+/580855/comment/8f8679a3_12b9ead1/ 4e0749a : Author: Andrew Balholm <andybalholm@gmail.com> Date: Wed May 30 15:50:12 2012 +1000 exp/html: Convert \r and \r\n to \n when tokenizing Also escape "\r" as "&#13;" when rendering HTML. Pass 2 additional tests. R=nigeltao CC=golang-dev https://golang.org/cl/6260046 golang/net@3d87fd6 : Author: Dmitry Savintsev <dsavints@gmail.com> Date: Thu Feb 26 23:44:25 2015 +0100 x/net/html: Sync the html parser and atom with the current whatwg spec The current documentation as well as set of atoms and attributes has gotten slightly out of sync with the current state of the WHATWG html5 specification. The change adds and removes several of the atoms and attributes, updates the documentation (such as steps numbering in inBodyEndTagFormatting) and modifies the spec URLs to https:// Change-Id: I6dfa52785858c1521301b20b1e585e19a08b1e98 Reviewed-on: https://go-review.googlesource.com/6173 Reviewed-by: Nigel Tao <nigeltao@golang.org>
1 parent 6d89b38 commit c668d2e

File tree

1 file changed

+29
-29
lines changed

1 file changed

+29
-29
lines changed

src/html/escape.go

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,9 @@ var replacementTable = [...]rune{
5353
// unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
5454
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
5555
// Precondition: b[src] == '&' && dst <= src.
56-
func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
57-
const attribute = false
58-
59-
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
56+
// attribute should be true if parsing an attribute value.
57+
func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
58+
// https://html.spec.whatwg.org/multipage/syntax.html#consume-a-character-reference
6059

6160
// i starts at 1 because we already know that s[0] == '&'.
6261
i, s := 1, b[src:]
@@ -163,16 +162,38 @@ func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
163162
return dst1, src1
164163
}
165164

165+
func unescapeInner(b []byte, i int, attribute bool) []byte {
166+
dst, src := unescapeEntity(b, i, i, attribute)
167+
for len(b[src:]) > 0 {
168+
if b[src] == '&' {
169+
i = 0
170+
} else {
171+
i = bytes.IndexByte(b[src:], '&')
172+
}
173+
if i < 0 {
174+
dst += copy(b[dst:], b[src:])
175+
break
176+
}
177+
178+
if i > 0 {
179+
copy(b[dst:], b[src:src+i])
180+
}
181+
dst, src = unescapeEntity(b, dst+i, src+i, attribute)
182+
}
183+
return b[:dst]
184+
}
185+
166186
var htmlEscaper = strings.NewReplacer(
167187
`&`, "&amp;",
168188
`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
169189
`<`, "&lt;",
170190
`>`, "&gt;",
171191
`"`, "&#34;", // "&#34;" is shorter than "&quot;".
192+
"\r", "&#13;",
172193
)
173194

174195
// EscapeString escapes special characters like "<" to become "&lt;". It
175-
// escapes only five such characters: <, >, &, ' and ".
196+
// escapes only six such characters: <, >, &, ', ", and \r.
176197
// UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
177198
// always true.
178199
func EscapeString(s string) string {
@@ -186,29 +207,8 @@ func EscapeString(s string) string {
186207
// always true.
187208
func UnescapeString(s string) string {
188209
populateMapsOnce.Do(populateMaps)
189-
i := strings.IndexByte(s, '&')
190-
191-
if i < 0 {
192-
return s
193-
}
194-
195-
b := []byte(s)
196-
dst, src := unescapeEntity(b, i, i)
197-
for len(s[src:]) > 0 {
198-
if s[src] == '&' {
199-
i = 0
200-
} else {
201-
i = strings.IndexByte(s[src:], '&')
202-
}
203-
if i < 0 {
204-
dst += copy(b[dst:], s[src:])
205-
break
206-
}
207-
208-
if i > 0 {
209-
copy(b[dst:], s[src:src+i])
210-
}
211-
dst, src = unescapeEntity(b, dst+i, src+i)
210+
if i := strings.IndexByte(s, '&'); i >= 0 {
211+
return string(unescapeInner([]byte(s), i, false))
212212
}
213-
return string(b[:dst])
213+
return s
214214
}

0 commit comments

Comments
 (0)