Skip to content

Commit 0496940

Browse files
authored
Optimize the parse_attributes method to use Source#match to parse XML. (#119)
## Why? Improve maintainability by consolidating processing into `Source#match`. ## Benchmark ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 10.891 10.622 16.356 17.403 i/s - 100.000 times in 9.182130s 9.414177s 6.113806s 5.746133s sax 30.335 29.845 49.749 54.877 i/s - 100.000 times in 3.296483s 3.350595s 2.010071s 1.822259s pull 35.514 34.801 61.123 66.908 i/s - 100.000 times in 2.815793s 2.873484s 1.636041s 1.494591s stream 35.141 34.475 52.110 56.836 i/s - 100.000 times in 2.845646s 2.900638s 1.919017s 1.759456s Comparison: dom after(YJIT): 17.4 i/s before(YJIT): 16.4 i/s - 1.06x slower before: 10.9 i/s - 1.60x slower after: 10.6 i/s - 1.64x slower sax after(YJIT): 54.9 i/s before(YJIT): 49.7 i/s - 1.10x slower before: 30.3 i/s - 1.81x slower after: 29.8 i/s - 1.84x slower pull after(YJIT): 66.9 i/s before(YJIT): 61.1 i/s - 1.09x slower before: 35.5 i/s - 1.88x slower after: 34.8 i/s - 1.92x slower stream after(YJIT): 56.8 i/s before(YJIT): 52.1 i/s - 1.09x slower before: 35.1 i/s - 1.62x slower after: 34.5 i/s - 1.65x slower ``` - YJIT=ON : 1.06x - 1.10x faster - YJIT=OFF : 0.97x - 0.98x faster
1 parent d4e79f2 commit 0496940

File tree

3 files changed

+64
-76
lines changed

3 files changed

+64
-76
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 44 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ class BaseParser
114114

115115
module Private
116116
INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um
117-
TAG_PATTERN = /((?>#{QNAME_STR}))/um
117+
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
118118
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
119119
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
120120
NAME_PATTERN = /\s*#{NAME}/um
@@ -128,7 +128,6 @@ module Private
128128
def initialize( source )
129129
self.stream = source
130130
@listeners = []
131-
@attributes_scanner = StringScanner.new('')
132131
end
133132

134133
def add_listener( listener )
@@ -614,87 +613,60 @@ def process_instruction(start_position)
614613
def parse_attributes(prefixes, curr_ns)
615614
attributes = {}
616615
closed = false
617-
match_data = @source.match(/^(.*?)(\/)?>/um, true)
618-
if match_data.nil?
619-
message = "Start tag isn't ended"
620-
raise REXML::ParseException.new(message, @source)
621-
end
622-
623-
raw_attributes = match_data[1]
624-
closed = !match_data[2].nil?
625-
return attributes, closed if raw_attributes.nil?
626-
return attributes, closed if raw_attributes.empty?
627-
628-
@attributes_scanner.string = raw_attributes
629-
scanner = @attributes_scanner
630-
until scanner.eos?
631-
if scanner.scan(/\s+/)
632-
break if scanner.eos?
633-
end
634-
635-
start_position = scanner.pos
636-
while true
637-
break if scanner.scan(ATTRIBUTE_PATTERN)
638-
unless scanner.scan(QNAME)
639-
message = "Invalid attribute name: <#{scanner.rest}>"
640-
raise REXML::ParseException.new(message, @source)
641-
end
642-
name = scanner[0]
643-
unless scanner.scan(/\s*=\s*/um)
616+
while true
617+
if @source.match(">", true)
618+
return attributes, closed
619+
elsif @source.match("/>", true)
620+
closed = true
621+
return attributes, closed
622+
elsif match = @source.match(QNAME, true)
623+
name = match[1]
624+
prefix = match[2]
625+
local_part = match[3]
626+
627+
unless @source.match(/\s*=\s*/um, true)
644628
message = "Missing attribute equal: <#{name}>"
645629
raise REXML::ParseException.new(message, @source)
646630
end
647-
quote = scanner.scan(/['"]/)
648-
unless quote
649-
message = "Missing attribute value start quote: <#{name}>"
650-
raise REXML::ParseException.new(message, @source)
651-
end
652-
unless scanner.scan(/.*#{Regexp.escape(quote)}/um)
653-
@source.ensure_buffer
654-
match_data = @source.match(/^(.*?)(\/)?>/um, true)
655-
if match_data
656-
scanner << "/" if closed
657-
scanner << ">"
658-
scanner << match_data[1]
659-
scanner.pos = start_position
660-
closed = !match_data[2].nil?
661-
next
631+
unless match = @source.match(/(['"])(.*?)\1\s*/um, true)
632+
if match = @source.match(/(['"])/, true)
633+
message =
634+
"Missing attribute value end quote: <#{name}>: <#{match[1]}>"
635+
raise REXML::ParseException.new(message, @source)
636+
else
637+
message = "Missing attribute value start quote: <#{name}>"
638+
raise REXML::ParseException.new(message, @source)
662639
end
663-
message =
664-
"Missing attribute value end quote: <#{name}>: <#{quote}>"
665-
raise REXML::ParseException.new(message, @source)
666640
end
667-
end
668-
name = scanner[1]
669-
prefix = scanner[2]
670-
local_part = scanner[3]
671-
# quote = scanner[4]
672-
value = scanner[5]
673-
if prefix == "xmlns"
674-
if local_part == "xml"
675-
if value != "http://www.w3.org/XML/1998/namespace"
676-
msg = "The 'xml' prefix must not be bound to any other namespace "+
641+
value = match[2]
642+
if prefix == "xmlns"
643+
if local_part == "xml"
644+
if value != "http://www.w3.org/XML/1998/namespace"
645+
msg = "The 'xml' prefix must not be bound to any other namespace "+
646+
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
647+
raise REXML::ParseException.new( msg, @source, self )
648+
end
649+
elsif local_part == "xmlns"
650+
msg = "The 'xmlns' prefix must not be declared "+
677651
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
678-
raise REXML::ParseException.new( msg, @source, self )
652+
raise REXML::ParseException.new( msg, @source, self)
679653
end
680-
elsif local_part == "xmlns"
681-
msg = "The 'xmlns' prefix must not be declared "+
682-
"(http://www.w3.org/TR/REC-xml-names/#ns-decl)"
683-
raise REXML::ParseException.new( msg, @source, self)
654+
curr_ns << local_part
655+
elsif prefix
656+
prefixes << prefix unless prefix == "xml"
684657
end
685-
curr_ns << local_part
686-
elsif prefix
687-
prefixes << prefix unless prefix == "xml"
688-
end
689658

690-
if attributes.has_key?(name)
691-
msg = "Duplicate attribute #{name.inspect}"
692-
raise REXML::ParseException.new(msg, @source, self)
693-
end
659+
if attributes.has_key?(name)
660+
msg = "Duplicate attribute #{name.inspect}"
661+
raise REXML::ParseException.new(msg, @source, self)
662+
end
694663

695-
attributes[name] = value
664+
attributes[name] = value
665+
else
666+
message = "Invalid attribute name: <#{@source.buffer.split(%r{[/>\s]}).first}>"
667+
raise REXML::ParseException.new(message, @source)
668+
end
696669
end
697-
return attributes, closed
698670
end
699671
end
700672
end

test/parse/test_element.rb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,9 @@ def test_empty_namespace_attribute_name
4141
assert_equal(<<-DETAIL.chomp, exception.to_s)
4242
Invalid attribute name: <:a="">
4343
Line: 1
44-
Position: 9
44+
Position: 13
4545
Last 80 unconsumed characters:
46-
46+
:a=""></x>
4747
DETAIL
4848
end
4949

test/test_core.rb

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,12 @@ def test_attribute
116116

117117
def test_attribute_namespace_conflict
118118
# https://www.w3.org/TR/xml-names/#uniqAttrs
119-
message = <<-MESSAGE
119+
message = <<-MESSAGE.chomp
120120
Duplicate attribute "a"
121121
Line: 4
122122
Position: 140
123123
Last 80 unconsumed characters:
124+
/>
124125
MESSAGE
125126
assert_raise(REXML::ParseException.new(message)) do
126127
Document.new(<<-XML)
@@ -1323,11 +1324,26 @@ def test_ticket_21
13231324
exception = assert_raise(ParseException) do
13241325
Document.new(src)
13251326
end
1326-
assert_equal(<<-DETAIL, exception.to_s)
1327+
assert_equal(<<-DETAIL.chomp, exception.to_s)
13271328
Missing attribute value start quote: <bar>
13281329
Line: 1
13291330
Position: 16
13301331
Last 80 unconsumed characters:
1332+
value/>
1333+
DETAIL
1334+
end
1335+
1336+
def test_parse_exception_on_missing_attribute_end_quote
1337+
src = '<foo bar="value/>'
1338+
exception = assert_raise(ParseException) do
1339+
Document.new(src)
1340+
end
1341+
assert_equal(<<-DETAIL.chomp, exception.to_s)
1342+
Missing attribute value end quote: <bar>: <">
1343+
Line: 1
1344+
Position: 17
1345+
Last 80 unconsumed characters:
1346+
value/>
13311347
DETAIL
13321348
end
13331349

0 commit comments

Comments
 (0)