Skip to content

Commit a0d3cf8

Browse files
committed
Merge pull request #7 from tom-lord/OrGroup_random_example_probability_distribution
Or group random example probability distribution
2 parents 0ef73be + 28cbfcb commit a0d3cf8

File tree

6 files changed

+88
-54
lines changed

6 files changed

+88
-54
lines changed

.rubocop.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Metrics/LineLength:
2+
Max: 90

lib/regexp-examples/chargroup_parser.rb

Lines changed: 37 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -25,29 +25,11 @@ def parse
2525
until next_char == ']'
2626
case next_char
2727
when '['
28-
@current_position += 1
29-
sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
30-
@charset.concat sub_group_parser.result
31-
@current_position += sub_group_parser.length
28+
parse_sub_group_concat
3229
when '-'
33-
if regexp_string[@current_position + 1] == ']' # e.g. /[abc-]/ -- not a range!
34-
@charset << '-'
35-
@current_position += 1
36-
else
37-
@current_position += 1
38-
@charset.concat (@charset.last..parse_checking_backlash.first).to_a
39-
@current_position += 1
40-
end
30+
parse_after_hyphen
4131
when '&'
42-
if regexp_string[@current_position + 1] == '&'
43-
@current_position += 2
44-
sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
45-
@charset &= sub_group_parser.result
46-
@current_position += (sub_group_parser.length - 1)
47-
else
48-
@charset << '&'
49-
@current_position += 1
50-
end
32+
parse_after_ampersand
5133
else
5234
@charset.concat parse_checking_backlash
5335
@current_position += 1
@@ -116,6 +98,40 @@ def parse_after_backslash
11698
end
11799
end
118100

101+
def parse_sub_group_concat
102+
@current_position += 1
103+
sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
104+
@charset.concat sub_group_parser.result
105+
@current_position += sub_group_parser.length
106+
end
107+
108+
def parse_after_ampersand
109+
if regexp_string[@current_position + 1] == '&'
110+
parse_sub_group_intersect
111+
else
112+
@charset << '&'
113+
@current_position += 1
114+
end
115+
end
116+
117+
def parse_sub_group_intersect
118+
@current_position += 2
119+
sub_group_parser = self.class.new(rest_of_string, is_sub_group: true)
120+
@charset &= sub_group_parser.result
121+
@current_position += (sub_group_parser.length - 1)
122+
end
123+
124+
def parse_after_hyphen
125+
if regexp_string[@current_position + 1] == ']' # e.g. /[abc-]/ -- not a range!
126+
@charset << '-'
127+
@current_position += 1
128+
else
129+
@current_position += 1
130+
@charset.concat (@charset.last..parse_checking_backlash.first).to_a
131+
@current_position += 1
132+
end
133+
end
134+
119135
def rest_of_string
120136
regexp_string[@current_position..-1]
121137
end

lib/regexp-examples/groups.rb

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -153,38 +153,45 @@ def result_by_method(method)
153153
end
154154

155155
# A boolean "or" group.
156-
# It really is boolean: The implementation is to pass in 2 set of
157-
# (repeaters of) groups. The simplest example is: /a|b/
158-
# If you have more than one boolean "or" operator, then this is
159-
# constructed using multiple *boolean* OrGroups, e.g.
160-
# /a|b|c|d/ is treated like /((a|b)|c)|d/
156+
# The implementation is to pass in 2 set of (repeaters of) groups.
157+
# The simplest example is: /a|b/
158+
# If you have more than one boolean "or" operator, then this is initially
159+
# parsed as an OrGroup containing another OrGroup. However, in order to avoid
160+
# probability distribution issues in Regexp#random_example, this then gets
161+
# simplified down to one OrGroup containing 3+ repeaters.
161162
class OrGroup
163+
attr_reader :repeaters_list
164+
162165
def initialize(left_repeaters, right_repeaters)
163-
@left_repeaters = left_repeaters
164-
@right_repeaters = right_repeaters
166+
@repeaters_list = [left_repeaters, *merge_if_orgroup(right_repeaters)]
165167
end
166168

167169
def result
168170
result_by_method(:map_results)
169171
end
170172

171173
def random_result
172-
# TODO: This logic is flawed in terms of choosing a truly "random" example! E.g.
173-
# /a|b|c|d/.random_example will choose a letter with the following probabilities:
174-
# a = 50%, b = 25%, c = 12.5%, d = 12.5%
175-
# In order to fix this, I must either apply some weighted selection logic,
176-
# or change how the OrGroup examples are generated
177-
# - i.e. make this class work with >2 repeaters
178174
result_by_method(:map_random_result).sample(1)
179175
end
180176

181177
private
182178

183179
def result_by_method(method)
184-
left_result = RegexpExamples.public_send(method, @left_repeaters)
185-
right_result = RegexpExamples.public_send(method, @right_repeaters)
186-
left_result.concat(right_result).flatten.uniq.map do |result|
187-
GroupResult.new(result)
180+
repeaters_list.map do |repeaters|
181+
RegexpExamples.public_send(method, repeaters)
182+
end
183+
.inject(:concat)
184+
.map do |result|
185+
GroupResult.new(result)
186+
end
187+
.uniq
188+
end
189+
190+
def merge_if_orgroup(repeaters)
191+
if repeaters.size == 1 && repeaters.first.is_a?(OrGroup)
192+
repeaters.first.repeaters_list
193+
else
194+
[repeaters]
188195
end
189196
end
190197
end

lib/regexp-examples/parser.rb

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ def parse
1515
repeaters = []
1616
until end_of_regexp
1717
group = parse_group(repeaters)
18-
return [OneTimeRepeater.new(group)] if group.is_a? OrGroup
18+
return [group] if group.is_a? OrGroup
1919
@current_position += 1
2020
repeaters << parse_repeater(group)
2121
end
@@ -148,7 +148,7 @@ def parse_after_backslash_group
148148
) # Using "\r\n" as one character is little bit hacky...
149149
when next_char == 'g' # Subexpression call
150150
fail IllegalSyntaxError,
151-
'Subexpression calls (\\g) cannot be supported, as they are not regular'
151+
'Subexpression calls (\\g) cannot be supported, as they are not regular'
152152
when next_char =~ /[bB]/ # Anchors
153153
raise_anchors_exception!
154154
when next_char =~ /[AG]/ # Start of string
@@ -159,6 +159,7 @@ def parse_after_backslash_group
159159
end
160160
when next_char =~ /[zZ]/ # End of string
161161
if @current_position == (regexp_string.length - 1)
162+
# TODO: /\Z/ should be treated as /\n?/
162163
group = PlaceHolderGroup.new
163164
else
164165
raise_anchors_exception!
@@ -212,10 +213,10 @@ def parse_multi_group
212213
end
213214
when %w(! =).include?(match[2]) # e.g. /(?=lookahead)/, /(?!neglookahead)/
214215
fail IllegalSyntaxError,
215-
'Lookaheads are not regular; cannot generate examples'
216+
'Lookaheads are not regular; cannot generate examples'
216217
when %w(! =).include?(match[3]) # e.g. /(?<=lookbehind)/, /(?<!neglookbehind)/
217218
fail IllegalSyntaxError,
218-
'Lookbehinds are not regular; cannot generate examples'
219+
'Lookbehinds are not regular; cannot generate examples'
219220
else # e.g. /(?<name>namedgroup)/
220221
@current_position += (match[3].length + 3)
221222
group_id = match[3]
@@ -237,12 +238,14 @@ def remember_old_regexp_options
237238
end
238239

239240
def regexp_options_toggle(on, off)
240-
@ignorecase = true if on.include? 'i'
241-
@ignorecase = false if off.include? 'i'
242-
@multiline = true if on.include? 'm'
243-
@multiline = false if off.include? 'm'
244-
@extended = true if on.include? 'x'
245-
@extended = false if off.include? 'x'
241+
regexp_option_toggle(on, off, '@ignorecase', 'i')
242+
regexp_option_toggle(on, off, '@multiline', 'm')
243+
regexp_option_toggle(on, off, '@extended', 'x')
244+
end
245+
246+
def regexp_option_toggle(on, off, var, char)
247+
instance_variable_set(var, true) if on.include? char
248+
instance_variable_set(var, false) if off.include? char
246249
end
247250

248251
def parse_char_group
@@ -327,7 +330,7 @@ def parse_reluctant_or_possessive_range_repeater(repeater, min, has_comma, max)
327330

328331
def raise_anchors_exception!
329332
fail IllegalSyntaxError,
330-
"Anchors ('#{next_char}') cannot be supported, as they are not regular"
333+
"Anchors ('#{next_char}') cannot be supported, as they are not regular"
331334
end
332335

333336
def parse_one_time_repeater(group)

lib/regexp-examples/version.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
module RegexpExamples
2-
VERSION = '1.1.2'
2+
VERSION = '1.1.3'
33
end

spec/regexp-examples_spec.rb

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@ def self.examples_exist_and_match(*regexps)
44
it "examples for /#{regexp.source}/" do
55
regexp_examples = regexp.examples(max_group_results: 99_999)
66

7-
expect(regexp_examples).not_to be_empty,
8-
"No examples were generated for regexp: /#{regexp.source}/"
7+
expect(regexp_examples)
8+
.not_to be_empty,
9+
"No examples were generated for regexp: /#{regexp.source}/"
910
regexp_examples.each do |example|
1011
expect(example).to match(/\A(?:#{regexp.source})\z/)
1112
end
@@ -205,8 +206,9 @@ def self.examples_are_empty(*regexps)
205206
).each do |property|
206207
it "examples for /\p{#{property}}/" do
207208
regexp_examples = /\p{#{property}}/.examples(max_group_results: 99_999)
208-
expect(regexp_examples).not_to be_empty,
209-
"No examples were generated for regexp: /\p{#{property}}/"
209+
expect(regexp_examples)
210+
.not_to be_empty,
211+
"No examples were generated for regexp: /\p{#{property}}/"
210212
# Just do one big check, for test system performance (~30% faster)
211213
# (Otherwise, we're doing up to 128 checks on 123 properties!!!)
212214
expect(regexp_examples.join('')).to match(/\A\p{#{property}}+\z/)
@@ -301,6 +303,10 @@ def self.examples_are_empty(*regexps)
301303
it { expect(/(a|b){2}/.examples).to match_array %w(aa ab ba bb) }
302304
it { expect(/a+|b?/.examples).to match_array ['a', 'aa', 'aaa', '', 'b'] }
303305

306+
# Only display unique examples:
307+
it { expect(/a|a|b|b/.examples).to match_array ['a', 'b'] }
308+
it { expect(/[ccdd]/.examples).to match_array ['c', 'd'] }
309+
304310
# a{1}? should be equivalent to (?:a{1})?, i.e. NOT a "non-greedy quantifier"
305311
it { expect(/a{1}?/.examples).to match_array ['', 'a'] }
306312
end

0 commit comments

Comments
 (0)