Skip to content

Commit d869727

Browse files
Merge pull request wangzheng0822#185 from jerryderry/bm-python
Boyer-Moore string-search algorithm in python
2 parents fb0c353 + 68b5c02 commit d869727

File tree

1 file changed

+77
-0
lines changed

1 file changed

+77
-0
lines changed

python/33_bm/bm.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
"""
2+
Boyer-Moore string-search algorithm.
3+
4+
Author: Wenru Dong
5+
"""
6+
7+
from typing import List, Tuple
8+
9+
SIZE = 256
10+
11+
def _generate_bad_character_table(pattern: str) -> List[int]:
12+
bc = [-1] * SIZE
13+
for i, char in enumerate(pattern):
14+
bc[ord(char)] = i
15+
return bc
16+
17+
18+
def _generate_good_suffix_table(pattern: str) -> Tuple[List[bool], List[int]]:
19+
m = len(pattern)
20+
# prefix[k] records whether the last k-character suffix of pattern
21+
# can match with the first k-character prefix of pattern.
22+
# suffix[k] records the starting index of the last substring of
23+
# pattern that can match with the last k-character suffix of pattern.
24+
prefix, suffix = [False] * m, [-1] * m
25+
# For each substring patter[:i+1], we find the common suffix with
26+
# pattern, and the starting index of this common suffix.
27+
# This way we can re-write previous suffix[k] to record the index
28+
# as large as possible, hence the last substring.
29+
for i in range(m - 1):
30+
j = i # starting index of the common suffix
31+
k = 0 # length of the common suffix
32+
while j >= 0 and pattern[j] == pattern[~k]:
33+
j -= 1
34+
k += 1
35+
suffix[k] = j + 1
36+
if j == -1: prefix[k] = True
37+
return (prefix, suffix)
38+
39+
40+
def _move_by_good_suffix(bad_character_index: int, suffix: List[int], prefix: List[bool]) -> int:
41+
k = len(suffix) - 1 - bad_character_index
42+
if suffix[k] != -1: return bad_character_index - suffix[k] + 1
43+
# Test from k - 1
44+
for r, can_match_prefix in enumerate(reversed(prefix[:k]), bad_character_index + 2):
45+
if can_match_prefix: return r
46+
return len(suffix)
47+
48+
49+
def bm(s: str, pattern: str) -> int:
50+
bc = _generate_bad_character_table(pattern)
51+
prefix, suffix = _generate_good_suffix_table(pattern)
52+
n, m = len(s), len(pattern)
53+
i = 0
54+
while i <= n - m:
55+
j = m - 1 # bad character index in pattern
56+
while j >= 0:
57+
if s[i + j] != pattern[j]: break
58+
j -= 1
59+
if j < 0: return i
60+
61+
x = j - bc[ord(s[i + j])]
62+
y = 0
63+
if j < m - 1:
64+
y = _move_by_good_suffix(j, suffix, prefix)
65+
i += max(x, y)
66+
return -1
67+
68+
69+
if __name__ == "__main__":
70+
71+
s = "Here is a simple example"
72+
pattern = "example"
73+
print(bm(s, pattern))
74+
75+
s = "abcdcccdc"
76+
pattern = "cccd"
77+
print(s.find(pattern) == bm(s, pattern))

0 commit comments

Comments
 (0)