|
| 1 | +#!/usr/bin/python |
| 2 | +# -*- coding: UTF-8 -*- |
| 3 | + |
| 4 | + |
| 5 | +def kmp(main, pattern): |
| 6 | + """ |
| 7 | + kmp字符串匹配 |
| 8 | + :param main: |
| 9 | + :param pattern: |
| 10 | + :return: |
| 11 | + """ |
| 12 | + assert type(main) is str and type(pattern) is str |
| 13 | + |
| 14 | + n, m = len(main), len(pattern) |
| 15 | + |
| 16 | + if m == 0: |
| 17 | + return 0 |
| 18 | + if n <= m: |
| 19 | + return 0 if main == pattern else -1 |
| 20 | + |
| 21 | + # 求解next数组 |
| 22 | + next = get_next(pattern) |
| 23 | + |
| 24 | + j = 0 |
| 25 | + for i in range(n): |
| 26 | + # 在pattern[:j]中,从长到短递归去找最长的和后缀子串匹配的前缀子串 |
| 27 | + while j > 0 and main[i] != pattern[j]: |
| 28 | + j = next[j-1] + 1 # 如果next[j-1] = -1,则要从起始字符取匹配 |
| 29 | + |
| 30 | + if main[i] == pattern[j]: |
| 31 | + if j == m-1: |
| 32 | + return i-m+1 |
| 33 | + else: |
| 34 | + j += 1 |
| 35 | + return -1 |
| 36 | + |
| 37 | + |
| 38 | +def get_next(pattern): |
| 39 | + """ |
| 40 | + next数组生成 |
| 41 | +
|
| 42 | + 注意: |
| 43 | + 理解的难点在于next[i]根据next[0], next[1]…… next[i-1]的求解 |
| 44 | + next[i]的值依赖于前面的next数组的值,求解思路: |
| 45 | + 1. 首先取出前一个最长的匹配的前缀子串,其下标就是next[i-1] |
| 46 | + 2. 对比下一个字符,如果匹配,直接赋值next[i]为next[i-1]+1,因为i-1的时候已经是最长 |
| 47 | + *3. 如果不匹配,需要递归去找次长的匹配的前缀子串,这里难理解的就是递归地方式,next[i-1] |
| 48 | + 是i-1的最长匹配前缀子串的下标结尾,则 *next[next[i-1]]* 是其次长匹配前缀子串的下标 |
| 49 | + 结尾 |
| 50 | + *4. 递归的出口,就是在次长前缀子串的下一个字符和当前匹配 或 遇到-1,遇到-1则说明没找到任 |
| 51 | + 何匹配的前缀子串,这时需要找pattern的第一个字符对比 |
| 52 | +
|
| 53 | + ps: next[m-1]的数值其实没有任何意义,求解时可以不理。网上也有将next数组往右平移的做法。 |
| 54 | + :param pattern: |
| 55 | + :return: |
| 56 | + """ |
| 57 | + m = len(pattern) |
| 58 | + next = [-1] * m |
| 59 | + |
| 60 | + next[0] = -1 |
| 61 | + |
| 62 | + # for i in range(1, m): |
| 63 | + for i in range(1, m-1): |
| 64 | + j = next[i-1] # 取i-1时匹配到的最长前缀子串 |
| 65 | + while j != -1 and pattern[j+1] != pattern[i]: |
| 66 | + j = next[j] # 次长的前缀子串的下标,即是next[next[i-1]] |
| 67 | + |
| 68 | + # 根据上面跳出while的条件,当j=-1时,需要比较pattern[0]和当前字符 |
| 69 | + # 如果j!=-1,则pattern[j+1]和pattern[i]一定是相等的 |
| 70 | + if pattern[j+1] == pattern[i]: # 如果接下来的字符也是匹配的,那i的最长前缀子串下标是next[i-1]+1 |
| 71 | + j += 1 |
| 72 | + next[i] = j |
| 73 | + |
| 74 | + return next |
| 75 | + |
| 76 | + |
| 77 | +if __name__ == '__main__': |
| 78 | + m_str = "aabbbbaaabbababbabbbabaaabb" |
| 79 | + p_str = "abbabbbabaa" |
| 80 | + |
| 81 | + print('--- search ---') |
| 82 | + print('[Built-in Functions] result:', m_str.find(p_str)) |
| 83 | + print('[kmp] result:', kmp(m_str, p_str)) |
0 commit comments