defBF(s, p): ''' Brute-Force算法实现字符串匹配 ''' indies = [] n = len(s) m = len(p) for i inrange(n - m + 1): index = i for j inrange(m): if s[index] == p[j]: index += 1 else: break if index - i == m: indies.append(i) return indies
defaddWord(self, word): ''' 添加关键词到Tire树中 ''' tmp = self.__root for i inrange(0, len(word)): if word[i] notin tmp.next: tmp.next[word[i]] = Node() tmp = tmp.next[word[i]] tmp.isWord = True
defmake(self): ''' 构建自动机,失效函数 ''' tmpQueue = [] tmpQueue.append(self.__root) while(len(tmpQueue) > 0): temp = tmpQueue.pop() p = None for k, v in temp.next.items(): if temp == self.__root: temp.next[k].fail = self.__root else: p = temp.fail while p isnotNone: if k in p.next: temp.next[k].fail = p.next[k] break p = p.fail if p isNone : temp.next[k].fail = self.__root tmpQueue.append(temp.next[k])
defsearch(self, content): p = self.__root result = [] startWordIndex = 0 endWordIndex = -1 currentPosition = 0
while currentPosition < len(content): word = content[currentPosition] # 检索状态机,直到匹配 while word notin p.nextand p != self.__root: p = p.fail
if word in p.next: if p == self.__root: # 若当前节点是根且存在转移状态,则说明是匹配词的开头,记录词的起始位置 startWordIndex = currentPosition # 转移状态机的状态 p = p.next[word] else: p = self.__root
if p.isWord: # 若状态为词的结尾,则把词放进结果集 result.append((startWordIndex, currentPosition))
currentPosition += 1 return result
defreplace(self, content): replacepos = self.search(content) result = content for i in replacepos: result = result[0:i[0]] + (i[1] - i[0] + 1) * u'*' + content[i[1] + 1:] return result
if __name__ == '__main__': ah = Ahocorasick() text = '中国科学技术大学位于安徽合肥,是一所理工科大学,又称为南七技校。' patterns = '科学 大学 理工科 技校' words = patterns.split(' ') for w in words: ah.addWord(w) ah.make() results = ah.search(text) print(results) iflen(results) == 0: print('No find.') else: print(len(results),' matching results are listed below.') print('-------' + '-'*len(text) + '-------') print(text) count = 0 for site in results: w = text[site[0]:site[1]+1] count += 1 print(' '*site[0] + w + ' '*(len(text)-site[1]) + ' ' + str(site[0]) + ' ' + str(count)) print('-------' + '-'*len(text) + '-------')