Last active
November 30, 2015 07:20
-
-
Save ls0f/55cd7a020b928e55920d to your computer and use it in GitHub Desktop.
词频统计&DFA关键词匹配
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# http://vdisk.weibo.com/s/azYuqTtsWXaEc | |
def analysis(fn): | |
with open(fn, 'rb') as f: | |
content = f.read() | |
word_dict = {} | |
cur_word = '' | |
total_word = 0 | |
for c in content: | |
if c.isalpha() is False: | |
if cur_word: | |
word_dict[cur_word] = word_dict.get(cur_word, 0) + 1 | |
total_word += 1 | |
cur_word = '' | |
else: | |
cur_word += c.lower() | |
# sort | |
word_list = word_dict.items() | |
print total_word,len(word_list) | |
word_list.sort(key = lambda x: x[1], reverse=True) | |
for item in word_list: | |
print item[0], item[1] | |
if __name__ == "__main__": | |
import sys | |
analysis(sys.argv[1]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Node: | |
def __init__(self, p): | |
self.p = p | |
self.next = None | |
def init_node_from_str(string): | |
assert len(string) > 0 | |
root = None | |
cur = None | |
for s in string: | |
if root is None: | |
cur = root = Node(s) | |
else: | |
cur.next = Node(s) | |
cur = cur.next | |
cur.next = None | |
return root | |
def print_node(root): | |
cur = root | |
string = "" | |
while cur: | |
string += cur.p | |
cur = cur.next | |
return string | |
def search(content, rules): | |
rules_dict = {} | |
for c_index in xrange(0, len(content)): | |
for r_index in xrange(0, len(rules)): | |
r = rules[r_index] | |
cur_node = r | |
cur_c_index = c_index | |
while(cur_node is not None and cur_c_index < len(content) and cur_node.p == content[cur_c_index]): | |
cur_node = cur_node.next | |
cur_c_index += 1 | |
if cur_node is None: | |
rules_dict[r_index] = rules_dict.get(r_index, 0) + 1 | |
for index, count in rules_dict.items(): | |
print print_node(rules[index]), count | |
if __name__ == '__main__': | |
rules = [ init_node_from_str("fuck"), init_node_from_str("porn")] | |
search("fuck you, porn, yfuckkkk, pornnnp", rules) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment