Skip to content

Instantly share code, notes, and snippets.

@ls0f
Last active November 30, 2015 07:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ls0f/55cd7a020b928e55920d to your computer and use it in GitHub Desktop.
Save ls0f/55cd7a020b928e55920d to your computer and use it in GitHub Desktop.
词频统计&DFA关键词匹配
# http://vdisk.weibo.com/s/azYuqTtsWXaEc
def analysis(fn):
with open(fn, 'rb') as f:
content = f.read()
word_dict = {}
cur_word = ''
total_word = 0
for c in content:
if c.isalpha() is False:
if cur_word:
word_dict[cur_word] = word_dict.get(cur_word, 0) + 1
total_word += 1
cur_word = ''
else:
cur_word += c.lower()
# sort
word_list = word_dict.items()
print total_word,len(word_list)
word_list.sort(key = lambda x: x[1], reverse=True)
for item in word_list:
print item[0], item[1]
if __name__ == "__main__":
import sys
analysis(sys.argv[1])
class Node:
def __init__(self, p):
self.p = p
self.next = None
def init_node_from_str(string):
assert len(string) > 0
root = None
cur = None
for s in string:
if root is None:
cur = root = Node(s)
else:
cur.next = Node(s)
cur = cur.next
cur.next = None
return root
def print_node(root):
cur = root
string = ""
while cur:
string += cur.p
cur = cur.next
return string
def search(content, rules):
rules_dict = {}
for c_index in xrange(0, len(content)):
for r_index in xrange(0, len(rules)):
r = rules[r_index]
cur_node = r
cur_c_index = c_index
while(cur_node is not None and cur_c_index < len(content) and cur_node.p == content[cur_c_index]):
cur_node = cur_node.next
cur_c_index += 1
if cur_node is None:
rules_dict[r_index] = rules_dict.get(r_index, 0) + 1
for index, count in rules_dict.items():
print print_node(rules[index]), count
if __name__ == '__main__':
rules = [ init_node_from_str("fuck"), init_node_from_str("porn")]
search("fuck you, porn, yfuckkkk, pornnnp", rules)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment