ls0f/analysis.py

## analysis.py
# http://vdisk.weibo.com/s/azYuqTtsWXaEc

def analysis(fn):

    with open(fn, 'rb') as f:
        content = f.read()
    word_dict = {}
    cur_word = ''
    total_word = 0
    for c in content:
        if c.isalpha() is False:
            if cur_word:
                word_dict[cur_word] = word_dict.get(cur_word, 0) + 1
                total_word += 1

            cur_word = ''
        else:
            cur_word += c.lower()

    # sort

    word_list = word_dict.items()
    print total_word,len(word_list)
    word_list.sort(key = lambda x: x[1], reverse=True)

    for item in word_list:
        print item[0], item[1]

if __name__ == "__main__":
    import sys
    analysis(sys.argv[1])

## dfa.py

class Node:
    def __init__(self, p):
        self.p = p
        self.next = None


def init_node_from_str(string):

    assert len(string) > 0
    root = None
    cur = None
    for s in string:
        if root is None:
            cur = root = Node(s)
        else:
            cur.next = Node(s)
            cur = cur.next

    cur.next = None
    return root


def print_node(root):

    cur = root
    string = ""
    while cur:
        string += cur.p
        cur = cur.next
    return string


def search(content, rules):

    rules_dict = {}
    for c_index in xrange(0, len(content)):
        for r_index in xrange(0, len(rules)):
            r = rules[r_index]
            cur_node = r
            cur_c_index = c_index
            while(cur_node is not None and cur_c_index < len(content) and cur_node.p == content[cur_c_index]):
                cur_node = cur_node.next
                cur_c_index += 1
            if cur_node is None:
                rules_dict[r_index] = rules_dict.get(r_index, 0) + 1

    for index, count in rules_dict.items():
        print print_node(rules[index]), count


if __name__ == '__main__':

    rules = [ init_node_from_str("fuck"), init_node_from_str("porn")]
    search("fuck you, porn, yfuckkkk, pornnnp", rules)
	# http://vdisk.weibo.com/s/azYuqTtsWXaEc

	def analysis(fn):

	with open(fn, 'rb') as f:
	content = f.read()
	word_dict = {}
	cur_word = ''
	total_word = 0
	for c in content:
	if c.isalpha() is False:
	if cur_word:
	word_dict[cur_word] = word_dict.get(cur_word, 0) + 1
	total_word += 1

	cur_word = ''
	else:
	cur_word += c.lower()

	# sort

	word_list = word_dict.items()
	print total_word,len(word_list)
	word_list.sort(key = lambda x: x[1], reverse=True)

	for item in word_list:
	print item[0], item[1]

	if __name__ == "__main__":
	import sys
	analysis(sys.argv[1])

	class Node:
	def __init__(self, p):
	self.p = p
	self.next = None


	def init_node_from_str(string):

	assert len(string) > 0
	root = None
	cur = None
	for s in string:
	if root is None:
	cur = root = Node(s)
	else:
	cur.next = Node(s)
	cur = cur.next

	cur.next = None
	return root


	def print_node(root):

	cur = root
	string = ""
	while cur:
	string += cur.p
	cur = cur.next
	return string


	def search(content, rules):

	rules_dict = {}
	for c_index in xrange(0, len(content)):
	for r_index in xrange(0, len(rules)):
	r = rules[r_index]
	cur_node = r
	cur_c_index = c_index
	while(cur_node is not None and cur_c_index < len(content) and cur_node.p == content[cur_c_index]):
	cur_node = cur_node.next
	cur_c_index += 1
	if cur_node is None:
	rules_dict[r_index] = rules_dict.get(r_index, 0) + 1

	for index, count in rules_dict.items():
	print print_node(rules[index]), count


	if __name__ == '__main__':

	rules = [ init_node_from_str("fuck"), init_node_from_str("porn")]
	search("fuck you, porn, yfuckkkk, pornnnp", rules)