guileen/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Features

抽取所有英文单词的基本形式，小写，计数。
Usage

python to_word_list.py path_to_text.txt min_word_count

e.g.
python to_word_list.py path_to_text.txt 2 will show words appear above twice.

  
## to_word_list.py
import os
from collections import Counter
import re
import sys
from nltk.stem import WordNetLemmatizer

def clean1(txt):
    return re.sub('[^0-9a-zA-Z]+', ' ', txt)

def clean(txt):
    return re.sub('[^a-zA-Z]+', ' ', txt)

def load_data(path):
    input_file = os.path.join(path)
    with open(input_file, 'r', encoding='utf-8') as f:
        return f.read()

def stem_counter(wc):
    lemmatizer = WordNetLemmatizer()
    counter = {}
    for w in wc:
        c = wc.get(w)
        w2 = lemmatizer.lemmatize(w, 'n')
        if len(w2) == len(w):
            w2 = lemmatizer.lemmatize(w, 'v')
        # if len(w2) == len(w):
        #     w2 = lemmatizer.lemmatize(w, 'j')
        if len(w2) == len(w):
            w2 = lemmatizer.lemmatize(w, 'r')
        w2 = w2.lower()
        counter[w2] = counter.get(w2, 0) + c
    return counter

def sort_counter(wc, min_c=3, max_c=500, min_w_len=2):
    bucket = {}
    clst = set()
    for w in wc:
        if len(w) <= min_w_len:
            continue
        c = wc.get(w)
        if c < min_c or c > max_c:
            continue
        clst.add(c)
        lst = bucket.get(c)
        if lst is None:
            lst = bucket[c] = []
        lst.append(w)
    words = []
    for c in reversed(sorted(clst)):
        for w in sorted(bucket.get(c)):
            words.append((w, c))
    return words

def main(fname, min_c, max_c):
    txt = load_data(fname)
    words = [w.lower() for w in clean(txt).split()]
    wc = Counter(words)
    wc = stem_counter(wc)
    wclist = sort_counter(wc, min_c, max_c)

    prev_c = 0
    for w, c in wclist:
        if c!= prev_c:
            prev_c = c
            print('#', c, '次')
        print(w)

fname = sys.argv[1]
min_c = len(sys.argv) > 2 and int(sys.argv[2]) or 0
max_c = len(sys.argv) > 3 and int(sys.argv[3]) or 1000
main(fname, min_c, max_c)
	import os
	from collections import Counter
	import re
	import sys
	from nltk.stem import WordNetLemmatizer

	def clean1(txt):
	return re.sub('[^0-9a-zA-Z]+', ' ', txt)

	def clean(txt):
	return re.sub('[^a-zA-Z]+', ' ', txt)

	def load_data(path):
	input_file = os.path.join(path)
	with open(input_file, 'r', encoding='utf-8') as f:
	return f.read()

	def stem_counter(wc):
	lemmatizer = WordNetLemmatizer()
	counter = {}
	for w in wc:
	c = wc.get(w)
	w2 = lemmatizer.lemmatize(w, 'n')
	if len(w2) == len(w):
	w2 = lemmatizer.lemmatize(w, 'v')
	# if len(w2) == len(w):
	# w2 = lemmatizer.lemmatize(w, 'j')
	if len(w2) == len(w):
	w2 = lemmatizer.lemmatize(w, 'r')
	w2 = w2.lower()
	counter[w2] = counter.get(w2, 0) + c
	return counter

	def sort_counter(wc, min_c=3, max_c=500, min_w_len=2):
	bucket = {}
	clst = set()
	for w in wc:
	if len(w) <= min_w_len:
	continue
	c = wc.get(w)
	if c < min_c or c > max_c:
	continue
	clst.add(c)
	lst = bucket.get(c)
	if lst is None:
	lst = bucket[c] = []
	lst.append(w)
	words = []
	for c in reversed(sorted(clst)):
	for w in sorted(bucket.get(c)):
	words.append((w, c))
	return words

	def main(fname, min_c, max_c):
	txt = load_data(fname)
	words = [w.lower() for w in clean(txt).split()]
	wc = Counter(words)
	wc = stem_counter(wc)
	wclist = sort_counter(wc, min_c, max_c)

	prev_c = 0
	for w, c in wclist:
	if c!= prev_c:
	prev_c = c
	print('#', c, '次')
	print(w)

	fname = sys.argv[1]
	min_c = len(sys.argv) > 2 and int(sys.argv[2]) or 0
	max_c = len(sys.argv) > 3 and int(sys.argv[3]) or 1000
	main(fname, min_c, max_c)