抽取所有英文单词的基本形式,小写,计数。
python to_word_list.py path_to_text.txt min_word_count
e.g.
python to_word_list.py path_to_text.txt 2
will show words appear above twice.
import os | |
from collections import Counter | |
import re | |
import sys | |
from nltk.stem import WordNetLemmatizer | |
def clean1(txt): | |
return re.sub('[^0-9a-zA-Z]+', ' ', txt) | |
def clean(txt): | |
return re.sub('[^a-zA-Z]+', ' ', txt) | |
def load_data(path): | |
input_file = os.path.join(path) | |
with open(input_file, 'r', encoding='utf-8') as f: | |
return f.read() | |
def stem_counter(wc): | |
lemmatizer = WordNetLemmatizer() | |
counter = {} | |
for w in wc: | |
c = wc.get(w) | |
w2 = lemmatizer.lemmatize(w, 'n') | |
if len(w2) == len(w): | |
w2 = lemmatizer.lemmatize(w, 'v') | |
# if len(w2) == len(w): | |
# w2 = lemmatizer.lemmatize(w, 'j') | |
if len(w2) == len(w): | |
w2 = lemmatizer.lemmatize(w, 'r') | |
w2 = w2.lower() | |
counter[w2] = counter.get(w2, 0) + c | |
return counter | |
def sort_counter(wc, min_c=3, max_c=500, min_w_len=2): | |
bucket = {} | |
clst = set() | |
for w in wc: | |
if len(w) <= min_w_len: | |
continue | |
c = wc.get(w) | |
if c < min_c or c > max_c: | |
continue | |
clst.add(c) | |
lst = bucket.get(c) | |
if lst is None: | |
lst = bucket[c] = [] | |
lst.append(w) | |
words = [] | |
for c in reversed(sorted(clst)): | |
for w in sorted(bucket.get(c)): | |
words.append((w, c)) | |
return words | |
def main(fname, min_c, max_c): | |
txt = load_data(fname) | |
words = [w.lower() for w in clean(txt).split()] | |
wc = Counter(words) | |
wc = stem_counter(wc) | |
wclist = sort_counter(wc, min_c, max_c) | |
prev_c = 0 | |
for w, c in wclist: | |
if c!= prev_c: | |
prev_c = c | |
print('#', c, '次') | |
print(w) | |
fname = sys.argv[1] | |
min_c = len(sys.argv) > 2 and int(sys.argv[2]) or 0 | |
max_c = len(sys.argv) > 3 and int(sys.argv[3]) or 1000 | |
main(fname, min_c, max_c) |