Skip to content

Instantly share code, notes, and snippets.

@guileen
Created September 21, 2017 13:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guileen/9bb846ad7011fa31ab31e68d964a4c6f to your computer and use it in GitHub Desktop.
Save guileen/9bb846ad7011fa31ab31e68d964a4c6f to your computer and use it in GitHub Desktop.
Grab english words to 单词本

Features

抽取所有英文单词的基本形式,小写,计数。

Usage

python to_word_list.py path_to_text.txt min_word_count

e.g.

python to_word_list.py path_to_text.txt 2 will show words appear above twice.

import os
from collections import Counter
import re
import sys
from nltk.stem import WordNetLemmatizer
def clean1(txt):
return re.sub('[^0-9a-zA-Z]+', ' ', txt)
def clean(txt):
return re.sub('[^a-zA-Z]+', ' ', txt)
def load_data(path):
input_file = os.path.join(path)
with open(input_file, 'r', encoding='utf-8') as f:
return f.read()
def stem_counter(wc):
lemmatizer = WordNetLemmatizer()
counter = {}
for w in wc:
c = wc.get(w)
w2 = lemmatizer.lemmatize(w, 'n')
if len(w2) == len(w):
w2 = lemmatizer.lemmatize(w, 'v')
# if len(w2) == len(w):
# w2 = lemmatizer.lemmatize(w, 'j')
if len(w2) == len(w):
w2 = lemmatizer.lemmatize(w, 'r')
w2 = w2.lower()
counter[w2] = counter.get(w2, 0) + c
return counter
def sort_counter(wc, min_c=3, max_c=500, min_w_len=2):
bucket = {}
clst = set()
for w in wc:
if len(w) <= min_w_len:
continue
c = wc.get(w)
if c < min_c or c > max_c:
continue
clst.add(c)
lst = bucket.get(c)
if lst is None:
lst = bucket[c] = []
lst.append(w)
words = []
for c in reversed(sorted(clst)):
for w in sorted(bucket.get(c)):
words.append((w, c))
return words
def main(fname, min_c, max_c):
txt = load_data(fname)
words = [w.lower() for w in clean(txt).split()]
wc = Counter(words)
wc = stem_counter(wc)
wclist = sort_counter(wc, min_c, max_c)
prev_c = 0
for w, c in wclist:
if c!= prev_c:
prev_c = c
print('#', c, '次')
print(w)
fname = sys.argv[1]
min_c = len(sys.argv) > 2 and int(sys.argv[2]) or 0
max_c = len(sys.argv) > 3 and int(sys.argv[3]) or 1000
main(fname, min_c, max_c)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment