Skip to content

Instantly share code, notes, and snippets.

@huhuhang
Created February 20, 2019 07:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save huhuhang/0b4b3de032e51217fc72fb931c751707 to your computer and use it in GitHub Desktop.
Save huhuhang/0b4b3de032e51217fc72fb931c751707 to your computer and use it in GitHub Desktop.
文本预处理依次返回:字到序号映射字典、序号到字映射字典、数据集中全部单字、数据集全文。
# *-* coding:utf-8 *-*
'''
modifed by ioiogoo
'''
def preprocess_file(path):
# 语料文本内容
files_content = ''
with open(path, 'r',encoding='UTF-8') as f:
for line in f:
x = line.strip() + "]"
x = x.split(":")[1]
if len(x) <= 5 :
continue
if x[5] == ',':
files_content += x
words = sorted(list(files_content))
counted_words = {}
for word in words:
if word in counted_words:
counted_words[word] += 1
else:
counted_words[word] = 1
# 去掉低频的字
erase = []
for key in counted_words:
if counted_words[key] <= 2:
erase.append(key)
for key in erase:
del counted_words[key]
wordPairs = sorted(counted_words.items(), key=lambda x: -x[1])
words, _ = zip(*wordPairs)
words += (" ",)
# word到id的映射
word2num = dict((c, i) for i, c in enumerate(words))
num2word = dict((i, c) for i, c in enumerate(words))
return word2num, num2word, words, files_content
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment