Skip to content

Instantly share code, notes, and snippets.

@ygmpkk
Last active December 20, 2015 05:19
Show Gist options
  • Save ygmpkk/6077939 to your computer and use it in GitHub Desktop.
Save ygmpkk/6077939 to your computer and use it in GitHub Desktop.
提取文本内容的单词
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import os
import os.path
import re
from collections import OrderedDict
from random import choice
class Words(object):
def __init__(self):
self.file_stream = None
self.result = None
self.result_sort = None
self.result_dict = None
def __del__(self):
del self.file_stream
del self.result
del self.result_sort
del self.result_dict
def read_file(self, filename):
file_stream = None
try:
fopen = open(filename)
file_stream = fopen.read()
self.file_stream = file_stream
fopen.close()
except IOError, ex:
print ex
return file_stream
def write_file(self, filename, stream, append=False):
try:
if append:
fopen = open(filename, 'wa')
else:
fopen = open(filename, 'w')
if type(stream) is str:
fopen.write(str(stream))
elif type(stream) is list:
for i in stream:
fopen.write(str(i)+"\n")
fopen.close()
except IOError, ex:
print ex
def filter_en(self, stream):
# 过滤非英文的字符
p = re.compile(r'[^a-zA-Z]')
result = p.sub(' ', stream).split(' ')
# 去除空的元素
result = filter(lambda x: len(x) > 0, result)
# 转化大小写
result = [x.lower() for x in result]
# 得到唯一的字符串
result = list(OrderedDict.fromkeys(result))
return result
def words_random(self, words):
return choice(words)
def save_file(self, filename, append=False):
stream = self.filter_en(self.read_file(filename))
self.write_file('result_' + filename, stream, append)
if __name__ == "__main__":
word = Words()
if len(sys.argv) > 1:
word.save_file(sys.argv[1])
elif len(sys.argv) > 2:
word.save_file(sys.argv[1], sys.argv[2])
else:
print 'args error'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment