Last active
December 20, 2015 05:19
-
-
Save ygmpkk/6077939 to your computer and use it in GitHub Desktop.
提取文本内容的单词
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import os | |
import os.path | |
import re | |
from collections import OrderedDict | |
from random import choice | |
class Words(object): | |
def __init__(self): | |
self.file_stream = None | |
self.result = None | |
self.result_sort = None | |
self.result_dict = None | |
def __del__(self): | |
del self.file_stream | |
del self.result | |
del self.result_sort | |
del self.result_dict | |
def read_file(self, filename): | |
file_stream = None | |
try: | |
fopen = open(filename) | |
file_stream = fopen.read() | |
self.file_stream = file_stream | |
fopen.close() | |
except IOError, ex: | |
print ex | |
return file_stream | |
def write_file(self, filename, stream, append=False): | |
try: | |
if append: | |
fopen = open(filename, 'wa') | |
else: | |
fopen = open(filename, 'w') | |
if type(stream) is str: | |
fopen.write(str(stream)) | |
elif type(stream) is list: | |
for i in stream: | |
fopen.write(str(i)+"\n") | |
fopen.close() | |
except IOError, ex: | |
print ex | |
def filter_en(self, stream): | |
# 过滤非英文的字符 | |
p = re.compile(r'[^a-zA-Z]') | |
result = p.sub(' ', stream).split(' ') | |
# 去除空的元素 | |
result = filter(lambda x: len(x) > 0, result) | |
# 转化大小写 | |
result = [x.lower() for x in result] | |
# 得到唯一的字符串 | |
result = list(OrderedDict.fromkeys(result)) | |
return result | |
def words_random(self, words): | |
return choice(words) | |
def save_file(self, filename, append=False): | |
stream = self.filter_en(self.read_file(filename)) | |
self.write_file('result_' + filename, stream, append) | |
if __name__ == "__main__": | |
word = Words() | |
if len(sys.argv) > 1: | |
word.save_file(sys.argv[1]) | |
elif len(sys.argv) > 2: | |
word.save_file(sys.argv[1], sys.argv[2]) | |
else: | |
print 'args error' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment