Skip to content

Instantly share code, notes, and snippets.

Last active November 18, 2018 10:43
Show Gist options
  • Save BrikerMan/996bede6edc41bad070c1d4e07d4d58d to your computer and use it in GitHub Desktop.
Save BrikerMan/996bede6edc41bad070c1d4e07d4d58d to your computer and use it in GitHub Desktop.
# encoding: utf-8
@author: BrikerMan
@version: 1.0
@license: Apache Licence
@file: pre_process
@time: 2018/11/18
import re
import os
import logging
import pathlib
from os import walk
from os.path import splitext
from os.path import join
from typing import List
import pandas as pd
from tabulate import tabulate
import shutil
chinese_regex = re.compile('[\u4e00-\u9fa5]')
TARGET_CHARS = [',', '。', '】', '【', '、', ':', '“', '”', ';', '》', '《', '○', ')', '(', '?']
def get_all_txt_files(path: str) -> List[str]:
text_files = list()
for root, dirs, files in walk(path):
for f in files:
if splitext(f)[1].lower() == ".txt":
text_files.append(join(root, f))
return text_files
def get_all_non_chinese_tokens(path):
tokens2count = {}
for file in get_all_txt_files(path):
tokens = chinese_regex.sub('', open(file, 'r', encoding='utf-8').read())
for i in tokens:
if i and i != ' ':
tokens2count[i] = tokens2count.get(i, 0) + 1
return tokens2count
def get_mark2_count():
path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
t2c = get_all_non_chinese_tokens(path)
data_list = []
for k, v in t2c.items():
'token': k,
'count': v
df = pd.DataFrame(data_list)
df = df[['token', 'count']]
df = df.sort_values('count', ascending=False)
df = df[df['count'] >= 7000]
t = []
for v in df.values:
def get_file_info(path):
fileinfo = []
for file in get_all_txt_files(path):
info = {
'token_count': 0,
'chinese_count': 0,
'mark_count': 0,
'mark_list': []
lines = open(file, 'r', encoding='utf-8').read().splitlines()
for line in lines:
line = line.strip()
for char in line:
if chinese_regex.match(char):
info['chinese_count'] += 1
elif char in TARGET_CHARS:
info['mark_count'] += 1
info['token_count'] = info['chinese_count'] + info['mark_count']
info['mark_list'] = ' '.join(set(info['mark_list']))
info['mark_rate'] = info['mark_count'] / info['token_count']
info['file'] = file.replace(path, '')
df = pd.DataFrame(fileinfo)
# df = df[]
print(tabulate(df, headers='keys', tablefmt='psql'))
return df
def copy_files(data_path, target_path):
marked_path = os.path.join(target_path, 'marked')
unmarked_path = os.path.join(target_path, 'unmarked')
pathlib.Path(marked_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(unmarked_path).mkdir(parents=True, exist_ok=True)
df = get_file_info(data_path)
marked_df = df[df['mark_rate'] >= 0.1]
unmarked_df = df[df['mark_rate'] < 0.1]
columns = list(df.columns)
for file in marked_df.values:
file_name = file[columns.index('file')]
origin = data_path + file_name
target = marked_path + file_name
shutil.copy(origin, target)
for file in unmarked_df.values:
file_name = file[columns.index('file')]
origin = data_path + file_name
target = unmarked_path + file_name
shutil.copy(origin, target)
def format_line(text):
:param text:
text = text
target_x = []
target_label = []
for char in text:
if chinese_regex.match(char):
elif char in TARGET_CHARS and len(target_label) > 0:
target_label[-1] = char
return target_x, target_label
def format_all_file(path):
:param path:
data = []
for file in get_all_txt_files(path):
for line in open(file, 'r', encoding='utf-8').read().splitlines():
line = line.strip()
if line:
x, y = format_line(line)
if len(x) == len(y):
'raw': line,
'x': x,
'y': y,
'length': len(x),
logging.error("格式化失败 {}".format(line))
df = pd.DataFrame(data)
df.to_csv(path + '/formatted.csv')
if __name__ == '__main__':
# 数据路径
path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
# 对文件进行分类,取出标注好的数据
copy_files(path, path)
# 对有标注的数据进行格式化
format_all_file(path + '/marked')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment