BrikerMan/nlp_get_all_non_chinese_tokens.py

## nlp_get_all_non_chinese_tokens.py
# encoding: utf-8
"""
@author: BrikerMan
@contact: eliyar917@gmail.com
@blog: https://eliyar.biz

@version: 1.0
@license: Apache Licence
@file: pre_process
@time: 2018/11/18

"""
import re
import os
import logging
import pathlib
from os import walk
from os.path import splitext
from os.path import join
from typing import List
import pandas as pd
from tabulate import tabulate
import shutil

chinese_regex = re.compile('[\u4e00-\u9fa5]')


TARGET_CHARS = ['，', '。', '】', '【', '、', '：', '“', '”', '；', '》', '《', '○', '）', '（', '？']


def get_all_txt_files(path: str) -> List[str]:
    text_files = list()

    for root, dirs, files in walk(path):
        for f in files:
            if splitext(f)[1].lower() == ".txt":
                text_files.append(join(root, f))
    return text_files


def get_all_non_chinese_tokens(path):
    tokens2count = {}
    for file in get_all_txt_files(path):
        tokens = chinese_regex.sub('', open(file, 'r', encoding='utf-8').read())
        for i in tokens:
            if i and i != ' ':
                tokens2count[i] = tokens2count.get(i, 0) + 1
    return tokens2count


def get_mark2_count():
    path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
    t2c = get_all_non_chinese_tokens(path)
    data_list = []
    for k, v in t2c.items():
        data_list.append({
            'token': k,
            'count': v
        })
    df = pd.DataFrame(data_list)
    df = df[['token', 'count']]
    df = df.sort_values('count', ascending=False)
    print(df)

    df = df[df['count'] >= 7000]
    t = []
    for v in df.values:
        t.append(v[0])
    print(t)


def get_file_info(path):
    fileinfo = []
    for file in get_all_txt_files(path):
        info = {
            'token_count': 0,
            'chinese_count': 0,
            'mark_count': 0,
            'mark_list': []
        }
        lines = open(file, 'r', encoding='utf-8').read().splitlines()
        for line in lines:
            line = line.strip()
            for char in line:
                if chinese_regex.match(char):
                    info['chinese_count'] += 1
                elif char in TARGET_CHARS:
                    info['mark_count'] += 1
                    info['mark_list'].append(char)

        info['token_count'] = info['chinese_count'] + info['mark_count']
        info['mark_list'] = ' '.join(set(info['mark_list']))
        info['mark_rate'] = info['mark_count'] / info['token_count']
        info['file'] = file.replace(path, '')
        fileinfo.append(info)
    df = pd.DataFrame(fileinfo)
    # df = df[]
    df.to_csv('file_info.csv')

    print(tabulate(df, headers='keys', tablefmt='psql'))
    return df


def copy_files(data_path, target_path):
    marked_path = os.path.join(target_path, 'marked')
    unmarked_path = os.path.join(target_path, 'unmarked')
    pathlib.Path(marked_path).mkdir(parents=True, exist_ok=True)
    pathlib.Path(unmarked_path).mkdir(parents=True, exist_ok=True)

    df = get_file_info(data_path)
    marked_df = df[df['mark_rate'] >= 0.1]
    unmarked_df = df[df['mark_rate'] < 0.1]
    columns = list(df.columns)
    for file in marked_df.values:
        file_name = file[columns.index('file')]
        origin = data_path + file_name
        target = marked_path + file_name
        shutil.copy(origin, target)

    for file in unmarked_df.values:
        file_name = file[columns.index('file')]
        origin = data_path + file_name
        target = unmarked_path + file_name
        shutil.copy(origin, target)


def format_line(text):
    """
    格式化一行数据
    :param text:
    :return:
    """
    text = text
    target_x = []
    target_label = []
    for char in text:
        if chinese_regex.match(char):
            target_x.append(char)
            target_label.append('O')
        elif char in TARGET_CHARS and len(target_label) > 0:
            target_label[-1] = char
    return target_x, target_label


def format_all_file(path):
    """
    格式化数据
    :param path:
    :return:
    """
    data = []
    for file in get_all_txt_files(path):
        for line in open(file, 'r', encoding='utf-8').read().splitlines():
            line = line.strip()
            if line:
                x, y = format_line(line)
                if len(x) == len(y):
                    data.append({
                        'raw': line,
                        'x': x,
                        'y': y,
                        'length': len(x),
                    })
                else:
                    logging.error("格式化失败 {}".format(line))
    df = pd.DataFrame(data)
    df.to_csv(path + '/formatted.csv')


if __name__ == '__main__':
    # 数据路径
    path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
    # 对文件进行分类，取出标注好的数据
    copy_files(path, path)
    # 对有标注的数据进行格式化
    format_all_file(path + '/marked')
	# encoding: utf-8
	"""
	@author: BrikerMan
	@contact: eliyar917@gmail.com
	@blog: https://eliyar.biz

	@version: 1.0
	@license: Apache Licence
	@file: pre_process
	@time: 2018/11/18

	"""
	import re
	import os
	import logging
	import pathlib
	from os import walk
	from os.path import splitext
	from os.path import join
	from typing import List
	import pandas as pd
	from tabulate import tabulate
	import shutil

	chinese_regex = re.compile('[\u4e00-\u9fa5]')


	TARGET_CHARS = ['，', '。', '】', '【', '、', '：', '“', '”', '；', '》', '《', '○', '）', '（', '？']


	def get_all_txt_files(path: str) -> List[str]:
	text_files = list()

	for root, dirs, files in walk(path):
	for f in files:
	if splitext(f)[1].lower() == ".txt":
	text_files.append(join(root, f))
	return text_files


	def get_all_non_chinese_tokens(path):
	tokens2count = {}
	for file in get_all_txt_files(path):
	tokens = chinese_regex.sub('', open(file, 'r', encoding='utf-8').read())
	for i in tokens:
	if i and i != ' ':
	tokens2count[i] = tokens2count.get(i, 0) + 1
	return tokens2count


	def get_mark2_count():
	path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
	t2c = get_all_non_chinese_tokens(path)
	data_list = []
	for k, v in t2c.items():
	data_list.append({
	'token': k,
	'count': v
	})
	df = pd.DataFrame(data_list)
	df = df[['token', 'count']]
	df = df.sort_values('count', ascending=False)
	print(df)

	df = df[df['count'] >= 7000]
	t = []
	for v in df.values:
	t.append(v[0])
	print(t)


	def get_file_info(path):
	fileinfo = []
	for file in get_all_txt_files(path):
	info = {
	'token_count': 0,
	'chinese_count': 0,
	'mark_count': 0,
	'mark_list': []
	}
	lines = open(file, 'r', encoding='utf-8').read().splitlines()
	for line in lines:
	line = line.strip()
	for char in line:
	if chinese_regex.match(char):
	info['chinese_count'] += 1
	elif char in TARGET_CHARS:
	info['mark_count'] += 1
	info['mark_list'].append(char)

	info['token_count'] = info['chinese_count'] + info['mark_count']
	info['mark_list'] = ' '.join(set(info['mark_list']))
	info['mark_rate'] = info['mark_count'] / info['token_count']
	info['file'] = file.replace(path, '')
	fileinfo.append(info)
	df = pd.DataFrame(fileinfo)
	# df = df[]
	df.to_csv('file_info.csv')

	print(tabulate(df, headers='keys', tablefmt='psql'))
	return df


	def copy_files(data_path, target_path):
	marked_path = os.path.join(target_path, 'marked')
	unmarked_path = os.path.join(target_path, 'unmarked')
	pathlib.Path(marked_path).mkdir(parents=True, exist_ok=True)
	pathlib.Path(unmarked_path).mkdir(parents=True, exist_ok=True)

	df = get_file_info(data_path)
	marked_df = df[df['mark_rate'] >= 0.1]
	unmarked_df = df[df['mark_rate'] < 0.1]
	columns = list(df.columns)
	for file in marked_df.values:
	file_name = file[columns.index('file')]
	origin = data_path + file_name
	target = marked_path + file_name
	shutil.copy(origin, target)

	for file in unmarked_df.values:
	file_name = file[columns.index('file')]
	origin = data_path + file_name
	target = unmarked_path + file_name
	shutil.copy(origin, target)


	def format_line(text):
	"""
	格式化一行数据
	:param text:
	:return:
	"""
	text = text
	target_x = []
	target_label = []
	for char in text:
	if chinese_regex.match(char):
	target_x.append(char)
	target_label.append('O')
	elif char in TARGET_CHARS and len(target_label) > 0:
	target_label[-1] = char
	return target_x, target_label


	def format_all_file(path):
	"""
	格式化数据
	:param path:
	:return:
	"""
	data = []
	for file in get_all_txt_files(path):
	for line in open(file, 'r', encoding='utf-8').read().splitlines():
	line = line.strip()
	if line:
	x, y = format_line(line)
	if len(x) == len(y):
	data.append({
	'raw': line,
	'x': x,
	'y': y,
	'length': len(x),
	})
	else:
	logging.error("格式化失败 {}".format(line))
	df = pd.DataFrame(data)
	df.to_csv(path + '/formatted.csv')


	if __name__ == '__main__':
	# 数据路径
	path = '/Users/brikerman/Downloads/殆知阁古代文献藏书/易藏'
	# 对文件进行分类，取出标注好的数据
	copy_files(path, path)
	# 对有标注的数据进行格式化
	format_all_file(path + '/marked')