Skip to content

Instantly share code, notes, and snippets.

@hjeffrey
Last active April 7, 2024 12:14
Show Gist options
  • Save hjeffrey/df08677b4d33753510ced204bfc34ed3 to your computer and use it in GitHub Desktop.
Save hjeffrey/df08677b4d33753510ced204bfc34ed3 to your computer and use it in GitHub Desktop.
把康熙部首替换为简体中文
#!/usr/bin/python
# -*- coding:utf-8 -*-
# 把康熙部首替换为简体中文
import os
import sys
import glob
import json
# 查看编码
def encodingValue(key):
print(key + "-" + json.dumps(key))
# 加载康熙部首的对应字典
def initKangxiDic():
global kangxi_map
# 康熙部首 https://unicode-table.com/cn/blocks/kangxi-radicals/
# 对应关系根据 https://raw.githubusercontent.com/furuiyang0715/spider_notes/master/codes/kangxi.json 补充
kangxi_map = {"⼀": "一","⼄": "乙","⼆": "二","⼈": "人","⼉": "儿","⼊": "入","⼋": "八","⼏": "几","⼑": "刀","⼒": "力","⼔": "匕","⼗": "十","⼘": "卜","⼚": "厂","⼜": "又","⼝": "口","⼞": "口","⼟": "土","⼠": "士","⼣": "夕","⼤": "大","⼥": "女","⼦": "子","⼨": "寸","⼩": "小","⼫": "尸","⼭": "山","⼯": "工","⼰": "己","⼲": "干","⼴": "广","⼸": "弓","⼼": "心","⼽": "戈","⼿": "手","⽀": "支","⽂": "文","⽃": "斗","⽄": "斤","⽅": "方","⽆": "无","⽇": "日","⽈": "曰","⽉": "月","⽊": "木","⽋": "欠","⽌": "止","⽍": "歹","⽏": "毋","⽐": "比","⽑": "毛","⽒": "氏","⽓": "气","⽔": "水","⽕": "火","⽖": "爪","⽗": "父","⽚": "片","⽛": "牙","⽜": "牛","⽝": "犬","⽞": "玄","⽟": "玉","⽠": "瓜","⽡": "瓦","⽢": "甘","⽣": "生","⽤": "用","⽥": "田","⽩": "白","⽪": "皮","⽫": "皿","⽬": "目","⽭": "矛","⽮": "矢","⽯": "石","⽰": "示","⽲": "禾","⽳": "穴","⽴": "立","⽵": "竹","⽶": "米","⽸": "缶","⽹": "网","⽺": "羊","⽻": "羽","⽼": "老","⽽": "而","⽿": "耳","⾁": "肉","⾂": "臣","⾃": "自","⾄": "至","⾆": "舌","⾈": "舟","⾉": "艮","⾊": "色","⾍": "虫","⾎": "血","⾏": "行","⾐": "衣","⾒": "儿","⾓": "角","⾔": "言","⾕": "谷","⾖": "豆","⾚": "赤","⾛": "走","⾜": "足","⾝": "身","⾞": "车","⾟": "辛","⾠": "辰","⾢": "邑","⾣": "酉","⾤": "采","⾥": "里","⾦": "金","⾧": "长","⾨": "门","⾩": "阜","⾪": "隶","⾬": "雨","⾭": "青","⾮": "非","⾯": "面","⾰": "革","⾲": "韭","⾳": "音","⾴": "页","⾵": "风","⾶": "飞","⾷": "食","⾸": "首","⾹": "香","⾺": "马","⾻": "骨","⾼": "高","⿁": "鬼","⿂": "鱼","⿃": "鸟","⿄": "卤","⿅": "鹿","⿇": "麻","⿉": "黍","⿊": "黑","⿍": "鼎","⿎": "鼓","⿏": "鼠","⿐": "鼻","⿒": "齿","⿓": "龙","⿔":"龟","⿕":"仑"}
# for key in kangxi_map:
# encodingValue(kangxi_map[key])
# 替换康熙部首的文字
def repaceKangxi(fileName, newFileName):
f = open(fileName, 'r')
contents = f.readlines()
wf = open(newFileName, 'w')
wf.seek(0)
for line in contents:
newLine = line
for word in kangxi_map.keys():
newLine = newLine.replace(word, kangxi_map[word])
wf.write(newLine)
f.close()
wf.close()
if __name__ == '__main__':
path = sys.argv[1].decode('utf-8')
if os.path.isdir(path):
print("请输入正确的文件路径")
elif os.path.isfile(path):
initKangxiDic()
newPath = os.path.dirname(path)+'/fix_kangxi_'+os.path.basename(path)
i = 1
while os.path.exists(newPath):
i=i+1
newPath = os.path.dirname(path)+'/fix_kangxi'+str(i)+'_'+os.path.basename(path)
repaceKangxi(path, newPath)
print(u"转换完成: "+newPath)
@izyForever
Copy link

Thanks for this work🤗. Help me deal with the unpleasant words in my textual data.

@johnniswang
Copy link

Uploading image.png…

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment