Skip to content

Instantly share code, notes, and snippets.

@powersee
Created April 24, 2019 10:29
Show Gist options
  • Save powersee/416fe645b61fe86acca8d90a7b51f2c5 to your computer and use it in GitHub Desktop.
Save powersee/416fe645b61fe86acca8d90a7b51f2c5 to your computer and use it in GitHub Desktop.
批量将文本翻译为中文,代码来源:https://blog.csdn.net/Fly_TheWind/article/details/84011981
#本脚本在 Python3 下运行正常,先安装模块 googletrans 和 tqdm ,然后设置好文本目录和生成文件的放置目录即可
#作用:能把文件夹里面的文本批量翻译成中文。
#输入正确的文本编码很重要,我一开始用 utf-8 一直出错,折腾一番后,猜想难道是文本编码不对?于是在 vim 里面输入 :set fileencoding 知道了我要翻译的文本是 utf-16 ……
from googletrans import Translator
from tqdm import tqdm
import os
import random
import time
import re
#声明源文件目录 和 生成文件的放置目录
home = os.environ['HOME']
path ="/Users/ver/Desktop/wait"
dest ="/Users/ver/Desktop/done"
files = os.listdir(path)
s = []
# 把长文本切分成短文本,当时google担心会检查文本长度,所以随机了长度,应该没这么严格,想写成固定的也可以
def getText(string):
list = []
randline = random.random() * 500
while len(string) > 1500:
index = string.find("\n",int(randline)+1000)
if index is not None:
list.append(string[0:index])
string = string[index:]
list.append(string)
return list
#保存翻译完后的文件
def save2file(title,result):
with open(dest+"/"+title,'w') as d:
for en in result:
d.write(en)
d.close
# 在文本中无法识别的表情包会使得翻译产生错误,对文本过滤
emoji_pattern = re.compile(
u"(\ud83d[\ude00-\ude4f])|" # emoticons
u"(\ud83c[\udf00-\uffff])|" # symbols & pictographs (1 of 2)
u"(\ud83d[\u0000-\uddff])|" # symbols & pictographs (2 of 2)
u"(\ud83d[\ude80-\udeff])|" # transport & map symbols
u"(\ud83c[\udde0-\uddff])|" # flags (iOS)
u"((-{0,1}[{}]-{0,1}))|" # 我的项目逻辑需要,可删除
u"([R efn]|)" # 我的项目逻辑需要,可删除
"+", flags=re.UNICODE)
def remove_emoji(text):
return emoji_pattern.sub(r'', text)
# 打印单个文本分段后的翻译进度
def printProcess(cnt,txt_len,tatal_size,error):
content = "file completed "+str(cnt)+"/"+str(txt_len)
print(content,end="\r")
# 在短文本翻译出错后,用二分法找到错误地方,并舍去无法翻译的句子
def binarySearch(text):
mid = (int) (len(text) *1.0/2)
result = []
splitIndex = text.find("。",mid)
if splitIndex == -1 or splitIndex == 0:
return result
pre = text[0:splitIndex]
after = text[splitIndex+1:]
try:
result = result+append(pre)
except:
result = result+binarySearch(pre)
try:
result = result+append(after)
except:
result = result+binarySearch(after)
return result
# 翻译文本
def getTranslateTextList(txt):
result = []
time.sleep(1)
cnt = 0
txtsize = 0
for text in txt:
try:
cnt += 1
text = remove_emoji(text)
txtsize += len(text)
translate = Translator()
en = translate.translate(text=text, dest='zh-CN').text
#原作者这里是翻译成英文的,但是我是用来把日文翻译成中文,所以就把 en 改为 zh-CN 了。
result.append(en)
printProcess(cnt,len(txt),txtsize,error)
slptimes = random.random() #我可能想太多,怕固定的sleep还是会被google检查出来,所以随机了一个时间
time.sleep(1.2 + slptimes)
except Exception as e:
result = result + binarySearch(text)
return result
## 正式开始 mian()
for file in tqdm(files):
if not os.path.isdir(file):
title = Translator().translate(text=file, dest='en').text
try:
with open(path+"/"+file,'r',encoding='utf-16',errors='ignore') as f:
string = f.read()
# string = string.rstrip("\n").decode("utf8")
# string = string.split("\r\n")
f.close()
txt = getText(string)
print("analysis:"+title)
result = getTranslateTextList(txt)
save2file(title,result)
except Exception as e:
print(str(e))
continue
time.sleep(10) #为了保证不被google屏蔽IP,不得已设置了一个超长时间的sleep,可以按情况改小。我试了下 3,结果没多久就被封 IP 了,改为 10 就没什么问题了。
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment