相信英语不好的同学一看到全英文资料或英文文献就头疼,很多时候,我们看英文资料都会遇到不认识的单词或句子,于是我们想到用翻译工具,比如有道词典、百度翻译、谷歌翻译等有时我们在看英文word的时候,有时会借助比如谷歌翻译,对一段话进行翻译,我们就会复制粘贴一段文字到谷歌翻译,看一段,这样会不会有点麻烦,我们想把整篇文档进行一次性翻译。
于是我又想到了万能的Python今天就给大家讲一讲如何用Python对word文档进行翻译1.目录设置
2.定义日志类
3.定义一个基类
4.运用爬虫调用百度翻译,定义百度翻译方法函数baidu_translate
5.运用爬虫调用有道翻译,定义有道翻译方法函数youdao_translate
6.运用爬虫调用谷歌翻译,定义谷歌翻译方法函数google_translate,调用google翻译之前需要用到JavaScript类,这里用Python导入JS类
7.ensemble or consensus : 综合方法或者说一致性方法即是同时调用百度翻译、有道翻译、谷歌翻译俗话说得好,三个臭皮匠顶个诸葛(这也是机器学习常用的算法思想,比如随机森林算法通过投票最多的树进行决策)。
8.定义翻译word文档的类,并且继承基类Translate.
代码写好了,接下来就要实战了。这里有两篇英文讲稿或书籍
运行程序:源代码:# -*- coding: utf-8 -*-"""Created on Thu Oct 11 17:21:43 2018@author: weineng.zhou"""###############################################################################
# setupimport os# 根目录root = D:/docx/os.chdir(root) # 改变工作目录到根目录Doc_In = D:\docx\Doc_in# word文档所在目录
Doc_Out = D:\docx\Doc_out# word文档翻译后所在目录Log_Out = D:\docx\log# log所在目录###############################################################################
# 日志类import datetimeclassLog(object):#实现日志的记录和维护,采用单例模式def__init__(self):#初始化时删除先前的日志# 获取运行目录 path = os.path.split(os.path.realpath(__file__))[
0] path = os.path.join(path,Log_Out)ifnot os.path.exists(path): os.makedirs(path) self.log_path = os.path.join(path,
log.txt)def__new__(cls,*args,**kwargs):#实现单例模式ifnot hasattr(Log,"_instance"): Log._instance = object.__new__(cls)
return Log._instancedefwrite(self,content):写日志# 添加时间信息 time_str = datetime.datetime.strftime(datetime.datetime.now(),
%Y-%m-%d %H:%M:%S) content = [ + time_str + ] -> + content + \n# 打印消息 print(content)# 写文件
with open(self.log_path,a+,encoding=utf-8) as f: f.write(content)defdelete_old_log():# 删除之前存在的日志文件
# 获取运行目录 path = os.path.split(os.path.realpath(__file__))[0] path = os.path.join(path,Log_Out)
if os.path.exists(): os.path.removedirs(path) os.path.mkdir(path) os.path.join(path,
log.txt)################################################################################ 翻译基类classTranslate
(object):deftranslate(doc):pass###############################################################################
import urllib.requestimport urllib.parseimport requestsimport json# 百度翻译方法defbaidu_translate(content, type=
1):#实现百度翻译 baidu_url = http://fanyi.baidu.com/basetrans data = {} data[from] = en data[to
] = zh data[query] = content data[transtype] = translang data[simple_means_flag] = 3 data[
sign] = 94582.365127 data[token] = ec980ef090b173ebdff2eea5ffd9a778 data = urllib.parse.urlencode(data).encode(
utf-8) headers = {"User-Agent":"Mozilla/5.0 (Linux; Android 5.1.1; Nexus 6 Build/LYZ28E) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Mobile Safari/537.36"
} baidu_re = urllib.request.Request(baidu_url, data, headers) baidu_response = urllib.request.urlopen(baidu_re)
baidu_html = baidu_response.read().decode(utf-8) target2 = json.loads(baidu_html) trans = target2[
trans] ret = for i in range(len(trans)): ret += trans[i][dst] + \nif ret:return (True,ret)else
:return (False,ret)################################################################################ 有道翻译方法
defyoudao_translate(content):实现有道翻译的接口 url = http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule&sessionFrom=https://www.baidu.com/link
data = {from:AUTO,to:AUTO,smartresult:dict,client:fanyideskweb,salt:1500092479607,sign:d9f9a3aa0a7b34241b3fe30505e5d436
,doctype:json,version:2.1,keyfrom:fanyi.web,action:FY_BY_CL1CKBUTTON,typoResult:true} data[i] = content
data = urllib.parse.urlencode(data).encode(utf-8) wy = urllib.request.urlopen(url,data) html = wy.read().decode(
utf-8) ta = json.loads(html) ret = ta[translateResult][0][0][tgt]if ret:return (True,ret)else:return
(False,ret)###############################################################################import execjs
# 安装命令:pip install PyExecJS# 这里需要一个类实现JavaScript代码的生成classPy4Js():def__init__(self): self.ctx = execjs.compile(
""" function TL(a) { var k = ""; var b = 406644; var b1 = 3293161072;
var jd = "."; var $b = "+-a^+6"; var Zb = "+-3^+b+-f"; for (var e = [], f = 0, g = 0; g < a.length; g++) {
var m = a.charCodeAt(g); 128 > m ? e[f++] = m : (2048 > m ? e[f++] = m >> 6 | 192 : (55296 == (m & 64512) && g + 1 < a.length && 56320 == (a.charCodeAt(g + 1) & 64512) ? (m = 65536 + ((m & 1023) << 10) + (a.charCodeAt(++g) & 1023),
e[f++] = m >> 18 | 240, e[f++] = m >> 12 & 63 | 128) : e[f++] = m >> 12 | 224,
e[f++] = m >> 6 & 63 | 128), e[f++] = m & 63 | 128) } a = b;
for (f = 0; f < e.length; f++) a += e[f], a = RL(a, $b); a = RL(a, Zb); a ^= b1 || 0;
0 > a && (a = (a & 2147483647) + 2147483648); a %= 1E6; return a.toString() + jd + (a ^ b)
}; function RL(a, b) { var t = "a"; var Yb = "+"; for (var c = 0; c < b.length - 2; c += 3) {
var d = b.charAt(c + 2), d = d >= t ? d.charCodeAt(0) - 87 : Number(d), d = b.charAt(c + 1) == Yb ? a >>> d: a << d;
a = b.charAt(c) == Yb ? a + d & 4294967295 : a ^ d } return a } """
) defgetTk(self,text):return self.ctx.call("TL",text) ###############################################################################
# 谷歌翻译方法defgoogle_translate(content):# 实现谷歌的翻译 js = Py4Js() tk = js.getTk(content)if len(content) >
4891: print("翻译的长度超过限制!!!") return param = {tk: tk, q: content} result = requests.get(
"""http://translate.google.cn/translate_a/single?client=t&sl=en &tl=zh-CN&hl=zh-CN&dt=at&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss
&dt=t&ie=UTF-8&oe=UTF-8&clearbtn=1&otf=1&pc=1&srcrom=0&ssel=0&tsel=0&kc=2""", params=param)
#返回的结果为Json,解析为一个嵌套列表 trans = result.json()[0] ret = for i in range(len(trans)): line = trans[i][
0]if line != None: ret += trans[i][0]if ret:return (True,ret)else:return (False,ret)###############################################################################
deftranslate_func(content):# 集成百度、谷歌、有道多合一的翻译 funcs = [baidu_translate,google_translate,youdao_translate]
count = 0# 循环调用百度、谷歌、有道API,其中如果谁调成功就返回,或者大于等于9次没有成功也返回whileTrue:for i in range(len(funcs)): ret = (。
False,)try: ret = funcs[i](content)except: Log().write("调用 %s 方法出现异常" % funcs[i].__name__)
if ret[0] == True:return ret[1]else: count += 1if count >= 9: Log().write(
"以下内容尝试9次仍翻译失败,内容【 %s 】" % content)returnelse:continue###############################################################################
import docx # 安装命令:pip install python-docxclassDocxTranslate(Translate):def__init__(self, fileName, fullPath)
:# fileName:文件名 fullPath:全路径 self.fileName = fileName self.fullName = fullPath self.prepare()
deftranslate(self):翻译# 获取文档对象 doc = docx.Document(self.fullName)# 创建内存中的word文档对象 new_doc = docx.Document()
i = 0# 遍历每一段文本for para in doc.paragraphs:# 翻译 content = para.text.strip() # 去掉多余空格
if content != : ret = translate_func(content) trans = ret if ret else翻译失败
new_doc.add_paragraph(content) new_doc.add_paragraph(trans) i +=
1 print(i,end=,flush=True) new_doc.save(self.new_fullPath) Log().write(self.fileName +
翻译完成,新文档: + self.new_fullPath)defprepare(self):# 查看要生成的文件名是否已存在,若存在,则在文件名中 + 1 file_name = os.path.splitext(self.fileName)[
0] + os.path.splitext(self.fileName)[1] path = self.get_path(Doc_Out,file_name) i = 1while
os.path.exists(path): # 循环,生成新的文件名 file_name = os.path.splitext(self.fileName)[0] + str(i) + os.path.splitext(self.fileName)[
1] path = self.get_path(Doc_Out,file_name) i = i + 1 self.new_fileName = file_name
self.new_fullPath = pathdefget_path(self,*paths): path = os.path.split(os.path.realpath(__file__))[
0]if len(paths):for i in range(len(paths)): path = os.path.join(path,paths[i])return path
###############################################################################deftranslate_doc(doc):
# 翻译单个文档 path = get_path(Doc_In,doc) translate = get_translate(doc,path)ifnot translate: write_log(
根据文件 + doc + 映射翻译对象失败!)return translate.translate()defget_translate(doc,path):# 根据文件扩展名返回翻译对象 tranlate =
None extend_str = os.path.splitext(doc)[1]if extend_str == .docor extend_str == .docx: tranlate = DocxTranslate(doc,path)
else: tranlate = Nonereturn tranlatedefget_doc():# 提取指定路径下的文件,返回文件列表 path = get_path(Doc_In)
ifnot os.path.exists(path):returnNone list1 = [] dirs = os.listdir(path)for i in dirs: extend_str = os.path.splitext(i)[
1]if extend_str == .docor extend_str == .docx: list1.append(i)return list1defget_path(*paths)
:# 获取路径,参考os.path.join()方法实现 path = os.path.split(os.path.realpath(__file__))[0]if len(paths):for i
in range(len(paths)): path = os.path.join(path,paths[i])return pathdefwrite_log(msg):# 打印消息并写日志
# 写入日志文件 Log().write(msg)defrun():# 提取文档 fileList = get_doc()ifnot fileList:# 文件不存在 write_log(
指定路径文件不存在,不执行翻译,结束程序)return msg = 提取到 + str(len(fileList)) + 个文档 write_log(msg)# 创建文件夹(文档输出目录)。
ifnot os.path.exists(get_path(Doc_Out)): os.makedirs(get_path(Doc_Out))for i in range(len(fileList)):
doc = fileList[i] write_log(开始翻译: + doc)# 翻译单个文档 translate_doc(doc) write_log(
翻译完成,请查看Doc_Out文件夹下面的文档)if __name__ == __main__: t1 = datetime.datetime.now() run() t2 = datetime.datetime.now()
print(t1) print(t2) print(翻译时间:+str((t2-t1).seconds)+秒)
翻译前后对比:奥巴马演讲稿:
瓦尔登湖:
我们可以看到,程序会对word文档进行分段翻译,并且保留原英文,经过翻译后的文档即是中英文对照,这样看着是不是更舒服呢有些人或许更想看到pdf, 这里刚好有用Python进行word转pdf的程序:。
# -*- coding: utf-8 -*-"""Created on Mon Oct 8 11:08:38 2018@author: weineng.zhou"""import osfrom win32com.client
import gencache, Dispatch, constantsinfile = D:\docx\Doc_Out\瓦尔登湖(英文).docxoutfile = D:\docx\Doc_Out\瓦尔登湖(中英).pdf
gencache.EnsureModule({00020905-0000-0000-C000-000000000046}, 0, 8, 4)w = Dispatch("Word.Application"
)try: doc = w.Documents.Open(infile, ReadOnly=1) doc.ExportAsFixedFormat(outfile, constants.wdExportFormatPDF, \
Item=constants.wdExportDocumentWithMarkup, CreateBookmarks=constants.wdExportCreateHeadingBookmarks)
except:print (exception)finally: w.Quit(constants.wdDoNotSaveChanges)if os.path.isfile(outfile): print(
translate success)else: print(translate fail)运行这段代码,自动进行word转pdf
希望今天讲的东西对大家的学习或工作有所帮助此篇文章参考了CSDN上一位同学写的文章,原来的同学写了很多模块,新手如果拿到程序,直接运行可能会出问题,并且原文程序我也试过,导入pdf会出错在这里小编整理成一个程序,便于理解,而且跑此程序没问题。
参考文献:https://www.jianshu.com/p/2d51a87e2926
亲爱的读者们,感谢您花时间阅读本文。如果您对本文有任何疑问或建议,请随时联系我。我非常乐意与您交流。
发表评论:
◎欢迎参与讨论,请在这里发表您的看法、交流您的观点。