Skip to content

Instantly share code, notes, and snippets.

@ctkirkman
Created June 17, 2020 23:58
Show Gist options
  • Save ctkirkman/1fbd9e2ac417d40be8be831474dba921 to your computer and use it in GitHub Desktop.
Save ctkirkman/1fbd9e2ac417d40be8be831474dba921 to your computer and use it in GitHub Desktop.
import binascii
import re
def rtf_to_text(rtf):
def clear_cmd(rtf_parsed):
if rtf_parsed.get("cmd") == "f" and rtf_parsed.get("lvl") == 1:
encoding = rtf_parsed.get("fonts").get(rtf_parsed.get("cmd") + rtf_parsed.get("arg"), {}).get("encoding", "1252")
rtf_parsed["encoding"] = encoding
elif rtf_parsed.get("cmd") == "ansicpg":
encoding = rtf_parsed.get("arg")
rtf_parsed["encoding"] = encoding
elif rtf_parsed.get("cmd") == "line":
rtf_parsed["string"] += "\n"
elif rtf_parsed.get("cmd") == "par":
rtf_parsed["string"] += "\n"
elif rtf_parsed.get("cmd") == "tab":
rtf_parsed["string"] += "\t"
elif rtf_parsed.get("cmd") == "cell":
rtf_parsed["string"] += "\t"
rtf_parsed["cmd"] = ""
rtf_parsed["arg"] = ""
return rtf_parsed
languages = {
"1078": "af",
"1025": "ar-sa",
"1027": "ca",
"932": "ja",
"936": "zh-cn",
"950": "zh-tw",
"1028": "zh-tw",
"2052": "zh-cn",
"1029": "cs",
"1030": "da",
"1043": "nl-nl",
"2067": "nl-be",
"2057": "en-gb",
"1033": "en-us",
"1035": "fi",
"1036": "fr-fr",
"2060": "fr-be",
"3084": "fr-ca",
"4108": "fr-ch",
"1031": "de-de",
"2055": "de-ch",
"1253": "el",
"1032": "el",
"1037": "he",
"1038": "hu",
"1057": "id",
"1040": "it-it",
"1041": "ja",
"1042": "ko",
"1044": "no-no",
"1045": "pl",
"2070": "pt-pt",
"1046": "pt-br",
"1048": "ro",
"1049": "ru",
"2074": "sr-sp",
"1050": "hr",
"1051": "sk",
"1034": "es-es",
"2058": "es-mx",
"1053": "sv-se",
"1054": "th",
"1055": "tr",
"1066": "vi"
}
charsets = {
"0": "1252",
"128": "932",
"129": "949",
"134": "936",
"136": "950",
"161": "1253",
"162": "1254",
"177": "1255",
"178": "1256",
"186": "1257",
"204": "1251",
"222": "874",
"238": "1250"
}
rtf_parsed = {
"lvl": 0,
"cmd": "",
"arg": "",
"mode": "",
"hex": "",
"string": "",
"parsed": "",
"buffer": "",
"last": "",
"fonts": {},
"languages": {},
"encoding": "1033"
}
for c in rtf:
rtf_parsed["buffer"] += c
rtf_parsed["parsed"] += c
if c == "{" and rtf_parsed.get("last") != "\\":
if rtf_parsed.get("mode") == "cmd":
rtf_parsed[rtf_parsed.get("lvl")] = rtf_parsed.get("cmd")
rtf_parsed["buffer"] = ""
rtf_parsed["lvl"] += 1
rtf_parsed["mode"] = ""
rtf_parsed = clear_cmd(rtf_parsed)
elif c == "}" and rtf_parsed.get("last") != "\\":
rtf_parsed["lvl"] -= 1
if rtf_parsed.get(rtf_parsed.get("lvl"), "") == "fonttbl":
font_cmd = rtf_parsed["buffer"].split(" ", 1)[0].split("\\")
font_entry = font_cmd[1]
for fcmd in font_cmd[2:]:
if "fcharset" in fcmd:
rtf_parsed["fonts"][font_entry] = {"encoding": charsets.get(fcmd.replace("fcharset", ""), "1252")}
elif rtf_parsed.get(rtf_parsed.get("lvl"), "") == "fldinst":
rtf_parsed["string"] += "(" + rtf_parsed.get("buffer", "").strip("}") + ")"
elif rtf_parsed.get(rtf_parsed.get("lvl"), "") == "fldrslt":
txt_val = re.sub(r"\\[0-9A-Za-z]+", "", rtf_parsed.get("buffer", "").strip("}"))
rtf_parsed["string"] += "[" + txt_val.strip() + "]"
rtf_parsed["buffer"] = ""
rtf_parsed["mode"] = ""
if rtf_parsed.get(rtf_parsed.get("lvl") + 1) is not None:
rtf_parsed[rtf_parsed.get("lvl") + 1] = ""
rtf_parsed = clear_cmd(rtf_parsed)
elif c == "\\" and rtf_parsed.get("last") != "\\":
rtf_parsed["mode"] = "cmd"
rtf_parsed = clear_cmd(rtf_parsed)
elif rtf_parsed.get("mode") == "cmd":
if re.match(r"[a-zA-Z\-]", c):
rtf_parsed["cmd"] += c
elif re.match(r"[0-9]", c):
rtf_parsed["arg"] += c
elif re.match(r"'", c) and rtf_parsed.get("lvl") == 1:
rtf_parsed["mode"] = "hex"
rtf_parsed = clear_cmd(rtf_parsed)
elif re.match(r" ", c):
rtf_parsed["mode"] = ""
rtf_parsed = clear_cmd(rtf_parsed)
elif re.match(r"\n", c) and rtf_parsed.get("lvl") == 1:
rtf_parsed = clear_cmd(rtf_parsed)
rtf_parsed["mode"] = ""
elif re.match(r"\\", c):
if rtf_parsed.get("lvl") == 1:
rtf_parsed["string"] += "\\"
rtf_parsed = clear_cmd(rtf_parsed)
rtf_parsed["mode"] = ""
elif re.match(r"\~", c):
rtf_parsed["string"] += " "
rtf_parsed = clear_cmd(rtf_parsed)
rtf_parsed["mode"] = ""
elif re.match(r"\{", c):
rtf_parsed["string"] += "{"
rtf_parsed = clear_cmd(rtf_parsed)
rtf_parsed["mode"] = ""
elif re.match(r"\}", c):
rtf_parsed["string"] += "}"
rtf_parsed = clear_cmd(rtf_parsed)
rtf_parsed["mode"] = ""
elif re.match(r"[a-zA-Z0-9\s]", c) is None and rtf_parsed.get("lvl") == 1:
rtf_parsed["mode"] = ""
rtf_parsed = clear_cmd(rtf_parsed)
elif rtf_parsed.get("mode") == "hex":
rtf_parsed["hex"] += c
if len(rtf_parsed.get("hex")) % 2 == 0:
rtf_parsed["mode"] = ""
try:
encoding = rtf_parsed.get("encoding")
lang = languages.get(encoding, "en-us")
enc_str = binascii.unhexlify(rtf_parsed.get("hex")).decode(encoding)
rtf_parsed["string"] += enc_str
rtf_parsed["languages"][lang] = rtf_parsed.get("languages", {}).get(lang, 0) + len(enc_str)
rtf_parsed["hex"] = ""
except Exception as ex:
pass
elif rtf_parsed.get("mode") == "" and rtf_parsed.get("lvl") == 1:
encoding = rtf_parsed.get("encoding")
lang = languages.get(encoding, "en-us")
rtf_parsed["languages"][lang] = rtf_parsed.get("languages", {}).get(lang, 0) + 1
rtf_parsed["string"] += c
else:
pass
# print("???")
rtf_parsed["last"] = c
return {
"text": rtf_parsed.get("string").strip("\n").strip(" "),
"lang": rtf_parsed.get("languages")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment