Last active
February 5, 2019 13:38
-
-
Save alexshpilkin/fc5ac7d5aad9332f4471fb2bdf6221a4 to your computer and use it in GitHub Desktop.
Convert between Transtool and GNU gettext files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh -eu | |
po=$(printf %s *.??.po) | |
detail=false; if [ "x${1-}" = x-d ]; then detail=true; shift; fi | |
base=${1-/dev/null} | |
prev=; next=; diff= | |
trap 'rm -f "$prev" "$next" "$diff"' EXIT | |
prev=$(mktemp); next=$(mktemp); diff=$(mktemp) | |
msggrep -T -e . <"$base" >"$prev" | |
msggrep -T -e . <"$po" >"$next" | |
printf '# %s source, %s target\n' \ | |
$(msgcomm '->' 1 '-<' 3 "$prev" "$next" - <"$next" | po2ox | ox2txt | wc -w) \ | |
$(msgcomm '->' 1 '-<' 3 "$prev" "$next" - <"$next" | po2tx | tx2txt | wc -w) | |
msgcat "$prev" "$next" | msggrep -T -e '#-#-#-#-#' | msgcomm "$next" - >"$diff" | |
if $detail; then | |
msgcomm '->' 1 '-<' 3 --force-po "$prev" "$next" - <"$next" | \ | |
msgcat -i --color=always - "$diff" | |
fi |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from csv import reader as CSVReader | |
from datetime import datetime as DateTime | |
from re import compile as Pattern | |
from sys import stdin, stdout | |
ESCAPE = [ | |
(r'"', r'\"', ), | |
(r'\n', r'\n'), | |
(r'\\', r'\\'), | |
] | |
ESCAPE = [(Pattern(x), y) for x, y in ESCAPE] | |
print(r"""# Converted from CSV by csv2po | |
# | |
msgid "" | |
msgstr "" | |
"MIME-Version: 1.0\n" | |
"Content-Type: text/plain; charset={}\n" | |
"Content-Transfer-Encoding: 8bit\n" | |
"X-Transtool-Escapes: 1\n" | |
""".format(stdout.encoding), end='') | |
stdin.reconfigure(newline='') | |
for entry in CSVReader(stdin): | |
assert len(entry) == 5 | |
# FIXME doesn't touch fuzzy flags | |
src = entry[2] | |
tar = entry[3] | |
for pat, rep in ESCAPE: | |
src = pat.sub(rep, src) | |
tar = pat.sub(rep, tar) | |
print(""" | |
{}#, python-brace-format | |
msgctxt "{}" | |
msgid "{}" | |
msgstr "{}" | |
""".format("# {}\n".format(entry[4]) if entry[4] else "", entry[0], src, tar), end='') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh -eu | |
force=false; if [ "x${1-}" = x-f ]; then force=true; shift; fi | |
po=$(printf %s *.??.po) | |
pd="${po%.po}.$(date -Imin)" | |
cp "$po" "$pd.po" | |
po2tx <"$pd.po" >"$pd.tx" | |
base=${1-/dev/null} | |
prev=; next=; diff= | |
trap 'rm -f "$prev" "$next" "$diff"' EXIT | |
prev=$(mktemp); next=$(mktemp); diff=$(mktemp) | |
msggrep -T -e . <"$base" >"$prev" | |
msggrep -T -e . <"$pd.po" >"$next" | |
{ msgcomm '-<' 2 "$prev" "$next" - <"$next" | diff /dev/null -; } || $force | |
{ msgcomm "$next" "$prev" | diff "$prev" -; } || $force | |
msgcat "$prev" "$next" | msggrep -T -e '#-#-#-#-#' | msgcomm "$next" - >"$diff" | |
msgcomm '->' 1 '-<' 3 --force-po "$prev" "$next" - <"$next" | \ | |
msgcat - "$diff" | po2csv | sort -n -t'"' -k2 >"$pd.utf.csv" | |
sed -E -e 's/\{LS\}/<LS>/g' -e 's/\{[^{}]+\}//g' <"$pd.utf.csv" | \ | |
iconv -t cp1251//TRANSLIT >"$pd.win.csv" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh -eu | |
next=${1%.utf.csv} | |
prev=$2 | |
if [ -f "$next.po" ]; then | |
echo "file already exists: $next.po" >&2 | |
exit 1 | |
fi | |
csv2po <"$next.utf.csv" | msgcat '->' 0 "$prev" - >"$next.po" | |
msgcomm -u "$prev" "$next" | diff /dev/null - |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# ox2pot - convert Transtool to GNU gettext | |
# | |
# Converts UTF-8 Transtool OX on standard input to GNU gettext POT on standard | |
# output. | |
from datetime import datetime as DateTime | |
from re import compile as Pattern | |
from sys import stdin, stdout | |
ESCAPE = [ | |
(r'\\', r'\\\\'), | |
(r'"', r'\\"'), | |
(r'{', r'{{'), | |
(r'}', r'}}'), | |
('\u2028', r'{LS}'), | |
(r'<\*([0-9]+)\*>', r'{as\1}'), | |
(r'<cs([0-9]+)>', r'{cs\1}'), | |
(r'</cs([0-9]+)>', r'{sc\1}'), | |
(r'<U([0-9]+)>', r'{uu\1}'), | |
(r'<\?ACE 7\?>', r'{pi}'), | |
] | |
ESCAPE = [(Pattern(x), y) for x, y in ESCAPE] | |
stdin.reconfigure(encoding='utf-8-sig', newline='\r\n') | |
lines = iter(stdin) | |
print(r'''# Converted from Transtool by ox2pot | |
# | |
#, fuzzy | |
msgid "" | |
msgstr "" | |
"POT-Creation-Date: {}\n" | |
"MIME-Version: 1.0\n" | |
"Content-Type: text/plain; charset={}\n" | |
"Content-Transfer-Encoding: 8bit\n" | |
"X-Transtool-Escapes: 1\n" | |
'''.format( | |
DateTime.now().astimezone().isoformat(sep=' ', timespec='minutes'), | |
stdout.encoding, | |
), end='') | |
while True: | |
line = next(lines, None) | |
if line is None: break | |
assert line.startswith('(') and line.endswith(')\r\n') | |
snum = line[1:-3] | |
assert snum and snum.isdigit() | |
line = next(lines, None) | |
assert line.endswith('\r\n') | |
stxt = line[:-2] | |
assert '{' not in stxt and '}' not in stxt | |
for pat, rep in ESCAPE: | |
stxt = pat.sub(rep, stxt) | |
assert '<' not in stxt and '>' not in stxt | |
print(''' | |
#, python-brace-format | |
msgctxt "{}" | |
msgid "{}" | |
msgstr "" | |
'''.format(snum, stxt), end='') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from csv import writer as CSVWriter, QUOTE_NONNUMERIC | |
from email import parser, policy | |
from re import compile as Pattern | |
from sys import argv, stdin, stdout | |
UNESCAPE = [ | |
(r'\\"', r'"'), | |
(r'\\n', '\n'), | |
(r'\\\\', r'\\'), | |
] | |
UNESCAPE = [(Pattern(x), y) for x, y in UNESCAPE] | |
OUTPUT = 'msgid' if argv[0].lower().endswith('2ox') else 'msgstr' | |
stdout.reconfigure(encoding='utf-8-sig', newline='\r\n') | |
def readentry(): | |
entry = {} | |
line = next(stdin, None) | |
if line is None: return None | |
while True: | |
assert line is not None | |
if not line.isspace(): break | |
line = next(stdin, None) | |
while True: | |
assert line is not None | |
line = line.strip() | |
if not line.startswith('#'): break | |
mark, line = line[:2], line[2:] | |
if mark not in entry: | |
entry[mark] = line | |
else: | |
entry[mark] += " " + line | |
line = next(stdin, None) | |
while True: | |
assert line is not None | |
key, line = line.split(maxsplit=1) | |
entry[key] = "" | |
while True: | |
assert (len(line) >= 2 and | |
line.startswith('"') and | |
line.endswith('"')) | |
line = line[1:-1] | |
for pat, rep in UNESCAPE: | |
line = pat.sub(rep, line) | |
entry[key] += line | |
line = next(stdin, None) | |
if line is None or line.isspace(): break | |
line = line.strip() | |
if not line.startswith('"'): break | |
if line is None or line.isspace(): break | |
return entry | |
def readentries(): | |
while True: | |
e = readentry() | |
if e is None: break | |
yield e | |
entries = iter(readentries()) | |
entry = next(readentries()) | |
assert not entry['msgid'] | |
headers = parser.Parser(policy=policy.default).parsestr(entry['msgstr']) | |
assert (headers['MIME-Version'] == '1.0' and | |
headers['Content-Type'].params['charset'].lower() == stdin.encoding.lower() and | |
headers['Content-Transfer-Encoding'].lower() == '8bit') | |
stdout.reconfigure(newline='') | |
csv = CSVWriter(stdout, quoting=QUOTE_NONNUMERIC) | |
for entry in entries: | |
csv.writerow([ | |
entry['msgctxt'], | |
'?' if 'fuzzy' in entry.get('#,','') else '', | |
entry['msgid'], | |
entry['msgstr'], | |
entry.get('# ', '') | |
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# po2ox, po2tx - convert GNU gettext to Transtool | |
# | |
# Converts GNU gettext PO on standard input to UTF-8 Transtool OX (if invoked | |
# as po2ox) or TX (otherwise) on standard output. | |
from email import parser, policy | |
from re import compile as Pattern | |
from sys import argv, stdin, stdout | |
PO2TT = [ | |
(r'{pi}', r'<?ACE 7?>'), | |
(r'{uu([0-9]+)}', r'<U\1>'), | |
(r'{sc([0-9]+)}', r'</cs\1>'), | |
(r'{cs([0-9]+)}', r'<cs\1>'), | |
(r'{as([0-9]+)}', r'<*\1*>'), | |
(r'{LS}', '\u2028'), | |
(r'}}', r'}'), | |
(r'{{', r'{'), | |
] | |
PO2TT = [(Pattern(x), y) for x, y in PO2TT] | |
UNESCAPE = [ | |
(r'\\"', r'"'), | |
(r'\\n', '\n'), | |
(r'\\\\', r'\\'), | |
] | |
UNESCAPE = [(Pattern(x), y) for x, y in UNESCAPE] | |
OUTPUT = 'msgid' if argv[0].lower().endswith('2ox') else 'msgstr' | |
stdout.reconfigure(encoding='utf-8-sig', newline='\r\n') | |
def readentry(): | |
entry = {} | |
line = next(stdin, None) | |
if line is None: return None | |
while True: | |
assert line is not None | |
if not line.isspace(): break | |
line = next(stdin, None) | |
while True: | |
assert line is not None | |
if not line.strip().startswith('#'): break | |
line = next(stdin, None) | |
while True: | |
assert line is not None | |
key, line = line.strip().split(maxsplit=1) | |
entry[key] = "" | |
while True: | |
assert (len(line) >= 2 and | |
line.startswith('"') and | |
line.endswith('"')) | |
line = line[1:-1] | |
for pat, rep in UNESCAPE: | |
line = pat.sub(rep, line) | |
entry[key] += line | |
line = next(stdin, None) | |
if line is None or line.isspace(): break | |
line = line.strip() | |
if not line.startswith('"'): break | |
if line is None or line.isspace(): break | |
return entry | |
def readentries(): | |
while True: | |
e = readentry() | |
if e is None: break | |
yield e | |
entries = iter(readentries()) | |
entry = next(readentries()) | |
assert not entry['msgid'] | |
headers = parser.Parser(policy=policy.default).parsestr(entry['msgstr']) | |
assert (headers['MIME-Version'] == '1.0' and | |
headers['Content-Type'].params['charset'].lower() == stdin.encoding.lower() and | |
headers['Content-Transfer-Encoding'].lower() == '8bit' and | |
headers['X-Transtool-Escapes'] == '1') | |
for entry in entries: | |
assert entry['msgctxt'].isdigit() | |
print('({})'.format(entry['msgctxt'])) | |
text = entry[OUTPUT] | |
assert '<' not in text and '>' not in text | |
for pat, rep in PO2TT: | |
text = pat.sub(rep, text) | |
assert '{' not in text and '}' not in text | |
print(text) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# ox2txt, tx2txt - extract text from Transtool | |
# | |
# Prints to standard output the text from a UTF-8 Transtool OX or TX file on | |
# standard input. The output can then be piped to wc(1) or similar. | |
from sys import stdin | |
from re import compile as Pattern | |
PH = Pattern(r'<[^>]*>') | |
stdin.reconfigure(encoding='utf-8-sig', newline='\r\n') | |
lines = iter(stdin) | |
while True: | |
line = next(stdin, None) | |
if line is None: break | |
line = next(stdin, None) | |
line = PH.sub(' ', line) | |
assert line is not None | |
print(line) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment