Skip to content

Instantly share code, notes, and snippets.

@alexshpilkin
Last active February 5, 2019 13:38
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save alexshpilkin/fc5ac7d5aad9332f4471fb2bdf6221a4 to your computer and use it in GitHub Desktop.
Save alexshpilkin/fc5ac7d5aad9332f4471fb2bdf6221a4 to your computer and use it in GitHub Desktop.
Convert between Transtool and GNU gettext files
#!/bin/sh -eu
po=$(printf %s *.??.po)
detail=false; if [ "x${1-}" = x-d ]; then detail=true; shift; fi
base=${1-/dev/null}
prev=; next=; diff=
trap 'rm -f "$prev" "$next" "$diff"' EXIT
prev=$(mktemp); next=$(mktemp); diff=$(mktemp)
msggrep -T -e . <"$base" >"$prev"
msggrep -T -e . <"$po" >"$next"
printf '# %s source, %s target\n' \
$(msgcomm '->' 1 '-<' 3 "$prev" "$next" - <"$next" | po2ox | ox2txt | wc -w) \
$(msgcomm '->' 1 '-<' 3 "$prev" "$next" - <"$next" | po2tx | tx2txt | wc -w)
msgcat "$prev" "$next" | msggrep -T -e '#-#-#-#-#' | msgcomm "$next" - >"$diff"
if $detail; then
msgcomm '->' 1 '-<' 3 --force-po "$prev" "$next" - <"$next" | \
msgcat -i --color=always - "$diff"
fi
#!/usr/bin/env python3
from csv import reader as CSVReader
from datetime import datetime as DateTime
from re import compile as Pattern
from sys import stdin, stdout
ESCAPE = [
(r'"', r'\"', ),
(r'\n', r'\n'),
(r'\\', r'\\'),
]
ESCAPE = [(Pattern(x), y) for x, y in ESCAPE]
print(r"""# Converted from CSV by csv2po
#
msgid ""
msgstr ""
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset={}\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Transtool-Escapes: 1\n"
""".format(stdout.encoding), end='')
stdin.reconfigure(newline='')
for entry in CSVReader(stdin):
assert len(entry) == 5
# FIXME doesn't touch fuzzy flags
src = entry[2]
tar = entry[3]
for pat, rep in ESCAPE:
src = pat.sub(rep, src)
tar = pat.sub(rep, tar)
print("""
{}#, python-brace-format
msgctxt "{}"
msgid "{}"
msgstr "{}"
""".format("# {}\n".format(entry[4]) if entry[4] else "", entry[0], src, tar), end='')
#!/bin/sh -eu
force=false; if [ "x${1-}" = x-f ]; then force=true; shift; fi
po=$(printf %s *.??.po)
pd="${po%.po}.$(date -Imin)"
cp "$po" "$pd.po"
po2tx <"$pd.po" >"$pd.tx"
base=${1-/dev/null}
prev=; next=; diff=
trap 'rm -f "$prev" "$next" "$diff"' EXIT
prev=$(mktemp); next=$(mktemp); diff=$(mktemp)
msggrep -T -e . <"$base" >"$prev"
msggrep -T -e . <"$pd.po" >"$next"
{ msgcomm '-<' 2 "$prev" "$next" - <"$next" | diff /dev/null -; } || $force
{ msgcomm "$next" "$prev" | diff "$prev" -; } || $force
msgcat "$prev" "$next" | msggrep -T -e '#-#-#-#-#' | msgcomm "$next" - >"$diff"
msgcomm '->' 1 '-<' 3 --force-po "$prev" "$next" - <"$next" | \
msgcat - "$diff" | po2csv | sort -n -t'"' -k2 >"$pd.utf.csv"
sed -E -e 's/\{LS\}/<LS>/g' -e 's/\{[^{}]+\}//g' <"$pd.utf.csv" | \
iconv -t cp1251//TRANSLIT >"$pd.win.csv"
#!/bin/sh -eu
next=${1%.utf.csv}
prev=$2
if [ -f "$next.po" ]; then
echo "file already exists: $next.po" >&2
exit 1
fi
csv2po <"$next.utf.csv" | msgcat '->' 0 "$prev" - >"$next.po"
msgcomm -u "$prev" "$next" | diff /dev/null -
#!/usr/bin/env python3
# ox2pot - convert Transtool to GNU gettext
#
# Converts UTF-8 Transtool OX on standard input to GNU gettext POT on standard
# output.
from datetime import datetime as DateTime
from re import compile as Pattern
from sys import stdin, stdout
ESCAPE = [
(r'\\', r'\\\\'),
(r'"', r'\\"'),
(r'{', r'{{'),
(r'}', r'}}'),
('\u2028', r'{LS}'),
(r'<\*([0-9]+)\*>', r'{as\1}'),
(r'<cs([0-9]+)>', r'{cs\1}'),
(r'</cs([0-9]+)>', r'{sc\1}'),
(r'<U([0-9]+)>', r'{uu\1}'),
(r'<\?ACE 7\?>', r'{pi}'),
]
ESCAPE = [(Pattern(x), y) for x, y in ESCAPE]
stdin.reconfigure(encoding='utf-8-sig', newline='\r\n')
lines = iter(stdin)
print(r'''# Converted from Transtool by ox2pot
#
#, fuzzy
msgid ""
msgstr ""
"POT-Creation-Date: {}\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset={}\n"
"Content-Transfer-Encoding: 8bit\n"
"X-Transtool-Escapes: 1\n"
'''.format(
DateTime.now().astimezone().isoformat(sep=' ', timespec='minutes'),
stdout.encoding,
), end='')
while True:
line = next(lines, None)
if line is None: break
assert line.startswith('(') and line.endswith(')\r\n')
snum = line[1:-3]
assert snum and snum.isdigit()
line = next(lines, None)
assert line.endswith('\r\n')
stxt = line[:-2]
assert '{' not in stxt and '}' not in stxt
for pat, rep in ESCAPE:
stxt = pat.sub(rep, stxt)
assert '<' not in stxt and '>' not in stxt
print('''
#, python-brace-format
msgctxt "{}"
msgid "{}"
msgstr ""
'''.format(snum, stxt), end='')
#!/usr/bin/env python3
from csv import writer as CSVWriter, QUOTE_NONNUMERIC
from email import parser, policy
from re import compile as Pattern
from sys import argv, stdin, stdout
UNESCAPE = [
(r'\\"', r'"'),
(r'\\n', '\n'),
(r'\\\\', r'\\'),
]
UNESCAPE = [(Pattern(x), y) for x, y in UNESCAPE]
OUTPUT = 'msgid' if argv[0].lower().endswith('2ox') else 'msgstr'
stdout.reconfigure(encoding='utf-8-sig', newline='\r\n')
def readentry():
entry = {}
line = next(stdin, None)
if line is None: return None
while True:
assert line is not None
if not line.isspace(): break
line = next(stdin, None)
while True:
assert line is not None
line = line.strip()
if not line.startswith('#'): break
mark, line = line[:2], line[2:]
if mark not in entry:
entry[mark] = line
else:
entry[mark] += " " + line
line = next(stdin, None)
while True:
assert line is not None
key, line = line.split(maxsplit=1)
entry[key] = ""
while True:
assert (len(line) >= 2 and
line.startswith('"') and
line.endswith('"'))
line = line[1:-1]
for pat, rep in UNESCAPE:
line = pat.sub(rep, line)
entry[key] += line
line = next(stdin, None)
if line is None or line.isspace(): break
line = line.strip()
if not line.startswith('"'): break
if line is None or line.isspace(): break
return entry
def readentries():
while True:
e = readentry()
if e is None: break
yield e
entries = iter(readentries())
entry = next(readentries())
assert not entry['msgid']
headers = parser.Parser(policy=policy.default).parsestr(entry['msgstr'])
assert (headers['MIME-Version'] == '1.0' and
headers['Content-Type'].params['charset'].lower() == stdin.encoding.lower() and
headers['Content-Transfer-Encoding'].lower() == '8bit')
stdout.reconfigure(newline='')
csv = CSVWriter(stdout, quoting=QUOTE_NONNUMERIC)
for entry in entries:
csv.writerow([
entry['msgctxt'],
'?' if 'fuzzy' in entry.get('#,','') else '',
entry['msgid'],
entry['msgstr'],
entry.get('# ', '')
])
#!/usr/bin/env python3
# po2ox, po2tx - convert GNU gettext to Transtool
#
# Converts GNU gettext PO on standard input to UTF-8 Transtool OX (if invoked
# as po2ox) or TX (otherwise) on standard output.
from email import parser, policy
from re import compile as Pattern
from sys import argv, stdin, stdout
PO2TT = [
(r'{pi}', r'<?ACE 7?>'),
(r'{uu([0-9]+)}', r'<U\1>'),
(r'{sc([0-9]+)}', r'</cs\1>'),
(r'{cs([0-9]+)}', r'<cs\1>'),
(r'{as([0-9]+)}', r'<*\1*>'),
(r'{LS}', '\u2028'),
(r'}}', r'}'),
(r'{{', r'{'),
]
PO2TT = [(Pattern(x), y) for x, y in PO2TT]
UNESCAPE = [
(r'\\"', r'"'),
(r'\\n', '\n'),
(r'\\\\', r'\\'),
]
UNESCAPE = [(Pattern(x), y) for x, y in UNESCAPE]
OUTPUT = 'msgid' if argv[0].lower().endswith('2ox') else 'msgstr'
stdout.reconfigure(encoding='utf-8-sig', newline='\r\n')
def readentry():
entry = {}
line = next(stdin, None)
if line is None: return None
while True:
assert line is not None
if not line.isspace(): break
line = next(stdin, None)
while True:
assert line is not None
if not line.strip().startswith('#'): break
line = next(stdin, None)
while True:
assert line is not None
key, line = line.strip().split(maxsplit=1)
entry[key] = ""
while True:
assert (len(line) >= 2 and
line.startswith('"') and
line.endswith('"'))
line = line[1:-1]
for pat, rep in UNESCAPE:
line = pat.sub(rep, line)
entry[key] += line
line = next(stdin, None)
if line is None or line.isspace(): break
line = line.strip()
if not line.startswith('"'): break
if line is None or line.isspace(): break
return entry
def readentries():
while True:
e = readentry()
if e is None: break
yield e
entries = iter(readentries())
entry = next(readentries())
assert not entry['msgid']
headers = parser.Parser(policy=policy.default).parsestr(entry['msgstr'])
assert (headers['MIME-Version'] == '1.0' and
headers['Content-Type'].params['charset'].lower() == stdin.encoding.lower() and
headers['Content-Transfer-Encoding'].lower() == '8bit' and
headers['X-Transtool-Escapes'] == '1')
for entry in entries:
assert entry['msgctxt'].isdigit()
print('({})'.format(entry['msgctxt']))
text = entry[OUTPUT]
assert '<' not in text and '>' not in text
for pat, rep in PO2TT:
text = pat.sub(rep, text)
assert '{' not in text and '}' not in text
print(text)
#!/usr/bin/env python3
# ox2txt, tx2txt - extract text from Transtool
#
# Prints to standard output the text from a UTF-8 Transtool OX or TX file on
# standard input. The output can then be piped to wc(1) or similar.
from sys import stdin
from re import compile as Pattern
PH = Pattern(r'<[^>]*>')
stdin.reconfigure(encoding='utf-8-sig', newline='\r\n')
lines = iter(stdin)
while True:
line = next(stdin, None)
if line is None: break
line = next(stdin, None)
line = PH.sub(' ', line)
assert line is not None
print(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment