|
import json |
|
import re |
|
from terminaltables import SingleTable |
|
from collections import Counter |
|
|
|
f = open("GY_posts.json").read() |
|
data = json.loads(f) |
|
|
|
def lang(s): |
|
if re.match(r".*[ぁ-ヿ].*", s): |
|
return 'ja' |
|
elif re.match(r".*[\u2E80-\u2FD5\u3400-\u4DBF\u4E00-\u9FCC].*", s): |
|
return 'zh' |
|
return 'en' |
|
|
|
def proc_row(row): |
|
d = { |
|
'main': '', |
|
'orig': '', |
|
'ja': '', |
|
'en': '', |
|
'zh': '', |
|
'id': row['id'] |
|
} |
|
|
|
# Main |
|
d['main'], dmain = lang_det(row['lyric'], 'main') |
|
for i in dmain: |
|
d[i] = (d[i] + "\n" + dmain[i]).strip() |
|
|
|
# orig |
|
if row['origin']: |
|
d['orig'], dorig = lang_det(row['origin'], 'original') |
|
for i in dmain: |
|
d[i] = (d[i] + "\n" + dorig[i]).strip() |
|
else: |
|
d['orig'] = d['main'] |
|
|
|
_, dtran = lang_det(row['translate'], 'translate') |
|
for i in dtran: |
|
d[i] = (d[i] + "\n" + dtran[i]).strip() |
|
|
|
return d |
|
|
|
def lang_det(ly: str, mode: str): |
|
ly = ly.split('\n') |
|
lylang = set(lang(i) for i in ly) |
|
lycount = Counter(lang(i) for i in ly) |
|
rlang = "en" |
|
d = { |
|
'zh': '', |
|
'ja': '', |
|
'en': '' |
|
} |
|
if lylang.issuperset({'ja', 'zh'}): |
|
print('\n'.join(ly)) |
|
print() |
|
rlang = input("Lang for %s [%s]:" % (mode, ", ".join(lylang))) |
|
lsplit = lang_split(ly, lylang) |
|
for i in lsplit: |
|
d[i] = (d[i] + "\n" + lsplit[i]).strip() |
|
elif 'ja' in lylang: |
|
if lycount['en'] >= (len(ly) // 3): |
|
print('\n'.join(ly)) |
|
print() |
|
rlang = input("Lang for %s [%s]:" % (mode, ", ".join(lylang))) |
|
lsplit = lang_split(ly, lylang) |
|
for i in lsplit: |
|
d[i] = (d[i] + "\n" + lsplit[i]).strip() |
|
else: |
|
rlang = 'ja' |
|
d['ja'] = '\n'.join(ly) |
|
elif 'zh' in lylang: |
|
if lycount['en'] >= (len(ly) // 3): |
|
print('\n'.join(ly)) |
|
print() |
|
rlang = input("Lang for %s [%s]:" % (mode, ", ".join(lylang))) |
|
lsplit = lang_split(ly, lylang) |
|
for i in lsplit: |
|
d[i] = (d[i] + "\n" + lsplit[i]).strip() |
|
else: |
|
rlang = 'zh' |
|
d['zh'] = '\n'.join(ly) |
|
elif 'en' in lylang: |
|
rlang = 'en' |
|
d['en'] = '\n'.join(ly) |
|
return rlang, d |
|
|
|
def lang_split(ly: list, lylang: set): |
|
for i in enumerate(ly): |
|
print("%-3s %s" % i) |
|
d = dict() |
|
for i in lylang: |
|
l = list(map(int, input("Line number for [%s]:\n" % i).split())) |
|
d[i] = "\n".join([ly[x] for x in l]) |
|
return d |
|
|
|
start_id = int(open("start_id").read()) |
|
|
|
for i, d in enumerate(data[start_id:]): |
|
i += start_id |
|
print("### Proc data", i) |
|
x = proc_row(d) |
|
t = SingleTable(sorted(x.items(), key=lambda a:['id', 'main', 'orig', 'ja', 'zh', 'en'].index(a[0]))) |
|
t.inner_row_border = True |
|
print(t.table) |
|
input() |
|
with open("start_id", 'w') as f: |
|
f.write(str(i + 1)) |
|
with open("res", "a") as f: |
|
f.write(json.dumps(x) + "\n") |