Skip to content

Instantly share code, notes, and snippets.

@blueset blueset/lyricproc.py
Last active Jan 7, 2017

Embed
What would you like to do?
The Making of: Lyricova Screensaver v2
import json
import re
from terminaltables import SingleTable
from collections import Counter
f = open("GY_posts.json").read()
data = json.loads(f)
def lang(s):
if re.match(r".*[ぁ-ヿ].*", s):
return 'ja'
elif re.match(r".*[\u2E80-\u2FD5\u3400-\u4DBF\u4E00-\u9FCC].*", s):
return 'zh'
return 'en'
def proc_row(row):
d = {
'main': '',
'orig': '',
'ja': '',
'en': '',
'zh': '',
'id': row['id']
}
# Main
d['main'], dmain = lang_det(row['lyric'], 'main')
for i in dmain:
d[i] = (d[i] + "\n" + dmain[i]).strip()
# orig
if row['origin']:
d['orig'], dorig = lang_det(row['origin'], 'original')
for i in dmain:
d[i] = (d[i] + "\n" + dorig[i]).strip()
else:
d['orig'] = d['main']
_, dtran = lang_det(row['translate'], 'translate')
for i in dtran:
d[i] = (d[i] + "\n" + dtran[i]).strip()
return d
def lang_det(ly: str, mode: str):
ly = ly.split('\n')
lylang = set(lang(i) for i in ly)
lycount = Counter(lang(i) for i in ly)
rlang = "en"
d = {
'zh': '',
'ja': '',
'en': ''
}
if lylang.issuperset({'ja', 'zh'}):
print('\n'.join(ly))
print()
rlang = input("Lang for %s [%s]:" % (mode, ", ".join(lylang)))
lsplit = lang_split(ly, lylang)
for i in lsplit:
d[i] = (d[i] + "\n" + lsplit[i]).strip()
elif 'ja' in lylang:
if lycount['en'] >= (len(ly) // 3):
print('\n'.join(ly))
print()
rlang = input("Lang for %s [%s]:" % (mode, ", ".join(lylang)))
lsplit = lang_split(ly, lylang)
for i in lsplit:
d[i] = (d[i] + "\n" + lsplit[i]).strip()
else:
rlang = 'ja'
d['ja'] = '\n'.join(ly)
elif 'zh' in lylang:
if lycount['en'] >= (len(ly) // 3):
print('\n'.join(ly))
print()
rlang = input("Lang for %s [%s]:" % (mode, ", ".join(lylang)))
lsplit = lang_split(ly, lylang)
for i in lsplit:
d[i] = (d[i] + "\n" + lsplit[i]).strip()
else:
rlang = 'zh'
d['zh'] = '\n'.join(ly)
elif 'en' in lylang:
rlang = 'en'
d['en'] = '\n'.join(ly)
return rlang, d
def lang_split(ly: list, lylang: set):
for i in enumerate(ly):
print("%-3s %s" % i)
d = dict()
for i in lylang:
l = list(map(int, input("Line number for [%s]:\n" % i).split()))
d[i] = "\n".join([ly[x] for x in l])
return d
start_id = int(open("start_id").read())
for i, d in enumerate(data[start_id:]):
i += start_id
print("### Proc data", i)
x = proc_row(d)
t = SingleTable(sorted(x.items(), key=lambda a:['id', 'main', 'orig', 'ja', 'zh', 'en'].index(a[0])))
t.inner_row_border = True
print(t.table)
input()
with open("start_id", 'w') as f:
f.write(str(i + 1))
with open("res", "a") as f:
f.write(json.dumps(x) + "\n")
<?php
header("Content-type: text/plain");
require 'vendor/autoload.php';
use Flintstone\Flintstone;
$db = new Flintstone('db', ['dir', 'db.flintstone']);
if ($_SERVER['QUERY_STRING'] != ""){
echo $db->get($_SERVER['QUERY_STRING']);
} elseif (count($_POST) > 0) {
foreach($_POST as $k => $v) {
$db->set($k, $v);
}
}
#!/usr/local/bin/python3
import MeCab
import jieba
import pypinyin
import sys
import json
def lang_ja(text):
def hira(w):
return ''.join(chr(ord(i) - ord('') + ord(''))
if ord('') <= ord(i) <= ord('') else i for i in w)
def not_punc(w):
return any(map(lambda a: a.isalpha() or
ord('') <= ord(a) <= ord('') or
0x2E80 <= ord(a) <= 0x2FD5 or
0x3400 <= ord(a) <= 0x4DBF or
0x4E00 <= ord(a) <= 0x9FCC, w))
t = MeCab.Tagger('--node-format=%M\u200C%f[7]\u200C%pA\\n --unk-format=%M\u200C%M\u200C%pA\\n --eos-format= -l3')
res = []
text = text.split("\n")
for line in text:
ln = [["", ""]]
last_score = 0
curr_diff = 0
n = [i.split('\u200C') for i in t.parse(line).split('\n') if i]
if len(n) < 1: # EOL
continue
else:
x = n.pop(0)
last_score = float(x[2])
ln[-1][0] += x[0]
yomi = x[1]
ln[-1][1] += hira(x[0] if yomi == "*" else yomi)
while n:
x = n.pop(0)
curr_diff = last_score - float(x[2]) - 1000
last_score = float(x[2])
if curr_diff > 0 and not_punc(x[0]):
ln.append(["", ""])
ln[-1][0] += x[0]
yomi = x[1]
ln[-1][1] += hira(x[0] if yomi == "*" else yomi)
if ln[-1] == ['', '']:
ln.pop(-1)
res.append(ln)
return res
def lang_zh(text):
res = []
for line in text.split('\n'):
cut = jieba.cut(line)
ln = [[i, "'".join(j[0] for j in pypinyin.pinyin(i, style=0))] for i in cut]
res.append(ln)
return res
if __name__ == '__main__':
if len(sys.argv) < 3 or sys.argv[1] not in ('zh', 'ja'):
print("Usage: %s (zh|ja) text" % sys.argv[0])
exit()
if sys.argv[1] == 'zh':
jieba.initialize()
print(json.dumps(lang_zh(sys.argv[2])))
elif sys.argv[1] == 'ja':
print(json.dumps(lang_ja(sys.argv[2])))
else:
jieba.initialize()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.