Skip to content

Instantly share code, notes, and snippets.

@tsudoko
Last active November 20, 2020 05:32
Show Gist options
  • Save tsudoko/1d3e947aae6a909110b393ebdd8d61c2 to your computer and use it in GitHub Desktop.
Save tsudoko/1d3e947aae6a909110b393ebdd8d61c2 to your computer and use it in GitHub Desktop.
Official kanji list scrapers
#!/usr/bin/env python3
import sys
from pdfminer import layout
import utils
# ref: https://web.archive.org/web/20170708205442/http://www.mext.go.jp:80/component/a_menu/education/micro_detail/__icsFiles/afieldfile/2017/05/12/1384661_4_2.pdf
debug_chars = ""
def contains(haystack, needle):
if isinstance(haystack, bytes):
return needle.encode() in haystack
else:
return needle in haystack
def pdf_process(page):
chars = {}
x1, x2 = (140, 510)
ys = [
(720, 690),
(640, 550),
(520, 400),
(380, 270),
(240, 130),
(100, 90),
]
if page.pageid != 32:
gradebounds = [(x1, y1, x2, y2) for y1, y2 in ys]
else:
gradebounds = [(x1, 999, x2, 0)]
for o in page:
if not isinstance(o, layout.LTChar):
continue
if not contains(o.fontname, "KyoikuKanji"):
continue
for i, bounds in enumerate(gradebounds):
if page.pageid == 32:
i = 5
bx1, by1, bx2, by2 = bounds
x1, y1, x2, y2 = o.bbox
if o.get_text() in debug_chars:
print(o.get_text())
print(o)
print(o.bbox)
print(x1 < bx1)
print(y1 > by1)
print(x2 > bx2)
print(y2 < by2)
if x1 < bx1 or y1 > by1 or x2 > bx2 or y2 < by2:
continue
char = o.get_text()
if i not in chars:
chars[i] = ""
chars[i] += char
return chars
def pdf(f):
kanji = {}
for p in utils.pages(f):
if p.pageid < 31 or p.pageid > 32:
continue
elif p.pageid != 32:
kanji = pdf_process(p)
else:
kanji[5] += pdf_process(p)[5]
#return [v for k, v in sorted(kanji.items())]
print(kanji)
return [kanji[i] for i in range(len(kanji))]
def pdf_ok(out):
assert len(out[0]) == 80
assert len(out[1]) == 160
assert len(out[2]) == 200
assert len(out[3]) == 202
assert len(out[4]) == 193
assert len(out[5]) == 191
if __name__ == "__main__":
with open(sys.argv[1], "rb") as f:
kanji = pdf(f)
for k, v in enumerate(kanji):
print(f"{k+1}\t{v}")
#!/usr/bin/env python3
import collections
import math
import sys
from pdfminer import layout
import utils
# ref: https://web.archive.org/web/20180829192608/http://www.kanken.or.jp/kanken/outline/data/outline_degree_national_list.pdf
debug_chars = ''
def pdf_process(page):
skip = False
cont = True
chars = ""
level = ""
# tables on continued pages (4級 その2 etc.) are wider
bounds = (60, 741, 525, 84)
for o in page:
if not isinstance(o, layout.LTChar):
continue
if o.get_text() == '〈':
# regular page
x1, y1, x2, y2 = bounds
x1 = 49
x2 = 488
bounds = (x1, y1, x2, y2)
break
elif o.get_text() == '※':
# table on the last page is thinner
x1, y1, x2, y2 = bounds
x1 = 105
x2 = 528
bounds = (x1, y1, x2, y2)
break
for o in page:
if not isinstance(o, layout.LTChar):
continue
if math.floor(o.bbox[1]) == 782:
level += o.get_text()
if level and level[-1] == '級':
break
for o in page:
if not isinstance(o, layout.LTChar):
continue
bx1, by1, bx2, by2 = bounds
x1, y1, x2, y2 = o.bbox
if o.get_text() in debug_chars:
print(o.get_text())
print(o.bbox)
print(x1 < bx1)
print(y1 > by1)
print(x2 > bx2)
print(y2 < by2)
if x1 < bx1 or y1 > by1 or x2 > bx2 or y2 < by2:
continue
char = o.get_text()
if char in "〔〕" or char in chars:
continue
chars += char
return level, chars
def pdf(f):
kanken = collections.OrderedDict()
for p in utils.pages(f):
k, v = pdf_process(p)
if not k:
continue
if k not in kanken:
kanken[k] = ''
kanken[k] += v
return kanken
def pdf_ok(out):
assert len(out['10級']) == 80
assert len(out['9級']) == 160
assert len(out['8級']) == 200
assert len(out['7級']) == 200
assert len(out['6級']) == 185
assert len(out['5級']) == 181
assert len(out['4級']) == 316
assert len(out['3級']) == 285
assert len(out['準2級']) == 333
assert len(out['2級']) == 196
if __name__ == "__main__":
with open(sys.argv[1], "rb") as f:
kanken = pdf(f)
for k, v in kanken.items():
print(f"{k}\t{v}")
from pdfminer import pdfdocument, pdfparser, pdfinterp, pdfpage, converter, layout
def pages(f):
rm = pdfinterp.PDFResourceManager()
dev = converter.PDFPageAggregator(rm)
interp = pdfinterp.PDFPageInterpreter(rm, dev)
for p in pdfpage.PDFPage.get_pages(f):
interp.process_page(p)
yield dev.get_result()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment