Skip to content

Instantly share code, notes, and snippets.

@knzm
Created May 8, 2011 14:29
Show Gist options
  • Save knzm/961403 to your computer and use it in GitHub Desktop.
Save knzm/961403 to your computer and use it in GitHub Desktop.
Download comitia (http://www.comitia.co.jp/) circle list.
# -*- coding: utf-8 -*-
import logging
import urllib
import lxml.html
log = None
list_url = "http://www.comitia.co.jp/history/96list.html"
def init_log():
global log
if log is None:
log = logging.getLogger()
def fetch(url):
f = urllib.urlopen(url)
try:
return f.read()
finally:
f.close()
def read_dict(f):
dict_ = []
for line in f:
read, name = line.decode('shift-jis').split("\t")[:2]
dict_.append({"read": read, "name": name})
return dict_
def normalize_initial(ch):
if ch in u"がぎぐげござじずぜぞだぢづでどばびぶべぼ":
ch = unichr(ord(ch) - 1)
elif ch in u"ぱぴぷぺぽ":
ch = unichr(ord(ch) - 2)
else:
ch = {u"ヴ": u"う"}.get(ch, ch)
return ch
def find_read(name, read_from_name):
if name in read_from_name:
return read_from_name[name]
name = name.replace(u"~", u"〜").replace(u"*", u"*") \
.replace(u"”", '"').replace(u"<", "<").replace(u">", ">") \
.replace(u"!", "!").replace(u"?", "?").replace(u"_", "_")
if name in read_from_name:
return read_from_name[name]
return None
def main(dict_=None):
init_log()
content = fetch(list_url)
doc = lxml.html.fromstring(content)
read_from_name = {}
if dict_:
for d in dict_:
name = d["name"]
if name in read_from_name:
log.warn(u"Duplicated: %s" % name)
continue
read_from_name[name] = d["read"]
unused = {}
for name, read in read_from_name.iteritems():
unused[read] = name
d = {}
nodes = doc.cssselect("#circle_list_container div.circle_list")
for div in nodes[1:]:
pos, name = div.text_content().split(None, 1)
d[pos] = {"name": name}
anchors = div.cssselect("a")
if len(anchors) > 0:
href = anchors[0].attrib.get("href")
d[pos]["url"] = href
read = find_read(name, read_from_name)
if read:
d[pos]["read"] = read
d[pos]["initial"] = normalize_initial(read[:1])
try:
unused.pop(read)
except KeyError:
pass
if unused:
for read, name in sorted(unused.iteritems()):
log.warn(u"Unused: %s (%s)" % (name, read))
def order_by_area(pair):
pos, value = pair
area = pos[0]
if u"A" <= area <= u"Z":
area_code = 0
elif u"あ" <= area <= u"ん":
area_code = 1
else:
area_code = 2
return (area_code, pos)
def order_by_name(pair):
pos, value = pair
initial = value.get("initial", "")
read = value.get("read", "")
name = value.get("name", "")
return (initial == "", read, name)
for pos, value in sorted(d.items(), key=order_by_name):
name = value.get("name")
read = value.get("read", "")
url = value.get("url", "")
initial = value.get("initial", "")
line = u"%s\t%s\t%s\t%s\t%s" % (initial, pos, name, read, url)
print line.encode('utf-8')
if __name__ == '__main__':
import sys
logging.basicConfig(
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
datefmt="%H:%M:%S",
level=logging.INFO,
)
dict_ = None
if len(sys.argv) > 1:
f = open(sys.argv[1])
try:
dict_ = read_dict(f)
finally:
f.close()
main(dict_)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment