Created
June 29, 2014 02:33
-
-
Save yubessy/83750001019a8a2731be to your computer and use it in GitHub Desktop.
Wikipediaの各年ページの日付をパース
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# stdlib | |
import re | |
import datetime | |
# thirdlib | |
from dateutil.parser import parse | |
MONTHS = ( | |
"January", | |
"February", | |
"March", | |
"April", | |
"May", | |
"June", | |
"July", | |
"August", | |
"September", | |
"October", | |
"November", | |
"December") | |
MONTH_DAY_RE = re.compile("({0}) {1}".format("|".join(MONTHS), "[1-3]?[0-9]")) | |
# strftimeは1900年より前の年に非対応のため、 | |
# 適当なうるう年を使って日付をパースし、 | |
# あとで年を置き換える | |
DEFAULT_DATE = datetime.datetime(2000, 1, 1) | |
def get_first_date(text, year): | |
u""" | |
textに含まれる最初の日付を正規化して返す | |
""" | |
try: | |
s = MONTH_DAY_RE.search(text) | |
date_exp = s.group(0) | |
dt = parse(date_exp, default=DEFAULT_DATE) | |
# 年の置き換え | |
result = dt.strftime("YYYY-%m-%d").replace("YYYY", str(year)) | |
return result | |
except Exception: | |
return None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment