Skip to content

Instantly share code, notes, and snippets.

@yubessy
Created June 29, 2014 02:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yubessy/83750001019a8a2731be to your computer and use it in GitHub Desktop.
Save yubessy/83750001019a8a2731be to your computer and use it in GitHub Desktop.
Wikipediaの各年ページの日付をパース
# -*- coding: utf-8 -*-
# stdlib
import re
import datetime
# thirdlib
from dateutil.parser import parse
MONTHS = (
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December")
MONTH_DAY_RE = re.compile("({0}) {1}".format("|".join(MONTHS), "[1-3]?[0-9]"))
# strftimeは1900年より前の年に非対応のため、
# 適当なうるう年を使って日付をパースし、
# あとで年を置き換える
DEFAULT_DATE = datetime.datetime(2000, 1, 1)
def get_first_date(text, year):
u"""
textに含まれる最初の日付を正規化して返す
"""
try:
s = MONTH_DAY_RE.search(text)
date_exp = s.group(0)
dt = parse(date_exp, default=DEFAULT_DATE)
# 年の置き換え
result = dt.strftime("YYYY-%m-%d").replace("YYYY", str(year))
return result
except Exception:
return None
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment