Skip to content

Instantly share code, notes, and snippets.

@lu911
Last active December 17, 2015 03:38
Show Gist options
  • Save lu911/5544443 to your computer and use it in GitHub Desktop.
Save lu911/5544443 to your computer and use it in GitHub Desktop.
identify_timezone
#-*-coding:utf8-*-
from study.models import *
from BeautifulSoup import BeautifulSoup
import re,datetime
exclude_words = [u'1:1', u'명', u'문장', u'시간', u'PT', u'세', u'일', u'월']
time_regex = [ur'((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*((?:저녁|아침)(?:시간)?|명|AM|PM|오(?:전|후))?\s*((?:-|–|~|부터|또는))\s*((?:저녁|아침)(?:시간?)?|AM|PM|오(?:전|후))?\s*([0-2]?\d)\s*([시:])\s*((?:[0-5]?\d분?|반))?\s*((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?',
ur'((?:저녁|아침)(?:시간)?|오(?:전|후))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*((?:-|–|~|부터|또는))\s*((?:저녁|아침)(?:시간?)?|오(?:전|후))?\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?',
ur'((?:저녁|아침)(?:시간)?|오(?:전|후))?\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*((?:-|–|~|부터|또는))\s*((?:저녁|아침)(?:시간?)?|오(?:전|후))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?',
ur'([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)\s*((?:-|–|~|부터|또는))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)?',
ur'([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)?\s*((?:-|–|~|부터|또는))\s*([0-2]?\d)\s*([시:])?\s*((?:[0-5]?\d분?|반))?\s*(AM|PM)',
ur'((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?\s*([0-2]?\d)\s*([시:]\s*(?:[0-5]?\d분|반))\s*((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?',
ur'((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?\s*([0-2]?\d시간?)\s*((?:저녁|아침)(?:시간)?|AM|PM|오(?:전|후))?']
def get_clean_content(study):
content = BeautifulSoup(re.sub(r'<br\s*(>)*(/>)*','\n',study.content)).text.replace('&nbsp','')
content = re.sub(r'&#8211;','-', content)
return re.sub(ur'^(?:01[016789]{1}|02|0[3-9]{1}[0-9]{1})[-.]?[0-9]{3,4}[-.\s]?[0-9]{4}$', '', content, re.M)
def get_study_list():
return Study.objects.filter(write_time = datetime.datetime(year=2013, month=5, day=03))
def identify_time_zone(time):
time = re.sub(ur'\s*','',time)
hour = re.findall(ur'[0-2]?\d',time)
if hour:
hour = hour[0]
try:
hour = int(hour)
except ValueError:
return 'etc'
if u'오전' in time or u'AM' in time or u'am' in time or u'아침' in time or (hour >= 8 and hour < 12):
return 'ante meridiem'
elif ((u'오후' in time or u'PM' in time or u'pm' in time or u'저녁' in time) and (hour >= 0 and hour < 6) or (hour >= 12 and hour < 18)) or (hour >= 0 and hour < 6) or (hour >= 12 and hour < 18):
return 'post meridiem'
elif ((u'오후' in time or u'PM' in time or u'pm' in time or u'저녁' in time) and (hour >= 6 and hour < 8) or (hour >= 18 and hour < 24)) or (hour >= 6 and hour < 8) or (hour >= 18 and hour < 24):
return 'evening'
else:
return 'etc'
def get_time(study):
global study_count
content = get_clean_content(study)
for regex in time_regex:
is_real_time = None
found_time = re.findall(regex, content, re.I)
for time in found_time:
is_real_time = True
for word in exclude_words:
time = ''.join(time)
if word in time:
is_real_time = False
break
if is_real_time:
study_count +=1
time_zone = identify_time_zone(time)
print 'Study ID : %s'%study.id
print 'Time : %s'%time
print 'Meridiem : %s'%time_zone
break
if found_time and is_real_time:
break
def test1():
global study_count
study_count = 0
study_list = get_study_list()
for study in study_list:
get_time(study)
percent = (float(study_count) / float(len(study_list))) * 100.0
print "study_count : %d Percent : %.2f"%(study_count, percent)
def test2(start, end):
global study_count
study_count = 0
study_list = get_study_list()
for study in study_list[start:end]:
get_time(study)
percent = (float(study_count) / float((end-1)-start)) * 100.0
print "study_count : %d Percent : %.2f"%(study_count, percent)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment