Last active
December 30, 2015 08:49
-
-
Save palday/7805267 to your computer and use it in GitHub Desktop.
Potential regex bug in python -- why isn't "o'clock" part of the match for re.findall()?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
# License: CC-BY-NC-SA 3.0 | |
import re | |
import codecs | |
# download kate_chopin_the_awakening_and_other_short_stories.txt | |
# from Project Gutenberg: | |
# http://www.gutenberg.org/ebooks/160.txt.utf-8 | |
# with wget: | |
# wget http://www.gutenberg.org/ebooks/160.txt.utf-8 -O kate_chopin_the_awakening_and_other_short_stories.txt | |
# match for something o'clock, with valid numerical time or | |
# any English word with proper capitalization | |
oclock = re.compile(r""" | |
( | |
[A-Z]?[a-z]+ # word mit max. 1 capital letter | |
| 1[012] # 10,11,12 | |
| [1-9] # 1,2,3,5,6,7,8,9 | |
) | |
\s | |
o'clock""", | |
re.VERBOSE) | |
path = "kate_chopin_the_awakening_and_other_short_stories.txt" | |
print "re.search()" | |
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match") | |
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====') | |
with codecs.open(path,mode='r',encoding='utf-8') as f: | |
for lineno, line in enumerate(f): | |
atime = oclock.search(line) | |
if atime: | |
print u"{:>6} {:>6} {:>6}\t{}".format(lineno, | |
atime.start(), | |
atime.end(), | |
atime.group()) | |
print "re.findall()" | |
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match") | |
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====') | |
with codecs.open(path,mode='r',encoding='utf-8') as f: | |
for lineno, line in enumerate(f): | |
times = oclock.findall(line) | |
if times: | |
print u"{:>6} {:>6} {:>6}\t{}".format(lineno, | |
'', | |
'', | |
' '.join(times)) | |
print "re.finditer()" | |
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match") | |
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====') | |
with codecs.open(path,mode='r',encoding='utf-8') as f: | |
for lineno, line in enumerate(f): | |
times = oclock.finditer(line) | |
for m in times: | |
print u"{:>6} {:>6} {:>6}\t{}".format(lineno, | |
m.start(), | |
m.end(), | |
m.group()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Check out the explanation on StackOverflow.