Skip to content

Instantly share code, notes, and snippets.

@palday
Last active December 30, 2015 08:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save palday/7805267 to your computer and use it in GitHub Desktop.
Save palday/7805267 to your computer and use it in GitHub Desktop.
Potential regex bug in python -- why isn't "o'clock" part of the match for re.findall()?
#! /usr/bin/env python
# -*- coding: utf-8 -*-
# License: CC-BY-NC-SA 3.0
import re
import codecs
# download kate_chopin_the_awakening_and_other_short_stories.txt
# from Project Gutenberg:
# http://www.gutenberg.org/ebooks/160.txt.utf-8
# with wget:
# wget http://www.gutenberg.org/ebooks/160.txt.utf-8 -O kate_chopin_the_awakening_and_other_short_stories.txt
# match for something o'clock, with valid numerical time or
# any English word with proper capitalization
oclock = re.compile(r"""
(
[A-Z]?[a-z]+ # word mit max. 1 capital letter
| 1[012] # 10,11,12
| [1-9] # 1,2,3,5,6,7,8,9
)
\s
o'clock""",
re.VERBOSE)
path = "kate_chopin_the_awakening_and_other_short_stories.txt"
print
print "re.search()"
print
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')
with codecs.open(path,mode='r',encoding='utf-8') as f:
for lineno, line in enumerate(f):
atime = oclock.search(line)
if atime:
print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
atime.start(),
atime.end(),
atime.group())
print
print "re.findall()"
print
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')
with codecs.open(path,mode='r',encoding='utf-8') as f:
for lineno, line in enumerate(f):
times = oclock.findall(line)
if times:
print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
'',
'',
' '.join(times))
print
print "re.finditer()"
print
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')
with codecs.open(path,mode='r',encoding='utf-8') as f:
for lineno, line in enumerate(f):
times = oclock.finditer(line)
for m in times:
print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
m.start(),
m.end(),
m.group())
@palday
Copy link
Author

palday commented Dec 6, 2013

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment