palday/regex.py

## regex.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-

# License: CC-BY-NC-SA 3.0

import re
import codecs

# download kate_chopin_the_awakening_and_other_short_stories.txt
# from Project Gutenberg:
# http://www.gutenberg.org/ebooks/160.txt.utf-8
# with wget:
# wget http://www.gutenberg.org/ebooks/160.txt.utf-8 -O kate_chopin_the_awakening_and_other_short_stories.txt


# match for something o'clock, with valid numerical time or
# any English word with proper capitalization

oclock = re.compile(r"""
                    (
                          [A-Z]?[a-z]+ # word mit max. 1 capital letter
                        | 1[012]       # 10,11,12
                        | [1-9]        # 1,2,3,5,6,7,8,9
                    )
                    \s
                    o'clock""",
                    re.VERBOSE)

path = "kate_chopin_the_awakening_and_other_short_stories.txt"

print
print "re.search()"
print
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')

with  codecs.open(path,mode='r',encoding='utf-8') as f:
    for lineno, line in enumerate(f):
        atime = oclock.search(line)
        if  atime:
            print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
                                            atime.start(),
                                            atime.end(),
                                            atime.group())


print
print "re.findall()"
print
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')
with  codecs.open(path,mode='r',encoding='utf-8') as f:
    for lineno, line in enumerate(f):
        times = oclock.findall(line)
        if times:
            print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
                                            '',
                                            '',
                                            ' '.join(times))


print
print "re.finditer()"
print
print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')
with  codecs.open(path,mode='r',encoding='utf-8') as f:
    for lineno, line in enumerate(f):
        times = oclock.finditer(line)
        for m in times:
            print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
                                            m.start(),
                                            m.end(),
                                            m.group())
	#! /usr/bin/env python
	# -- coding: utf-8 --

	# License: CC-BY-NC-SA 3.0

	import re
	import codecs

	# download kate_chopin_the_awakening_and_other_short_stories.txt
	# from Project Gutenberg:
	# http://www.gutenberg.org/ebooks/160.txt.utf-8
	# with wget:
	# wget http://www.gutenberg.org/ebooks/160.txt.utf-8 -O kate_chopin_the_awakening_and_other_short_stories.txt


	# match for something o'clock, with valid numerical time or
	# any English word with proper capitalization

	oclock = re.compile(r"""
	(
	[A-Z]?[a-z]+ # word mit max. 1 capital letter
	\| 1[012] # 10,11,12
	\| [1-9] # 1,2,3,5,6,7,8,9
	)
	\s
	o'clock""",
	re.VERBOSE)

	path = "kate_chopin_the_awakening_and_other_short_stories.txt"

	print
	print "re.search()"
	print
	print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
	print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')

	with codecs.open(path,mode='r',encoding='utf-8') as f:
	for lineno, line in enumerate(f):
	atime = oclock.search(line)
	if atime:
	print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
	atime.start(),
	atime.end(),
	atime.group())


	print
	print "re.findall()"
	print
	print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
	print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')
	with codecs.open(path,mode='r',encoding='utf-8') as f:
	for lineno, line in enumerate(f):
	times = oclock.findall(line)
	if times:
	print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
	'',
	'',
	' '.join(times))


	print
	print "re.finditer()"
	print
	print u"{:>6} {:>6} {:>6}\t{}".format("Line","Start","End","Match")
	print u"{:=>6} {:=>6} {:=>6}\t{}".format('','','','=====')
	with codecs.open(path,mode='r',encoding='utf-8') as f:
	for lineno, line in enumerate(f):
	times = oclock.finditer(line)
	for m in times:
	print u"{:>6} {:>6} {:>6}\t{}".format(lineno,
	m.start(),
	m.end(),
	m.group())