yonglam/regex_notes.py

## regex_notes.py
'''
phone number regex
-------------------'''

def isPhoneNumber(text):
	if len(text) != 12:
		return False #not phone number-sized
	for i in range(0, 3):
		if not text[i].isdecimal():
			return False
	if text[3] != '-':
		return False
	for i in range(4, 7):
		if not text[i].isdecimal():
			return False
	if text[7] != '-':
		return False
	for i in range(8, 12):
		if not text[i].isdecimal():
			return False
	return True

print (isPhoneNumber('415-555-1234'))
message = "Call me at 415-444-1011 or at 903-772-3878"

foundNumber = False
for i in range(len(message)):
	chunk = message[i:i+12]
	if isPhoneNumber(chunk):
		print ('Phone Number Found: ' + chunk)
		foundNumber = True
if not foundNumber:
	print('could not find a phone number')
'''--------------------------------------------------------'''

import re
phoneNumRegex = re.compile()
message = "Call me at 415-444-1011 or at 903-772-3878"

''' --------------------------------------- '''
'''              Regex Basics               '''
''' --------------------------------------- '''
#build regular expression object *stored in phoneNumberRegex
phoneNumberRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

#Regex object has search method, returns a `match object`
mo = phoneNumberRegex.search(message)
print mo.group()

#find all method returns list of matches
print phoneNumberRegex.findall(message)

#groups
#parenthes mark group
phonReg = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)')
mo = phonReg.search(message)
print mo.group()
print mo.group(1)
print mo.group(2)
print mo.group(3)

#literal parenthesis
message = 'my phone number is (813)-255-8812)'
phoneReg = re.compile(r'\(\d\d\d\)-\d\d\d-\d\d\d\d')
mo = phoneReg.search(message)
print mo.group()

''' --------------------------------------- '''
'''              Regex Logic                '''
''' --------------------------------------- '''

batRegex = re.compile(r'Bat(man|mobile|copter|bat)')
mo =  batRegex.search('Batmobile lost a wheel')
print mo.group()

#?  0 or 1 times only
batRegex = re.compile(r'Bat(wo)?man')
mo = batRegex.search('The adventures of Batman')
moo = batRegex.search('Batwoman loves batman')
print mo.group()
print moo.group()

#looks for area code, but matches even if no area code
phoneRegex = re.compile('(\d\d\d-)?\d\d\d-\d\d\d\d')
mo = phoneRegex.search("My phone number is 555-8821")
print mo.group()

# *  0 or more times
batRegex = re.compile(r'Bat(wo)*man')
mo = batRegex.search('Batwowowowowoman')
print mo.group()


# + once or more
batRegex = re.compile(r'Bat(wo)+man')
mo = batRegex.search("Adventures of Batwoman")
print mo.group()
mo = batRegex.search("Batwowowowoman")
print mo.group()

''' --------------------------------------- '''
'''              Regex Groups               '''
''' --------------------------------------- '''

#finding groups
haRegex = re.compile(r'(ha){3}')
mo = haRegex.search("he said 'hahaha'")
print mo.group()

#match three phone numbers
PhoneReg = re.compile(r"((\d\d\d-)?\d\d\d-\d\d\d\d(,)?( )?){3}")
mo = PhoneReg.search('phone numbers 888-888-8888, 233-111-2232 113-1212')
print mo.group()

#range groups
hareg = re.compile(r'(ha){3,5}')
mo = hareg.search('hahaha')
print mo.group()

mo = hareg.search('hahahahaha')
print mo.group()

hareg = re.compile(r'(ha){,5}') #same as 0-5
hareg = re.compile(r'(ha){3,}') #3 or more

#greedy(default)
digitRegex = re.compile(r'(\d){3,5}')
mo = digitRegex.search('1234567890')
print mo.group() #will return the max amount of charachters (5)

#non-greedy
digitRegex = re.compile(r'(\d){3,5}?')
mo = digitRegex.search('0123456789')
print mo.group()

'''---------------------------------------------'''
'''             Find All                        '''
'''---------------------------------------------'''
phoneReg = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
grou = phoneReg.findall(message)

#find all with grouping
phoneReg = re.compile(r'(\d\d\d-)(\d\d\d-\d\d\d\d)')

#print phoneReg.findall(message)

'''---------------------------------------------'''
'''             Character Classes               '''
'''---------------------------------------------'''
# \d - matches any charachter thats a numeric digit
# \D - matches any charachter that is NOT a numeric digit from 0 to 9
# \w - matches any letter nunmberic digit or the underscore character
# \W - matches any character that is not a letter, number or _
# \s - matches any space, tab, or newline charachter
# \S any charachter that is not a space, tab, or newline


christmas = ''' 12 lords leaping
11 ladies dancing
10 pipers piping
9 drummers drumming
8 maids milking
7 swans swimming
6 geese laying
5 gold rings
4 colly birds
3 french hens
2 turtle doves and
1 partridge in a pear tree '''

#digit one or more times, followed by a space, followed by char 1 or more
xmas = re.compile(r'\d+\s\w+')
print xmas.findall(christmas)

'''---------------------------------------------'''
'''             Custom Character Classes        '''
'''---------------------------------------------'''

regexObj = re.compile(r'[aeiou]') #finds vowels
regexObj = re.compile(r'[a-z]') #finds all lowercase from a-z
regexObj = re.compile(r'[a-fA-F]') #finds all a-f lowercase and capital
regexObj = re.compile(r'[aeiouAEIOU]') #finds lowercase and capital vowels
print regexObj.findall('robocop eats baby food')
# ['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o']

regexObj = re.compile('[aeiouAEIOU]{2}') #match 2 vowels in a row
print regexObj.findall('robocop eats baby food')


'''---------------------------------------------'''
'''             Negative Character Classes      '''
'''---------------------------------------------'''

contsReg = re.compile(r'[^aeiouAEIOU]') #matches any char thats NOT vowel
print contsReg.findall('robo cop eats babyfood')
#['r', 'b', ' ', 'c', 'p', ' ', 't', 's', ' ', 'b', 'b', 'y', 'f', 'd']

import re

''' ----------------------------- '''
'''      Regex .* ^ $             '''
''' ----------------------------- '''
# ^ - matches strings that begin with
# $ - matches strings that end with
# . - matches any char except new line
# * - matches zero or more
# .*- matches any charachter, any amount of time except newline
        # default is greedy
# .*? - non greedy dotstar expression
# re.compile(r'.*', re.DOTALL) - matches ALL characters
# re.compile(r'[aeiou]', re.IGNORECASE) - ignores case
# re.compile(r'[aeiou]', re.I) - same as above


beginHelloRegex = re.compile(r'^Hello') #match string beginning with hello
mo = beginHelloRegex.search('Hello how are you')
print mo.group()

endHelloRegex = re.compile(r'world!$') #matches string that ends with `world`
mo = endHelloRegex.search('hello world!')
print mo.group()


allDigitsRegex = re.compile(r'^\d+$') #begin and end with number
mo = allDigitsRegex.search('651652166262')
print mo.group()

atRegex = re.compile(r'.at')
print atRegex.findall("The cat in the hat sat on the flat mat")

string = "First Name: John Last Name: Smith"
nameRegex = re.compile(r'First Name: (.*) Last Name: (.*)')
print nameRegex.findall(string)

serve = "<To serve humans> for dinner.>"
greedy = re.compile(r'<(.*)>')
mo = greedy.search(serve)
print mo.group()

nongreedRegex = re.compile('<(.*?)>')
mo = nongreedRegex.search(serve)
print mo.group()

#.* matches up to first \n character
string = 'Serve the public trust. \n Protect the innocent \n Upload the law'
newLineRegex = re.compile('.*')
mo = newLineRegex.search(string)
print mo.group()

# Match All charachters, even new lines
string = 'Serve the public trust. \n Protect the innocent \n Upload the law'
allCharRegex = re.compile('.*', re.DOTALL)
mo = allCharRegex.search(string)
print mo.group()

string = "AbcdEfghIjklmnOpqrstUvwxyz"
caseSenReg = re.compile(r'[aeiou]')
print caseSenReg.findall(string)

caseInSenReg = re.compile(r'[aeiou]', re.IGNORECASE)
print caseInSenReg.findall(string)
import re

''' -------------------------------- '''
'''            re.sub                '''
''' -------------------------------- '''

namesRegex = re.compile(r'Agent \w+') #find letter up until space char
print namesRegex.findall("Agent Alice gave the secret documents to Agent Bob.")


print namesRegex.sub('REDACTED', "Agent Alice gave the secret documents to Agent Bob.")


namesRegex = re.compile(r'Agent (\w)\w*') #only matches first char
print namesRegex.findall("Agent Alice gave the secret documents to Agent Bob.")


print namesRegex.sub(r'AGENT \1****', "Agent Alice gave the secret documents to Agent Bob.")

''' -------------------------------- '''
'''            re.verbose            '''
''' -------------------------------- '''
#allows long strings and comments inside of expression
re.compile(r'''
        \d\d\d- #area code
        \d\d\d-
        \d\d\d\d''', re.VERBOSE)

''' -------------------------------- '''
'''          Bitwise Comparison      '''
''' -------------------------------- '''

re.compile('\d\d\d', re.IGNORECASE | re.DOTALL | re.VERBOSE)
	'''
	phone number regex
	-------------------'''

	def isPhoneNumber(text):
	if len(text) != 12:
	return False #not phone number-sized
	for i in range(0, 3):
	if not text[i].isdecimal():
	return False
	if text[3] != '-':
	return False
	for i in range(4, 7):
	if not text[i].isdecimal():
	return False
	if text[7] != '-':
	return False
	for i in range(8, 12):
	if not text[i].isdecimal():
	return False
	return True

	print (isPhoneNumber('415-555-1234'))
	message = "Call me at 415-444-1011 or at 903-772-3878"

	foundNumber = False
	for i in range(len(message)):
	chunk = message[i:i+12]
	if isPhoneNumber(chunk):
	print ('Phone Number Found: ' + chunk)
	foundNumber = True
	if not foundNumber:
	print('could not find a phone number')
	'''--------------------------------------------------------'''

	import re
	phoneNumRegex = re.compile()
	message = "Call me at 415-444-1011 or at 903-772-3878"

	''' --------------------------------------- '''
	''' Regex Basics '''
	''' --------------------------------------- '''
	#build regular expression object *stored in phoneNumberRegex
	phoneNumberRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')

	#Regex object has search method, returns a `match object`
	mo = phoneNumberRegex.search(message)
	print mo.group()

	#find all method returns list of matches
	print phoneNumberRegex.findall(message)

	#groups
	#parenthes mark group
	phonReg = re.compile(r'(\d\d\d)-(\d\d\d)-(\d\d\d\d)')
	mo = phonReg.search(message)
	print mo.group()
	print mo.group(1)
	print mo.group(2)
	print mo.group(3)

	#literal parenthesis
	message = 'my phone number is (813)-255-8812)'
	phoneReg = re.compile(r'\(\d\d\d\)-\d\d\d-\d\d\d\d')
	mo = phoneReg.search(message)
	print mo.group()

	''' --------------------------------------- '''
	''' Regex Logic '''
	''' --------------------------------------- '''

	batRegex = re.compile(r'Bat(man\|mobile\|copter\|bat)')
	mo = batRegex.search('Batmobile lost a wheel')
	print mo.group()

	#? 0 or 1 times only
	batRegex = re.compile(r'Bat(wo)?man')
	mo = batRegex.search('The adventures of Batman')
	moo = batRegex.search('Batwoman loves batman')
	print mo.group()
	print moo.group()

	#looks for area code, but matches even if no area code
	phoneRegex = re.compile('(\d\d\d-)?\d\d\d-\d\d\d\d')
	mo = phoneRegex.search("My phone number is 555-8821")
	print mo.group()

	# * 0 or more times
	batRegex = re.compile(r'Bat(wo)*man')
	mo = batRegex.search('Batwowowowowoman')
	print mo.group()


	# + once or more
	batRegex = re.compile(r'Bat(wo)+man')
	mo = batRegex.search("Adventures of Batwoman")
	print mo.group()
	mo = batRegex.search("Batwowowowoman")
	print mo.group()

	''' --------------------------------------- '''
	''' Regex Groups '''
	''' --------------------------------------- '''

	#finding groups
	haRegex = re.compile(r'(ha){3}')
	mo = haRegex.search("he said 'hahaha'")
	print mo.group()

	#match three phone numbers
	PhoneReg = re.compile(r"((\d\d\d-)?\d\d\d-\d\d\d\d(,)?( )?){3}")
	mo = PhoneReg.search('phone numbers 888-888-8888, 233-111-2232 113-1212')
	print mo.group()

	#range groups
	hareg = re.compile(r'(ha){3,5}')
	mo = hareg.search('hahaha')
	print mo.group()

	mo = hareg.search('hahahahaha')
	print mo.group()

	hareg = re.compile(r'(ha){,5}') #same as 0-5
	hareg = re.compile(r'(ha){3,}') #3 or more

	#greedy(default)
	digitRegex = re.compile(r'(\d){3,5}')
	mo = digitRegex.search('1234567890')
	print mo.group() #will return the max amount of charachters (5)

	#non-greedy
	digitRegex = re.compile(r'(\d){3,5}?')
	mo = digitRegex.search('0123456789')
	print mo.group()

	'''---------------------------------------------'''
	''' Find All '''
	'''---------------------------------------------'''
	phoneReg = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
	grou = phoneReg.findall(message)

	#find all with grouping
	phoneReg = re.compile(r'(\d\d\d-)(\d\d\d-\d\d\d\d)')

	#print phoneReg.findall(message)

	'''---------------------------------------------'''
	''' Character Classes '''
	'''---------------------------------------------'''
	# \d - matches any charachter thats a numeric digit
	# \D - matches any charachter that is NOT a numeric digit from 0 to 9
	# \w - matches any letter nunmberic digit or the underscore character
	# \W - matches any character that is not a letter, number or _
	# \s - matches any space, tab, or newline charachter
	# \S any charachter that is not a space, tab, or newline


	christmas = ''' 12 lords leaping
	11 ladies dancing
	10 pipers piping
	9 drummers drumming
	8 maids milking
	7 swans swimming
	6 geese laying
	5 gold rings
	4 colly birds
	3 french hens
	2 turtle doves and
	1 partridge in a pear tree '''

	#digit one or more times, followed by a space, followed by char 1 or more
	xmas = re.compile(r'\d+\s\w+')
	print xmas.findall(christmas)

	'''---------------------------------------------'''
	''' Custom Character Classes '''
	'''---------------------------------------------'''

	regexObj = re.compile(r'[aeiou]') #finds vowels
	regexObj = re.compile(r'[a-z]') #finds all lowercase from a-z
	regexObj = re.compile(r'[a-fA-F]') #finds all a-f lowercase and capital
	regexObj = re.compile(r'[aeiouAEIOU]') #finds lowercase and capital vowels
	print regexObj.findall('robocop eats baby food')
	# ['o', 'o', 'o', 'e', 'a', 'a', 'o', 'o']

	regexObj = re.compile('[aeiouAEIOU]{2}') #match 2 vowels in a row
	print regexObj.findall('robocop eats baby food')


	'''---------------------------------------------'''
	''' Negative Character Classes '''
	'''---------------------------------------------'''

	contsReg = re.compile(r'[^aeiouAEIOU]') #matches any char thats NOT vowel
	print contsReg.findall('robo cop eats babyfood')
	#['r', 'b', ' ', 'c', 'p', ' ', 't', 's', ' ', 'b', 'b', 'y', 'f', 'd']

	import re

	''' ----------------------------- '''
	''' Regex .* ^ $ '''
	''' ----------------------------- '''
	# ^ - matches strings that begin with
	# $ - matches strings that end with
	# . - matches any char except new line
	# * - matches zero or more
	# .*- matches any charachter, any amount of time except newline
	# default is greedy
	# .*? - non greedy dotstar expression
	# re.compile(r'.*', re.DOTALL) - matches ALL characters
	# re.compile(r'[aeiou]', re.IGNORECASE) - ignores case
	# re.compile(r'[aeiou]', re.I) - same as above


	beginHelloRegex = re.compile(r'^Hello') #match string beginning with hello
	mo = beginHelloRegex.search('Hello how are you')
	print mo.group()

	endHelloRegex = re.compile(r'world!$') #matches string that ends with `world`
	mo = endHelloRegex.search('hello world!')
	print mo.group()


	allDigitsRegex = re.compile(r'^\d+$') #begin and end with number
	mo = allDigitsRegex.search('651652166262')
	print mo.group()

	atRegex = re.compile(r'.at')
	print atRegex.findall("The cat in the hat sat on the flat mat")

	string = "First Name: John Last Name: Smith"
	nameRegex = re.compile(r'First Name: (.) Last Name: (.)')
	print nameRegex.findall(string)

	serve = "<To serve humans> for dinner.>"
	greedy = re.compile(r'<(.*)>')
	mo = greedy.search(serve)
	print mo.group()

	nongreedRegex = re.compile('<(.*?)>')
	mo = nongreedRegex.search(serve)
	print mo.group()

	#.* matches up to first \n character
	string = 'Serve the public trust. \n Protect the innocent \n Upload the law'
	newLineRegex = re.compile('.*')
	mo = newLineRegex.search(string)
	print mo.group()

	# Match All charachters, even new lines
	string = 'Serve the public trust. \n Protect the innocent \n Upload the law'
	allCharRegex = re.compile('.*', re.DOTALL)
	mo = allCharRegex.search(string)
	print mo.group()

	string = "AbcdEfghIjklmnOpqrstUvwxyz"
	caseSenReg = re.compile(r'[aeiou]')
	print caseSenReg.findall(string)

	caseInSenReg = re.compile(r'[aeiou]', re.IGNORECASE)
	print caseInSenReg.findall(string)
	import re

	''' -------------------------------- '''
	''' re.sub '''
	''' -------------------------------- '''

	namesRegex = re.compile(r'Agent \w+') #find letter up until space char
	print namesRegex.findall("Agent Alice gave the secret documents to Agent Bob.")


	print namesRegex.sub('REDACTED', "Agent Alice gave the secret documents to Agent Bob.")


	namesRegex = re.compile(r'Agent (\w)\w*') #only matches first char
	print namesRegex.findall("Agent Alice gave the secret documents to Agent Bob.")


	print namesRegex.sub(r'AGENT \1****', "Agent Alice gave the secret documents to Agent Bob.")

	''' -------------------------------- '''
	''' re.verbose '''
	''' -------------------------------- '''
	#allows long strings and comments inside of expression
	re.compile(r'''
	\d\d\d- #area code
	\d\d\d-
	\d\d\d\d''', re.VERBOSE)

	''' -------------------------------- '''
	''' Bitwise Comparison '''
	''' -------------------------------- '''

	re.compile('\d\d\d', re.IGNORECASE \| re.DOTALL \| re.VERBOSE)