exbotanical/py_regex_notes.py

## py_regex_notes.py
# # #   REGEX   # # #

# The ? matches zero or one of the preceding group.
# The * matches zero or more of the preceding group.
# The + matches one or more of the preceding group.
# The {n} matches exactly n of the preceding group.
# The {n,} matches n or more of the preceding group.
# The {,m} matches 0 to m of the preceding group.
# The {n,m} matches at least n and at most m of the preceding group.
# {n,m}? or *? or +? performs a non-greedy match of the preceding group.
# ^spam means the string must begin with spam.
# spam$ means the string must end with spam.
# The . matches any character, except newline characters.
# \d, \w, and \s match a digit, word, or space character, respectively.
# \D, \W, and \S match anything except a digit, word, or space character, respectively.
# [abc] matches any character between the brackets (such as a, b, or c).
# [^abc] matches any character that isn’t between the brackets.


# def is_phone_number(txt):
#   if len(txt) != 12:
#     return False
#   for i in range(0,3):
#     if not txt[i].isdecimal:
#       return False
#     if txt[3] != '-':
#       return False
#   for i in range(4,7):
#     if not txt[i].isdecimal:
#       return False
#     if txt[7] != '-':
#       return False
#   for i in range(8,12):
#     if not txt[i].isdecimal:
#       return False
#   return True

# message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
# for i in range(len(message)):
#   chunk = message[i:i+12]
#   if is_phone_number(chunk):
#     print('Phone number found: ' + chunk)
# print('Done')

# import re

# num_regex = re.compile(r'(\d{3})-(\d{3}-\d{4})')

# match = num_regex.search('Call me at 415-555-1011 tomorrow.')

# area_code = match.group(1)
# number = match.group(2)

# print(f'Phone number found: ({area_code}) {number}')

# ha_regex = re.compile(r'(((Ha){4})+)')

# match_ha = ha_regex.findall('I laughed like HaHaHa HaHaHa HaHa HAHaHaHaHaha HaHahaHaHaHA and else-like.')

# print(match_ha)

# phoneNumRegex = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

# print(phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000'))

# # create own char class

# custom_char_class_regex = re.compile(r'[RNVE]\w+')

# b = custom_char_class_regex.findall('ReactJS, VueJS, NodeJS, ExpressJS, C++, Java')

# print(b) # oh, it returns only the keywords that match my skillset !

# begins_with_hello = re.compile(r'^Hello')
# print(begins_with_hello.search('Hello, world!'))

# print(begins_with_hello.search('I said hello.') == None)

# ends_with_number = re.compile(r'\d$')
# print(ends_with_number.search('Your number is 42'))

# # I always confuse the meanings of these two symbols, so I use the mnemonic “Carrots cost dollars” to remind myself that the caret comes first and the dollar sign comes last.

# wild_card = re.compile(r'.@gmail.com')
# print(wild_card.search('call that number or email hello@gmail.com'))

# atRegex = re.compile(r'.at')
# print(atRegex.findall('The cat in the hat sat on the flat mat.'))


import re

email_regex = re.compile(r'\w+@\w+')

m = email_regex.findall('my email is abc@gmail.com and his is notarealemail@gmail.com and yours is email@email.com')

print(m)

greedy_regex = re.compile(r'(Ha){3,5}')
mo1 = greedy_regex.search('HaHaHaHaHa')
print(mo1.group())

non_greedy_regex = re.compile(r'(Ha){3,5}?')
mo2 = non_greedy_regex.search('HaHaHaHaHa')
print(mo2.group())

phone_regex = re.compile(r'\(?\d{3}\)?-\d{3}-\d{4}')

l = phone_regex.findall('The first phone number is (713)-214-5039 and the second is 281-889-2034. The suite number is L-303')

print(l)

newline_regex = re.compile('.*', re.DOTALL)

o = newline_regex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

print(o)

# IGNORE CASE

robocop = re.compile(r'robocop', re.I)
print(robocop.search('RoboCop is part man, part machine.').group())

print(robocop.sub('An android', 'RoboCop is part man, part machine.'))

agent_names_regex = re.compile(r'Agent (\w)\w*')
print(agent_names_regex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.'))

# VERBOSE MODE EXAMPLE

verbose_regex = re.compile(r'''(
    (\d{3}|\(\d{3}\))?            # area code
    (\s|-|\.)?                    # separator
    \d{3}                         # first 3 digits
    (\s|-|\.)                     # separator
    \d{4}                         # last 4 digits
    (\s*(ext|x|ext.)\s*\d{2,5})?  # extension
    )''', re.VERBOSE)

# pass multiple args into compile with the pipe | operator

multiple_arg_regex = re.compile('foo', re.IGNORECASE | re.DOTALL | re.VERBOSE)
	# # # REGEX # # #

	# The ? matches zero or one of the preceding group.
	# The * matches zero or more of the preceding group.
	# The + matches one or more of the preceding group.
	# The {n} matches exactly n of the preceding group.
	# The {n,} matches n or more of the preceding group.
	# The {,m} matches 0 to m of the preceding group.
	# The {n,m} matches at least n and at most m of the preceding group.
	# {n,m}? or *? or +? performs a non-greedy match of the preceding group.
	# ^spam means the string must begin with spam.
	# spam$ means the string must end with spam.
	# The . matches any character, except newline characters.
	# \d, \w, and \s match a digit, word, or space character, respectively.
	# \D, \W, and \S match anything except a digit, word, or space character, respectively.
	# [abc] matches any character between the brackets (such as a, b, or c).
	# [^abc] matches any character that isn’t between the brackets.



	# def is_phone_number(txt):
	# if len(txt) != 12:
	# return False
	# for i in range(0,3):
	# if not txt[i].isdecimal:
	# return False
	# if txt[3] != '-':
	# return False
	# for i in range(4,7):
	# if not txt[i].isdecimal:
	# return False
	# if txt[7] != '-':
	# return False
	# for i in range(8,12):
	# if not txt[i].isdecimal:
	# return False
	# return True

	# message = 'Call me at 415-555-1011 tomorrow. 415-555-9999 is my office.'
	# for i in range(len(message)):
	# chunk = message[i:i+12]
	# if is_phone_number(chunk):
	# print('Phone number found: ' + chunk)
	# print('Done')

	# import re

	# num_regex = re.compile(r'(\d{3})-(\d{3}-\d{4})')

	# match = num_regex.search('Call me at 415-555-1011 tomorrow.')

	# area_code = match.group(1)
	# number = match.group(2)

	# print(f'Phone number found: ({area_code}) {number}')

	# ha_regex = re.compile(r'(((Ha){4})+)')

	# match_ha = ha_regex.findall('I laughed like HaHaHa HaHaHa HaHa HAHaHaHaHaha HaHahaHaHaHA and else-like.')

	# print(match_ha)

	# phoneNumRegex = re.compile(r'(\d{3})-(\d{3})-(\d{4})')

	# print(phoneNumRegex.findall('Cell: 415-555-9999 Work: 212-555-0000'))

	# # create own char class

	# custom_char_class_regex = re.compile(r'[RNVE]\w+')

	# b = custom_char_class_regex.findall('ReactJS, VueJS, NodeJS, ExpressJS, C++, Java')

	# print(b) # oh, it returns only the keywords that match my skillset !

	# begins_with_hello = re.compile(r'^Hello')
	# print(begins_with_hello.search('Hello, world!'))

	# print(begins_with_hello.search('I said hello.') == None)

	# ends_with_number = re.compile(r'\d$')
	# print(ends_with_number.search('Your number is 42'))

	# # I always confuse the meanings of these two symbols, so I use the mnemonic “Carrots cost dollars” to remind myself that the caret comes first and the dollar sign comes last.

	# wild_card = re.compile(r'.@gmail.com')
	# print(wild_card.search('call that number or email hello@gmail.com'))

	# atRegex = re.compile(r'.at')
	# print(atRegex.findall('The cat in the hat sat on the flat mat.'))


	import re

	email_regex = re.compile(r'\w+@\w+')

	m = email_regex.findall('my email is abc@gmail.com and his is notarealemail@gmail.com and yours is email@email.com')

	print(m)

	greedy_regex = re.compile(r'(Ha){3,5}')
	mo1 = greedy_regex.search('HaHaHaHaHa')
	print(mo1.group())

	non_greedy_regex = re.compile(r'(Ha){3,5}?')
	mo2 = non_greedy_regex.search('HaHaHaHaHa')
	print(mo2.group())

	phone_regex = re.compile(r'\(?\d{3}\)?-\d{3}-\d{4}')

	l = phone_regex.findall('The first phone number is (713)-214-5039 and the second is 281-889-2034. The suite number is L-303')

	print(l)

	newline_regex = re.compile('.*', re.DOTALL)

	o = newline_regex.search('Serve the public trust.\nProtect the innocent.\nUphold the law.').group()

	print(o)

	# IGNORE CASE

	robocop = re.compile(r'robocop', re.I)
	print(robocop.search('RoboCop is part man, part machine.').group())

	print(robocop.sub('An android', 'RoboCop is part man, part machine.'))

	agent_names_regex = re.compile(r'Agent (\w)\w*')
	print(agent_names_regex.sub(r'\1****', 'Agent Alice told Agent Carol that Agent Eve knew Agent Bob was a double agent.'))

	# VERBOSE MODE EXAMPLE

	verbose_regex = re.compile(r'''(
	(\d{3}\|\(\d{3}\))? # area code
	(\s\|-\|\.)? # separator
	\d{3} # first 3 digits
	(\s\|-\|\.) # separator
	\d{4} # last 4 digits
	(\s(ext\|x\|ext.)\s\d{2,5})? # extension
	)''', re.VERBOSE)

	# pass multiple args into compile with the pipe \| operator

	multiple_arg_regex = re.compile('foo', re.IGNORECASE \| re.DOTALL \| re.VERBOSE)