ChillarAnand/regex-demo.py

## regex-demo.py
import re
print(re.match(r'bc', 'abc'))
print(re.match(r'abc', 'abc'))
print(re.search(r'bc', 'abc'))
print(re.search(r'^bc', 'abc'))
print(re.search(r'^ab$', 'abc'))
print(re.search(r'^abc$', 'abc'))
print(re.search(r'a?bc', 'bc'))
print(re.search(r'a?bc', 'abc'))
print(re.search(r'a?bc', 'aabc'))
print(re.search(r'a*bc', 'aabc'))
print(re.search(r'a+bc', 'aabc'))
print(re.search(r'a+bc', 'bc'))
print(re.search(r'a*bc', 'bc'))
print(re.search(r'a*bc', 'BC'))
print(re.search(r'a*bc', 'BC', re.I))
print(re.search(r'a.*bc', 'aabbcabcd'))
print(re.search(r'a.*?bc', 'aabbcabcd'))
print(re.search(r'a.*?d', 'ababdbabd'))
print(re.search(r'a[^d]*d', 'ababdbabd'))
print(re.findall(r'a[^d]*d', 'ababdbabd'))
print(re.search(r'\d+', 'The year is 2015.'))
print(re.findall(r'\w+(?=\s+Khan)', 'Salman Khan, Shahrukh Khan, Aamir Khan and Fardeen Khan are famous actors.'))


#======================================================================
s = 'Я люблю мороженое'
print(s.encode())
print(re.sub('л', 'ѫ', s))
print(re.sub(b'\xd0\xbb', b'\xd1\xab', s.encode()).decode())
#print(re.sub(b'\xd0\xbb', 'ѫ', s.encode()).decode()) # error


#======================================================================
# Extract year, month and date
url = 'http://techcrunch.com/2015/08/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/'
print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url))
url = 'http://techcrunch.com/2015/8/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/'
print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url))

# Extract blog category or tag
url = 'http://iedf.in/index.php/blog/category/electrical-electronics'
print(re.findall(r'/blog/(category|tag)/(.*)', url))
url = 'http://iedf.in/index.php/blog/tag/innovation'
print(re.findall(r'/blog/(category|tag)/(.*)', url))

# Redirect articles of 2013 and older to archives with renamed file
urls = ['http://www.example.io/articles/2013/jan/how-to-format-hdd.htm',
        'http://www.example.io/articles/2001/dec/how-to-format-hdd.htm',
        'http://www.example.io/articles/1994/apr/how-to-format-hdd.htm',
        'http://www.example.io/articles/2014/oct/how-to-format-hdd.htm']
print([re.sub(r'/articles/(?P<year>1\d{3}|200\d|201[0-3])/(\w+)/',
              r'/archives/\g<year>-\2-', url)
              for url in urls])

# Redirect all references from some folders to a sub-domain
urls = ['http://www.example.io/images/12343341.jpg',
        'http://www.example.io/images/thumbs/798788324.png',
        'https://www.example.io/images/events/photos/hackathon-2015/345145.png',
        'http://www.example.io/icons/animated/aasd.gif']
print([re.sub(r'^https?://www\.example\.io/(?:images|icons)/(.*\.(?:jpg|png|gif))',
              r'https://cdn.example.io/imgs/\1', url)
              for url in urls])


#======================================================================
import re

s = """
The original document was signed on 1998-1-31.
An ammendment was approved on 2015-02-24. It is expected to be signed on
2015-10-1
"""

# Convert from YYYY-MM-DD to DD-MM-YYYY date format
print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', '\\3-\\2-\\1', s))
print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', r'\3-\2-\1', s))

# Extract individual fields of first match
m = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', s)
print(m)
print(m.groups())
year, month, day = m.groups()
print(year,month,day)

# Extract individual fields of first match by named groups
m = re.search(r'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})', s)
print(m.group('day'), m.group('month'), m.group('year'))


#======================================================================
import re
runs = """Dhawan    23     101      44       11       76
Kohli    111,23,12,  58,   90
Saha      45:    8  :   37   :   65  :  121"""
print([re.split(r'\s*[,:]?\s*',s) for s in re.split(r'\n+',runs)])


#======================================================================
import re
s = "My name is Raju. I was born in Delhi.  I came to Mumbai in 1994.    Though I've lived here ever since, I miss Delhi."
print(re.findall(r'(?<=\.) {2,}(?=[A-Z])', s))
print(re.sub(r'(?<=\.) {2,}(?=[A-Z])', ' ', s))
print(re.subn(r'(?<=\.) {2,}(?=[A-Z])', ' ', s))


#======================================================================
import re
L = ['Hindi', 'English', 'Kannada', 'Urdu', 'Punjabi', 'Tamil', 'Assamese']
D = {x:(len(x),len(re.findall('[aeiou]',x,re.I))) for x in L}
print(D)


#======================================================================
import re
s = "'Well, I've tried to say \"How Doth the Little Busy Bee,\" but it all came different!' Alice replied in a very melancholy voice."
for i,m in enumerate(re.finditer(r'([\'"])(?!(?:ve|m|re|s|t|d|ll))(?=([^\1]*)\1)', s)):
    print("Group {:d}: ".format(i+1))
    for g in m.groups():
        print('  '+g)


#======================================================================
import re

ip_str = """
Remote address 11.242.97.38
111.200.251.63 was deleted
222.97.98.180 access denied
Unknown address:71.7.287.38
Something123.63.97.29
26.98.73.262
99.125.34.153
121.226.291.143
122.215.259.80
88.32.172.106
"""

# Match IP address format without range checking
patt = re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b')
print(patt.findall(ip_str))

# Match valid IP addresses
patt = re.compile(r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
                  r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b')
print(patt.findall(ip_str))

# Obtain individual fields from valid IP addresses
patt = re.compile(r'\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.'
                  r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.'
                  r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.'
                  r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b')
print(patt.findall(ip_str))


#======================================================================
import re
def get_inr(number):
    return re.sub(r"\d(?=(?:\d{2})+(\d{3})(?!\d)|(\d{3})(?!\d))","\g<0>,",str(number))

def get_inr_x(number):
    patt = re.compile(r"""\d(?= #
                              (?:\d{2})+(\d{3})(?!\d) | # >=100000
                              (\d{3})(?!\d) # >=1000 && <100000
                            )""", re.X)
    return re.sub(patt,"\g<0>,", str(number))

print(*map(get_inr, [10**x for x in range(10)]))
print(*map(get_inr, [-10**x for x in range(10)]))
print(*map(get_inr, [10**x+0.45 for x in range(10)]))
print(*map(get_inr, [-10**x-0.45 for x in range(10)]))


#======================================================================
import re

content = """
When Raju is released from prison after serving two years
for forgery and for embezzlement, Raju goes to the temple
located on the Sarayu River in his hometown of Malgudi,
which is far from prison. He thinks prison is not too bad
a place, and he is wondering what to do next with his life.
Then a villager named Velan shows up and, taking Raju for
a holy wise man or guru, consults with him about his sister,
who refuses to marry as the family wishes. Well aware that
he is not a guru, Raju is evasive, but Velan brings his
sister anyway, and after their meeting she conforms to her
family’s wishes. So begins Raju’s life as a holy man.
"""

print(re.findall(r'\b(\w+)\b(?=[^\.]+\b\1\b)', content, re.I)) # correct
print(re.findall(r'\b(\w+)\b(?=([^\.]+)\b\1\b)', content, re.I)) # for debugging
print(re.findall(r'\b(\w+)\b[^\.]+\b\1\b', content, re.I)) # nested/overlapping occurrences are not matched
	import re
	print(re.match(r'bc', 'abc'))
	print(re.match(r'abc', 'abc'))
	print(re.search(r'bc', 'abc'))
	print(re.search(r'^bc', 'abc'))
	print(re.search(r'^ab$', 'abc'))
	print(re.search(r'^abc$', 'abc'))
	print(re.search(r'a?bc', 'bc'))
	print(re.search(r'a?bc', 'abc'))
	print(re.search(r'a?bc', 'aabc'))
	print(re.search(r'a*bc', 'aabc'))
	print(re.search(r'a+bc', 'aabc'))
	print(re.search(r'a+bc', 'bc'))
	print(re.search(r'a*bc', 'bc'))
	print(re.search(r'a*bc', 'BC'))
	print(re.search(r'a*bc', 'BC', re.I))
	print(re.search(r'a.*bc', 'aabbcabcd'))
	print(re.search(r'a.*?bc', 'aabbcabcd'))
	print(re.search(r'a.*?d', 'ababdbabd'))
	print(re.search(r'a[^d]*d', 'ababdbabd'))
	print(re.findall(r'a[^d]*d', 'ababdbabd'))
	print(re.search(r'\d+', 'The year is 2015.'))
	print(re.findall(r'\w+(?=\s+Khan)', 'Salman Khan, Shahrukh Khan, Aamir Khan and Fardeen Khan are famous actors.'))


	#======================================================================
	s = 'Я люблю мороженое'
	print(s.encode())
	print(re.sub('л', 'ѫ', s))
	print(re.sub(b'\xd0\xbb', b'\xd1\xab', s.encode()).decode())
	#print(re.sub(b'\xd0\xbb', 'ѫ', s.encode()).decode()) # error


	#======================================================================
	# Extract year, month and date
	url = 'http://techcrunch.com/2015/08/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/'
	print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url))
	url = 'http://techcrunch.com/2015/8/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/'
	print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url))

	# Extract blog category or tag
	url = 'http://iedf.in/index.php/blog/category/electrical-electronics'
	print(re.findall(r'/blog/(category\|tag)/(.*)', url))
	url = 'http://iedf.in/index.php/blog/tag/innovation'
	print(re.findall(r'/blog/(category\|tag)/(.*)', url))

	# Redirect articles of 2013 and older to archives with renamed file
	urls = ['http://www.example.io/articles/2013/jan/how-to-format-hdd.htm',
	'http://www.example.io/articles/2001/dec/how-to-format-hdd.htm',
	'http://www.example.io/articles/1994/apr/how-to-format-hdd.htm',
	'http://www.example.io/articles/2014/oct/how-to-format-hdd.htm']
	print([re.sub(r'/articles/(?P<year>1\d{3}\|200\d\|201[0-3])/(\w+)/',
	r'/archives/\g<year>-\2-', url)
	for url in urls])

	# Redirect all references from some folders to a sub-domain
	urls = ['http://www.example.io/images/12343341.jpg',
	'http://www.example.io/images/thumbs/798788324.png',
	'https://www.example.io/images/events/photos/hackathon-2015/345145.png',
	'http://www.example.io/icons/animated/aasd.gif']
	print([re.sub(r'^https?://www\.example\.io/(?:images\|icons)/(.*\.(?:jpg\|png\|gif))',
	r'https://cdn.example.io/imgs/\1', url)
	for url in urls])


	#======================================================================
	import re

	s = """
	The original document was signed on 1998-1-31.
	An ammendment was approved on 2015-02-24. It is expected to be signed on
	2015-10-1
	"""

	# Convert from YYYY-MM-DD to DD-MM-YYYY date format
	print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', '\\3-\\2-\\1', s))
	print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', r'\3-\2-\1', s))

	# Extract individual fields of first match
	m = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', s)
	print(m)
	print(m.groups())
	year, month, day = m.groups()
	print(year,month,day)

	# Extract individual fields of first match by named groups
	m = re.search(r'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})', s)
	print(m.group('day'), m.group('month'), m.group('year'))


	#======================================================================
	import re
	runs = """Dhawan 23 101 44 11 76
	Kohli 111,23,12, 58, 90
	Saha 45: 8 : 37 : 65 : 121"""
	print([re.split(r'\s[,:]?\s',s) for s in re.split(r'\n+',runs)])


	#======================================================================
	import re
	s = "My name is Raju. I was born in Delhi. I came to Mumbai in 1994. Though I've lived here ever since, I miss Delhi."
	print(re.findall(r'(?<=\.) {2,}(?=[A-Z])', s))
	print(re.sub(r'(?<=\.) {2,}(?=[A-Z])', ' ', s))
	print(re.subn(r'(?<=\.) {2,}(?=[A-Z])', ' ', s))


	#======================================================================
	import re
	L = ['Hindi', 'English', 'Kannada', 'Urdu', 'Punjabi', 'Tamil', 'Assamese']
	D = {x:(len(x),len(re.findall('[aeiou]',x,re.I))) for x in L}
	print(D)


	#======================================================================
	import re
	s = "'Well, I've tried to say \"How Doth the Little Busy Bee,\" but it all came different!' Alice replied in a very melancholy voice."
	for i,m in enumerate(re.finditer(r'([\'"])(?!(?:ve\|m\|re\|s\|t\|d\|ll))(?=([^\1]*)\1)', s)):
	print("Group {:d}: ".format(i+1))
	for g in m.groups():
	print(' '+g)


	#======================================================================
	import re

	ip_str = """
	Remote address 11.242.97.38
	111.200.251.63 was deleted
	222.97.98.180 access denied
	Unknown address:71.7.287.38
	Something123.63.97.29
	26.98.73.262
	99.125.34.153
	121.226.291.143
	122.215.259.80
	88.32.172.106
	"""

	# Match IP address format without range checking
	patt = re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b')
	print(patt.findall(ip_str))

	# Match valid IP addresses
	patt = re.compile(r'\b(?:(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.){3}'
	r'(?:25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\b')
	print(patt.findall(ip_str))

	# Obtain individual fields from valid IP addresses
	patt = re.compile(r'\b(25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.'
	r'(25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.'
	r'(25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\.'
	r'(25[0-5]\|2[0-4][0-9]\|[01]?[0-9][0-9]?)\b')
	print(patt.findall(ip_str))


	#======================================================================
	import re
	def get_inr(number):
	return re.sub(r"\d(?=(?:\d{2})+(\d{3})(?!\d)\|(\d{3})(?!\d))","\g<0>,",str(number))

	def get_inr_x(number):
	patt = re.compile(r"""\d(?= #
	(?:\d{2})+(\d{3})(?!\d) \| # >=100000
	(\d{3})(?!\d) # >=1000 && <100000
	)""", re.X)
	return re.sub(patt,"\g<0>,", str(number))

	print(map(get_inr, [10*x for x in range(10)]))
	print(map(get_inr, [-10*x for x in range(10)]))
	print(map(get_inr, [10*x+0.45 for x in range(10)]))
	print(map(get_inr, [-10*x-0.45 for x in range(10)]))


	#======================================================================
	import re

	content = """
	When Raju is released from prison after serving two years
	for forgery and for embezzlement, Raju goes to the temple
	located on the Sarayu River in his hometown of Malgudi,
	which is far from prison. He thinks prison is not too bad
	a place, and he is wondering what to do next with his life.
	Then a villager named Velan shows up and, taking Raju for
	a holy wise man or guru, consults with him about his sister,
	who refuses to marry as the family wishes. Well aware that
	he is not a guru, Raju is evasive, but Velan brings his
	sister anyway, and after their meeting she conforms to her
	family’s wishes. So begins Raju’s life as a holy man.
	"""

	print(re.findall(r'\b(\w+)\b(?=[^\.]+\b\1\b)', content, re.I)) # correct
	print(re.findall(r'\b(\w+)\b(?=([^\.]+)\b\1\b)', content, re.I)) # for debugging
	print(re.findall(r'\b(\w+)\b[^\.]+\b\1\b', content, re.I)) # nested/overlapping occurrences are not matched