Skip to content

Instantly share code, notes, and snippets.

@ChillarAnand
Last active January 21, 2016 14:31
Show Gist options
  • Save ChillarAnand/7c8a05ca69037accbc9b to your computer and use it in GitHub Desktop.
Save ChillarAnand/7c8a05ca69037accbc9b to your computer and use it in GitHub Desktop.
BangPypers talk - Using Regular Expressions by Arvind Padmanabhan
import re
print(re.match(r'bc', 'abc'))
print(re.match(r'abc', 'abc'))
print(re.search(r'bc', 'abc'))
print(re.search(r'^bc', 'abc'))
print(re.search(r'^ab$', 'abc'))
print(re.search(r'^abc$', 'abc'))
print(re.search(r'a?bc', 'bc'))
print(re.search(r'a?bc', 'abc'))
print(re.search(r'a?bc', 'aabc'))
print(re.search(r'a*bc', 'aabc'))
print(re.search(r'a+bc', 'aabc'))
print(re.search(r'a+bc', 'bc'))
print(re.search(r'a*bc', 'bc'))
print(re.search(r'a*bc', 'BC'))
print(re.search(r'a*bc', 'BC', re.I))
print(re.search(r'a.*bc', 'aabbcabcd'))
print(re.search(r'a.*?bc', 'aabbcabcd'))
print(re.search(r'a.*?d', 'ababdbabd'))
print(re.search(r'a[^d]*d', 'ababdbabd'))
print(re.findall(r'a[^d]*d', 'ababdbabd'))
print(re.search(r'\d+', 'The year is 2015.'))
print(re.findall(r'\w+(?=\s+Khan)', 'Salman Khan, Shahrukh Khan, Aamir Khan and Fardeen Khan are famous actors.'))
#======================================================================
s = 'Я люблю мороженое'
print(s.encode())
print(re.sub('л', 'ѫ', s))
print(re.sub(b'\xd0\xbb', b'\xd1\xab', s.encode()).decode())
#print(re.sub(b'\xd0\xbb', 'ѫ', s.encode()).decode()) # error
#======================================================================
# Extract year, month and date
url = 'http://techcrunch.com/2015/08/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/'
print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url))
url = 'http://techcrunch.com/2015/8/15/the-future-of-marketplace-lending-lessons-from-an-advertising-past/'
print(re.findall(r'/(\d{4})/(\d{1,2})/(\d{1,2})/', url))
# Extract blog category or tag
url = 'http://iedf.in/index.php/blog/category/electrical-electronics'
print(re.findall(r'/blog/(category|tag)/(.*)', url))
url = 'http://iedf.in/index.php/blog/tag/innovation'
print(re.findall(r'/blog/(category|tag)/(.*)', url))
# Redirect articles of 2013 and older to archives with renamed file
urls = ['http://www.example.io/articles/2013/jan/how-to-format-hdd.htm',
'http://www.example.io/articles/2001/dec/how-to-format-hdd.htm',
'http://www.example.io/articles/1994/apr/how-to-format-hdd.htm',
'http://www.example.io/articles/2014/oct/how-to-format-hdd.htm']
print([re.sub(r'/articles/(?P<year>1\d{3}|200\d|201[0-3])/(\w+)/',
r'/archives/\g<year>-\2-', url)
for url in urls])
# Redirect all references from some folders to a sub-domain
urls = ['http://www.example.io/images/12343341.jpg',
'http://www.example.io/images/thumbs/798788324.png',
'https://www.example.io/images/events/photos/hackathon-2015/345145.png',
'http://www.example.io/icons/animated/aasd.gif']
print([re.sub(r'^https?://www\.example\.io/(?:images|icons)/(.*\.(?:jpg|png|gif))',
r'https://cdn.example.io/imgs/\1', url)
for url in urls])
#======================================================================
import re
s = """
The original document was signed on 1998-1-31.
An ammendment was approved on 2015-02-24. It is expected to be signed on
2015-10-1
"""
# Convert from YYYY-MM-DD to DD-MM-YYYY date format
print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', '\\3-\\2-\\1', s))
print(re.sub(r'(\d{4})-(\d{1,2})-(\d{1,2})', r'\3-\2-\1', s))
# Extract individual fields of first match
m = re.search(r'(\d{4})-(\d{1,2})-(\d{1,2})', s)
print(m)
print(m.groups())
year, month, day = m.groups()
print(year,month,day)
# Extract individual fields of first match by named groups
m = re.search(r'(?P<year>\d{4})-(?P<month>\d{1,2})-(?P<day>\d{1,2})', s)
print(m.group('day'), m.group('month'), m.group('year'))
#======================================================================
import re
runs = """Dhawan 23 101 44 11 76
Kohli 111,23,12, 58, 90
Saha 45: 8 : 37 : 65 : 121"""
print([re.split(r'\s*[,:]?\s*',s) for s in re.split(r'\n+',runs)])
#======================================================================
import re
s = "My name is Raju. I was born in Delhi. I came to Mumbai in 1994. Though I've lived here ever since, I miss Delhi."
print(re.findall(r'(?<=\.) {2,}(?=[A-Z])', s))
print(re.sub(r'(?<=\.) {2,}(?=[A-Z])', ' ', s))
print(re.subn(r'(?<=\.) {2,}(?=[A-Z])', ' ', s))
#======================================================================
import re
L = ['Hindi', 'English', 'Kannada', 'Urdu', 'Punjabi', 'Tamil', 'Assamese']
D = {x:(len(x),len(re.findall('[aeiou]',x,re.I))) for x in L}
print(D)
#======================================================================
import re
s = "'Well, I've tried to say \"How Doth the Little Busy Bee,\" but it all came different!' Alice replied in a very melancholy voice."
for i,m in enumerate(re.finditer(r'([\'"])(?!(?:ve|m|re|s|t|d|ll))(?=([^\1]*)\1)', s)):
print("Group {:d}: ".format(i+1))
for g in m.groups():
print(' '+g)
#======================================================================
import re
ip_str = """
Remote address 11.242.97.38
111.200.251.63 was deleted
222.97.98.180 access denied
Unknown address:71.7.287.38
Something123.63.97.29
26.98.73.262
99.125.34.153
121.226.291.143
122.215.259.80
88.32.172.106
"""
# Match IP address format without range checking
patt = re.compile(r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b')
print(patt.findall(ip_str))
# Match valid IP addresses
patt = re.compile(r'\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}'
r'(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b')
print(patt.findall(ip_str))
# Obtain individual fields from valid IP addresses
patt = re.compile(r'\b(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.'
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.'
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.'
r'(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b')
print(patt.findall(ip_str))
#======================================================================
import re
def get_inr(number):
return re.sub(r"\d(?=(?:\d{2})+(\d{3})(?!\d)|(\d{3})(?!\d))","\g<0>,",str(number))
def get_inr_x(number):
patt = re.compile(r"""\d(?= #
(?:\d{2})+(\d{3})(?!\d) | # >=100000
(\d{3})(?!\d) # >=1000 && <100000
)""", re.X)
return re.sub(patt,"\g<0>,", str(number))
print(*map(get_inr, [10**x for x in range(10)]))
print(*map(get_inr, [-10**x for x in range(10)]))
print(*map(get_inr, [10**x+0.45 for x in range(10)]))
print(*map(get_inr, [-10**x-0.45 for x in range(10)]))
#======================================================================
import re
content = """
When Raju is released from prison after serving two years
for forgery and for embezzlement, Raju goes to the temple
located on the Sarayu River in his hometown of Malgudi,
which is far from prison. He thinks prison is not too bad
a place, and he is wondering what to do next with his life.
Then a villager named Velan shows up and, taking Raju for
a holy wise man or guru, consults with him about his sister,
who refuses to marry as the family wishes. Well aware that
he is not a guru, Raju is evasive, but Velan brings his
sister anyway, and after their meeting she conforms to her
family’s wishes. So begins Raju’s life as a holy man.
"""
print(re.findall(r'\b(\w+)\b(?=[^\.]+\b\1\b)', content, re.I)) # correct
print(re.findall(r'\b(\w+)\b(?=([^\.]+)\b\1\b)', content, re.I)) # for debugging
print(re.findall(r'\b(\w+)\b[^\.]+\b\1\b', content, re.I)) # nested/overlapping occurrences are not matched
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment