demisjohn/Python RegEx Notes.py

## Python RegEx Notes.py
import re       # regex

#   This will be used to capture a group () within a larger string, and save that group to a variable

# Set regex pattern to match
dcpattern = re.compile(     r'DC=[-]?(\d*\.?\d*)V?' ,    flags=(re.IGNORECASE) )
#regex expression within 'raw' python string, to prevent interpretation/escaping (%f etc.).

# The Regular Expression
#	DC=[-]?(\d*\.?\d*)V?
# searches for exact text match
#	DC=
# 0 or 1 instances (the '?') of the set of characters
#	-
# Start a group to capture via () parentheses
#   This is how we extract the part of the string we're looking for.
# Find any number (*) of decimals (\d), via
#	\d*
# maybe (?) a period (maybe doesn't need the \ escaping?)
#	\.?
# And more decimals (assume this just goes away if there's no decimal point)
# This forms the 1st 'group' to capture, since it's enclosed in ()
#   Here we captured only the numbers.
# Maybe (?) has a
#	V
# Ignored case in the whole expression, but I believe we could have also specified
#	[Dd][Cc]
# and
#	[Vv]
# (sets of characters that include both upper & lower case) to accomplish the same thing


# perform the search:
m = dcpattern.search(  f1  )      # use regex pattern to extract DC value from filename (see above for regex definition, dcpat.compile()  )
# m will contain any 'groups' () defined in the RegEx pattern.
    if m:
        Vdc = float( m.group(1) )	# grab 1st group from RegEx & convert to float
        print 'DC value found:', m.groups(), ' --> ', Vdc, '(V)'
#groups() prints all captured groups

# for example, if
f1 = 'Iinj=1.0mA, Vdc=2.220V - 08 Oct 2013, 1130_28- Optical Spectrum.jpg'
# after the search, we'd get:
Vdc = 2.22


'''Other useful RegEx tokens
.   - any single character
.*  - any single character, any number of times (eg. any number of characters)
+ - like *, but can't be zero characters (only one or more)

\d - a single decimal number

? - may or may not have the preceeding char, eg.
    0?  means maybe has a 0

\s - any whitespace (tab, space, newline etc.)

Capture number with possible decimal point:
(\d+\.?\d*)     - One or more decimals (must include left-most 0 then, ie. ".045" won't match, only "0.045"), followed by Maybe a ".", followed by any number of (including none) decimals

Match either or two words:
/(?:wordone|wordtwo)/      (?:  means group but don't capture (eg. not giving the group a name)
'''
	import re # regex

	# This will be used to capture a group () within a larger string, and save that group to a variable

	# Set regex pattern to match
	dcpattern = re.compile( r'DC=[-]?(\d\.?\d)V?' , flags=(re.IGNORECASE) )
	#regex expression within 'raw' python string, to prevent interpretation/escaping (%f etc.).

	# The Regular Expression
	# DC=[-]?(\d\.?\d)V?
	# searches for exact text match
	# DC=
	# 0 or 1 instances (the '?') of the set of characters
	# -
	# Start a group to capture via () parentheses
	# This is how we extract the part of the string we're looking for.
	# Find any number (*) of decimals (\d), via
	# \d*
	# maybe (?) a period (maybe doesn't need the \ escaping?)
	# \.?
	# And more decimals (assume this just goes away if there's no decimal point)
	# This forms the 1st 'group' to capture, since it's enclosed in ()
	# Here we captured only the numbers.
	# Maybe (?) has a
	# V
	# Ignored case in the whole expression, but I believe we could have also specified
	# [Dd][Cc]
	# and
	# [Vv]
	# (sets of characters that include both upper & lower case) to accomplish the same thing



	# perform the search:
	m = dcpattern.search( f1 ) # use regex pattern to extract DC value from filename (see above for regex definition, dcpat.compile() )
	# m will contain any 'groups' () defined in the RegEx pattern.
	if m:
	Vdc = float( m.group(1) ) # grab 1st group from RegEx & convert to float
	print 'DC value found:', m.groups(), ' --> ', Vdc, '(V)'
	#groups() prints all captured groups

	# for example, if
	f1 = 'Iinj=1.0mA, Vdc=2.220V - 08 Oct 2013, 1130_28- Optical Spectrum.jpg'
	# after the search, we'd get:
	Vdc = 2.22




	'''Other useful RegEx tokens
	. - any single character
	.* - any single character, any number of times (eg. any number of characters)
	+ - like *, but can't be zero characters (only one or more)

	\d - a single decimal number

	? - may or may not have the preceeding char, eg.
	0? means maybe has a 0

	\s - any whitespace (tab, space, newline etc.)

	Capture number with possible decimal point:
	(\d+\.?\d*) - One or more decimals (must include left-most 0 then, ie. ".045" won't match, only "0.045"), followed by Maybe a ".", followed by any number of (including none) decimals

	Match either or two words:
	/(?:wordone\|wordtwo)/ (?: means group but don't capture (eg. not giving the group a name)
	'''