Skip to content

Instantly share code, notes, and snippets.

@gibiansky
Last active January 31, 2016 17:57
Show Gist options
  • Save gibiansky/99acba95707f856fa253 to your computer and use it in GitHub Desktop.
Save gibiansky/99acba95707f856fa253 to your computer and use it in GitHub Desktop.
Fancy parsing
# If you use triple quotes, the strings can span multiple lines...
# Let's define our test data...
my_test_data = """
<strong>LHD Contact Information:</strong>
</div>
<div class="right">
<strong>Alameda County Public Health Department</strong> <strong>(NACCHO Member)</strong><br />
1000 Broadway Ste 500<br />
Oakland, CA 94607-4033<br />
Phone: (510)267-8000<br />
Fax: (510)267-3212<br />
<a href="mailto:muntu.davis@acgov.org">E-mail</a>
</div>
"""
# Let's split it into lines like we usually do
lines = [y.strip() for y in my_test_data.split("\n")]
# Old way: construct the list using a list comprehension
new_list = [y.strip() for y in lines] # just an example, doesnt do anything
# New option: construct the list using a for loop
new_list = []
for item in lines:
new_list.append(item.strip()) # does the same thing as the previous example, aka nothing
# Now use a 'for' loop to iterate over the lines, without constructing a list
# We construct the list using an empty list and using .append()
number_of_br_tags_seen = -1 # leave this at -1 until we hit a LHD Contact Information
phones = [] # collect the phone numbers lines into a list
for line in lines:
print("Processing " + line)
# reset number_of_br_tags_seen to 0 when we see LHD Contact Information in a line
if "LHD Contact Information" in line: # use 'in' to check for the string being in the line at any part of the line
print("number_of_br_tags_seen = 0")
number_of_br_tags_seen = 0
# if number_of_br_tags_seen is <0, then it's -1, so this line means nothing
if number_of_br_tags_seen < 0:
print("skipping line")
continue # continue means skip the rest of the loop
# now if this ends with <br/> add to number_of_br_tags_seen
if line.endswith("<br />"):
number_of_br_tags_seen = number_of_br_tags_seen + 1
print("number_of_br_tags_seen now equal to " + str(number_of_br_tags_seen))
if number_of_br_tags_seen == 4:
print("found phone " + line)
phones.append(line)
# ==== Output =====
# Processing
# skipping line
# Processing <strong>LHD Contact Information:</strong>
# number_of_br_tags_seen = 0
# Processing </div>
# Processing <div class="right">
# Processing <strong>Alameda County Public Health Department</strong> <strong>(NACCHO Member)</strong><br />
# number_of_br_tags_seen now equal to 1
# Processing 1000 Broadway Ste 500<br />
# number_of_br_tags_seen now equal to 2
# Processing Oakland, CA 94607-4033<br />
# number_of_br_tags_seen now equal to 3
# Processing
# Processing Phone: (510)267-8000<br />
# number_of_br_tags_seen now equal to 4
# found phone Phone: (510)267-8000<br />
# Processing Fax: (510)267-3212<br />
# number_of_br_tags_seen now equal to 5
# Processing <a href="mailto:muntu.davis@acgov.org">E-mail</a>
# Processing </div>
# Processing
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment