Skip to content

Instantly share code, notes, and snippets.

@AmeliaMN
Created October 22, 2012 18:25
Show Gist options
  • Save AmeliaMN/3933196 to your computer and use it in GitHub Desktop.
Save AmeliaMN/3933196 to your computer and use it in GitHub Desktop.
Python example: Recipe parsing
#!/usr/local/bin/python
#import the modules we will be using in the code
import sys,re,os
#Create a text file to store all the recipe information
filename = "recipes.txt" #What our output file will be named
FILE = open(filename,"wr") #Opens recipes.txt and defines it readable and writable
broken=open("broken.txt","w") #Opens broken.txt and defines it to be writable
#Defining uppercase to be any line that has all uppercase letters
uppercase = "^(\s*[A-Z\s]){3,}\n";
#Defining titlecase to be any line that is only in titlecase
titlecase = "^(\s*[A-Z][a-z]*)( [A-Z][a-z]*| and| an| as| at| a| but| by| en| for | if| in | of| on| or| the| to| via| vs| with)*\s*\n";
#Looking for either uppercase or titlecase at the same time
bothcase = "(" + uppercase + ")|(" + titlecase + ")";
#search the files for this pattern
pattern = re.compile(bothcase);
#This lets you run this as a python script
recipe_files=sys.argv[1:]
for recipe_file in recipe_files:
f=open(recipe_file)
raw=f.readlines() #Define raw to be the line numbers in the recipe file
length = len(raw) #Find length of file
#Initialize variables for counting
yieldcount=0
titlecount=0
#Find line number of <txt> and </txt> tags to only look at the main parts of the recipes,
#not the headlines and bylines, etc.
for i in range(0,length-1):
if raw[i].find("<txt>")!=-1: break
start=i
else:
break
for i in range(0,length-1):
if raw[i].find("</txt>")!=-1: break
end=i
else:
break
#Find line numbers of first instance of ALL CAPS line, of instruction line,
#and of line containing "Yield: "
marker = start
while (marker < length-1):
for i in range(marker,end):
if re.match(pattern,raw[i]):
#Match the pattern defined above--look for uppercase and titlecase lines
# for match in pattern.finditer(raw[i]):
title_row = i #Define the title_row to be the line number where a title is found
title = raw[i] #title is defined as the information in line i, the title_row
FILE.write("<recipe> \n"+"\t<title>\n\t\t"+title+"\t</title> \n")
#Print out the recipe and title tags and put the title into the title tag
marker = i + 1 #Assign the line number to be 1 past the title_row
titlecount+=1
break
else:
break #If no title is found, break the loop
for i in range(title_row,end): #Look through everything in the title past the title line
find_num = re.match("^(\s*#*\**[0-9]+\.)",raw[i]) #To look for instructions
if find_num: #If you find a match, the the instruction_row is found
instruction_row = i
break
else: #If no instruction is found, break the loop
break
for i in range(instruction_row,end+1):
#Once you find the instruction_row, only look from there to the end of the recipe file
if raw[i].find("Yield:")!=-1: #Find the line with Yield:
yield_row = i #Defines the yield_row to be the line number where a yield: is found
yieldcount+=1
marker = yield_row+1
break
else: #If no Yield: is found, break the loop
break
# INGREDIENTS Copy the ingredients into the text file
FILE.write("\t<ingredients> \n") #Write the starting ingredient tag to the file
ingredient_block = []
for i in range(title_row+1,instruction_row-1):
#Only looks between the title row and instruction row since ingredients are always
#in between these
if "<i>" not in raw[i] and "<p>" not in raw[i] and "</i>" not in raw[i] and "</p>" not in raw[i]:
ingredient_block.append(raw[i])
ingredient_block = ''.join(ingredient_block)
#Join all the ingredients into one paragraph instead of separate lines
ingredient_block = ingredient_block.replace('\n','')
#Get rid of the \n characters and replace with a space
ing_words = ingredient_block.split(" ")
#Defines ing_words to be the words in ingredient_block that are separated by spaces
ing_counter = []
#Define ing_counter to be an empty list
for j in range(0,len(ing_words)-1):
if re.match("[0-9]",ing_words[j]):
#Look for ingredients that have numbers or the list of ingredient keywords.
ing_counter.append(j)
#Stores location of each ingredient start
for d in range(0,len(ing_counter)-1):
#Keep mixed fractions together!
if len(ing_counter) >= d:
if ing_counter[d-1] == ing_counter[d-2]+1:
del ing_counter[d-1]
if len(ing_counter) == 0:
FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
#Write all if no keywords present.
elif ing_counter == [0]:
FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
#Write all if only one ingredient in line
elif ing_counter == []:
FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
#Write all if no keywords present (case 2).
if ing_counter[0] != 0: #Write start of ingredient block even if no keyword at start.
FILE.write("\t\t<ingredient>")
for j in range(0,ing_counter[0]):
FILE.write(ing_words[j]+" ")
FILE.write("</ingredient> \n")
for j in range(1,len(ing_counter)-1): #Write the bulk of the ingredient block
FILE.write("\t\t<ingredient>") # key position to the next.
for k in range(ing_counter[j],ing_counter[j+1]):
FILE.write(ing_words[k]+" ")
FILE.write("</ingredient> \n")
FILE.write("\t\t<ingredient>")
for j in range(ing_counter[len(ing_counter)-1],len(ing_words)-1): #Write the end stuff
FILE.write(ing_words[j]+" ")
FILE.write("</ingredient> \n")
FILE.write("\t</ingredients> \n") #Writes the end ingredients tag
# INSTRUCTIONS Copy the instructions into the text file
FILE.write("\t<instructions> \n") #Writes the beginning instructions tag
instruction_block=[] #Defines the instruction_block to be an empty list for now
for i in range(instruction_row,yield_row):
#Looks between the beginning instruction row to the yield row
if raw[i].find("<p>")!=0:
#Put all the instructions into one block, instead of separate lines
instruction_block.append(raw[i])
instruction_block = ''.join(instruction_block)
instruction_block = instruction_block.replace('\n','') #Replaces the \n with spaces
temp= re.split("(\s*#*\**[0-9]+\.)",instruction_block)
#Splits the instruction block by numbers
for j in range(1,len(temp),2):
temp_instru=temp[j]+temp[j+1]
FILE.write("\t\t<instruction>\n\t\t\t"+temp_instru+"\t\t</instruction> \n")
#Writes the instruction between instruction tags
FILE.write("\t</instructions> \n"+"</recipe> \n")
#Writes the ending instructions tag and the ending recipe tag
#This writes the names of txt files that yielded no titles, no yields, or where the number of titles and yields
#didn't match into the broken.txt file
if titlecount==0:
broken.write(recipe_file+" titlecount=0\n")
elif yieldcount==0:
broken.write(recipe_file+" yieldcount=0\n")
elif (titlecount-yieldcount)!=0:
broken.write(recipe_file+" mismatched title/yield\n")
FILE.close()
broken.close()
@kns003
Copy link

kns003 commented Oct 2, 2014

How to run this and where should I enter the URL.
Could you please explain

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment