Created
October 22, 2012 18:25
-
-
Save AmeliaMN/3933196 to your computer and use it in GitHub Desktop.
Python example: Recipe parsing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
#import the modules we will be using in the code | |
import sys,re,os | |
#Create a text file to store all the recipe information | |
filename = "recipes.txt" #What our output file will be named | |
FILE = open(filename,"wr") #Opens recipes.txt and defines it readable and writable | |
broken=open("broken.txt","w") #Opens broken.txt and defines it to be writable | |
#Defining uppercase to be any line that has all uppercase letters | |
uppercase = "^(\s*[A-Z\s]){3,}\n"; | |
#Defining titlecase to be any line that is only in titlecase | |
titlecase = "^(\s*[A-Z][a-z]*)( [A-Z][a-z]*| and| an| as| at| a| but| by| en| for | if| in | of| on| or| the| to| via| vs| with)*\s*\n"; | |
#Looking for either uppercase or titlecase at the same time | |
bothcase = "(" + uppercase + ")|(" + titlecase + ")"; | |
#search the files for this pattern | |
pattern = re.compile(bothcase); | |
#This lets you run this as a python script | |
recipe_files=sys.argv[1:] | |
for recipe_file in recipe_files: | |
f=open(recipe_file) | |
raw=f.readlines() #Define raw to be the line numbers in the recipe file | |
length = len(raw) #Find length of file | |
#Initialize variables for counting | |
yieldcount=0 | |
titlecount=0 | |
#Find line number of <txt> and </txt> tags to only look at the main parts of the recipes, | |
#not the headlines and bylines, etc. | |
for i in range(0,length-1): | |
if raw[i].find("<txt>")!=-1: break | |
start=i | |
else: | |
break | |
for i in range(0,length-1): | |
if raw[i].find("</txt>")!=-1: break | |
end=i | |
else: | |
break | |
#Find line numbers of first instance of ALL CAPS line, of instruction line, | |
#and of line containing "Yield: " | |
marker = start | |
while (marker < length-1): | |
for i in range(marker,end): | |
if re.match(pattern,raw[i]): | |
#Match the pattern defined above--look for uppercase and titlecase lines | |
# for match in pattern.finditer(raw[i]): | |
title_row = i #Define the title_row to be the line number where a title is found | |
title = raw[i] #title is defined as the information in line i, the title_row | |
FILE.write("<recipe> \n"+"\t<title>\n\t\t"+title+"\t</title> \n") | |
#Print out the recipe and title tags and put the title into the title tag | |
marker = i + 1 #Assign the line number to be 1 past the title_row | |
titlecount+=1 | |
break | |
else: | |
break #If no title is found, break the loop | |
for i in range(title_row,end): #Look through everything in the title past the title line | |
find_num = re.match("^(\s*#*\**[0-9]+\.)",raw[i]) #To look for instructions | |
if find_num: #If you find a match, the the instruction_row is found | |
instruction_row = i | |
break | |
else: #If no instruction is found, break the loop | |
break | |
for i in range(instruction_row,end+1): | |
#Once you find the instruction_row, only look from there to the end of the recipe file | |
if raw[i].find("Yield:")!=-1: #Find the line with Yield: | |
yield_row = i #Defines the yield_row to be the line number where a yield: is found | |
yieldcount+=1 | |
marker = yield_row+1 | |
break | |
else: #If no Yield: is found, break the loop | |
break | |
# INGREDIENTS Copy the ingredients into the text file | |
FILE.write("\t<ingredients> \n") #Write the starting ingredient tag to the file | |
ingredient_block = [] | |
for i in range(title_row+1,instruction_row-1): | |
#Only looks between the title row and instruction row since ingredients are always | |
#in between these | |
if "<i>" not in raw[i] and "<p>" not in raw[i] and "</i>" not in raw[i] and "</p>" not in raw[i]: | |
ingredient_block.append(raw[i]) | |
ingredient_block = ''.join(ingredient_block) | |
#Join all the ingredients into one paragraph instead of separate lines | |
ingredient_block = ingredient_block.replace('\n','') | |
#Get rid of the \n characters and replace with a space | |
ing_words = ingredient_block.split(" ") | |
#Defines ing_words to be the words in ingredient_block that are separated by spaces | |
ing_counter = [] | |
#Define ing_counter to be an empty list | |
for j in range(0,len(ing_words)-1): | |
if re.match("[0-9]",ing_words[j]): | |
#Look for ingredients that have numbers or the list of ingredient keywords. | |
ing_counter.append(j) | |
#Stores location of each ingredient start | |
for d in range(0,len(ing_counter)-1): | |
#Keep mixed fractions together! | |
if len(ing_counter) >= d: | |
if ing_counter[d-1] == ing_counter[d-2]+1: | |
del ing_counter[d-1] | |
if len(ing_counter) == 0: | |
FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>") | |
#Write all if no keywords present. | |
elif ing_counter == [0]: | |
FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>") | |
#Write all if only one ingredient in line | |
elif ing_counter == []: | |
FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>") | |
#Write all if no keywords present (case 2). | |
if ing_counter[0] != 0: #Write start of ingredient block even if no keyword at start. | |
FILE.write("\t\t<ingredient>") | |
for j in range(0,ing_counter[0]): | |
FILE.write(ing_words[j]+" ") | |
FILE.write("</ingredient> \n") | |
for j in range(1,len(ing_counter)-1): #Write the bulk of the ingredient block | |
FILE.write("\t\t<ingredient>") # key position to the next. | |
for k in range(ing_counter[j],ing_counter[j+1]): | |
FILE.write(ing_words[k]+" ") | |
FILE.write("</ingredient> \n") | |
FILE.write("\t\t<ingredient>") | |
for j in range(ing_counter[len(ing_counter)-1],len(ing_words)-1): #Write the end stuff | |
FILE.write(ing_words[j]+" ") | |
FILE.write("</ingredient> \n") | |
FILE.write("\t</ingredients> \n") #Writes the end ingredients tag | |
# INSTRUCTIONS Copy the instructions into the text file | |
FILE.write("\t<instructions> \n") #Writes the beginning instructions tag | |
instruction_block=[] #Defines the instruction_block to be an empty list for now | |
for i in range(instruction_row,yield_row): | |
#Looks between the beginning instruction row to the yield row | |
if raw[i].find("<p>")!=0: | |
#Put all the instructions into one block, instead of separate lines | |
instruction_block.append(raw[i]) | |
instruction_block = ''.join(instruction_block) | |
instruction_block = instruction_block.replace('\n','') #Replaces the \n with spaces | |
temp= re.split("(\s*#*\**[0-9]+\.)",instruction_block) | |
#Splits the instruction block by numbers | |
for j in range(1,len(temp),2): | |
temp_instru=temp[j]+temp[j+1] | |
FILE.write("\t\t<instruction>\n\t\t\t"+temp_instru+"\t\t</instruction> \n") | |
#Writes the instruction between instruction tags | |
FILE.write("\t</instructions> \n"+"</recipe> \n") | |
#Writes the ending instructions tag and the ending recipe tag | |
#This writes the names of txt files that yielded no titles, no yields, or where the number of titles and yields | |
#didn't match into the broken.txt file | |
if titlecount==0: | |
broken.write(recipe_file+" titlecount=0\n") | |
elif yieldcount==0: | |
broken.write(recipe_file+" yieldcount=0\n") | |
elif (titlecount-yieldcount)!=0: | |
broken.write(recipe_file+" mismatched title/yield\n") | |
FILE.close() | |
broken.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
How to run this and where should I enter the URL.
Could you please explain