AmeliaMN/recipe.py

## recipe.py
#!/usr/local/bin/python
#import the modules we will be using in the code
import sys,re,os

#Create a text file to store all the recipe information
filename = "recipes.txt"			#What our output file will be named
FILE = open(filename,"wr")			#Opens recipes.txt and defines it readable and writable
broken=open("broken.txt","w")			#Opens broken.txt and defines it to be writable
#Defining uppercase to be any line that has all uppercase letters
uppercase = "^(\s*[A-Z\s]){3,}\n";
#Defining titlecase to be any line that is only in titlecase
titlecase = "^(\s*[A-Z][a-z]*)( [A-Z][a-z]*| and| an| as| at| a| but| by| en| for | if| in | of| on| or| the| to| via| vs| with)*\s*\n";

#Looking for either uppercase or titlecase at the same time
bothcase = "(" + uppercase + ")|(" + titlecase + ")";
#search the files for this pattern
pattern = re.compile(bothcase);

#This lets you run this as a python script
recipe_files=sys.argv[1:]


for recipe_file in recipe_files:
	f=open(recipe_file)
	raw=f.readlines()				#Define raw to be the line numbers in the recipe file
	length = len(raw)				#Find length of file
	#Initialize variables for counting
	yieldcount=0
	titlecount=0
	#Find line number of <txt> and </txt> tags to only look at the main parts of the recipes,
	#not the headlines and bylines, etc.
	for i in range(0,length-1):
		if raw[i].find("<txt>")!=-1: break
		start=i
	else:
		break
	for i in range(0,length-1):
		if raw[i].find("</txt>")!=-1: break
		end=i
	else:
		break
	#Find line numbers of first instance of ALL CAPS line, of instruction line,
	#and of line containing "Yield: "
	marker = start
	while (marker < length-1):
		for i in range(marker,end):
			if re.match(pattern,raw[i]):
			#Match the pattern defined above--look for uppercase and titlecase lines
				# for match in pattern.finditer(raw[i]):
				title_row = i		#Define the title_row to be the line number where a title is found
				title = raw[i]		#title is defined as the information in line i, the title_row
				FILE.write("<recipe> \n"+"\t<title>\n\t\t"+title+"\t</title> \n")
				#Print out the recipe and title tags and put the title into the title tag
				marker = i + 1		#Assign the line number to be 1 past the title_row
				titlecount+=1
				break
		else:
			break				#If no title is found, break the loop
		for i in range(title_row,end):		#Look through everything in the title past the title line
			find_num = re.match("^(\s*#*\**[0-9]+\.)",raw[i])	#To look for instructions
			if find_num:			#If you find a match, the the instruction_row is found
				instruction_row = i
				break
		else:					#If no instruction is found, break the loop
			break
		for i in range(instruction_row,end+1):
		#Once you find the instruction_row, only look from there to the end of the recipe file
			if raw[i].find("Yield:")!=-1:	#Find the line with Yield:
				yield_row = i		#Defines the yield_row to be the line number where a yield: is found
				yieldcount+=1
				marker = yield_row+1
				break
		else:					#If no Yield: is found, break the loop
			break
  # INGREDIENTS Copy the ingredients into the text file
		FILE.write("\t<ingredients> \n")	#Write the starting ingredient tag to the file
		ingredient_block = []
		for i in range(title_row+1,instruction_row-1):
		#Only looks between the title row and instruction row since ingredients are always
		#in between these
			if "<i>" not in raw[i] and "<p>" not in raw[i] and "</i>" not in raw[i] and "</p>" not in raw[i]:
				ingredient_block.append(raw[i])
		ingredient_block = ''.join(ingredient_block)
		#Join all the ingredients into one paragraph instead of separate lines
		ingredient_block = ingredient_block.replace('\n','')
		#Get rid of the \n characters and replace with a space
		ing_words = ingredient_block.split(" ")
		#Defines ing_words to be the words in ingredient_block that are separated by spaces
		ing_counter = []
		#Define ing_counter to be an empty list
		for j in range(0,len(ing_words)-1):
			if re.match("[0-9]",ing_words[j]):
			#Look for ingredients that have numbers or the list of ingredient keywords.
				ing_counter.append(j)
				#Stores location of each ingredient start
		for d in range(0,len(ing_counter)-1):
		#Keep mixed fractions together!
			if len(ing_counter) >= d:
				if ing_counter[d-1] == ing_counter[d-2]+1:
					del ing_counter[d-1]
		if len(ing_counter) == 0:
			FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
			#Write all if no keywords present.
		elif ing_counter == [0]:
			FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
			#Write all if only one ingredient in line
		elif ing_counter == []:
			FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
			#Write all if no keywords present (case 2).
		if ing_counter[0] != 0:			#Write start of ingredient block even if no keyword at start.
			FILE.write("\t\t<ingredient>")
			for j in range(0,ing_counter[0]):
				FILE.write(ing_words[j]+" ")
			FILE.write("</ingredient> \n")

		for j in range(1,len(ing_counter)-1): 					#Write the bulk of the ingredient block
			FILE.write("\t\t<ingredient>")						# key position to the next.
			for k in range(ing_counter[j],ing_counter[j+1]):
				FILE.write(ing_words[k]+" ")
			FILE.write("</ingredient> \n")
		FILE.write("\t\t<ingredient>")
		for j in range(ing_counter[len(ing_counter)-1],len(ing_words)-1):	#Write the end stuff
			FILE.write(ing_words[j]+" ")
		FILE.write("</ingredient> \n")
		FILE.write("\t</ingredients> \n")		#Writes the end ingredients tag
  # INSTRUCTIONS Copy the instructions into the text file
		FILE.write("\t<instructions> \n")		#Writes the beginning instructions tag
		instruction_block=[]				#Defines the instruction_block to be an empty list for now
		for i in range(instruction_row,yield_row):
		#Looks between the beginning instruction row to the yield row
			if raw[i].find("<p>")!=0:
			#Put all the instructions into one block, instead of separate lines
				instruction_block.append(raw[i])
		instruction_block = ''.join(instruction_block)
		instruction_block = instruction_block.replace('\n','')	#Replaces the \n with spaces
		temp= re.split("(\s*#*\**[0-9]+\.)",instruction_block)
		#Splits the instruction block by numbers
		for j in range(1,len(temp),2):
			temp_instru=temp[j]+temp[j+1]
			FILE.write("\t\t<instruction>\n\t\t\t"+temp_instru+"\t\t</instruction> \n")
			#Writes the instruction between instruction tags
		FILE.write("\t</instructions> \n"+"</recipe> \n")
		#Writes the ending instructions tag and the ending recipe tag
#This writes the names of txt files that yielded no titles, no yields, or where the number of titles and yields
#didn't match into the broken.txt file
	if titlecount==0:
		broken.write(recipe_file+" titlecount=0\n")
	elif yieldcount==0:
		broken.write(recipe_file+" yieldcount=0\n")
	elif (titlecount-yieldcount)!=0:
		broken.write(recipe_file+" mismatched title/yield\n")

FILE.close()
broken.close()
	#!/usr/local/bin/python
	#import the modules we will be using in the code
	import sys,re,os

	#Create a text file to store all the recipe information
	filename = "recipes.txt" #What our output file will be named
	FILE = open(filename,"wr") #Opens recipes.txt and defines it readable and writable
	broken=open("broken.txt","w") #Opens broken.txt and defines it to be writable
	#Defining uppercase to be any line that has all uppercase letters
	uppercase = "^(\s*[A-Z\s]){3,}\n";
	#Defining titlecase to be any line that is only in titlecase
	titlecase = "^(\s[A-Z][a-z])( [A-Z][a-z]\| and\| an\| as\| at\| a\| but\| by\| en\| for \| if\| in \| of\| on\| or\| the\| to\| via\| vs\| with)\s*\n";

	#Looking for either uppercase or titlecase at the same time
	bothcase = "(" + uppercase + ")\|(" + titlecase + ")";
	#search the files for this pattern
	pattern = re.compile(bothcase);

	#This lets you run this as a python script
	recipe_files=sys.argv[1:]


	for recipe_file in recipe_files:
	f=open(recipe_file)
	raw=f.readlines() #Define raw to be the line numbers in the recipe file
	length = len(raw) #Find length of file
	#Initialize variables for counting
	yieldcount=0
	titlecount=0
	#Find line number of <txt> and </txt> tags to only look at the main parts of the recipes,
	#not the headlines and bylines, etc.
	for i in range(0,length-1):
	if raw[i].find("<txt>")!=-1: break
	start=i
	else:
	break
	for i in range(0,length-1):
	if raw[i].find("</txt>")!=-1: break
	end=i
	else:
	break
	#Find line numbers of first instance of ALL CAPS line, of instruction line,
	#and of line containing "Yield: "
	marker = start
	while (marker < length-1):
	for i in range(marker,end):
	if re.match(pattern,raw[i]):
	#Match the pattern defined above--look for uppercase and titlecase lines
	# for match in pattern.finditer(raw[i]):
	title_row = i #Define the title_row to be the line number where a title is found
	title = raw[i] #title is defined as the information in line i, the title_row
	FILE.write("<recipe> \n"+"\t<title>\n\t\t"+title+"\t</title> \n")
	#Print out the recipe and title tags and put the title into the title tag
	marker = i + 1 #Assign the line number to be 1 past the title_row
	titlecount+=1
	break
	else:
	break #If no title is found, break the loop
	for i in range(title_row,end): #Look through everything in the title past the title line
	find_num = re.match("^(\s#\**[0-9]+\.)",raw[i]) #To look for instructions
	if find_num: #If you find a match, the the instruction_row is found
	instruction_row = i
	break
	else: #If no instruction is found, break the loop
	break
	for i in range(instruction_row,end+1):
	#Once you find the instruction_row, only look from there to the end of the recipe file
	if raw[i].find("Yield:")!=-1: #Find the line with Yield:
	yield_row = i #Defines the yield_row to be the line number where a yield: is found
	yieldcount+=1
	marker = yield_row+1
	break
	else: #If no Yield: is found, break the loop
	break
	# INGREDIENTS Copy the ingredients into the text file
	FILE.write("\t<ingredients> \n") #Write the starting ingredient tag to the file
	ingredient_block = []
	for i in range(title_row+1,instruction_row-1):
	#Only looks between the title row and instruction row since ingredients are always
	#in between these
	if "<i>" not in raw[i] and "<p>" not in raw[i] and "</i>" not in raw[i] and "</p>" not in raw[i]:
	ingredient_block.append(raw[i])
	ingredient_block = ''.join(ingredient_block)
	#Join all the ingredients into one paragraph instead of separate lines
	ingredient_block = ingredient_block.replace('\n','')
	#Get rid of the \n characters and replace with a space
	ing_words = ingredient_block.split(" ")
	#Defines ing_words to be the words in ingredient_block that are separated by spaces
	ing_counter = []
	#Define ing_counter to be an empty list
	for j in range(0,len(ing_words)-1):
	if re.match("[0-9]",ing_words[j]):
	#Look for ingredients that have numbers or the list of ingredient keywords.
	ing_counter.append(j)
	#Stores location of each ingredient start
	for d in range(0,len(ing_counter)-1):
	#Keep mixed fractions together!
	if len(ing_counter) >= d:
	if ing_counter[d-1] == ing_counter[d-2]+1:
	del ing_counter[d-1]
	if len(ing_counter) == 0:
	FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
	#Write all if no keywords present.
	elif ing_counter == [0]:
	FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
	#Write all if only one ingredient in line
	elif ing_counter == []:
	FILE.write("\t\t<ingredient>"+"ingredient_block"+"</ingredient>")
	#Write all if no keywords present (case 2).
	if ing_counter[0] != 0: #Write start of ingredient block even if no keyword at start.
	FILE.write("\t\t<ingredient>")
	for j in range(0,ing_counter[0]):
	FILE.write(ing_words[j]+" ")
	FILE.write("</ingredient> \n")

	for j in range(1,len(ing_counter)-1): #Write the bulk of the ingredient block
	FILE.write("\t\t<ingredient>") # key position to the next.
	for k in range(ing_counter[j],ing_counter[j+1]):
	FILE.write(ing_words[k]+" ")
	FILE.write("</ingredient> \n")
	FILE.write("\t\t<ingredient>")
	for j in range(ing_counter[len(ing_counter)-1],len(ing_words)-1): #Write the end stuff
	FILE.write(ing_words[j]+" ")
	FILE.write("</ingredient> \n")
	FILE.write("\t</ingredients> \n") #Writes the end ingredients tag
	# INSTRUCTIONS Copy the instructions into the text file
	FILE.write("\t<instructions> \n") #Writes the beginning instructions tag
	instruction_block=[] #Defines the instruction_block to be an empty list for now
	for i in range(instruction_row,yield_row):
	#Looks between the beginning instruction row to the yield row
	if raw[i].find("<p>")!=0:
	#Put all the instructions into one block, instead of separate lines
	instruction_block.append(raw[i])
	instruction_block = ''.join(instruction_block)
	instruction_block = instruction_block.replace('\n','') #Replaces the \n with spaces
	temp= re.split("(\s#\**[0-9]+\.)",instruction_block)
	#Splits the instruction block by numbers
	for j in range(1,len(temp),2):
	temp_instru=temp[j]+temp[j+1]
	FILE.write("\t\t<instruction>\n\t\t\t"+temp_instru+"\t\t</instruction> \n")
	#Writes the instruction between instruction tags
	FILE.write("\t</instructions> \n"+"</recipe> \n")
	#Writes the ending instructions tag and the ending recipe tag
	#This writes the names of txt files that yielded no titles, no yields, or where the number of titles and yields
	#didn't match into the broken.txt file
	if titlecount==0:
	broken.write(recipe_file+" titlecount=0\n")
	elif yieldcount==0:
	broken.write(recipe_file+" yieldcount=0\n")
	elif (titlecount-yieldcount)!=0:
	broken.write(recipe_file+" mismatched title/yield\n")

	FILE.close()
	broken.close()