Skip to content

Instantly share code, notes, and snippets.

@EmilePW
Created December 5, 2017 22:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EmilePW/8645ba2390a897f189d5415459569b4e to your computer and use it in GitHub Desktop.
Save EmilePW/8645ba2390a897f189d5415459569b4e to your computer and use it in GitHub Desktop.
Parses ingredients from allrecipes.co.uk
/*
Scrapes recipes from allrecipes.co.uk
Standard format for a recipe ingredient is: /(amount)\s(?measure)
*/
const WordPOS = require("wordpos");
const wordpos = new WordPOS();
// List of measures to be spotted in text
// Actual measures are to be identified prior to pseduomeasures (like 'jar')
// Descending order so that no measure contained within another can be
// wrongly identified (e.g. detecting 'g' instead of 'gallon')
const measureStrings = [
"fluid ounces",
"cubic inches",
"fluid ounce",
"millilitres",
"cubic inch",
"tablespoons",
"tablespoon",
"teaspoons",
"kilograms",
"teaspoon",
"kilogram",
"gallons",
"stones",
"ounces",
"gallon",
"litres",
"liters",
"fl. oz",
"pounds",
"ounce",
"pound",
"stone",
"liter",
"litre",
"pints",
"grams",
"tbsps",
"pinch",
"lots",
"cups",
"pint",
"gram",
"tbsp",
"tsps",
"cup",
"tsp",
"lbs",
"oz",
"ml",
"lb",
"g",
"l",
"handfuls",
"handful",
"bunches",
"pinches",
"bottles",
"packets",
"sachets",
"sprigs",
"sachet",
"bottle",
"packet",
"sprig",
"bunch",
"clove",
"whole",
"cans",
"tins",
"jars",
"knob",
"tin",
"jar",
"can"
];
// Matches measure and then alternative unit in brackets e.g. '1200ml (2pints) of water'
const measureAndAlternativeRegex = /(\d+(.\d+)?)\s{0,1}(fluid ounces|cubic inches|fluid ounce|millilitres|cubic inch|tablespoons|tablespoon|teaspoons|kilograms|teaspoon|kilogram|gallons|stones|ounces|gallon|litres|liters|fl. oz|pounds|ounce|pound|stone|liter|litre|pints|grams|tbsps|pinch|lots|cups|pint|gram|tbsp|tsps|cup|pts|tsp|lbs|oz|ml|lb|pt|g|l|handfuls|handful|bunches|pinches|bottles|packets|sachets|sprigs|sachet|bottle|packet|sprig|bunch|clove|whole|cans|tins|jars|knob|tin|jar|can)\s{1}\((\d+(.\d+)?)\s{0,1}(fluid ounces|cubic inches|fluid ounce|millilitres|cubic inch|tablespoons|tablespoon|teaspoons|kilograms|teaspoon|kilogram|gallons|stones|ounces|gallon|litres|liters|fl. oz|pounds|ounce|pound|stone|liter|litre|pints|grams|tbsps|pinch|lots|cups|pint|gram|tbsp|tsps|cup|pts|tsp|lbs|oz|ml|lb|pt|g|l|handfuls|handful|bunches|pinches|bottles|packets|sachets|sprigs|sachet|bottle|packet|sprig|bunch|clove|whole|cans|tins|jars|knob|tin|jar|can)\)/;
// Matches number followed by measure
const numOfMeasureRegex = /(\d+(.\d+)?)\s{0,1}(fluid ounces|cubic inches|fluid ounce|millilitres|cubic inch|tablespoons|tablespoon|teaspoons|kilograms|teaspoon|kilogram|gallons|stones|ounces|gallon|litres|liters|fl. oz|pounds|ounce|pound|stone|liter|litre|pints|grams|tbsps|pinch|lots|cups|pint|gram|tbsp|tsps|cup|pts|tsp|lbs|oz|ml|lb|pt|g|l|handfuls|handful|bunches|pinches|bottles|packets|sachets|sprigs|sachet|bottle|packet|sprig|bunch|clove|whole|cans|tins|jars|knob|tin|jar|can)\s+/;
// Matches unit number of a set quantity e.g. '3 (400ml) tins of coconut milk'
const numOfSetQuantityRegex = /(\d+(.\d+)?)\s{1}\(?(\d+(.\d+)?)(fluid ounces|cubic inches|fluid ounce|millilitres|cubic inch|tablespoons|tablespoon|teaspoons|kilograms|teaspoon|kilogram|gallons|stones|ounces|gallon|litres|liters|fl. oz|pounds|ounce|pound|stone|liter|litre|pints|grams|tbsps|pinch|lots|cups|pint|gram|tbsp|tsps|cup|pts|tsp|lbs|oz|ml|lb|pt|g|l)\)?/;
// Matches unit number without measure (only to be used if the above fail)
const numOfUnitRegex = /\s{0}(\d+(.\d+)?)/;
// Matches measure unit only
const measureOnlyRegex = /\s(fluid ounces|cubic inches|fluid ounce|millilitres|cubic inch|tablespoons|tablespoon|teaspoons|kilograms|teaspoon|kilogram|gallons|stones|ounces|gallon|litres|liters|fl. oz|pounds|ounce|pound|stone|liter|litre|pints|grams|tbsps|pinch|lots|cups|pint|gram|tbsp|tsps|cup|pts|tsp|lbs|oz|ml|lb|pt|g|l|handfuls|handful|bunches|pinches|bottles|packets|sachets|sprigs|sachet|bottle|packet|sprig|bunch|clove|whole|cans|tins|jars|knob|tin|jar|can)\s/;
function findMeasure(ingredientStr) {
var measure, i = 0;
while (i < measureStrings.length && !measure) {
if (ingredientStr.includes(measureStrings[i])) {
measure = measureStrings[i];
}
i++;
}
return measure || "unit";
}
function parseNum(str) {
return parseFloat(eval(str));
}
function replaceUnicodeFractions(str) {
return str
.replace("½", "0.5")
.replace("¼", "0.25")
.replace("⅓", "1/3")
.replace("⅔", "2/3")
.replace("¾", "0.75");
}
function findQuantityAndMeasure(ingredientStr, measure) {
ingredientStr = replaceUnicodeFractions(ingredientStr);
var match = ingredientStr.match(numOfSetQuantityRegex);
if (match && match.length > 1) {
var multiple = parseNum(match[1]);
var quantity = parseNum(match[3]);
var measure = match[5];
return {
quantity: quantity * multiple,
measure
};
}
match = ingredientStr.match(measureAndAlternativeRegex);
if (match && match.length > 1) {
var quantity = parseNum(match[1]);
var measure = match[3];
return {
quantity,
measure
};
}
match = ingredientStr.match(numOfMeasureRegex);
if (match && match.length > 1) {
var quantity = parseNum(match[1]);
var measure = match[3];
return {
quantity,
measure
};
}
match = ingredientStr.match(numOfUnitRegex);
if (match && match.length > 1) {
var quantity = parseNum(match[1]);
var measure = "unit";
return {
quantity,
measure
};
}
return { quantity, measure };
}
async function isLastCommaIngredientDescription(str) {
/*
Function to vaguely determine whether the comma is for a description
of an ingredient or a comment e.g. we want to distinguish between
the parts in 'boneless, skinless chicken thighs, diced into cubes'
*/
if (str.includes(",")) {
var adjectives = await wordpos.getAdjectives(str);
var nouns = await wordpos.getNouns(str);
var substr = str.substring(str.lastIndexOf(",") + 1).trim();
var firstWord = substr.split(" ")[0];
var secondWord = substr.split(" ")[1] || "";
return adjectives.includes(firstWord) && nouns.includes(secondWord);
} else {
return true;
}
}
async function findIngredient(ingredientStr) {
ingredientStr = replaceUnicodeFractions(ingredientStr);
var ingredientStrCopy = ingredientStr.slice();
var lastCommaIsDescription = await isLastCommaIngredientDescription(
ingredientStrCopy
);
if (!lastCommaIsDescription) {
ingredientStrCopy = ingredientStrCopy.substring(
0,
ingredientStrCopy.lastIndexOf(",")
);
}
var ingredient = await ingredientStrCopy
.replace(measureAndAlternativeRegex, "")
.replace(numOfSetQuantityRegex, "")
.replace(numOfMeasureRegex, "")
.replace(numOfUnitRegex, "")
.replace(measureOnlyRegex, "")
.trim();
return ingredient;
}
async function parse(ingredientStr) {
var ingredient = await findIngredient(ingredientStr);
var result = findQuantityAndMeasure(ingredientStr);
result.ingredient = ingredient;
return result;
}
// [
// "2 (400ml) tins coconut milk",
// "2 tablespoons green curry paste",
// "160ml chicken stock",
// "1 (220g) tin sliced water chestnuts, drained",
// "1 (225g) tin sliced bamboo shoots, drained",
// "1 green pepper, cut into 2cm pieces",
// "75g sliced fresh mushrooms",
// "3 boneless, skinless chicken breast fillets, diced",
// "3 tablespoons fish sauce",
// "4 tablespoons chopped fresh basil"
// ].map(parse)
module.exports = {
parse
};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment