Skip to content

Instantly share code, notes, and snippets.

@inkhorn
Last active September 1, 2015 05:11
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save inkhorn/9044779 to your computer and use it in GitHub Desktop.
Save inkhorn/9044779 to your computer and use it in GitHub Desktop.
recipes = readLines('recipes combined.tsv')
# Once I read it into R, I have to get rid of the /t
# characters so that it's more acceptable to the tm package
recipes.new = apply(as.matrix(recipes), 1, function (x) gsub('\t',' ', x))
recipes.corpus = Corpus(VectorSource(recipes.new))
recipes.dtm = DocumentTermMatrix(recipes.corpus)
# Now I filter out any terms that have shown up in less than 10 documents
recipes.dict = Dictionary(findFreqTerms(recipes.dtm,10))
recipes.dtm.filtered = DocumentTermMatrix(recipes.corpus, list(dictionary = recipes.dict))
# Here I get a count of number of ingredients in each document
# with the intent of deleting any documents with 0 ingredients
ingredient.counts = apply(recipes.dtm.filtered, 1, function (x) sum(x))
recipes.dtm.filtered = recipes.dtm.filtered[ingredient.counts > 0]
# Here i get some simple ingredient frequencies so that I can plot them and decide
# which I'd like to filter out
recipes.m = as.matrix(recipes.dtm.filtered)
popularity.of.ingredients = sort(colSums(recipes.m), decreasing=TRUE)
popularity.of.ingredients = data.frame(ingredients = names(popularity.of.ingredients), num_recipes=popularity.of.ingredients)
popularity.of.ingredients$ingredients = reorder(popularity.of.ingredients$ingredients, popularity.of.ingredients$num_recipes)
library(ggplot2)
ggplot(popularity.of.ingredients[1:30,], aes(x=ingredients, y=num_recipes)) + geom_point(size=5, colour="red") + coord_flip() +
ggtitle("Recipe Popularity of Top 30 Ingredients") +
theme(axis.text.x=element_text(size=13,face="bold", colour="black"), axis.text.y=element_text(size=13,colour="black",
face="bold"), axis.title.x=element_text(size=14, face="bold"), axis.title.y=element_text(size=14,face="bold"),
plot.title=element_text(size=24,face="bold"))
# Having found wheat, egg, and butter to be the three most frequent ingredients
# (and not caring too much about them as ingredients in general) I remove them
# from the corpus and redo the document term matrix
recipes.corpus = tm_map(recipes.corpus, removeWords, c("wheat","egg","butter")) # Go back to line 6
recipes.dtm.final = DocumentTermMatrix(recipes.corpus, list(dictionary = recipes.dict))
# Finally, I run the LDA and extract the 5 most
# characteristic ingredients in each topic... yummy!
recipes.lda = LDA(recipes.dtm.filtered, 50)
t = terms(recipes.lda,5)
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9
[1,] "onion" "pepper" "milk" "tomato" "olive_oil" "milk" "milk" "tomato" "garlic"
[2,] "rice" "vinegar" "vanilla" "garlic" "garlic" "nutmeg" "pepper" "cayenne" "cream"
[3,] "cayenne" "onion" "cocoa" "oregano" "onion" "vanilla" "yeast" "olive_oil" "vegetable_oil"
[4,] "chicken_broth" "tomato" "onion" "onion" "black_pepper" "cinnamon" "potato" "garlic" "pepper"
[5,] "olive_oil" "milk" "cane_molasses" "basil" "vinegar" "cream" "lemon_juice" "pepper" "milk"
Topic 10 Topic 11 Topic 12 Topic 13 Topic 14 Topic 15 Topic 16 Topic 17
[1,] "milk" "soy_sauce" "vegetable_oil" "onion" "milk" "tamarind" "milk" "vegetable_oil"
[2,] "cream" "scallion" "milk" "black_pepper" "cinnamon" "onion" "vanilla" "pepper"
[3,] "vanilla" "sesame_oil" "pepper" "vinegar" "onion" "garlic" "cream" "cream"
[4,] "cane_molasses" "cane_molasses" "cane_molasses" "bell_pepper" "cayenne" "corn" "vegetable_oil" "black_pepper"
[5,] "cinnamon" "roasted_sesame_seed" "cinnamon" "bacon" "olive_oil" "vinegar" "garlic" "mustard"
Topic 18 Topic 19 Topic 20 Topic 21 Topic 22 Topic 23 Topic 24 Topic 25 Topic 26
[1,] "cane_molasses" "vanilla" "onion" "garlic" "onion" "vegetable_oil" "onion" "cream" "cumin"
[2,] "onion" "cream" "black_pepper" "cane_molasses" "garlic" "soy_sauce" "garlic" "tomato" "coriander"
[3,] "vinegar" "almond" "vegetable_oil" "vinegar" "tomato" "sesame_oil" "cane_molasses" "chicken" "turmeric"
[4,] "olive_oil" "coconut" "bell_pepper" "black_pepper" "olive_oil" "fish" "tomato" "lemon_juice" "fenugreek"
[5,] "pepper" "oat" "garlic" "soy_sauce" "basil" "chicken" "vegetable_oil" "black_pepper" "lemongrass"
Topic 27 Topic 28 Topic 29 Topic 30 Topic 31 Topic 32 Topic 33 Topic 34 Topic 35
[1,] "onion" "onion" "onion" "onion" "vanilla" "garlic" "onion" "onion" "garlic"
[2,] "garlic" "vinegar" "celery" "pepper" "milk" "onion" "pepper" "garlic" "basil"
[3,] "black_pepper" "garlic" "chicken" "garlic" "garlic" "vegetable_oil" "garlic" "vegetable_oil" "pepper"
[4,] "tomato" "lemon_juice" "vegetable_oil" "parsley" "cinnamon" "cayenne" "black_pepper" "black_pepper" "tomato"
[5,] "olive_oil" "ginger" "carrot" "olive_oil" "cream" "beef" "beef" "chicken" "olive_oil"
Topic 36 Topic 37 Topic 38 Topic 39 Topic 40 Topic 41 Topic 42 Topic 43 Topic 44
[1,] "onion" "onion" "onion" "cayenne" "garlic" "vanilla" "vanilla" "scallion" "milk"
[2,] "garlic" "garlic" "cream" "garlic" "onion" "cocoa" "cane_molasses" "garlic" "tomato"
[3,] "cayenne" "black_pepper" "tomato" "ginger" "bell_pepper" "milk" "cocoa" "ginger" "garlic"
[4,] "vegetable_oil" "lemon_juice" "cane_molasses" "rice" "olive_oil" "cinnamon" "oat" "soybean" "vegetable_oil"
[5,] "oregano" "scallion" "milk" "onion" "milk" "walnut" "milk" "pepper" "cream"
Topic 45 Topic 46 Topic 47 Topic 48 Topic 49 Topic 50
[1,] "onion" "cream" "pepper" "cream" "milk" "olive_oil"
[2,] "cream" "black_pepper" "vegetable_oil" "tomato" "vanilla" "tomato"
[3,] "black_pepper" "chicken_broth" "garlic" "beef" "lard" "parmesan_cheese"
[4,] "milk" "vegetable_oil" "onion" "garlic" "cocoa" "lemon_juice"
[5,] "cinnamon" "garlic" "olive_oil" "carrot" "cane_molasses" "garlic"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment