Skip to content

Instantly share code, notes, and snippets.

Avatar

Rajesh Manikka rajeshmr

  • factorz.in
  • Chennai
View GitHub Profile
@rajeshmr
rajeshmr / sample.json
Created Feb 17, 2016
Sample ingredient structure
View sample.json
{
"url": "http://allrecipes.co.in/recipe/12227/pakal-fish-curry.aspx",
"ingredients": [
"7-8 pakal fish",
"1 teaspoon turmeric powder",
"as needed salt",
"2 tablespoon mustard oil",
"a pinch black cumin seeds/powder",
"2 tablespoon onion, sliced",
"1/2 teaspoon ginger paste",
@rajeshmr
rajeshmr / crf_input_generator.py
Last active Feb 17, 2016
Structuring text using Conditional Random Field (CRF). Tagging recipe ingredient phrases.
View crf_input_generator.py
import sys
import nltk
import json
for line in sys.stdin:
data = json.loads(line)
for ingredient in data['ingredients']:
tokens = nltk.word_tokenize(ingredient.strip())
tagged_tokens = nltk.pos_tag(tokens)
for token, pos in tagged_tokens:
View chunk.txt
QTY w[0]=7-8 w[1]=pakal w[2]=fish w[0]|w[1]=7-8|pakal pos[0]=JJ pos[1]=NN pos[2]=NN pos[0]|pos[1]=JJ|NN pos[1]|pos[2]=NN|NN pos[0]|pos[1]|pos[2]=JJ|NN|NN __BOS__
NAME w[-1]=7-8 w[0]=pakal w[1]=fish w[-1]|w[0]=7-8|pakal w[0]|w[1]=pakal|fish pos[-1]=JJ pos[0]=NN pos[1]=NN pos[-1]|pos[0]=JJ|NN pos[0]|pos[1]=NN|NN pos[-1]|pos[0]|pos[1]=JJ|NN|NN
NAME w[-2]=7-8 w[-1]=pakal w[0]=fish w[-1]|w[0]=pakal|fish pos[-2]=JJ pos[-1]=NN pos[0]=NN pos[-2]|pos[-1]=JJ|NN pos[-1]|pos[0]=NN|NN pos[-2]|pos[-1]|pos[0]=JJ|NN|NN __EOS__
QTY w[0]=1 w[1]=teaspoon w[2]=turmeric w[0]|w[1]=1|teaspoon pos[0]=CD pos[1]=NN pos[2]=JJ pos[0]|pos[1]=CD|NN pos[1]|pos[2]=NN|JJ pos[0]|pos[1]|pos[2]=CD|NN|JJ __BOS__
UNIT w[-1]=1 w[0]=teaspoon w[1]=turmeric w[2]=powder w[-1]|w[0]=1|teaspoon w[0]|w[1]=teaspoon|turmeric pos[-1]=CD pos[0]=NN pos[1]=JJ pos[2]=NN pos[-1]|pos[0]=CD|NN pos[0]|pos[1]=NN|JJ pos[1]|pos[2]=JJ|NN pos[-1]|pos[0]|pos[1]=CD|NN|JJ pos[0]|pos[1]|pos[2]=NN|JJ|NN
NAME w[-2]=1 w[-1]=teaspoon w[0]=turmeric w[1]=powder w[-1]|w[0]=teaspoo
View token_pos_tagged.tsv
token pos label
7-8 JJ QTY
pakal NN NAME
fish NN NAME
1 CD QTY
teaspoon NN UNIT
turmeric JJ NAME
powder NN NAME
View token_pos.tsv
token pos label
7-8 JJ XXX
pakal NN XXX
fish NN XXX
1 CD XXX
teaspoon NN XXX
turmeric JJ XXX
powder NN XXX
View rit_input.json
[
"8 ounces uncooked elbow macaroni",
"2 cups shredded sharp Cheddar cheese",
"1/2 cup grated Parmesan cheese",
"3 cups milk",
"1/4 cup butter",
"2 1/2 tablespoons all-purpose flour",
"2 tablespoons butter",
"1/2 cup bread crumbs",
"1 pinch paprika"
View rit_output.json
{
"tagged_tokens": [
[
{
"tag": "QTY",
"token": "8"
},
{
"tag": "UNIT",
"token": "ounces"
View api_tag_route.py
@app.route("/tag", methods=['GET', 'POST'])
def tag():
content = request.get_json(silent=True)
if len(content) > 50:
return abort(400)
tokens = map(nltk.word_tokenize, content)
tagged_tokens = map(nltk.pos_tag, tokens)
for_feature = pre_feature(tagged_tokens)
with_feature = map(feature_extractor, for_feature)
flattened_with_feature = [item for sublist in with_feature for item in sublist]