Skip to content

Instantly share code, notes, and snippets.

@fayeip
Last active August 29, 2015 14:11
Show Gist options
  • Save fayeip/f18f7431fc48d542efe7 to your computer and use it in GitHub Desktop.
Save fayeip/f18f7431fc48d542efe7 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"worksheets": [
{
"cells": [
{
"metadata": {},
"cell_type": "code",
"input": "import nltk\nfrom nltk.corpus import PlaintextCorpusReader\nimport re\nfrom itertools import chain\nfrom nltk import tokenize\nfrom nltk.corpus import stopwords\nimport nltk.data\nimport json\nimport pdb\nfrom collections import defaultdict",
"prompt_number": 54,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Importing corpus\ncorpus_root = 'data'\nwordlists = PlaintextCorpusReader(corpus_root, '.*\\\\.txt')\nsent_detector = nltk.data.load('tokenizers/punkt/english.pickle')",
"prompt_number": 2,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Clean up Process - create date regex parameters \ndate_pattern = '((J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber).*([0-9]))'\nmp = '(J(anuary|u(ne|ly))|February|Ma(rch|y)|A(pril|ugust)|(((Sept|Nov|Dec)em)|Octo)ber)'\nyp = '[0-9]{4}'",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Testing the patterns \ntest = \"This is the month of November 9, 2014\"\ndate = re.search(date_pattern,test)\nm = re.search(mp,date.group(0))\nmonth = m.group(0)\ny = re.search(yp,date.group(0))\nyear = y.group(0)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# start processing\n#Set the dictionaries \ncorpus_dict = {}\n\n#Putting it all together\nfor fileid in wordlists.fileids():\n #Part 1: split of xx of DOCUMENTS \n doc_list = re.split('((?m)^\\\\s+[0-9]+\\\\s*of\\\\s*[0-9]+\\\\s+DOCUMENTS)', wordlists.raw(fileid))\n doc_list.pop(0) #got rid of garbage first empty line\n master_list = list() # put all documents by id, header, footer\n #print len(doc_list) # keep for testing -- how many documents within a single file \n \n #Part 2: split into id, head and footer and create a triple tuple \n for idx in range(0, len(doc_list), 2):\n # add a new tuple of id, header, footer\n # split condition in order of importance\n split_conds = ['words\\r\\n\\r\\n', 'Edition\\r\\n\\r\\n', 'Society Desk\\r\\n\\r\\n','Society Desk\\r\\n\\r\\n\\r\\n','DATELINE: Camden, Me.,\\r\\n\\r\\n\\r\\n']\n doc_split = []\n for cond in split_conds:\n doc_split = re.split(cond,doc_list[idx+1], 1)\n if len(doc_split) == 2:\n break\n #Part 2 contd: Error check to see if any of the splits didn't go through \n if len(doc_split) < 2:\n doc_parts = (doc_list[idx], doc_split)\n print \"too few traces\"\n pdb.set_trace()\n elif len(doc_split) > 2:\n print \"too many splits\"\n else:\n doc_parts = (doc_list[idx], doc_split[0], doc_split[1])\n# print doc_split[0]\n# print '<><><><><><><><><>'\n# print doc_split[1]\n# print \"****************************************\"\n master_list.append(doc_parts) #Create that tuple triple \n \n year_counter = []\n #Part 3: Read the header and extract date \n for doc in master_list:\n #Part 3 a: Header cleaning steps \n clean_header = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(BYLINE.*)|(.*Correction Appended.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(LANGUAGE:.*)|(GRAPHIC:.*)|(Copyright.*)|(Late Edition - Final.*))\\b\", \"\", doc[1])\n clean_header = clean_header.replace(\"\\r\",\"\").strip()\n clean_header = [x for x in clean_header.split('\\n') if any(x.isalnum() for x in x)]\n header_final = ' '.join(clean_header)\n\n #Part 3b: Extracting the date\n date = re.search(date_pattern,header_final)\n m = re.search(mp,date.group(0))\n month = m.group(0)\n y = re.search(yp,date.group(0))\n year = y.group(0)\n year_counter.append(year) \n\n if \"Events\" not in header_final:\n body = doc[2]\n clean_sent = re.sub(r\"\\b(The New York Times|(DATELINE:.*)|(SECTION:.*)|(LENGTH:.*)|(LOAD-DATE:.*)|(http:.*)|(URL:.*)|(LANGUAGE:.*)|(PUBLICATION.*)|(GRAPHIC:.*)|(Copyright.*))\\b\", \"\", body)\n body = re.sub('\\r\\n(?!\\r\\n)', ' ',clean_sent)\n\n #Part 4 adding to the dictionary\n corpus_dict.setdefault(year,{}).setdefault(month, []).append((doc[0],header_final,body)) \n \n#Part 5: Write to a JSON file \nwith open('data/dict2014.json', 'wb') as fp:\n json.dump(corpus_dict, fp)\n ",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# print corpus_dict['1984']['March']",
"prompt_number": 55,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Sent to AWS to tag",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Download Stanford NER taggers\nfrom nltk.tag.stanford import POSTagger\nfrom nltk.tag.stanford import NERTagger\npost = POSTagger('lib/stanford-postagger-2014-10-26/models/english-bidirectional-distsim.tagger',\n 'lib/stanford-postagger-2014-10-26/stanford-postagger.jar', 'utf-8')\n\nnert = NERTagger('lib/stanford-ner-2014-10-26/classifiers/english.all.3class.distsim.crf.ser.gz',\n 'lib/stanford-ner-2014-10-26/stanford-ner.jar', 'utf-8')",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Load the entity tagged file as a tuple of tuples \nfrom ast import literal_eval\n\ntagged_1984 = []\n\nwith open('1984_tagged.txt', 'r') as f:\n for line in f:\n line.split(',')\n tagged_1984.append(literal_eval(line.strip()))\n",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Customizing the tagger\n#assigning a custom tag in the word,tag \n\nfrom ast import literal_eval\n\ndef alter_source(sourcefile):\n f_before = open(sourcefile,'r')\n f_before_str = f_before.read()\n f_before.close()\n\n \"\"\"Customizing the tags labeled 'O'\n 1) Widow , Widower, Widowed >> label: W for widow\n 2) Mr., Mrs., Adm., Sgt., Dr. >> label: PERSON\n 3) Rev., Rabbi, priest >> label: R for religious \n 4) bride >> label: B for bride \n 5) bridegroom, groom >> label: G for groom\n\n \"\"\"\n\n f_after_str = ''\n # Adding the custom tag set 1 - widow\n f_after_str_1 = re.sub(r\"\\(\\'widow\\', \\'O\\'\\)\", \"('widow', 'W')\",f_before_str)\n f_after_str_2 = re.sub(r\"\\(\\'widower\\', \\'O\\'\\)\", \"('widower', 'W')\",f_after_str_1)\n f_after_str_3 = re.sub(r\"\\(\\'widowed\\', \\'O\\'\\)\", \"('widowed', 'W')\",f_after_str_2)\n\n #Adding the custom tag set 2 - person \n f_after_str_4 = re.sub(r\"\\(\\'Mr.\\', \\'O\\'\\)\", \"('Mr.', 'PERSON')\",f_after_str_3)\n f_after_str_5 = re.sub(r\"\\(\\'Mrs.\\', \\'O\\'\\)\", \"('Mrs.', 'PERSON')\",f_after_str_4)\n f_after_str_6 = re.sub(r\"\\(\\'Adm.\\', \\'O\\'\\)\", \"('Adm.', 'PERSON')\",f_after_str_5)\n f_after_str_7 = re.sub(r\"\\(\\'Sgt.\\', \\'O\\'\\)\", \"('Sgt.', 'PERSON')\",f_after_str_6)\n f_after_str_8 = re.sub(r\"\\(\\'Dr.\\', \\'O\\'\\)\", \"('Dr.', 'PERSON')\",f_after_str_7)\n\n\n #Adding the custom tag set 3 - religious head \n f_after_str_9 = re.sub(r\"\\(\\'Rev.\\', \\'O\\'\\)\", \"('Rev.', 'R')\",f_after_str_8)\n f_after_str_10 = re.sub(r\"\\(\\'\\bRabbi\\b\\', \\'O\\'\\)\", \"('Rabbi', 'R')\",f_after_str_9)\n f_after_str_11 = re.sub(r\"\\(\\'\\bpriest\\b\\', \\'O\\'\\)\", \"('priest','R')\",f_after_str_10)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_12 = re.sub(r\"\\(\\'\\bdivorce\\b\\', \\'O\\'\\)\", \"('divorce', 'D')\",f_after_str_11)\n f_after_str_13 = re.sub(r\"\\(\\'\\bdivorced\\b\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_12)\n\n # Adding the custom tag set 4 - divorced\n f_after_str_14 = re.sub(r\"\\(\\'divorce\\', \\'O\\'\\)\", \"sufia\", f_after_str_13)\n f_after_str_15 = re.sub(r\"\\(\\'divorced\\', \\'O\\'\\)\", \"('divorced', 'D')\",f_after_str_14)\n\n # Adding the custom tag set 5 - bride\n f_after_str_16 = re.sub(r\"\\(\\'\\bbride\\b\\', \\'O\\'\\)\", \"('bride', 'B')\",f_after_str_15)\n\n # Adding the custom tag set 6 - bridegroom\n f_after_str_17 = re.sub(r\"\\(\\'\\bbridegroom\\b\\', \\'O\\'\\)\", \"('bridegroom', 'G')\",f_after_str_16)\n f_after_str_final = re.sub(r\"\\(\\'\\bgroom\\b\\', \\'O\\'\\)\", \"('groom', 'G')\",f_after_str_17)\n \n return f_after_str_final\n\n\ndef apply_custom_tags (targetfile, custom_tags):\n f = open(targetfile,'w')\n f.write(custom_tags)\n f.close()\n\n custom_tag_list = []\n\n with open(targetfile, 'r') as g:\n for line in g:\n line.split('\\n')\n custom_tag_list.append(literal_eval(line.strip()))\n return custom_tag_list",
"prompt_number": 3,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Run custom tagger for all 5 years\n\ncustom_tagged_1984 = apply_custom_tags('data_tagged2/1984_tagged_custom.txt',alter_source('data_tagged/1984_tagged.txt'))\ncustom_tagged_1990 = apply_custom_tags('data_tagged2/1990_tagged_custom.txt',alter_source('data_tagged/1990_tagged.txt'))\ncustom_tagged_2000 = apply_custom_tags('data_tagged2/2000_tagged_custom.txt',alter_source('data_tagged/2000_tagged.txt'))\ncustom_tagged_2010 = apply_custom_tags('data_tagged2/2010_tagged_custom.txt',alter_source('data_tagged/2010_tagged.txt'))\ncustom_tagged_2014 = apply_custom_tags('data_tagged2/2014_tagged_custom.txt',alter_source('data_tagged/2014_tagged.txt'))",
"prompt_number": 4,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# print custom_tagged_1984 ",
"prompt_number": 56,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Import the RegexpParser\nfrom nltk.chunk import RegexpParser\n\n# Define your custom tagged data. \n# entities\n\ndef chunker_rules(values):\n # Define custom grammar (modified to be a valid regex).\n grammar = r'''\n PERSON:\n {<PERSON><O><PERSON>+}\n {<PERSON>+}\n ORGANIZATION: \n {<ORGANIZATION>+}\n LOCATION: \n {<LOCATION>+} \n\n '''\n cp = nltk.RegexpParser(grammar) # Create an instance of your custom parser.\n return cp.parse(values) # Parse!\n\ndef entity_chunker(tagged_docs):\n chunks = []\n for doc in tagged_docs:\n tree = chunker_rules(doc)\n for subtree in tree.subtrees():\n if (subtree.node == 'CHUNK'):\n leaflist = [leaf[0] for leaf in subtree.leaves()]\n chunks.append(' '.join(leaflist))\n# if verb in leaflist:\n# chunks.append(' '.join(leaflist))\n return chunks\n ",
"prompt_number": 33,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Creating a dictionary for each wedding announcement",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "rel_dict = defaultdict(dict)",
"prompt_number": 34,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def extract_rels(rel_dict, dict_key, relKey, rel1, rel2, tree,regex): \n \"\"\"This function extracts the relationships\n Function Outputs:\n final output = {dict_key: [{relKey:[relationship extracted]}, {relKey:[relationship extracted]}]}\n example = {1:[{bride:'Mary Flyn',groom: 'John Mayer'}]}\n \n Function Inputs:\n 1) rel_dict = This is the default dict that will contain all the patterns in a dictionary per wedding announcement\n 2) dict_key = This is basically a counter per wedding announcment \n 3) relKey = This is the second key i.e. the relationship type you want to get values for \n 4) rel1 , rel2, regex = 'PERSON' [the word \"marries\"] 'PERSON'\n 5) tree = the parsed tree\n \"\"\" \n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n# rels_str = nltk.sem.relextract.show_raw_rtuple(rel) \n dict_values = []\n dict_values.append(nltk.sem.relextract.show_raw_rtuple(rel))\n rel_dict[str(dict_key)][relKey] = dict_values\n \n# if relKey in rel_dict[str(dict_key)].keys():\n# rel_dict[str(dict_key)][relKey].append(dict_values)\n# else:\n# rel_dict[str(dict_key)][relKey] = dict_values",
"prompt_number": 35,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "def make_rels_dict(tagged_data, rel_dict, relKey, rel1,rel2,regex):\n \"\"\"This function makes the dictionary for the relationships you want to extract -- \n read comments in function \"extract_rels\" for more context\"\"\"\n dict_key = 1\n \n for doc in tagged_data:\n tree = chunker_rules(doc)\n extract_rels (rel_dict, dict_key, relKey, rel1, rel2, tree,regex)\n dict_key +=1\n return rel_dict",
"prompt_number": 36,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "DAUGHTER = re.compile(r'.*\\bdaughter\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'daughter_relation','PERSON','PERSON',DAUGHTER)\nprint \"DAUGHTER relation done\"",
"prompt_number": 57,
"outputs": [
{
"output_type": "stream",
"text": "DAUGHTER relation done\n",
"stream": "stdout"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# regex to lift out names and remove tags \n\ndef lift_names_remove_tags(relation, extracted_relation_str):\n \n if relation == 'DAUGHTER' or relation == 'SON':\n rels_pattern = r\"\\[PER: (.*)\\] (.*) \\[PER: (.*)\\]\"\n name_pattern = re.compile(r\"(\\w*\\.?)\\/PERSON\")\n names_search = re.search(rels_pattern, extracted_relation_str)\n bride_or_groom_name = name_pattern.findall(names_search.group(1))\n parents_name = name_pattern.findall(names_search.group(3))\n bride_or_groom_name_str = ''\n parents_name_str = ''\n for bg in bride_or_groom_name:\n bride_or_groom_name_str += bg + \" \"\n for p in parents_name:\n parents_name_str += p + \" \"\n \n return bride_or_groom_name_str.strip(), parents_name_str.strip()\n",
"prompt_number": 58,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for key in rel_dict.keys():\n for k in rel_dict[key].keys():\n# print rel_dict[key][k]\n for each in rel_dict[key][k]:\n if k == 'daughter_relation':\n bridename, brideparentsname = lift_names_remove_tags('DAUGHTER',each)\n rel_dict[key]['bride_name'] = bridename\n rel_dict[key]['bride_parents_names'] = brideparentsname ",
"prompt_number": 59,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# print rel_dict",
"prompt_number": 60,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "SON = re.compile(r'.*\\bson\\b')\nmake_rels_dict(custom_tagged_1984,rel_dict,'son_relation','PERSON','PERSON',SON) \nprint \"Groom key has been added to master dict\"",
"prompt_number": 61,
"outputs": [
{
"output_type": "stream",
"text": "Groom key has been added to master dict\n",
"stream": "stdout"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for key in rel_dict.keys():\n for k in rel_dict[key].keys():\n# print rel_dict[key][k]\n for each in rel_dict[key][k]:\n if k == 'son_relation':\n groomname, groomparentsname = lift_names_remove_tags('SON',each)\n rel_dict[key]['groom_name'] = groomname\n rel_dict[key]['groom_parents_names'] = groomparentsname ",
"prompt_number": 62,
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print rel_dict['347']",
"prompt_number": 63,
"outputs": [
{
"output_type": "stream",
"text": "{'daughter_relation': [\"[PER: 'Leicia/PERSON Sharon/PERSON Osborne/PERSON'] ',/O the/O daughter/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON Philip/PERSON Barry/PERSON Osborne/PERSON']\"], 'bride_parents_names': 'Mr. Mrs. Philip Barry Osborne', 'groom_parents_names': 'Mr. Mrs. John A. Milano', 'son_relation': [\"[PER: 'Michael/PERSON Anthony/PERSON Milano/PERSON'] ',/O a/O son/O of/O' [PER: 'Mr./PERSON and/O Mrs./PERSON John/PERSON A./PERSON Milano/PERSON']\"], 'bride_name': 'Leicia Sharon Osborne', 'groom_name': 'Michael Anthony Milano'}\n",
"stream": "stdout"
}
],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# To test for empty patterns in the rel_dict\nempty = []\nfor i in range (1):\n if test_dict[str(i)].keys() == []:\n print i\n empty.append(i)\nprint \"final len\", len(empty)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Use functions below to see how well the individual patterns do and then add to the master dictionary",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "# Tester Function to extract the relationships for individual patterns \ndef test_extract_rels (tagged_data, alist, rel1,rel2,regex):\n for doc in tagged_data:\n tree = chunker_rules(doc)\n for rel in nltk.sem.extract_rels(rel1, rel2, tree, pattern = regex):\n #print nltk.sem.relextract.show_raw_rtuple(rel)\n alist.append(nltk.sem.relextract.show_raw_rtuple(rel)) \n return alist ",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Tester Function to append lists if needed \ndef append_rels(lists_to_append):\n master = []\n for i in lists_to_append:\n for rel in i: \n master.append(i)\n return master",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Print function \ndef print_rels(rels):\n print \"length of list: \",len(rels)\n print \"=\" * 125 , \"\\n\"\n for i in rels:\n print i ",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#####Testing Individual Regex Patterns to add to master#########",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Task1: Genders of who is being married\n#The bride is being extracted - related regex\nbride = []\nDAUGHTER = re.compile(r'.*\\bdaughter\\b')\nbride = extract_rels(tagged_1984, bride, 'PERSON','PERSON',DAUGHTER)\nprint_rels(bride)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#OPEN QUESTION: Am I supposed to check which ones got picked and then append it to the bride list???\n#Task 1 - continued , extracting the bride \n\n#Create a list \nmarries = []\n\n#Define Regex \nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\nWED = re.compile(r'.*\\b[Ww]eds?\\b')\nENAGEGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\n#Run Relationship Extraction Function \nmarries1 = extract_rels(tagged_1984, marries, 'PERSON','ORGANIZATION',MARRIES)\nmarries2 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',MARRIES)\nmarries3 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',WED)\nmarries4 = extract_rels(tagged_1984, marries, 'PERSON','PERSON',ENAGEGEMENT)\n\n# Append Lists above \nlists = [marries1,marries2,marries3,marries4]\nmaster_marries = append_rels(lists)\nprint len(master_marries)\nprint '=' *100\n#Print Lists\nfor i in master_marries:\n for j in i: \n print j , \"\\n\"",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Task1: Genders of who is being married\n#The groom is being extracted - related regex\ngroom = []\nSON = re.compile(r'.*\\bson\\b')\ngroom = extract_rels(tagged_1984, groom, 'PERSON','PERSON',SON)\nprint_rels(groom)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Task 2 - Hometowns of whom is being married \nhometown = []\nOF = re.compile(r'.*\\bof\\b')\nhometown = extract_rels(march1984_tagged, hometown,'PERSON','LOCATION',OF)\nprint_rels(hometown)\n# strip out the false positives ",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "profession = []\nPROF = re.compile(r'.*\\b([Pp]rofessor|[Bb]anker|[Pp]rogrammer|[Aa]nalyst|[Aa]ssociate|[Hh]ead[master?]|[Cc]onsultant|[Cc]hairman|[Dd]octorate|[Aa]ccountant|[Ff]reelance|[Pp]artner|[Mm]anager|[Tt]eacher|[Ll]awyer|[Pp]resident|[Dd]ean|[Ee]ngineer|[Aa]ssistant|[Dd]irector|[Ee]xecutive)\\b')\nprofession = extract_rels(tagged_1984, profession,'PERSON','ORGANIZATION',PROF)\nprint_rels(profession)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#larger funnel\nprofession_v2 = []\nIS = re.compile(r'.*\\bis\\b')\nprofession_v2 = extract_rels(tagged_1984, profession_v2,'PERSON','ORGANIZATION',IS)\nprint_rels(profession_v2)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "education = []\nEDU = re.compile(r'.*\\b([Dd]egree|[M.B.A.]|[M.S.]|[M.D.]|[Dd]esigner|[Mm]aster\\'s|[Gg]raduate[d]?|complet[ing|ed|e])\\b')\neducation = extract_rels(tagged_1984, education,'PERSON','ORGANIZATION',EDU)\nprint_rels(education)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#larger funnel\neducation_v2 = []\nFROM = re.compile(r'.*\\bfrom\\b')\neducation_v2 = extract_rels(tagged_1984, education_v2,'PERSON','ORGANIZATION',FROM)\nprint_rels(education_v2)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Marries Extractors",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "marries = []\nMARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "WED = re.compile(r'.*\\b[Ww]eds?\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n# print nltk.sem.relextract.show_raw_rtuple(rel)\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "ENAGEGEMNT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n\nfor rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "AT = re.compile(r'.*\\b[Aa]t\\b')\n\nfor rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Putting it all together:",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Set variable to collect info\nmarries = []\n\nfor doc in march1984_tagged:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n MARRIES = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = MARRIES):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #2\n MARRIESv2 = re.compile(r'.*\\b[Mm]arrie[sd]\\b')\n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = MARRIESv2):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #3\n WED = re.compile(r'.*\\b[Ww]eds?\\b')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = WED):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n #Relationship Extractors - #4\n ENGAGEMENT = re.compile(r'.*\\b([Ee]ngaged|[Ee]ngagement|[Ee]gagment\\b)')\n for rel in nltk.sem.extract_rels('PERSON','PERSON', tree, pattern = ENGAGEMENT):\n marries.append(nltk.sem.relextract.show_raw_rtuple(rel))\n ",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print len(marries)\n\nfor i in marries:\n print i + '\\n'",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "#Set variable to collect info\nmarriage_location = []\n\nfor doc in march1984_tagged:\n #Parse every document \n tree = chunker_rules(doc)\n #Relationship Extractors - #1\n AT = re.compile(r'.*\\b[Aa]t\\b')\n for rel in nltk.sem.extract_rels('PERSON','ORGANIZATION', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','ORGANIZATION', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('PERSON','LOCATION', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))\n for rel in nltk.sem.extract_rels('LOCATION','PERSON', tree, pattern = AT):\n marriage_location.append(nltk.sem.relextract.show_raw_rtuple(rel))",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for i in marriage_location:\n print i + '\\n'",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "marriage_location_dict = {'marriage_loc':[]}\nloc = []\nother = []\nfor i in marriage_location:\n if 'performed' in i or 'arrie' in i or 'Weds' in i or 'officiate' in i or 'Temple' in i or 'Church' in i :\n# if 'Church' in i or 'Temple' in i :\n loc.append(i)\n marriage_location_dict['marriage_loc'].append(i)\n else:\n other.append(i)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print len(marries)\nprint len (marriage_location)\nprint len(loc)\nprint len(other)",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "for i in loc:\n print i + '\\n'",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "len(marriage_location_dict['marriage_loc'])",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "print marriage_location_dict",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
},
{
"metadata": {},
"cell_type": "code",
"input": "",
"outputs": [],
"language": "python",
"trusted": true,
"collapsed": false
}
],
"metadata": {}
}
],
"metadata": {
"name": "",
"signature": "sha256:3ecf8aab785c892aae24f34a1ca226d4b63a481d39098d3c4a3c45581cdca0f3",
"gist_id": "f18f7431fc48d542efe7"
},
"nbformat": 3
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment