Created
September 9, 2018 06:51
-
-
Save RandomForestGump/04e5e0aecf434db40bc045670637fac4 to your computer and use it in GitHub Desktop.
Compare Features using Cosine Similarity
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Collection of novels from two different authors:\n", | |
"- **The Cromptons** by Mary J. Holmes\n", | |
"- **The Red Room** by H.G Wells" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 82, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"works = reader.get_authors_works('Holmes, Mary Jane')\n", | |
"parsed_novel1 = nlp(works[2][\"text\"])\n", | |
"works = reader.get_authors_works('Wells, H. G. (Herbert George)')\n", | |
"parsed_novel2 = nlp(works[10][\"text\"])" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Fetch feature sets of the 2 novels parsed of all the male characters" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 83, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>angel</th>\n", | |
" <th>howard</th>\n", | |
" <th>jack</th>\n", | |
" <th>jake</th>\n", | |
" <th>peter</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>act</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>add</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>appear</th>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>approach</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>argue</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>bow</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>breathe</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>cease</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>clench</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>cling</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>construct</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>consult</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>contend</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>continue</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>4</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>curse</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>dance</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>decide</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>define</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>desiccate</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>detect</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>deter</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>drive</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>exclaim</th>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td>2</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>eye</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>fail</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>finish</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>forestall</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>gather</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>grasp</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>greet</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>...</th>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" <td>...</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>paint</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>plan</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>pretend</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>propose</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>question</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>rebuke</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>redden</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>rejoin</th>\n", | |
" <td></td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>relent</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>resist</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>respond</th>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>retreat</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>return</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>saunter</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>shift</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>shrug</th>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>shuffle</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>smile</th>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>spot</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>star</th>\n", | |
" <td>11</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>stick</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>stoop</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>suffer</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>suggest</th>\n", | |
" <td></td>\n", | |
" <td>3</td>\n", | |
" <td>4</td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>treat</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>untie</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>urge</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>usher</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>warn</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>whisper</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"<p>74 rows × 5 columns</p>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" angel howard jack jake peter\n", | |
"act 1 1 \n", | |
"add 1 \n", | |
"appear 2 \n", | |
"approach 1 \n", | |
"argue 1 \n", | |
"bow 1 1 \n", | |
"breathe 1 \n", | |
"cease 1 \n", | |
"clench 1 \n", | |
"cling 1\n", | |
"construct 1 \n", | |
"consult 1 \n", | |
"contend 1 \n", | |
"continue 4 1 \n", | |
"curse 1 \n", | |
"dance 1 \n", | |
"decide 1\n", | |
"define 1 \n", | |
"desiccate 1 \n", | |
"detect 1 \n", | |
"deter 1 \n", | |
"drive 1 \n", | |
"exclaim 2 2 1 1\n", | |
"eye 1 \n", | |
"fail 1 \n", | |
"finish 1 \n", | |
"forestall 2 \n", | |
"gather 1 \n", | |
"grasp 1 \n", | |
"greet 1 \n", | |
"... ... ... ... ... ...\n", | |
"paint 1 \n", | |
"plan 1 \n", | |
"pretend 1 \n", | |
"propose 2 \n", | |
"question 1 \n", | |
"rebuke 1 \n", | |
"redden 1 \n", | |
"rejoin 3 \n", | |
"relent 1 \n", | |
"resist 1 \n", | |
"respond 2 \n", | |
"retreat 1 \n", | |
"return 1 \n", | |
"saunter 1 \n", | |
"shift 1 \n", | |
"shrug 3 1 1 \n", | |
"shuffle 1 \n", | |
"smile 4 \n", | |
"spot 1 \n", | |
"star 11 \n", | |
"stick 1 \n", | |
"stoop 1 \n", | |
"suffer 1 \n", | |
"suggest 3 4 1\n", | |
"treat 1 \n", | |
"untie 1 \n", | |
"urge 1 \n", | |
"usher 1 \n", | |
"warn 1 \n", | |
"whisper 1 \n", | |
"\n", | |
"[74 rows x 5 columns]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"def get_interesting_contexts_gender(novels,rels,num_characters,verb_stop,gender):\n", | |
" def of_interest(ent,rels,main_characters,gender):\n", | |
" if gender_guess(ent.text.strip().lower(),gender_map)==gender:\n", | |
" return (ent.text.strip().lower() in main_characters \n", | |
" and ent.label_ == 'PERSON' \n", | |
" and ent.root.head.pos_ == 'VERB'\n", | |
" and ent.root.dep_ in rels) \n", | |
"\n", | |
" contexts = defaultdict(Counter) \n", | |
" for parsed_novel in novels:\n", | |
" main_characters = get_main_characters(parsed_novel,num_characters)\n", | |
" stop_verbs=get_pos_in(parsed_novel,'VERB',verb_stop)\n", | |
" for ent in parsed_novel.ents:\n", | |
" if of_interest(ent,rels,main_characters,gender):\n", | |
" if ent.root.head.lemma_ not in stop_verbs:\n", | |
" contexts[ent.text.strip().lower()][ent.root.head.lemma_] += 1\n", | |
" return contexts\n", | |
"novels = {parsed_novel1,parsed_novel2}\n", | |
"number_of_characters_per_text = 8\n", | |
"target_rels = {'nsubj','dobj'}\n", | |
"verb_stop=1000\n", | |
"target_contexts = get_interesting_contexts_gender(novels,target_rels,number_of_characters_per_text,verb_stop,'male')\n", | |
"display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))\n", | |
"# C=(Counter(get_entities_in(parsed_novel,\"VERB\")).most_common())\n", | |
"# get_pos_in(parsed_novel,'VERB',60)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"#### Fetch feature sets of the 2 novels parsed of all the female characters." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 85, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>amy</th>\n", | |
" <th>eloise</th>\n", | |
" <th>lady hammergallow</th>\n", | |
" <th>mandy ann</th>\n", | |
" <th>mrs jehoram</th>\n", | |
" <th>mrs mendham</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>'</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>add</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>arrange</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>assent</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>boast</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>borrow</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>chat</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>chime</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>choke</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>continue</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>decide</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>decline</th>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>demur</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>design</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>disappear</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>drag</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>draw</th>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>droop</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>drop</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>exchange</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>exclaim</th>\n", | |
" <td>1</td>\n", | |
" <td>3</td>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>exercise</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>forbear</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>glance</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>hesitate</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>hold</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>hop</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>insist</th>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>insult</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>intrust</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>lead</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>mistake</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>niggers</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>nod</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>parry</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>persist</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>piece</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>place</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>recover</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>reflect</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>rejoin</th>\n", | |
" <td>1</td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>relapse</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>rub</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>rush</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>sink</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>slip</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>spring</th>\n", | |
" <td></td>\n", | |
" <td>2</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>stoop</th>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>suggest</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>throw</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>tie</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>welcome</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>whisper</th>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>win</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>wrap</th>\n", | |
" <td></td>\n", | |
" <td>1</td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" <td></td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" amy eloise lady hammergallow mandy ann mrs jehoram mrs mendham\n", | |
"' 1 \n", | |
"add 1\n", | |
"arrange 1 \n", | |
"assent 1 \n", | |
"boast 1 \n", | |
"borrow 1 \n", | |
"chat 1 \n", | |
"chime 1 \n", | |
"choke 1 \n", | |
"continue 1 1 \n", | |
"decide 1 \n", | |
"decline 2 \n", | |
"demur 1 \n", | |
"design 1 \n", | |
"disappear 1 \n", | |
"drag 1 \n", | |
"draw 1 3 \n", | |
"droop 1 \n", | |
"drop 1 \n", | |
"exchange 1 \n", | |
"exclaim 1 3 1 1 \n", | |
"exercise 1 \n", | |
"forbear 1 \n", | |
"glance 1 \n", | |
"hesitate 1 \n", | |
"hold 1 \n", | |
"hop 1 \n", | |
"insist 2 \n", | |
"insult 1 \n", | |
"intrust 1 \n", | |
"lead 1 \n", | |
"mistake 1 \n", | |
"niggers 1 \n", | |
"nod 1 1 \n", | |
"parry 1 \n", | |
"persist 1 \n", | |
"piece 1 \n", | |
"place 1 \n", | |
"recover 1 \n", | |
"reflect 1\n", | |
"rejoin 1 2 1 \n", | |
"relapse 1 \n", | |
"rub 1 \n", | |
"rush 2 \n", | |
"sink 1 \n", | |
"slip 1 \n", | |
"spring 2 \n", | |
"stoop 1 \n", | |
"suggest 1 1 \n", | |
"throw 1 \n", | |
"tie 1 \n", | |
"welcome 1 \n", | |
"whisper 1 \n", | |
"win 1 \n", | |
"wrap 1 " | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"def get_interesting_contexts_gender(novels,rels,num_characters,verb_stop,gender):\n", | |
" def of_interest(ent,rels,main_characters,gender):\n", | |
" if gender_guess(ent.text.strip().lower(),gender_map)==gender:\n", | |
" return (ent.text.strip().lower() in main_characters \n", | |
" and ent.label_ == 'PERSON' \n", | |
" and ent.root.head.pos_ == 'VERB'\n", | |
" and ent.root.dep_ in rels) \n", | |
"\n", | |
" contexts = defaultdict(Counter) \n", | |
" for parsed_novel in novels:\n", | |
" main_characters = get_main_characters(parsed_novel,num_characters)\n", | |
" stop_verbs=get_pos_in(parsed_novel,'VERB',verb_stop)\n", | |
" for ent in parsed_novel.ents:\n", | |
" if of_interest(ent,rels,main_characters,gender):\n", | |
" if ent.root.head.lemma_ not in stop_verbs:\n", | |
" contexts[ent.text.strip().lower()][ent.root.head.lemma_] += 1\n", | |
" return contexts\n", | |
"novels = { parsed_novel1,parsed_novel2}\n", | |
"number_of_characters_per_text = 7\n", | |
"target_rels = {'nsubj','dobj'}\n", | |
"verb_stop=1000\n", | |
"target_contexts = get_interesting_contexts_gender(novels,target_rels,number_of_characters_per_text,verb_stop,'female')\n", | |
"display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))\n", | |
"# C=(Counter(get_entities_in(parsed_novel,\"VERB\")).most_common())\n", | |
"# get_pos_in(parsed_novel,'VERB',60)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.5" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 1 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment