Skip to content

Instantly share code, notes, and snippets.

@RandomForestGump
Created September 9, 2018 06:51
Show Gist options
  • Save RandomForestGump/04e5e0aecf434db40bc045670637fac4 to your computer and use it in GitHub Desktop.
Save RandomForestGump/04e5e0aecf434db40bc045670637fac4 to your computer and use it in GitHub Desktop.
Compare Features using Cosine Similarity
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Collection of novels from two different authors:\n",
"- **The Cromptons** by Mary J. Holmes\n",
"- **The Red Room** by H.G Wells"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"works = reader.get_authors_works('Holmes, Mary Jane')\n",
"parsed_novel1 = nlp(works[2][\"text\"])\n",
"works = reader.get_authors_works('Wells, H. G. (Herbert George)')\n",
"parsed_novel2 = nlp(works[10][\"text\"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Fetch feature sets of the 2 novels parsed of all the male characters"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>angel</th>\n",
" <th>howard</th>\n",
" <th>jack</th>\n",
" <th>jake</th>\n",
" <th>peter</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>act</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>add</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>appear</th>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>approach</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>argue</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>bow</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>breathe</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>cease</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>clench</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>cling</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>construct</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>consult</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>contend</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>continue</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>curse</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>dance</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>decide</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>define</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>desiccate</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>detect</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>deter</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>drive</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>exclaim</th>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>eye</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>fail</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>finish</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>forestall</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>gather</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>grasp</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>greet</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>paint</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>plan</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>pretend</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>propose</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>question</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>rebuke</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>redden</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>rejoin</th>\n",
" <td></td>\n",
" <td>3</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>relent</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>resist</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>respond</th>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>retreat</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>return</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>saunter</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>shift</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>shrug</th>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>shuffle</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>smile</th>\n",
" <td>4</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>spot</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>star</th>\n",
" <td>11</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>stick</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>stoop</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>suffer</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>suggest</th>\n",
" <td></td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td></td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>treat</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>untie</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>urge</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>usher</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>warn</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>whisper</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>74 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" angel howard jack jake peter\n",
"act 1 1 \n",
"add 1 \n",
"appear 2 \n",
"approach 1 \n",
"argue 1 \n",
"bow 1 1 \n",
"breathe 1 \n",
"cease 1 \n",
"clench 1 \n",
"cling 1\n",
"construct 1 \n",
"consult 1 \n",
"contend 1 \n",
"continue 4 1 \n",
"curse 1 \n",
"dance 1 \n",
"decide 1\n",
"define 1 \n",
"desiccate 1 \n",
"detect 1 \n",
"deter 1 \n",
"drive 1 \n",
"exclaim 2 2 1 1\n",
"eye 1 \n",
"fail 1 \n",
"finish 1 \n",
"forestall 2 \n",
"gather 1 \n",
"grasp 1 \n",
"greet 1 \n",
"... ... ... ... ... ...\n",
"paint 1 \n",
"plan 1 \n",
"pretend 1 \n",
"propose 2 \n",
"question 1 \n",
"rebuke 1 \n",
"redden 1 \n",
"rejoin 3 \n",
"relent 1 \n",
"resist 1 \n",
"respond 2 \n",
"retreat 1 \n",
"return 1 \n",
"saunter 1 \n",
"shift 1 \n",
"shrug 3 1 1 \n",
"shuffle 1 \n",
"smile 4 \n",
"spot 1 \n",
"star 11 \n",
"stick 1 \n",
"stoop 1 \n",
"suffer 1 \n",
"suggest 3 4 1\n",
"treat 1 \n",
"untie 1 \n",
"urge 1 \n",
"usher 1 \n",
"warn 1 \n",
"whisper 1 \n",
"\n",
"[74 rows x 5 columns]"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def get_interesting_contexts_gender(novels,rels,num_characters,verb_stop,gender):\n",
" def of_interest(ent,rels,main_characters,gender):\n",
" if gender_guess(ent.text.strip().lower(),gender_map)==gender:\n",
" return (ent.text.strip().lower() in main_characters \n",
" and ent.label_ == 'PERSON' \n",
" and ent.root.head.pos_ == 'VERB'\n",
" and ent.root.dep_ in rels) \n",
"\n",
" contexts = defaultdict(Counter) \n",
" for parsed_novel in novels:\n",
" main_characters = get_main_characters(parsed_novel,num_characters)\n",
" stop_verbs=get_pos_in(parsed_novel,'VERB',verb_stop)\n",
" for ent in parsed_novel.ents:\n",
" if of_interest(ent,rels,main_characters,gender):\n",
" if ent.root.head.lemma_ not in stop_verbs:\n",
" contexts[ent.text.strip().lower()][ent.root.head.lemma_] += 1\n",
" return contexts\n",
"novels = {parsed_novel1,parsed_novel2}\n",
"number_of_characters_per_text = 8\n",
"target_rels = {'nsubj','dobj'}\n",
"verb_stop=1000\n",
"target_contexts = get_interesting_contexts_gender(novels,target_rels,number_of_characters_per_text,verb_stop,'male')\n",
"display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))\n",
"# C=(Counter(get_entities_in(parsed_novel,\"VERB\")).most_common())\n",
"# get_pos_in(parsed_novel,'VERB',60)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Fetch feature sets of the 2 novels parsed of all the female characters."
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>amy</th>\n",
" <th>eloise</th>\n",
" <th>lady hammergallow</th>\n",
" <th>mandy ann</th>\n",
" <th>mrs jehoram</th>\n",
" <th>mrs mendham</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>'</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>add</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>arrange</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>assent</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>boast</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>borrow</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>chat</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>chime</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>choke</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>continue</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>decide</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>decline</th>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>demur</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>design</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>disappear</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>drag</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>draw</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>droop</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>drop</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>exchange</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>exclaim</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>exercise</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>forbear</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>glance</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>hesitate</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>hold</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>hop</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>insist</th>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>insult</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>intrust</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>lead</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>mistake</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>niggers</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>nod</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>parry</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>persist</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>piece</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>place</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>recover</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>reflect</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>rejoin</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>relapse</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>rub</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>rush</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>sink</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>slip</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>spring</th>\n",
" <td></td>\n",
" <td>2</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>stoop</th>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>suggest</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>throw</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>tie</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>welcome</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>whisper</th>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>win</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>wrap</th>\n",
" <td></td>\n",
" <td>1</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" amy eloise lady hammergallow mandy ann mrs jehoram mrs mendham\n",
"' 1 \n",
"add 1\n",
"arrange 1 \n",
"assent 1 \n",
"boast 1 \n",
"borrow 1 \n",
"chat 1 \n",
"chime 1 \n",
"choke 1 \n",
"continue 1 1 \n",
"decide 1 \n",
"decline 2 \n",
"demur 1 \n",
"design 1 \n",
"disappear 1 \n",
"drag 1 \n",
"draw 1 3 \n",
"droop 1 \n",
"drop 1 \n",
"exchange 1 \n",
"exclaim 1 3 1 1 \n",
"exercise 1 \n",
"forbear 1 \n",
"glance 1 \n",
"hesitate 1 \n",
"hold 1 \n",
"hop 1 \n",
"insist 2 \n",
"insult 1 \n",
"intrust 1 \n",
"lead 1 \n",
"mistake 1 \n",
"niggers 1 \n",
"nod 1 1 \n",
"parry 1 \n",
"persist 1 \n",
"piece 1 \n",
"place 1 \n",
"recover 1 \n",
"reflect 1\n",
"rejoin 1 2 1 \n",
"relapse 1 \n",
"rub 1 \n",
"rush 2 \n",
"sink 1 \n",
"slip 1 \n",
"spring 2 \n",
"stoop 1 \n",
"suggest 1 1 \n",
"throw 1 \n",
"tie 1 \n",
"welcome 1 \n",
"whisper 1 \n",
"win 1 \n",
"wrap 1 "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"def get_interesting_contexts_gender(novels,rels,num_characters,verb_stop,gender):\n",
" def of_interest(ent,rels,main_characters,gender):\n",
" if gender_guess(ent.text.strip().lower(),gender_map)==gender:\n",
" return (ent.text.strip().lower() in main_characters \n",
" and ent.label_ == 'PERSON' \n",
" and ent.root.head.pos_ == 'VERB'\n",
" and ent.root.dep_ in rels) \n",
"\n",
" contexts = defaultdict(Counter) \n",
" for parsed_novel in novels:\n",
" main_characters = get_main_characters(parsed_novel,num_characters)\n",
" stop_verbs=get_pos_in(parsed_novel,'VERB',verb_stop)\n",
" for ent in parsed_novel.ents:\n",
" if of_interest(ent,rels,main_characters,gender):\n",
" if ent.root.head.lemma_ not in stop_verbs:\n",
" contexts[ent.text.strip().lower()][ent.root.head.lemma_] += 1\n",
" return contexts\n",
"novels = { parsed_novel1,parsed_novel2}\n",
"number_of_characters_per_text = 7\n",
"target_rels = {'nsubj','dobj'}\n",
"verb_stop=1000\n",
"target_contexts = get_interesting_contexts_gender(novels,target_rels,number_of_characters_per_text,verb_stop,'female')\n",
"display(pd.DataFrame.from_dict(target_contexts).applymap(lambda x: '' if math.isnan(x) else x))\n",
"# C=(Counter(get_entities_in(parsed_novel,\"VERB\")).most_common())\n",
"# get_pos_in(parsed_novel,'VERB',60)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 1
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment