Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
newsgrab Example Analysis
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analyzing newsgrab JSON Data with spaCy and pandas"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"[Data Preparation](#Data-Preparation)\n",
"\n",
"[Separate and Quanitfy Keyword Tokens](#Separate-and-Quantify-Keyword-Tokens)\n",
"\n",
"[Quantify Noun Chunks](#Quantify-Noun-Chunks)\n",
"\n",
"[Quantify Named Entities](#Quantify-Named-Entities)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Data Preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Import the necessary libraries and load in each of the JSON files"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sb\n",
"import re\n",
"import spacy\n",
"from collections import Counter\n",
"\n",
"# Load English language pretrained model from spacy\n",
"nlp = spacy.load(\"en_core_web_sm\")\n",
"\n",
"# Set the dataframe row column widths to show full text\n",
"from pandas.io.json import json_normalize\n",
"pd.set_option('display.max_colwidth', None)\n",
"\n",
"with open('output_1.json',encoding=\"utf8\") as f1, open('output_2.json',encoding=\"utf8\") as f2, open('output_3.json',encoding=\"utf8\") as f3, open('output_special.json',encoding=\"utf8\") as f4:\n",
" list1 = json.load(f1)\n",
" list2 = json.load(f2)\n",
" list3 = json.load(f3)\n",
" list4 = json.load(f4)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Normalize the JSON files with pandas and return dataframe objects."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"search_data1 = pd.json_normalize(list1, record_path='results', meta='search_term',\n",
" record_prefix='results')\n",
"search_data2 = pd.json_normalize(list2, record_path='results', meta='search_term',\n",
" record_prefix='results')\n",
"search_data3 = pd.json_normalize(list3, record_path='results', meta='search_term',\n",
" record_prefix='results')\n",
"search_data4 = pd.json_normalize(list4, record_path='results', meta='search_term',\n",
" record_prefix='results')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Merge the four dataframes into one using pandas concatenate function."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results0</th>\n",
" <th>search_term</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Oxford's Emma Fogarty makes Adelphi University dean's list</td>\n",
" <td>adelphi university</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wilton Resident Sophia Walsh Named to Adelphi University ...</td>\n",
" <td>adelphi university</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Local Students Named to Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results0 \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"2 Oxford's Emma Fogarty makes Adelphi University dean's list \n",
"3 Wilton Resident Sophia Walsh Named to Adelphi University ... \n",
"4 Local Students Named to Adelphi University Spring 2020 ... \n",
"\n",
" search_term \n",
"0 adelphi university \n",
"1 adelphi university \n",
"2 adelphi university \n",
"3 adelphi university \n",
"4 adelphi university "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"frames = [search_data1, search_data2, search_data3, search_data4]\n",
"df = pd.concat(frames)\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Rename the results column to exclude the 0 and create a column with lowercase results output."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = df.rename(columns = {\"results0\":\"results\"})\n",
"df['results_lower'] = df['results'].apply(lambda x: x.lower())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use spaCy to get noun chunks, named entities, and tokens from the results text and append these to respective lists. Turn these into new columns."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"noun_chunks = []\n",
"named_entity = []\n",
"tokens = []\n",
"\n",
"for doc in nlp.pipe(df['results_lower'].astype('unicode').values, batch_size=50,\n",
" n_process=5):\n",
" if doc.is_parsed:\n",
" noun_chunks.append([chunk.text for chunk in doc.noun_chunks])\n",
" named_entity.append([ent.text for ent in doc.ents])\n",
" tokens.append([token.text for token in doc if not token.is_stop and not token.is_punct])\n",
" else:\n",
" noun_chunks.append(None)\n",
" named_entity.append(None) \n",
" tokens.append(None)\n",
" \n",
"df['results_noun_chunks'] = noun_chunks\n",
"df['results_named_entities'] = named_entity\n",
"df['results_tokens_clean'] = tokens"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Separate and Quantify Keyword Tokens"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Clean the output from the tokens column in its entirety by flattening the list, remove special characters, and remove line breaks and the leftovers of ampersands. Then, put in a string list and use the Counter from Python's build-in collections. Get a sample of the 15 most common words."
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('college', 289), ('new', 159), ('university', 116), ('2020', 93), ('students', 87), ('suny', 86), ('covid19', 67), ('school', 66), ('news', 63), ('fall', 63), ('colleges', 59), ('york', 53), ('president', 46), ('named', 44), ('coronavirus', 41)]\n"
]
}
],
"source": [
"flattened_list = [y for x in tokens for y in x]\n",
"\n",
"#remove special characters from tokens list before analyzing\n",
"list_cleaned = [re.sub(r\"[^a-zA-Z0-9]\", \"\", file) for file in flattened_list]\n",
"\n",
"#remove the remainders of what was ampersands and paragraph breaks: \\n and amp\n",
"list_cleaned2 = [re.sub('\\n', \"\", file) for file in list_cleaned]\n",
"\n",
"list_cleaned3 = [re.sub('amp', \"\", file) for file in list_cleaned2]\n",
"\n",
"str_list = list(filter(None, list_cleaned3))\n",
"\n",
"word_freq = Counter(str_list)\n",
"common_words = word_freq.most_common(15)\n",
"print(common_words)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a copy of the token list to use in further processing."
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"new_str_list = str_list.copy()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Compile a list of words to exclude from the tokens list, using the original search document text. Read in the text file, split on the separator, then join on it to return a string, and replace the commas with a space."
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"#compile list of words to exclude from college list\n",
"list_names = open('ny_colleges.txt',encoding=\"utf8\").read().split(',')\n",
"str1 = ','.join(list_names)\n",
"other_str = str1.replace(',', ' ')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Tokenize the list of search terms to exclude using spaCy."
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"#must tokenize the list that was brought in\n",
"new_college_list = []\n",
"doc2 = nlp(other_str)\n",
"for token in doc2:\n",
" new_college_list.append(token.text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a final list of tokens that exclude the search terms."
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"new_words = [word for word in new_str_list if word not in new_college_list]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the Counter from collections to quantify the frequency of each keyword."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"word_freq2 = Counter(new_words)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a new dataframe for the keywords and their frequency counts. Sort from high to low count, reset the index, and rename the columns. Save the dataframe as a CSV if desired."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"results_word_counts = pd.DataFrame.from_dict(word_freq2, orient='index',\n",
" columns=['keyword_count'])\n",
"results_words = results_word_counts.sort_values(by=['keyword_count'],ascending=False)\n",
"results_words.reset_index(inplace=True)\n",
"results_words.rename(columns = {\"index\":\"keyword\",\"keyword_count\":\"count\"}, inplace=True)\n",
"#results_words.to_csv('keyword_counts.csv', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Choose all keywords with a frequency greater than or equal to 10."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>keyword</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020</td>\n",
" <td>93</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>students</td>\n",
" <td>87</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>covid19</td>\n",
" <td>67</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>news</td>\n",
" <td>63</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>fall</td>\n",
" <td>63</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" keyword count\n",
"0 2020 93\n",
"1 students 87\n",
"2 covid19 67\n",
"3 news 63\n",
"4 fall 63"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#from the above dataframe, choose all keywords with a frequency greater than or equal to 10\n",
"results_words = results_words.loc[results_words['count'] >= 10]\n",
"results_words.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Removed an unrecognizable word in the dataframe ('cus') after noticing it above."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"results_words = results_words.loc[~results_words['keyword'].str.contains(\"cus\")]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a dataframe for the top 24 keywords and include both original columns."
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"results_words_top = results_words.iloc[0:24, 0:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a bar plot of the keyword data using seaborn and save the figure image, if desired."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"ax = sb.barplot(x = results_words_top['count'],y = results_words_top['keyword'],palette=\"Blues_d\")\n",
"\n",
"sb.despine(fig=None, ax=None, top=True, right=True, left=False, trim=False)\n",
"sb.set(rc={'figure.figsize':(6,7.2)})\n",
"\n",
"ax.set_ylabel('') \n",
"ax.set_xlabel('')\n",
"ax.set_title('Keyword Counts for NY 4-Year College Headlines on 7/1/20', fontsize=18, fontweight='heavy')\n",
"sb.set(font_scale = 1.4)\n",
"ax.axes.get_xaxis().set_visible(True)\n",
"ax.set_frame_on(True)\n",
"\n",
"#all data that had over 22 frequency counts \n",
"#plt.savefig('keyword_countplot.jpg', bbox_inches=\"tight\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Quantify Noun Chunks"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a dataframe focusing on columns for noun chunks."
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_noun_chunks</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>[student, ozark, adelphi university spring 2020 dean]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>[kaitlyn grant, adelphi university spring]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Oxford's Emma Fogarty makes Adelphi University dean's list</td>\n",
" <td>adelphi university</td>\n",
" <td>[oxford's emma fogarty, adelphi university dean's list]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wilton Resident Sophia Walsh Named to Adelphi University ...</td>\n",
" <td>adelphi university</td>\n",
" <td>[wilton resident sophia walsh, university]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Local Students Named to Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>[local students, university spring]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"2 Oxford's Emma Fogarty makes Adelphi University dean's list \n",
"3 Wilton Resident Sophia Walsh Named to Adelphi University ... \n",
"4 Local Students Named to Adelphi University Spring 2020 ... \n",
"\n",
" search_term results_noun_chunks \n",
"0 adelphi university [student, ozark, adelphi university spring 2020 dean] \n",
"1 adelphi university [kaitlyn grant, adelphi university spring] \n",
"2 adelphi university [oxford's emma fogarty, adelphi university dean's list] \n",
"3 adelphi university [wilton resident sophia walsh, university] \n",
"4 adelphi university [local students, university spring] "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nouns = df[['results','search_term','results_noun_chunks']].copy()\n",
"nouns.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save the noun chunks column to a string type and create a new column that will be separated. Use the explode function from pandas to separate the items in the sep column."
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_noun_chunks</th>\n",
" <th>sep</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>['student'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>'ozark'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>'adelphi university spring 2020 dean']</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university spring']</td>\n",
" <td>['kaitlyn grant'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university spring']</td>\n",
" <td>'adelphi university spring']</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"2 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"3 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"4 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"\n",
" search_term \\\n",
"0 adelphi university \n",
"1 adelphi university \n",
"2 adelphi university \n",
"3 adelphi university \n",
"4 adelphi university \n",
"\n",
" results_noun_chunks \\\n",
"0 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"1 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"2 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"3 ['kaitlyn grant', 'adelphi university spring'] \n",
"4 ['kaitlyn grant', 'adelphi university spring'] \n",
"\n",
" sep \n",
"0 ['student' \n",
"1 'ozark' \n",
"2 'adelphi university spring 2020 dean'] \n",
"3 ['kaitlyn grant' \n",
"4 'adelphi university spring'] "
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nouns['results_noun_chunks'] = nouns['results_noun_chunks'].astype(str)\n",
"nouns['sep'] = nouns['results_noun_chunks'].str.split(pat=',')\n",
"nouns = nouns.explode('sep').reset_index(drop=True)\n",
"nouns.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove bracket and quotation characters from the sep column incrementally, then remove the columns used for cleaning, and rename the final version."
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_noun_chunks</th>\n",
" <th>noun_segments</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>student</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>ozark</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>adelphi university spring 2020 dean</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university spring']</td>\n",
" <td>kaitlyn grant</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university spring']</td>\n",
" <td>adelphi university spring</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"2 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"3 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"4 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"\n",
" search_term \\\n",
"0 adelphi university \n",
"1 adelphi university \n",
"2 adelphi university \n",
"3 adelphi university \n",
"4 adelphi university \n",
"\n",
" results_noun_chunks \\\n",
"0 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"1 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"2 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"3 ['kaitlyn grant', 'adelphi university spring'] \n",
"4 ['kaitlyn grant', 'adelphi university spring'] \n",
"\n",
" noun_segments \n",
"0 student \n",
"1 ozark \n",
"2 adelphi university spring 2020 dean \n",
"3 kaitlyn grant \n",
"4 adelphi university spring "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nouns['sep2'] = nouns['sep'].str.replace(\"'\", '')\n",
"nouns['sep3'] = nouns['sep2'].str.replace(\"[\",'')\n",
"nouns['sep4'] = nouns['sep3'].str.replace(\"]\", '')\n",
"nouns.drop(columns = ['sep','sep2','sep3'], inplace=True)\n",
"nouns = nouns.rename(columns = {\"sep4\":\"noun_segments\"})\n",
"nouns.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Strip the noun chunks of any preceding spaces, then return all results not equal to 'amp' from leftover ampersands."
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"nouns['noun_segments'] = nouns['noun_segments'].map(lambda x:x.lstrip(' '))\n",
"nouns = nouns.loc[nouns['noun_segments'] != 'amp']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a variable for the value counts of the noun segments, convert it to a dictionary, and then map a column in the dataframe to specify the frequency of each noun chunk."
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_noun_chunks</th>\n",
" <th>noun_segments</th>\n",
" <th>noun_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>student</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>ozark</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['student', 'ozark', 'adelphi university spring 2020 dean']</td>\n",
" <td>adelphi university spring 2020 dean</td>\n",
" <td>2</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university spring']</td>\n",
" <td>kaitlyn grant</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university spring']</td>\n",
" <td>adelphi university spring</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"2 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"3 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"4 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"\n",
" search_term \\\n",
"0 adelphi university \n",
"1 adelphi university \n",
"2 adelphi university \n",
"3 adelphi university \n",
"4 adelphi university \n",
"\n",
" results_noun_chunks \\\n",
"0 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"1 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"2 ['student', 'ozark', 'adelphi university spring 2020 dean'] \n",
"3 ['kaitlyn grant', 'adelphi university spring'] \n",
"4 ['kaitlyn grant', 'adelphi university spring'] \n",
"\n",
" noun_segments noun_count \n",
"0 student 3 \n",
"1 ozark 1 \n",
"2 adelphi university spring 2020 dean 2 \n",
"3 kaitlyn grant 1 \n",
"4 adelphi university spring 1 "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"z = nouns['noun_segments'].value_counts()\n",
"z1 = z.to_dict()\n",
"nouns['noun_count'] = nouns['noun_segments'].map(z1)\n",
"nouns.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a dataframe with the noun segments sorted by frequency count. Then, create another with only the noun chunks and their respective counts. Drop any duplicates and blanks, then reset the index. Inspect the dataframe and create a CSV, if desired."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>noun_segments</th>\n",
" <th>noun_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>students</td>\n",
" <td>34</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>college</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>colleges</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>what</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>campus</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>coronavirus</td>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>university</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>sports</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>it</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>covid-19</td>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>who</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>jobs</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>universities</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>you</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>fall semester</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>cuny</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>the class</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>we</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>new york</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>art</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>pandemic</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>fall</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>diversity</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>\"deans list\"</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>the fall</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>i</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>equity</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>class</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>college notes</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>person</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>june</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>31</th>\n",
" <td>| news</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>32</th>\n",
" <td>us</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>president</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>34</th>\n",
" <td>july</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35</th>\n",
" <td>community</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>36</th>\n",
" <td>ny</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>37</th>\n",
" <td>new york college students</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>38</th>\n",
" <td>purchase college</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>39</th>\n",
" <td>pace university</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>40</th>\n",
" <td>america</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>41</th>\n",
" <td>utica college</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>42</th>\n",
" <td>petition</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>43</th>\n",
" <td>cornell</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>44</th>\n",
" <td>covid-19 pandemic</td>\n",
" <td>5</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" noun_segments noun_count\n",
"0 students 34\n",
"1 college 26\n",
"2 colleges 22\n",
"3 what 20\n",
"4 campus 20\n",
"5 coronavirus 19\n",
"6 university 15\n",
"7 sports 15\n",
"8 it 15\n",
"9 covid-19 15\n",
"10 who 14\n",
"11 jobs 14\n",
"12 universities 12\n",
"13 you 12\n",
"14 fall semester 12\n",
"15 cuny 11\n",
"16 the class 10\n",
"17 we 10\n",
"18 new york 10\n",
"19 art 9\n",
"20 pandemic 9\n",
"21 fall 9\n",
"22 diversity 8\n",
"23 \"deans list\" 8\n",
"24 the fall 7\n",
"25 i 7\n",
"26 equity 7\n",
"27 class 7\n",
"28 college notes 7\n",
"29 person 7\n",
"30 june 7\n",
"31 | news 6\n",
"32 us 6\n",
"33 president 6\n",
"34 july 6\n",
"35 community 6\n",
"36 ny 6\n",
"37 new york college students 6\n",
"38 purchase college 6\n",
"39 pace university 6\n",
"40 america 6\n",
"41 utica college 6\n",
"42 petition 5\n",
"43 cornell 5\n",
"44 covid-19 pandemic 5"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#sort values by noun count, remove duplicates, and reset the index\n",
"sorted_df = nouns.sort_values(by = 'noun_count', ascending=False)\n",
"nouns_only = sorted_df[['noun_segments','noun_count']].copy()\n",
"nouns_only.drop_duplicates('noun_segments', inplace=True)\n",
"nouns_only = nouns_only.loc[nouns_only['noun_segments'] != '']\n",
"nouns_only = nouns_only.reset_index(drop=True)\n",
"nouns_only.head(45)\n",
"#nouns_only.to_csv('noun_counts.csv',index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Choose the top 20 noun segments to use for a visual."
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"nouns_count_top = nouns_only.iloc[0:19, 0:2]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a bar plot of the noun chunk data using seaborn and save the figure image, if desired."
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x518.4 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"ax = sb.barplot(x = nouns_count_top['noun_count'],y = nouns_count_top['noun_segments'],palette=\"Blues_d\")\n",
"\n",
"sb.despine(fig=None, ax=None, top=True, right=True, left=False, trim=False)\n",
"sb.set(rc={'figure.figsize':(6,7.2)})\n",
"\n",
"ax.set_ylabel('') \n",
"ax.set_xlabel('')\n",
"ax.set_title('Noun Chunk Counts for NY 4-Year College Headlines on 7/1/20', fontsize=18, fontweight='heavy')\n",
"sb.set(font_scale = 1.8)\n",
"ax.axes.get_xaxis().set_visible(True)\n",
"ax.set_frame_on(True)\n",
"\n",
"#plt.savefig('noun_countplot.jpg', bbox_inches=\"tight\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Quantify Named Entities"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a dataframe for the named entities."
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_named_entities</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>[adelphi university, 2020, dean]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>[kaitlyn grant, adelphi university, spring 2020]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Oxford's Emma Fogarty makes Adelphi University dean's list</td>\n",
" <td>adelphi university</td>\n",
" <td>[oxford, fogarty, adelphi university dean's]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Wilton Resident Sophia Walsh Named to Adelphi University ...</td>\n",
" <td>adelphi university</td>\n",
" <td>[wilton, sophia walsh, adelphi university]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Local Students Named to Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>[adelphi university, spring 2020]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"2 Oxford's Emma Fogarty makes Adelphi University dean's list \n",
"3 Wilton Resident Sophia Walsh Named to Adelphi University ... \n",
"4 Local Students Named to Adelphi University Spring 2020 ... \n",
"\n",
" search_term results_named_entities \n",
"0 adelphi university [adelphi university, 2020, dean] \n",
"1 adelphi university [kaitlyn grant, adelphi university, spring 2020] \n",
"2 adelphi university [oxford, fogarty, adelphi university dean's] \n",
"3 adelphi university [wilton, sophia walsh, adelphi university] \n",
"4 adelphi university [adelphi university, spring 2020] "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"named = df[['results','search_term','results_named_entities']].copy()\n",
"named.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Perform cleaning. Save the named entities column as a string, split based on commas, then use the explode function from pandas and reset the index."
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_named_entities</th>\n",
" <th>named_ent</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>['adelphi university'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>'2020'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>'dean']</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university', 'spring 2020']</td>\n",
" <td>['kaitlyn grant'</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university', 'spring 2020']</td>\n",
" <td>'adelphi university'</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"2 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"3 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"4 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"\n",
" search_term results_named_entities \\\n",
"0 adelphi university ['adelphi university', '2020', 'dean'] \n",
"1 adelphi university ['adelphi university', '2020', 'dean'] \n",
"2 adelphi university ['adelphi university', '2020', 'dean'] \n",
"3 adelphi university ['kaitlyn grant', 'adelphi university', 'spring 2020'] \n",
"4 adelphi university ['kaitlyn grant', 'adelphi university', 'spring 2020'] \n",
"\n",
" named_ent \n",
"0 ['adelphi university' \n",
"1 '2020' \n",
"2 'dean'] \n",
"3 ['kaitlyn grant' \n",
"4 'adelphi university' "
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"named['results_named_entities'] = named['results_named_entities'].astype(str)\n",
"named['named_ent'] = named['results_named_entities'].str.split(pat=',')\n",
"named = named.explode('named_ent').reset_index(drop=True)\n",
"named.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove bracket and quotation characters from the named_ent column incrementally, then remove the columns used for cleaning, and rename the final version."
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"#remove quotation marks and brackets from sep\n",
"named['named_ent2'] = named['named_ent'].str.replace(\"'\", '')\n",
"named['named_ent3'] = named['named_ent2'].str.replace(\"[\",'')\n",
"named['named_ent4'] = named['named_ent3'].str.replace(\"]\", '')\n",
"\n",
"named.drop(columns = ['named_ent','named_ent2','named_ent3'], inplace=True)\n",
"\n",
"named = named.rename(columns = {\"named_ent4\":\"named_entity\"})"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Remove any preceding spaces from the named entities and remove all those equal to 'nan'."
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"#named['named_entity'].value_counts()\n",
"#named['named_entity'] = named['named_entity'].astype(str)\n",
"named['named_entity'] = named['named_entity'].map(lambda x:x.lstrip(' '))\n",
"named = named.loc[named['named_entity'] != 'nan']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use spaCy to get the entity types for each named entity. Create a new column in the dataframe for these."
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"named_entity_type = []\n",
"\n",
"for doc in nlp.pipe(named['named_entity'].astype('unicode').values, batch_size=50,\n",
" n_process=5):\n",
" if doc.is_parsed:\n",
" named_entity_type.append([ent.label_ for ent in doc.ents])\n",
" else:\n",
" named_entity_type.append(None) \n",
"\n",
"named['named_entities_type'] = named_entity_type"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_named_entities</th>\n",
" <th>named_entity</th>\n",
" <th>named_entities_type</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>adelphi university</td>\n",
" <td>[ORG]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>2020</td>\n",
" <td>[]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>dean</td>\n",
" <td>[ORG]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university', 'spring 2020']</td>\n",
" <td>kaitlyn grant</td>\n",
" <td>[ORG]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university', 'spring 2020']</td>\n",
" <td>adelphi university</td>\n",
" <td>[ORG]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"2 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"3 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"4 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"\n",
" search_term results_named_entities \\\n",
"0 adelphi university ['adelphi university', '2020', 'dean'] \n",
"1 adelphi university ['adelphi university', '2020', 'dean'] \n",
"2 adelphi university ['adelphi university', '2020', 'dean'] \n",
"3 adelphi university ['kaitlyn grant', 'adelphi university', 'spring 2020'] \n",
"4 adelphi university ['kaitlyn grant', 'adelphi university', 'spring 2020'] \n",
"\n",
" named_entity named_entities_type \n",
"0 adelphi university [ORG] \n",
"1 2020 [] \n",
"2 dean [ORG] \n",
"3 kaitlyn grant [ORG] \n",
"4 adelphi university [ORG] "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"named.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Save the named entity types as strings, then exlore the unique types in the dataframe."
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"named['named_entities_type'] = named['named_entities_type'].astype(str)"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([\"['ORG']\", '[]', \"['PERSON']\", \"['ORDINAL']\", \"['DATE']\",\n",
" \"['GPE']\", \"['CARDINAL']\", \"['EVENT']\", \"['LOC']\", \"['MONEY']\",\n",
" \"['NORP']\", \"['PERCENT']\", \"['TIME']\", \"['QUANTITY']\",\n",
" \"['ORG', 'PERSON']\", \"['PERSON', 'PERSON']\", \"['WORK_OF_ART']\",\n",
" \"['ORG', 'GPE']\", \"['PERSON', 'ORG']\"], dtype=object)"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"named['named_entities_type'].unique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a variable for the value counts of the named entities, convert it to a dictionary, and then map a column in the dataframe to specify the frequency of each named entity."
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>results</th>\n",
" <th>search_term</th>\n",
" <th>results_named_entities</th>\n",
" <th>named_entity</th>\n",
" <th>named_entities_type</th>\n",
" <th>entity_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>adelphi university</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>2020</td>\n",
" <td>[]</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Student from Ozark on Adelphi University Spring 2020 Dean’s List</td>\n",
" <td>adelphi university</td>\n",
" <td>['adelphi university', '2020', 'dean']</td>\n",
" <td>dean</td>\n",
" <td>['ORG']</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university', 'spring 2020']</td>\n",
" <td>kaitlyn grant</td>\n",
" <td>['ORG']</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Kaitlyn Grant Named Among Adelphi University Spring 2020 ...</td>\n",
" <td>adelphi university</td>\n",
" <td>['kaitlyn grant', 'adelphi university', 'spring 2020']</td>\n",
" <td>adelphi university</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" results \\\n",
"0 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"1 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"2 Student from Ozark on Adelphi University Spring 2020 Dean’s List \n",
"3 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"4 Kaitlyn Grant Named Among Adelphi University Spring 2020 ... \n",
"\n",
" search_term results_named_entities \\\n",
"0 adelphi university ['adelphi university', '2020', 'dean'] \n",
"1 adelphi university ['adelphi university', '2020', 'dean'] \n",
"2 adelphi university ['adelphi university', '2020', 'dean'] \n",
"3 adelphi university ['kaitlyn grant', 'adelphi university', 'spring 2020'] \n",
"4 adelphi university ['kaitlyn grant', 'adelphi university', 'spring 2020'] \n",
"\n",
" named_entity named_entities_type entity_count \n",
"0 adelphi university ['ORG'] 6 \n",
"1 2020 [] 62 \n",
"2 dean ['ORG'] 12 \n",
"3 kaitlyn grant ['ORG'] 1 \n",
"4 adelphi university ['ORG'] 6 "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"q = named['named_entity'].value_counts()\n",
"q1 = q.to_dict()\n",
"named['entity_count'] = named['named_entity'].map(q1)\n",
"named.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a dataframe with the named entities sorted by frequency count. Then, create another with only the named entities and their respective counts. Drop any duplicates and blanks, then reset the index. Inspect the dataframe and create a CSV, if desired."
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>named_entity</th>\n",
" <th>named_entities_type</th>\n",
" <th>entity_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020</td>\n",
" <td>[]</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>suny</td>\n",
" <td>['ORG']</td>\n",
" <td>51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>covid-19</td>\n",
" <td>[]</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>new york</td>\n",
" <td>['GPE']</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>first</td>\n",
" <td>['ORDINAL']</td>\n",
" <td>17</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" named_entity named_entities_type entity_count\n",
"0 2020 [] 62\n",
"1 suny ['ORG'] 51\n",
"2 covid-19 [] 41\n",
"3 new york ['GPE'] 20\n",
"4 first ['ORDINAL'] 17"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted_df2 = named.sort_values(by = 'entity_count', ascending=False)\n",
"entity_only = sorted_df2[['named_entity','named_entities_type','entity_count']].copy()\n",
"entity_only.drop_duplicates('named_entity', inplace=True)\n",
"entity_only = entity_only.loc[entity_only['named_entity'] != '']\n",
"entity_only = entity_only.reset_index(drop=True)\n",
"entity_only.head()\n",
"#entity_only.to_csv('entity_counts.csv',index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Create a dataframe with the top 30 named entities."
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"entity_top = entity_only.iloc[0:29, 0:3]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>named_entity</th>\n",
" <th>named_entities_type</th>\n",
" <th>entity_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>2020</td>\n",
" <td>[]</td>\n",
" <td>62</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>suny</td>\n",
" <td>['ORG']</td>\n",
" <td>51</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>covid-19</td>\n",
" <td>[]</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>new york</td>\n",
" <td>['GPE']</td>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>first</td>\n",
" <td>['ORDINAL']</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>two</td>\n",
" <td>['CARDINAL']</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>brooklyn</td>\n",
" <td>['GPE']</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>jewish</td>\n",
" <td>['NORP']</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>dean</td>\n",
" <td>['ORG']</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>new york city</td>\n",
" <td>['GPE']</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>summer</td>\n",
" <td>[]</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>5</td>\n",
" <td>['CARDINAL']</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>u.s.</td>\n",
" <td>['GPE']</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>cornell</td>\n",
" <td>['PERSON']</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>manhattan</td>\n",
" <td>['GPE']</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>10</td>\n",
" <td>['CARDINAL']</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>annual</td>\n",
" <td>['DATE']</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>three</td>\n",
" <td>['CARDINAL']</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>five</td>\n",
" <td>['CARDINAL']</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>utica college</td>\n",
" <td>[]</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>20</th>\n",
" <td>niagara university</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>21</th>\n",
" <td>yeshiva university</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>22</th>\n",
" <td>15</td>\n",
" <td>['CARDINAL']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>23</th>\n",
" <td>syracuse university</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>24</th>\n",
" <td>eight</td>\n",
" <td>['CARDINAL']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25</th>\n",
" <td>us</td>\n",
" <td>[]</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>oswego county</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>adelphi university</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>28</th>\n",
" <td>clarkson university</td>\n",
" <td>['ORG']</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" named_entity named_entities_type entity_count\n",