aes512/GoogleEntities.ipynb

## GoogleEntities.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Google Entity Extractor from Search Results\n",
    "\n",
    "_Extract Entities from Search Results using Google NLP for Keyword Research Opportunities_"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Library imports and initilizations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 322,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from goose3 import Goose\n",
    "import pandas as pd\n",
    "import argparse\n",
    "import os\n",
    "\n",
    "from google.cloud import language\n",
    "from google.cloud.language import enums\n",
    "from google.cloud.language import types\n",
    "\n",
    "client = language.LanguageServiceClient()\n",
    "g = Goose({'browser_user_agent': 'Mozilla'})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Functions:\n",
    " \n",
    " **clean_text**: extracts non-boilerplate body text\n",
    " \n",
    " **analyze_entities**: extracts entities from a document using Google API\n",
    " \n",
    " **entity_create_list**: creates a list of entity values for use with lambda function\n",
    " \n",
    " **create_score**: creates a simple score for use with lambda function\n",
    " "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 323,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def clean_text(url):\n",
    "    try:\n",
    "        article = g.extract(url=url)\n",
    "        return article.title + \" \" + article.cleaned_text\n",
    "    except:\n",
    "        print(\"Error on \" + url)\n",
    "        return \" \"\n",
    "    \n",
    "def analyze_entities(text, encoding='UTF32'):\n",
    "    document = language.types.Document(content=text, language='en',type='PLAIN_TEXT')\n",
    "    response = client.analyze_entities(document=document,encoding_type='UTF32')\n",
    "    return response\n",
    "\n",
    "def entity_create_list(text):\n",
    "    entity_list = []\n",
    "    entities = analyze_entities(text)\n",
    "    for x in entities.entities:\n",
    "        entity_list.append({ \"name\": x.name, \"salience\": x.salience, \"entity_type\": str(enums.Entity.Type(x.type)).strip('Type.')})\n",
    "    return entity_list\n",
    "\n",
    "def create_score(x, y):\n",
    "    return x * y"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## User inputs\n",
    "\n",
    "**Note:** *input_csv* represent SERP data including 'url', 'rank', and 'page' data. Should be replaced via rank tracking API of choice."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 329,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "input_csv = \"martin-guitars.csv\"\n",
    "output_csv = \"C:/Users/username/Documents/martin-guitars-entities.csv\"\n",
    "os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"C:/Users/username/My Project-75bf3613e7b0.json\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import ranking data from CSV, reduce to page 1 results only, extract body text, and add to dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 325,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Converting https://www.martinguitar.com/\n",
      "Converting https://en.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
      "Converting https://spinditty.com/instruments-gear/The-Most-Popular-Guitar-Brand-Used-By-Popular-Musicians-And-Performers\n",
      "Converting https://www.musicradar.com/news/the-10-best-high-end-acoustic-guitars-the-best-guitars-for-experts-and-pro-players\n",
      "Converting https://www.martinguitar.com/about/\n",
      "Converting https://en.m.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
      "Converting https://www.guitarcenter.com/Martin/Acoustic-Guitars.gc\n",
      "Converting https://www.guitarcenter.com/Martin/Guitars.gc\n",
      "Adding to dataframe https://www.martinguitar.com/\n",
      "Adding to dataframe https://en.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
      "Adding to dataframe https://spinditty.com/instruments-gear/The-Most-Popular-Guitar-Brand-Used-By-Popular-Musicians-And-Performers\n",
      "Adding to dataframe https://www.musicradar.com/news/the-10-best-high-end-acoustic-guitars-the-best-guitars-for-experts-and-pro-players\n",
      "Adding to dataframe https://www.martinguitar.com/about/\n",
      "Adding to dataframe https://en.m.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
      "Adding to dataframe https://www.guitarcenter.com/Martin/Acoustic-Guitars.gc\n",
      "Adding to dataframe https://www.guitarcenter.com/Martin/Guitars.gc\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv(input_csv)\n",
    "df = df.query('Page==1')\n",
    "\n",
    "text = []\n",
    "for url in df[\"URL\"]:\n",
    "    text.append(clean_text(url))\n",
    "    print(\"Converting \" + url)\n",
    "    \n",
    "for x in df[\"URL\"]:\n",
    "    df[\"text\"] = text\n",
    "    print(\"Adding to dataframe \" + x)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Add entities list to each row of dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 327,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "df['Entities'] = df.apply(lambda row: entity_create_list(row['text']), axis=1)\n",
    "all_entities = []\n",
    "for x in df[\"Entities\"]:\n",
    "    for y in x:\n",
    "        all_entities.append({\"name\": y[\"name\"], \"salience\": y[\"salience\"], \"entity_type\": y[\"entity_type\"]})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Create new dataframe for final output\n",
    "\n",
    "Creates dataframe, data aggregates, creates score, sorts by score, and outputs to CSV"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 330,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "entity_df = pd.DataFrame.from_dict(all_entities)\n",
    "entity_df = entity_df.groupby([\"name\", 'entity_type'], as_index=False)[\"salience\"].agg({\"standev_salience\": \"std\", \"sum_salience\": \"sum\", \"median_salience\": \"median\", \"avg_salience\": \"mean\", \"count\": \"count\"}).sort_values(by=\"count\", ascending=False).reset_index()\n",
    "entity_df['score'] = entity_df.apply(lambda row: create_score(row['avg_salience'], row['count']), axis=1)\n",
    "entity_df.sort_values(by=\"score\", ascending=False)\n",
    "\n",
    "entity_df.to_csv(output_csv)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Google Entity Extractor from Search Results\n",
	"\n",
	"_Extract Entities from Search Results using Google NLP for Keyword Research Opportunities_"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Library imports and initilizations"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 322,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"from goose3 import Goose\n",
	"import pandas as pd\n",
	"import argparse\n",
	"import os\n",
	"\n",
	"from google.cloud import language\n",
	"from google.cloud.language import enums\n",
	"from google.cloud.language import types\n",
	"\n",
	"client = language.LanguageServiceClient()\n",
	"g = Goose({'browser_user_agent': 'Mozilla'})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# Functions:\n",
	" \n",
	" clean_text: extracts non-boilerplate body text\n",
	" \n",
	" analyze_entities: extracts entities from a document using Google API\n",
	" \n",
	" entity_create_list: creates a list of entity values for use with lambda function\n",
	" \n",
	" create_score: creates a simple score for use with lambda function\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 323,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"def clean_text(url):\n",
	" try:\n",
	" article = g.extract(url=url)\n",
	" return article.title + \" \" + article.cleaned_text\n",
	" except:\n",
	" print(\"Error on \" + url)\n",
	" return \" \"\n",
	" \n",
	"def analyze_entities(text, encoding='UTF32'):\n",
	" document = language.types.Document(content=text, language='en',type='PLAIN_TEXT')\n",
	" response = client.analyze_entities(document=document,encoding_type='UTF32')\n",
	" return response\n",
	"\n",
	"def entity_create_list(text):\n",
	" entity_list = []\n",
	" entities = analyze_entities(text)\n",
	" for x in entities.entities:\n",
	" entity_list.append({ \"name\": x.name, \"salience\": x.salience, \"entity_type\": str(enums.Entity.Type(x.type)).strip('Type.')})\n",
	" return entity_list\n",
	"\n",
	"def create_score(x, y):\n",
	" return x * y"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## User inputs\n",
	"\n",
	"Note: input_csv represent SERP data including 'url', 'rank', and 'page' data. Should be replaced via rank tracking API of choice."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 329,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"input_csv = \"martin-guitars.csv\"\n",
	"output_csv = \"C:/Users/username/Documents/martin-guitars-entities.csv\"\n",
	"os.environ[\"GOOGLE_APPLICATION_CREDENTIALS\"] = \"C:/Users/username/My Project-75bf3613e7b0.json\""
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Import ranking data from CSV, reduce to page 1 results only, extract body text, and add to dataframe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 325,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Converting https://www.martinguitar.com/\n",
	"Converting https://en.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
	"Converting https://spinditty.com/instruments-gear/The-Most-Popular-Guitar-Brand-Used-By-Popular-Musicians-And-Performers\n",
	"Converting https://www.musicradar.com/news/the-10-best-high-end-acoustic-guitars-the-best-guitars-for-experts-and-pro-players\n",
	"Converting https://www.martinguitar.com/about/\n",
	"Converting https://en.m.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
	"Converting https://www.guitarcenter.com/Martin/Acoustic-Guitars.gc\n",
	"Converting https://www.guitarcenter.com/Martin/Guitars.gc\n",
	"Adding to dataframe https://www.martinguitar.com/\n",
	"Adding to dataframe https://en.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
	"Adding to dataframe https://spinditty.com/instruments-gear/The-Most-Popular-Guitar-Brand-Used-By-Popular-Musicians-And-Performers\n",
	"Adding to dataframe https://www.musicradar.com/news/the-10-best-high-end-acoustic-guitars-the-best-guitars-for-experts-and-pro-players\n",
	"Adding to dataframe https://www.martinguitar.com/about/\n",
	"Adding to dataframe https://en.m.wikipedia.org/wiki/C._F._Martin_%26_Company\n",
	"Adding to dataframe https://www.guitarcenter.com/Martin/Acoustic-Guitars.gc\n",
	"Adding to dataframe https://www.guitarcenter.com/Martin/Guitars.gc\n"
	]
	}
	],
	"source": [
	"df = pd.read_csv(input_csv)\n",
	"df = df.query('Page==1')\n",
	"\n",
	"text = []\n",
	"for url in df[\"URL\"]:\n",
	" text.append(clean_text(url))\n",
	" print(\"Converting \" + url)\n",
	" \n",
	"for x in df[\"URL\"]:\n",
	" df[\"text\"] = text\n",
	" print(\"Adding to dataframe \" + x)"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Add entities list to each row of dataframe"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 327,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"df['Entities'] = df.apply(lambda row: entity_create_list(row['text']), axis=1)\n",
	"all_entities = []\n",
	"for x in df[\"Entities\"]:\n",
	" for y in x:\n",
	" all_entities.append({\"name\": y[\"name\"], \"salience\": y[\"salience\"], \"entity_type\": y[\"entity_type\"]})"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"## Create new dataframe for final output\n",
	"\n",
	"Creates dataframe, data aggregates, creates score, sorts by score, and outputs to CSV"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 330,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"entity_df = pd.DataFrame.from_dict(all_entities)\n",
	"entity_df = entity_df.groupby([\"name\", 'entity_type'], as_index=False)[\"salience\"].agg({\"standev_salience\": \"std\", \"sum_salience\": \"sum\", \"median_salience\": \"median\", \"avg_salience\": \"mean\", \"count\": \"count\"}).sort_values(by=\"count\", ascending=False).reset_index()\n",
	"entity_df['score'] = entity_df.apply(lambda row: create_score(row['avg_salience'], row['count']), axis=1)\n",
	"entity_df.sort_values(by=\"score\", ascending=False)\n",
	"\n",
	"entity_df.to_csv(output_csv)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.6.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}