Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ViksaaSkool/5470e89d04a0dea8c1a9959633afdd2b to your computer and use it in GitHub Desktop.
Save ViksaaSkool/5470e89d04a0dea8c1a9959633afdd2b to your computer and use it in GitHub Desktop.
generate_stand_up_comedians_twitter_accounts_list.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "generate_stand_up_comedians_twitter_accounts_list.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyNCAzHmIFPtd/yzTRBQ4qQM",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/ViksaaSkool/5470e89d04a0dea8c1a9959633afdd2b/generate_stand_up_comedians_twitter_accounts_list.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "UVsdG3mN1ZhN",
"colab_type": "text"
},
"source": [
"**Install libraries** "
]
},
{
"cell_type": "code",
"metadata": {
"id": "iWUHgOMs-ZD8",
"colab_type": "code",
"colab": {}
},
"source": [
"!pip install google\n",
"!pip install beautifulsoup4\n",
"!pip install requests"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "NEF6Uy812gUD",
"colab_type": "text"
},
"source": [
"Import needed libraries"
]
},
{
"cell_type": "code",
"metadata": {
"id": "7moFsClM2WSt",
"colab_type": "code",
"colab": {}
},
"source": [
"from googlesearch import search\n",
"from google.colab import drive\n",
"import re\n",
"import pandas as pd\n",
"from time import sleep\n",
"import json\n",
"import csv\n",
"from bs4 import BeautifulSoup\n",
"import requests\n",
"import re\n",
"from functools import reduce\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "GbYtRyGG3_c0",
"colab_type": "text"
},
"source": [
"Mount Google Drive\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "wyR0T5W84FFY",
"colab_type": "code",
"colab": {}
},
"source": [
"GOOGLE_DRIVE_ROOT = \"/content/drive\"\n",
"drive.mount(GOOGLE_DRIVE_ROOT)"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Y-Dxd8lZ2m2m",
"colab_type": "text"
},
"source": [
"Define constants\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mFw7Lw1W3qQX",
"colab_type": "code",
"colab": {}
},
"source": [
"GOOGLE_DRIVE_COLAB_PATH = f\"{GOOGLE_DRIVE_ROOT}/My Drive/Colab Notebooks/\"\n",
"\n",
"READ = 'r'\n",
"WRITE = 'w'\n",
"\n",
"STAND_UP_COMEDIANS_JSON_PATH = f\"{GOOGLE_DRIVE_COLAB_PATH}stand_up_comedians_twitter/comedians.json\"\n",
"STAND_UP_COMEDIANS_TWITTER_ACCOUNTS_PATH = f\"{GOOGLE_DRIVE_COLAB_PATH}stand_up_comedians_twitter/comedians_and_twitter_accounts.csv\"\n",
"\n",
"COMEDIAN_COLUMN = \"stand_up_comedian\"\n",
"COMEDIAN_TWITTER_COLUMN = \"twitter_handle\"\n",
"SLEEP_SEARCH_DELAY = 70 #value in seconds\n",
"USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'\n",
"\n",
"WIKIPEDIA_BASE_URL = \"https://en.wikipedia.org/wiki/\"\n",
"\n",
"#wikipedia articles\n",
"UK = \"List_of_stand-up_comedians_from_the_United_Kingdom\"\n",
"AUSTRALIA = \"List_of_Australian_stand-up_comedians\"\n",
"CANADA = \"List_of_Canadian_stand-up_comedians\"\n",
"USA = \"List_of_United_States_stand-up_comedians\"\n",
"\n",
"COMEDIANS_USA = \"comedians_us\"\n",
"COMEDIANS_UK = \"comedians_uk\"\n",
"COMEDIANS_CA = \"comedians_ca\"\n",
"COMEDIANS_AU = \"comedians_au\"\n",
"\n",
"COUNTRIES = [(COMEDIANS_UK, UK), (COMEDIANS_AU, AUSTRALIA), (COMEDIANS_CA, CANADA), (COMEDIANS_USA, USA)]\n",
"\n",
"list_to_one_line_string = lambda x: str(x).replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\")\n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "hUsHb8dAYbn3",
"colab_type": "text"
},
"source": [
"retrieve comedians form Wikipedia aticles (list of comedians from country) "
]
},
{
"cell_type": "code",
"metadata": {
"id": "btbOyOboYkRv",
"colab_type": "code",
"colab": {}
},
"source": [
"def generate_comedians_json():\n",
" comedians_json = {}\n",
" for key, value_article in COUNTRIES:\n",
" comedians_json[key] = get_comedians_from_wiki(value_article)\n",
"\n",
" print(comedians_json)\n",
"\n",
" with open(STAND_UP_COMEDIANS_JSON_PATH, WRITE) as file:\n",
" json.dump(comedians_json, file)\n",
" \n",
"def get_comedians_from_wiki(comedians_from_article):\n",
" page_html = requests.get(f\"{WIKIPEDIA_BASE_URL}{comedians_from_article}\").text\n",
" wiki_soup = BeautifulSoup(page_html, 'html.parser')\n",
"\n",
" comedians = []\n",
" for a in wiki_soup.find_all('a'):\n",
" if a.get('title') == a.get_text():\n",
" comedians.append(a.get_text())\n",
"\n",
" return list_to_one_line_string(comedians) \n",
"\n",
"#method call to generate the .json file\n",
"generate_comedians_json()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "wtc7HYMf_7eQ",
"colab_type": "text"
},
"source": [
"comedians.json - json of comedians retrieved form wikipedia in the following format:\n",
"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "P43GUUUs6EjO",
"colab_type": "code",
"colab": {}
},
"source": [
"{\n",
" \"comedians_us\": \"list_of_comedians_from_USA_as_a_string\",\n",
" \"comedians_uk\": \"list_of_comedians_from_United_Kingdom_as_a_string\",\n",
" \"comedians_ca\": \"list_of_comedians_from_Canada_as_a_string\",\n",
" \"comedians_au\": \"list_of_comedians_from_Australia_as_a_string\"\n",
"}"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "ZWq1q5di6DKD",
"colab_type": "text"
},
"source": [
"convert the .json into .csv in with two columns: \"stand_up_comedian\" and \"twitter_handle\"\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "vuEIwsza4aBl",
"colab_type": "code",
"colab": {}
},
"source": [
"def load_data_file(file_path):\n",
" return open(file_path, READ)\n",
"\n",
"def create_comedians_csv():\n",
" comedians_json = json.load(load_data_file(STAND_UP_COMEDIANS_JSON_PATH)) \n",
" comedians_list = []\n",
"\n",
" for key, value in COUNTRIES:\n",
" comedians_list.append(comedians_json[key]) \n",
"\n",
" comedians_list = list_to_one_line_string(comedians_list).split(',') \n",
" print(comedians_list) \n",
"\n",
" with open(STAND_UP_COMEDIANS_TWITTER_ACCOUNTS_PATH, mode=WRITE) as csv_file:\n",
" fieldnames = [COMEDIAN_COLUMN, COMEDIAN_TWITTER_COLUMN]\n",
" writer = csv.DictWriter(csv_file, fieldnames=fieldnames)\n",
" writer.writeheader()\n",
" for comedian in comedians_list:\n",
" writer.writerow({COMEDIAN_COLUMN: comedian, COMEDIAN_TWITTER_COLUMN: ''})\n",
"\n",
"# method call\n",
"create_comedians_csv()"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "0mmTMeL07pdn",
"colab_type": "text"
},
"source": [
"do a google search in the format \"[comedian_name] [twitter]\" and filter results\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "7vwm56kR9NXF",
"colab_type": "code",
"colab": {}
},
"source": [
"def populate_stand_up_comedians_twitter_accounts():\n",
" comedians_df = pd.read_csv(STAND_UP_COMEDIANS_TWITTER_ACCOUNTS_PATH)\n",
"\n",
" for index, row in comedians_df.iterrows():\n",
" comedian = comedians_df.at[index, COMEDIAN_COLUMN]\n",
" twitter_account = str(comedians_df.at[index, COMEDIAN_TWITTER_COLUMN])\n",
" print(f\"{comedian}, {twitter_account}, is_blank = {twitter_account.lower() == 'nan'}, \")\n",
"\n",
" if twitter_account.lower() == \"nan\":\n",
" t_a = \"\"\n",
" for url in search(comedian, stop=1, domains=[\"twitter.com\"], user_agent=USER_AGENT):\n",
" print(url)\n",
" t_a = get_twitter_handle_from_url_(url)\n",
" print(f\"comedian = {comedian}, twitter_handle = {t_a}, index = {index}, comedians_df.at = {comedians_df.at[index, COMEDIAN_TWITTER_COLUMN]}\")\n",
" comedians_df.loc[index, COMEDIAN_TWITTER_COLUMN] = t_a\n",
" print(f\"comedians_df.loc[{index}] = {comedians_df.loc[index, COMEDIAN_TWITTER_COLUMN]}\")\n",
" comedians_twitter_accounts = comedians_df[[COMEDIAN_COLUMN, COMEDIAN_TWITTER_COLUMN]]\n",
" comedians_twitter_accounts.to_csv(STAND_UP_COMEDIANS_TWITTER_ACCOUNTS_PATH)\n",
" comedians_twitter_accounts = comedians_twitter_accounts.iloc[0:0]\n",
" sleep(SLEEP_SEARCH_DELAY) #yeah it sucks, but have to avoid \"Too Many Requests\" Exception \n"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "9BlLviiCk5Yo",
"colab_type": "text"
},
"source": [
"extract user from twitter url ex. https://twitter.com/wil_anderson\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "0hKqE9A-l9vK",
"colab_type": "code",
"colab": {}
},
"source": [
"def get_twitter_handle_from_url_(url):\n",
" filter_username = lambda x: x if('?' not in x) else x[:index('?')]\n",
" return filter_username(url[len(\"https://twitter.com/\") : len(url)])"
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "j7HernIu-Ff3",
"colab_type": "text"
},
"source": [
"finally, method call to create the csv (stand_up_comedian - twitter_handle):\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "RWbTeUil-ISJ",
"colab_type": "code",
"colab": {}
},
"source": [
"populate_stand_up_comedians_twitter_accounts()"
],
"execution_count": 0,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment