Mahelita/scrape_tonie_tracks.ipynb

## scrape_tonie_tracks.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "nasty-denver",
   "metadata": {},
   "source": [
    "## Get JSON and first part of tonie urls (the series part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "strong-louisiana",
   "metadata": {},
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "from fuzzywuzzy import fuzz\n",
    "import json\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "import requests\n",
    "\n",
    "url = 'http://gt-blog.de/JSON/tonies.json'\n",
    "data = requests.get(url).json()\n",
    "\n",
    "url_base = 'https://tonies.de'\n",
    "special_char_map = {ord('!'): '', ord('?'): '', ord('’'): '', ord('&'): '', ord('.'): '', ord(','): '', ord(' '): '-', ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'}\n",
    "series_names_url = []\n",
    "episode_names = []\n",
    "episode_names_url = []\n",
    "series_urls = []\n",
    "for tonie in data:\n",
    "    if tonie['language'] == 'de':\n",
    "        series_names_url.append(re.sub('--+', '-', tonie['series'].translate(special_char_map).lower()))\n",
    "        episode_names.append(tonie['episodes'])\n",
    "        episode_names_url.append(re.sub('--+', '-', episode_names[-1].translate(special_char_map).lower()))\n",
    "        series_urls.append('https://tonies.de/shop/tonies/{series}/'.format(series=series_names_url[-1]))\n",
    "        #TODO Instead of collecting series_urls, directly get track names and add to tonies, then save json file\n",
    "series_urls = np.unique(series_urls)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "metallic-greek",
   "metadata": {},
   "source": [
    "## Check if the first part of the url is correct and get the listed episodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "worldwide-sensitivity",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "https://tonies.de/shop/tonies/der-raeuber-hotzenplotz/\n",
      "https://tonies.de/shop/tonies/die-maus/\n",
      "https://tonies.de/shop/tonies/heule-eule-und-andere-geschichten/\n",
      "https://tonies.de/shop/tonies/kosmo-klax/\n",
      "https://tonies.de/shop/tonies/kreativ-tonie/\n",
      "https://tonies.de/shop/tonies/nola-note/\n",
      "https://tonies.de/shop/tonies/rotzn-roll-radio/\n"
     ]
    }
   ],
   "source": [
    "all_episode_urls = []\n",
    "for url,series,episode in zip(series_urls, series_names_url, episode_names):\n",
    "    r = requests.get(url)\n",
    "    if r.status_code == 200:\n",
    "        soup = BeautifulSoup(r.content)\n",
    "        href_all = soup.find_all('a', href=True)\n",
    "        request_urls = [href['href'] for href in href_all]\n",
    "        episode_urls = [a.find(url[17:]) for a in request_urls]\n",
    "        episode_urls = [0 if a == -1 else 1 for a in episode_urls]\n",
    "        all_episode_urls.append(np.unique(np.array(request_urls)[np.where(episode_urls)[0]]))\n",
    "        \n",
    "    else:\n",
    "        print(url)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "regulated-cargo",
   "metadata": {},
   "source": [
    "## Get the track names of all episodes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "special-batch",
   "metadata": {},
   "outputs": [],
   "source": [
    "episode_list = []\n",
    "for series_array in all_episode_urls:\n",
    "    for partial_url in series_array:\n",
    "        full_url = url_base + partial_url\n",
    "        r = requests.get(full_url)\n",
    "\n",
    "        soup = BeautifulSoup(r.content)\n",
    "        titlelist = soup.find_all(id=\"tabs--large-up__titelliste\")\n",
    "        if titlelist:\n",
    "            titlelist = [a.get_text() for a in titlelist[0].find_all('p')]\n",
    "            episode_list.append([full_url, partial_url.split('/')[-3], partial_url.split('/')[-2], titlelist])\n",
    "df = pd.DataFrame(episode_list, columns=['full_url', 'series_url', 'episode_url', 'tracks'])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "successful-closure",
   "metadata": {},
   "source": [
    "## Match each episode back to the JSON"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "intensive-native",
   "metadata": {},
   "source": [
    "The nesting order is relevant as otherwise many low-quality matches will happen"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "mature-retreat",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "better match found!\n",
      "better match found!\n",
      "better match found!\n",
      "better match found!\n",
      "better match found!\n",
      "better match found!\n",
      "better match found!\n",
      "better match found!\n"
     ]
    }
   ],
   "source": [
    "for index, row in df.iterrows():\n",
    "    fuzzy_ratio = []\n",
    "    for tonie in data:\n",
    "#         if tonie['language'] == 'de':\n",
    "        full_name = ' - '.join(row['full_url'].replace('-', ' ').lower().split('/')[-3:-1])\n",
    "        fuzzy_ratio.append(fuzz.ratio(full_name, tonie['title'].lower()))\n",
    "    matching_tonie = np.argsort(fuzzy_ratio)[-1]\n",
    "    if 'fuzzy_ratio' not in data[matching_tonie]:\n",
    "        data[matching_tonie]['tracks'] = row['tracks']\n",
    "        data[matching_tonie]['url'] = row['full_url']\n",
    "        data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n",
    "    elif int(data[matching_tonie]['fuzzy_ratio']) < np.max(fuzzy_ratio):\n",
    "        print('better match found!')\n",
    "        data[matching_tonie]['tracks'] = row['tracks']\n",
    "        data[matching_tonie]['url'] = row['full_url']\n",
    "        data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n",
    "\n",
    "with open('tonies.json', 'w') as f:\n",
    "    json.dump(data, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "nasty-denver",
	"metadata": {},
	"source": [
	"## Get JSON and first part of tonie urls (the series part)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "strong-louisiana",
	"metadata": {},
	"outputs": [],
	"source": [
	"from bs4 import BeautifulSoup\n",
	"from fuzzywuzzy import fuzz\n",
	"import json\n",
	"import numpy as np\n",
	"import pandas as pd\n",
	"import re\n",
	"import requests\n",
	"\n",
	"url = 'http://gt-blog.de/JSON/tonies.json'\n",
	"data = requests.get(url).json()\n",
	"\n",
	"url_base = 'https://tonies.de'\n",
	"special_char_map = {ord('!'): '', ord('?'): '', ord('’'): '', ord('&'): '', ord('.'): '', ord(','): '', ord(' '): '-', ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'}\n",
	"series_names_url = []\n",
	"episode_names = []\n",
	"episode_names_url = []\n",
	"series_urls = []\n",
	"for tonie in data:\n",
	" if tonie['language'] == 'de':\n",
	" series_names_url.append(re.sub('--+', '-', tonie['series'].translate(special_char_map).lower()))\n",
	" episode_names.append(tonie['episodes'])\n",
	" episode_names_url.append(re.sub('--+', '-', episode_names[-1].translate(special_char_map).lower()))\n",
	" series_urls.append('https://tonies.de/shop/tonies/{series}/'.format(series=series_names_url[-1]))\n",
	" #TODO Instead of collecting series_urls, directly get track names and add to tonies, then save json file\n",
	"series_urls = np.unique(series_urls)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "metallic-greek",
	"metadata": {},
	"source": [
	"## Check if the first part of the url is correct and get the listed episodes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "worldwide-sensitivity",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"https://tonies.de/shop/tonies/der-raeuber-hotzenplotz/\n",
	"https://tonies.de/shop/tonies/die-maus/\n",
	"https://tonies.de/shop/tonies/heule-eule-und-andere-geschichten/\n",
	"https://tonies.de/shop/tonies/kosmo-klax/\n",
	"https://tonies.de/shop/tonies/kreativ-tonie/\n",
	"https://tonies.de/shop/tonies/nola-note/\n",
	"https://tonies.de/shop/tonies/rotzn-roll-radio/\n"
	]
	}
	],
	"source": [
	"all_episode_urls = []\n",
	"for url,series,episode in zip(series_urls, series_names_url, episode_names):\n",
	" r = requests.get(url)\n",
	" if r.status_code == 200:\n",
	" soup = BeautifulSoup(r.content)\n",
	" href_all = soup.find_all('a', href=True)\n",
	" request_urls = [href['href'] for href in href_all]\n",
	" episode_urls = [a.find(url[17:]) for a in request_urls]\n",
	" episode_urls = [0 if a == -1 else 1 for a in episode_urls]\n",
	" all_episode_urls.append(np.unique(np.array(request_urls)[np.where(episode_urls)[0]]))\n",
	" \n",
	" else:\n",
	" print(url)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "regulated-cargo",
	"metadata": {},
	"source": [
	"## Get the track names of all episodes"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "special-batch",
	"metadata": {},
	"outputs": [],
	"source": [
	"episode_list = []\n",
	"for series_array in all_episode_urls:\n",
	" for partial_url in series_array:\n",
	" full_url = url_base + partial_url\n",
	" r = requests.get(full_url)\n",
	"\n",
	" soup = BeautifulSoup(r.content)\n",
	" titlelist = soup.find_all(id=\"tabs--large-up__titelliste\")\n",
	" if titlelist:\n",
	" titlelist = [a.get_text() for a in titlelist[0].find_all('p')]\n",
	" episode_list.append([full_url, partial_url.split('/')[-3], partial_url.split('/')[-2], titlelist])\n",
	"df = pd.DataFrame(episode_list, columns=['full_url', 'series_url', 'episode_url', 'tracks'])"
	]
	},
	{
	"cell_type": "markdown",
	"id": "successful-closure",
	"metadata": {},
	"source": [
	"## Match each episode back to the JSON"
	]
	},
	{
	"cell_type": "markdown",
	"id": "intensive-native",
	"metadata": {},
	"source": [
	"The nesting order is relevant as otherwise many low-quality matches will happen"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "mature-retreat",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"better match found!\n",
	"better match found!\n",
	"better match found!\n",
	"better match found!\n",
	"better match found!\n",
	"better match found!\n",
	"better match found!\n",
	"better match found!\n"
	]
	}
	],
	"source": [
	"for index, row in df.iterrows():\n",
	" fuzzy_ratio = []\n",
	" for tonie in data:\n",
	"# if tonie['language'] == 'de':\n",
	" full_name = ' - '.join(row['full_url'].replace('-', ' ').lower().split('/')[-3:-1])\n",
	" fuzzy_ratio.append(fuzz.ratio(full_name, tonie['title'].lower()))\n",
	" matching_tonie = np.argsort(fuzzy_ratio)[-1]\n",
	" if 'fuzzy_ratio' not in data[matching_tonie]:\n",
	" data[matching_tonie]['tracks'] = row['tracks']\n",
	" data[matching_tonie]['url'] = row['full_url']\n",
	" data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n",
	" elif int(data[matching_tonie]['fuzzy_ratio']) < np.max(fuzzy_ratio):\n",
	" print('better match found!')\n",
	" data[matching_tonie]['tracks'] = row['tracks']\n",
	" data[matching_tonie]['url'] = row['full_url']\n",
	" data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n",
	"\n",
	"with open('tonies.json', 'w') as f:\n",
	" json.dump(data, f)"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.9.1"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}