Skip to content

Instantly share code, notes, and snippets.

@Mahelita
Created April 17, 2021 17:52
Show Gist options
  • Save Mahelita/a6a934071f926a944d57ad0c6c99852d to your computer and use it in GitHub Desktop.
Save Mahelita/a6a934071f926a944d57ad0c6c99852d to your computer and use it in GitHub Desktop.
Scrape tonie tracks
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "nasty-denver",
"metadata": {},
"source": [
"## Get JSON and first part of tonie urls (the series part)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "strong-louisiana",
"metadata": {},
"outputs": [],
"source": [
"from bs4 import BeautifulSoup\n",
"from fuzzywuzzy import fuzz\n",
"import json\n",
"import numpy as np\n",
"import pandas as pd\n",
"import re\n",
"import requests\n",
"\n",
"url = 'http://gt-blog.de/JSON/tonies.json'\n",
"data = requests.get(url).json()\n",
"\n",
"url_base = 'https://tonies.de'\n",
"special_char_map = {ord('!'): '', ord('?'): '', ord('’'): '', ord('&'): '', ord('.'): '', ord(','): '', ord(' '): '-', ord('ä'):'ae', ord('ü'):'ue', ord('ö'):'oe', ord('ß'):'ss'}\n",
"series_names_url = []\n",
"episode_names = []\n",
"episode_names_url = []\n",
"series_urls = []\n",
"for tonie in data:\n",
" if tonie['language'] == 'de':\n",
" series_names_url.append(re.sub('--+', '-', tonie['series'].translate(special_char_map).lower()))\n",
" episode_names.append(tonie['episodes'])\n",
" episode_names_url.append(re.sub('--+', '-', episode_names[-1].translate(special_char_map).lower()))\n",
" series_urls.append('https://tonies.de/shop/tonies/{series}/'.format(series=series_names_url[-1]))\n",
" #TODO Instead of collecting series_urls, directly get track names and add to tonies, then save json file\n",
"series_urls = np.unique(series_urls)"
]
},
{
"cell_type": "markdown",
"id": "metallic-greek",
"metadata": {},
"source": [
"## Check if the first part of the url is correct and get the listed episodes"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "worldwide-sensitivity",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://tonies.de/shop/tonies/der-raeuber-hotzenplotz/\n",
"https://tonies.de/shop/tonies/die-maus/\n",
"https://tonies.de/shop/tonies/heule-eule-und-andere-geschichten/\n",
"https://tonies.de/shop/tonies/kosmo-klax/\n",
"https://tonies.de/shop/tonies/kreativ-tonie/\n",
"https://tonies.de/shop/tonies/nola-note/\n",
"https://tonies.de/shop/tonies/rotzn-roll-radio/\n"
]
}
],
"source": [
"all_episode_urls = []\n",
"for url,series,episode in zip(series_urls, series_names_url, episode_names):\n",
" r = requests.get(url)\n",
" if r.status_code == 200:\n",
" soup = BeautifulSoup(r.content)\n",
" href_all = soup.find_all('a', href=True)\n",
" request_urls = [href['href'] for href in href_all]\n",
" episode_urls = [a.find(url[17:]) for a in request_urls]\n",
" episode_urls = [0 if a == -1 else 1 for a in episode_urls]\n",
" all_episode_urls.append(np.unique(np.array(request_urls)[np.where(episode_urls)[0]]))\n",
" \n",
" else:\n",
" print(url)"
]
},
{
"cell_type": "markdown",
"id": "regulated-cargo",
"metadata": {},
"source": [
"## Get the track names of all episodes"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "special-batch",
"metadata": {},
"outputs": [],
"source": [
"episode_list = []\n",
"for series_array in all_episode_urls:\n",
" for partial_url in series_array:\n",
" full_url = url_base + partial_url\n",
" r = requests.get(full_url)\n",
"\n",
" soup = BeautifulSoup(r.content)\n",
" titlelist = soup.find_all(id=\"tabs--large-up__titelliste\")\n",
" if titlelist:\n",
" titlelist = [a.get_text() for a in titlelist[0].find_all('p')]\n",
" episode_list.append([full_url, partial_url.split('/')[-3], partial_url.split('/')[-2], titlelist])\n",
"df = pd.DataFrame(episode_list, columns=['full_url', 'series_url', 'episode_url', 'tracks'])"
]
},
{
"cell_type": "markdown",
"id": "successful-closure",
"metadata": {},
"source": [
"## Match each episode back to the JSON"
]
},
{
"cell_type": "markdown",
"id": "intensive-native",
"metadata": {},
"source": [
"The nesting order is relevant as otherwise many low-quality matches will happen"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "mature-retreat",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"better match found!\n",
"better match found!\n",
"better match found!\n",
"better match found!\n",
"better match found!\n",
"better match found!\n",
"better match found!\n",
"better match found!\n"
]
}
],
"source": [
"for index, row in df.iterrows():\n",
" fuzzy_ratio = []\n",
" for tonie in data:\n",
"# if tonie['language'] == 'de':\n",
" full_name = ' - '.join(row['full_url'].replace('-', ' ').lower().split('/')[-3:-1])\n",
" fuzzy_ratio.append(fuzz.ratio(full_name, tonie['title'].lower()))\n",
" matching_tonie = np.argsort(fuzzy_ratio)[-1]\n",
" if 'fuzzy_ratio' not in data[matching_tonie]:\n",
" data[matching_tonie]['tracks'] = row['tracks']\n",
" data[matching_tonie]['url'] = row['full_url']\n",
" data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n",
" elif int(data[matching_tonie]['fuzzy_ratio']) < np.max(fuzzy_ratio):\n",
" print('better match found!')\n",
" data[matching_tonie]['tracks'] = row['tracks']\n",
" data[matching_tonie]['url'] = row['full_url']\n",
" data[matching_tonie]['fuzzy_ratio'] = str(np.max(fuzzy_ratio))\n",
"\n",
"with open('tonies.json', 'w') as f:\n",
" json.dump(data, f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
@steve8x8
Copy link

steve8x8 commented Jan 7, 2023

Unfortunately, retrieving a "series" using requests.get() for me doesn't return the same information I'd get via the browser. Example: https://tonies.com/de-de/tonies/?series=anne-kaffeekanne ("tonies.de" gets replaced by "tonies.com/de-de" and "tonies/${series}" becomes "tonies/?series=${series}") - in the browser I get 1 hit while the python code returns some random, and unrelated, stuff :(
Any suggestions what might go wrong here?

Also I'm still trying to find out what's happening in the 4th stage... I'm getting some rather bad matches

@steve8x8
Copy link

steve8x8 commented Jan 9, 2023

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment