Last active
May 7, 2021 12:04
-
-
Save monk-time/a81e297c94e13513fae11aa7aa30acdd to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Dict, Iterable, Union | |
import requests | |
from bs4 import BeautifulSoup | |
from bs4.element import Tag | |
MovieData = Dict[str, Union[str, Dict[str, str]]] | |
def fetch_pages(url: str, page_start: int, page_end: int) -> Iterable[Tag]: | |
"""Download and parse all pages starting from page_start and up to (including) page_end.""" | |
for page_num in range(page_start, page_end + 1): | |
r = requests.get(url, {'page': page_num}) | |
r.raise_for_status() | |
page = BeautifulSoup(r.text, 'html.parser') | |
yield page | |
def get_movie_datas_from_page(page: Tag) -> Iterable[MovieData]: | |
"""Extract all DOM elements containing movies from an ICM list page, | |
and form a movie data dictionary for each movie.""" | |
yield from map(get_movie_data, page.select('.listItemMovie')) | |
def get_movie_data(t: Tag) -> MovieData: | |
"""Given an ICM DOM element, extract all movie data needed for further work into a dictionary. | |
ICM slug and IMDb id are taken directly from the element, the rest (movie data and runtimes) | |
are fetched through additional requests to ICM and IMDb.""" | |
icm_slug = t.select_one('h2 a')['href'] \ | |
.replace('/movies/', '').rstrip('/') | |
imdb_id = t.select_one('.optionIMDB')['href'] \ | |
.replace('http://www.imdb.com/title/', '').rstrip('/') | |
return { | |
'icm_slug': icm_slug, | |
'icm_data': get_icm_movie_data(icm_slug), | |
'imdb_id': imdb_id, | |
'imdb_runtime': get_imdb_runtime(imdb_id), | |
} | |
def get_icm_movie_data(slug: str) -> Dict[str, str]: | |
"""Extract all movie data from an ICM's /dialog/ subpage. | |
Converts dt/dd HTML tag pairs into a dictionary.""" | |
r = requests.get(f'https://www.icheckmovies.com/movies/{slug}/dialog/') | |
r.raise_for_status() | |
soup = BeautifulSoup(r.json()['html'], 'html.parser') | |
dts = [t.string.lower() for t in soup.find_all('dt')] | |
dds = [t.get_text().strip() for t in soup.find_all('dd')] | |
return dict(zip(dts, dds)) | |
def get_imdb_runtime(imdb_id: str) -> str: | |
"""Extract IMDb runtime from a title bar element. Missing runtimes are replaced with 'unknown'. | |
The same value is returned for unavailable pages.""" | |
r = requests.get(f'https://www.imdb.com/title/{imdb_id}/') | |
if not r.ok: # some movies on ICM link to dead IMDB pages | |
return 'unknown' | |
page = BeautifulSoup(r.text, 'html.parser') | |
runtime_tag = page.select_one('.titleBar time') | |
return runtime_tag.string.strip() if runtime_tag else 'unknown' | |
def main(): | |
# Go through every page from 25524 to 25525 (so two pages) | |
# (at the point of writing that was the first page on ICM with missing runtimes). | |
for page in fetch_pages('https://www.icheckmovies.com/movies/?sort=runtime', 25524, 25525): | |
# Fetch all necessary data from every movie on the current page | |
for m in get_movie_datas_from_page(page): | |
is_missing_runtime_on_icm = m['icm_data']['runtime'] == 'unknown' \ | |
and m['imdb_runtime'] != 'unknown' | |
# Skip movies that don't satisfy our criteria; | |
# comment these two lines to see the full output of the script for every movie | |
if not is_missing_runtime_on_icm: | |
continue | |
# Print the collected data that we're intested in, runtimes and slugs/ids from ICM/IMDb | |
print(f"{m['icm_data']['runtime']:>7} | {m['imdb_runtime']:>7} | " | |
f"{m['icm_slug']} | {m['imdb_id']}") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment