monk-time/ICM - Compare runtimes.py

## ICM - Compare runtimes.py
from typing import Dict, Iterable, Union

import requests
from bs4 import BeautifulSoup
from bs4.element import Tag

MovieData = Dict[str, Union[str, Dict[str, str]]]


def fetch_pages(url: str, page_start: int, page_end: int) -> Iterable[Tag]:
    """Download and parse all pages starting from page_start and up to (including) page_end."""
    for page_num in range(page_start, page_end + 1):
        r = requests.get(url, {'page': page_num})
        r.raise_for_status()
        page = BeautifulSoup(r.text, 'html.parser')
        yield page


def get_movie_datas_from_page(page: Tag) -> Iterable[MovieData]:
    """Extract all DOM elements containing movies from an ICM list page,
    and form a movie data dictionary for each movie."""
    yield from map(get_movie_data, page.select('.listItemMovie'))


def get_movie_data(t: Tag) -> MovieData:
    """Given an ICM DOM element, extract all movie data needed for further work into a dictionary.
    ICM slug and IMDb id are taken directly from the element, the rest (movie data and runtimes)
    are fetched through additional requests to ICM and IMDb."""
    icm_slug = t.select_one('h2 a')['href'] \
        .replace('/movies/', '').rstrip('/')
    imdb_id = t.select_one('.optionIMDB')['href'] \
        .replace('http://www.imdb.com/title/', '').rstrip('/')
    return {
        'icm_slug': icm_slug,
        'icm_data': get_icm_movie_data(icm_slug),
        'imdb_id': imdb_id,
        'imdb_runtime': get_imdb_runtime(imdb_id),
    }


def get_icm_movie_data(slug: str) -> Dict[str, str]:
    """Extract all movie data from an ICM's /dialog/ subpage.
    Converts dt/dd HTML tag pairs into a dictionary."""
    r = requests.get(f'https://www.icheckmovies.com/movies/{slug}/dialog/')
    r.raise_for_status()
    soup = BeautifulSoup(r.json()['html'], 'html.parser')
    dts = [t.string.lower() for t in soup.find_all('dt')]
    dds = [t.get_text().strip() for t in soup.find_all('dd')]
    return dict(zip(dts, dds))


def get_imdb_runtime(imdb_id: str) -> str:
    """Extract IMDb runtime from a title bar element. Missing runtimes are replaced with 'unknown'.
    The same value is returned for unavailable pages."""
    r = requests.get(f'https://www.imdb.com/title/{imdb_id}/')
    if not r.ok:  # some movies on ICM link to dead IMDB pages
        return 'unknown'
    page = BeautifulSoup(r.text, 'html.parser')
    runtime_tag = page.select_one('.titleBar time')
    return runtime_tag.string.strip() if runtime_tag else 'unknown'


def main():
    # Go through every page from 25524 to 25525 (so two pages)
    # (at the point of writing that was the first page on ICM with missing runtimes).
    for page in fetch_pages('https://www.icheckmovies.com/movies/?sort=runtime', 25524, 25525):
        # Fetch all necessary data from every movie on the current page
        for m in get_movie_datas_from_page(page):
            is_missing_runtime_on_icm = m['icm_data']['runtime'] == 'unknown' \
                                        and m['imdb_runtime'] != 'unknown'
            # Skip movies that don't satisfy our criteria;
            # comment these two lines to see the full output of the script for every movie
            if not is_missing_runtime_on_icm:
                continue
            # Print the collected data that we're intested in, runtimes and slugs/ids from ICM/IMDb
            print(f"{m['icm_data']['runtime']:>7} | {m['imdb_runtime']:>7} | "
                  f"{m['icm_slug']} | {m['imdb_id']}")


if __name__ == '__main__':
    main()
	from typing import Dict, Iterable, Union

	import requests
	from bs4 import BeautifulSoup
	from bs4.element import Tag

	MovieData = Dict[str, Union[str, Dict[str, str]]]


	def fetch_pages(url: str, page_start: int, page_end: int) -> Iterable[Tag]:
	"""Download and parse all pages starting from page_start and up to (including) page_end."""
	for page_num in range(page_start, page_end + 1):
	r = requests.get(url, {'page': page_num})
	r.raise_for_status()
	page = BeautifulSoup(r.text, 'html.parser')
	yield page


	def get_movie_datas_from_page(page: Tag) -> Iterable[MovieData]:
	"""Extract all DOM elements containing movies from an ICM list page,
	and form a movie data dictionary for each movie."""
	yield from map(get_movie_data, page.select('.listItemMovie'))


	def get_movie_data(t: Tag) -> MovieData:
	"""Given an ICM DOM element, extract all movie data needed for further work into a dictionary.
	ICM slug and IMDb id are taken directly from the element, the rest (movie data and runtimes)
	are fetched through additional requests to ICM and IMDb."""
	icm_slug = t.select_one('h2 a')['href'] \
	.replace('/movies/', '').rstrip('/')
	imdb_id = t.select_one('.optionIMDB')['href'] \
	.replace('http://www.imdb.com/title/', '').rstrip('/')
	return {
	'icm_slug': icm_slug,
	'icm_data': get_icm_movie_data(icm_slug),
	'imdb_id': imdb_id,
	'imdb_runtime': get_imdb_runtime(imdb_id),
	}


	def get_icm_movie_data(slug: str) -> Dict[str, str]:
	"""Extract all movie data from an ICM's /dialog/ subpage.
	Converts dt/dd HTML tag pairs into a dictionary."""
	r = requests.get(f'https://www.icheckmovies.com/movies/{slug}/dialog/')
	r.raise_for_status()
	soup = BeautifulSoup(r.json()['html'], 'html.parser')
	dts = [t.string.lower() for t in soup.find_all('dt')]
	dds = [t.get_text().strip() for t in soup.find_all('dd')]
	return dict(zip(dts, dds))


	def get_imdb_runtime(imdb_id: str) -> str:
	"""Extract IMDb runtime from a title bar element. Missing runtimes are replaced with 'unknown'.
	The same value is returned for unavailable pages."""
	r = requests.get(f'https://www.imdb.com/title/{imdb_id}/')
	if not r.ok: # some movies on ICM link to dead IMDB pages
	return 'unknown'
	page = BeautifulSoup(r.text, 'html.parser')
	runtime_tag = page.select_one('.titleBar time')
	return runtime_tag.string.strip() if runtime_tag else 'unknown'


	def main():
	# Go through every page from 25524 to 25525 (so two pages)
	# (at the point of writing that was the first page on ICM with missing runtimes).
	for page in fetch_pages('https://www.icheckmovies.com/movies/?sort=runtime', 25524, 25525):
	# Fetch all necessary data from every movie on the current page
	for m in get_movie_datas_from_page(page):
	is_missing_runtime_on_icm = m['icm_data']['runtime'] == 'unknown' \
	and m['imdb_runtime'] != 'unknown'
	# Skip movies that don't satisfy our criteria;
	# comment these two lines to see the full output of the script for every movie
	if not is_missing_runtime_on_icm:
	continue
	# Print the collected data that we're intested in, runtimes and slugs/ids from ICM/IMDb
	print(f"{m['icm_data']['runtime']:>7} \| {m['imdb_runtime']:>7} \| "
	f"{m['icm_slug']} \| {m['imdb_id']}")


	if __name__ == '__main__':
	main()