Skip to content

Instantly share code, notes, and snippets.

@monk-time
Last active May 7, 2021 12:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save monk-time/a81e297c94e13513fae11aa7aa30acdd to your computer and use it in GitHub Desktop.
Save monk-time/a81e297c94e13513fae11aa7aa30acdd to your computer and use it in GitHub Desktop.
from typing import Dict, Iterable, Union
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
MovieData = Dict[str, Union[str, Dict[str, str]]]
def fetch_pages(url: str, page_start: int, page_end: int) -> Iterable[Tag]:
"""Download and parse all pages starting from page_start and up to (including) page_end."""
for page_num in range(page_start, page_end + 1):
r = requests.get(url, {'page': page_num})
r.raise_for_status()
page = BeautifulSoup(r.text, 'html.parser')
yield page
def get_movie_datas_from_page(page: Tag) -> Iterable[MovieData]:
"""Extract all DOM elements containing movies from an ICM list page,
and form a movie data dictionary for each movie."""
yield from map(get_movie_data, page.select('.listItemMovie'))
def get_movie_data(t: Tag) -> MovieData:
"""Given an ICM DOM element, extract all movie data needed for further work into a dictionary.
ICM slug and IMDb id are taken directly from the element, the rest (movie data and runtimes)
are fetched through additional requests to ICM and IMDb."""
icm_slug = t.select_one('h2 a')['href'] \
.replace('/movies/', '').rstrip('/')
imdb_id = t.select_one('.optionIMDB')['href'] \
.replace('http://www.imdb.com/title/', '').rstrip('/')
return {
'icm_slug': icm_slug,
'icm_data': get_icm_movie_data(icm_slug),
'imdb_id': imdb_id,
'imdb_runtime': get_imdb_runtime(imdb_id),
}
def get_icm_movie_data(slug: str) -> Dict[str, str]:
"""Extract all movie data from an ICM's /dialog/ subpage.
Converts dt/dd HTML tag pairs into a dictionary."""
r = requests.get(f'https://www.icheckmovies.com/movies/{slug}/dialog/')
r.raise_for_status()
soup = BeautifulSoup(r.json()['html'], 'html.parser')
dts = [t.string.lower() for t in soup.find_all('dt')]
dds = [t.get_text().strip() for t in soup.find_all('dd')]
return dict(zip(dts, dds))
def get_imdb_runtime(imdb_id: str) -> str:
"""Extract IMDb runtime from a title bar element. Missing runtimes are replaced with 'unknown'.
The same value is returned for unavailable pages."""
r = requests.get(f'https://www.imdb.com/title/{imdb_id}/')
if not r.ok: # some movies on ICM link to dead IMDB pages
return 'unknown'
page = BeautifulSoup(r.text, 'html.parser')
runtime_tag = page.select_one('.titleBar time')
return runtime_tag.string.strip() if runtime_tag else 'unknown'
def main():
# Go through every page from 25524 to 25525 (so two pages)
# (at the point of writing that was the first page on ICM with missing runtimes).
for page in fetch_pages('https://www.icheckmovies.com/movies/?sort=runtime', 25524, 25525):
# Fetch all necessary data from every movie on the current page
for m in get_movie_datas_from_page(page):
is_missing_runtime_on_icm = m['icm_data']['runtime'] == 'unknown' \
and m['imdb_runtime'] != 'unknown'
# Skip movies that don't satisfy our criteria;
# comment these two lines to see the full output of the script for every movie
if not is_missing_runtime_on_icm:
continue
# Print the collected data that we're intested in, runtimes and slugs/ids from ICM/IMDb
print(f"{m['icm_data']['runtime']:>7} | {m['imdb_runtime']:>7} | "
f"{m['icm_slug']} | {m['imdb_id']}")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment