Skip to content

Instantly share code, notes, and snippets.

@parnexcodes
Created October 31, 2021 13:37
Show Gist options
  • Save parnexcodes/e9b437f5cb5c09afa235ed8ea0cc68ff to your computer and use it in GitHub Desktop.
Save parnexcodes/e9b437f5cb5c09afa235ed8ea0cc68ff to your computer and use it in GitHub Desktop.
imdb.com/top scraping
from typing import final
import requests
import pprint
from bs4 import BeautifulSoup
def get_recent():
URL = f"https://www.imdb.com/chart/top/"
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'lxml')
items = soup.find_all('tbody', {'class': 'lister-list'})
final_data = []
for item in items:
for list in item.find_all('tr'):
for poster_column in list.find_all('td', {'class': 'posterColumn'}):
rank = poster_column.find('span', {'name': 'rk'})['data-value']
poster = poster_column.a.img['src']
for title_column in list.find_all('td', {'class': 'titleColumn'}):
link = title_column.a.get('href')
title = title_column.a.text
release_year = title_column.find('span', {'class': 'secondaryInfo'}).text.strip('(,)')
for rating_column in list.find_all('td', {'class': 'ratingColumn imdbRating'}):
rating = rating_column.text.strip()
data = {'rank': rank, 'poster': poster, 'link': f'https://imdb.com{link}', 'id': link[7:-1], 'title': title, 'release_year': release_year, 'rating': rating}
final_data.append(data)
return pprint.pprint(final_data)
get_recent()
@parnexcodes
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment