Skip to content

Instantly share code, notes, and snippets.

@hughlilly
Last active June 1, 2022 06:23
Show Gist options
  • Save hughlilly/83eaad4ca3a12c5ba04a6bb9b3456ec2 to your computer and use it in GitHub Desktop.
Save hughlilly/83eaad4ca3a12c5ba04a6bb9b3456ec2 to your computer and use it in GitHub Desktop.
IMDbExtractor.py
# Based on https://github.com/yogeshp1426/IMDB_Top_50_Web_Scrapper/blob/master/IMDB_csv_out.ipynb
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen
import csv
IMDb_url = (
"https://www.imdb.com/search/title/?sort=num_votes,desc&start=1&title_type=feature&year=1950,1960")
connection = urlopen(IMDb_url)
page_html = connection.read()
connection.close()
page_soup = soup(page_html, "html.parser")
movie_container = page_soup.findAll("div", {"class": "lister-item-content"})
f = open('imdb_dataout.csv', 'w', newline='')
writer = csv.writer(f)
writer.writerow(['ID', 'title', 'director', 'releaseYear',
'runtimeMinutes', 'IMDbURL'])
count = 1
for movie in movie_container:
title = movie.h3.a.text
director = movie.find('p', {"class": ""}).a.text
release_year = movie.find(
'span', {"class": "lister-item-year"}).text.replace('(', '').replace(')', '')
runtime = movie.find('span', {"class": "runtime"}).text.replace(' min', '')
imdb_id = movie.h3.a.attrs['href']
url = "https://www.imdb.com" + str(imdb_id)
row = [count, title, director, release_year,
runtime, url]
print(row)
writer.writerow(row)
count += 1
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment