Skip to content

Instantly share code, notes, and snippets.

@harendra21
Created January 16, 2022 07:33
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save harendra21/0531548699017fb337d7602cee2ecaae to your computer and use it in GitHub Desktop.
Save harendra21/0531548699017fb337d7602cee2ecaae to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import pandas as pd
import os
# Setting up session
s = requests.session()
# List contaiting all the films for which data has to be scraped from IMDB
films = []
# Lists contaiting web scraped data
names = []
ratings = []
genres = []
# Define path where your films are present
# For eg: "/Users/utkarsh/Desktop/films"
path = input("Enter the path where your films are: ")
# Films with extensions
filmswe = os.listdir(path)
for film in filmswe:
# Append into my films list (without extensions)
films.append(os.path.splitext(film)[0])
# print(os.path.splitext(film)[0])
for line in films:
# x = line.split(", ")
title = line.lower()
# release = x[1]
query = "+".join(title.split())
URL = "https://www.imdb.com/search/title/?title=" + query
print(URL)
# print(release)
try:
response = s.get(URL)
#getting contect from IMDB Website
content = response.content
# print(response.status_code)
soup = BeautifulSoup(response.content, features="html.parser")
#searching all films containers found
containers = soup.find_all("div", class_="lister-item-content")
for result in containers:
name1 = result.h3.a.text
name = result.h3.a.text.lower()
# Uncomment below lines if you want year specific as well, define year variable before this
# year = result.h3.find(
# "span", class_="lister-item-year text-muted unbold"
# ).text.lower()
#if film found (searching using name)
if title in name:
#scraping rating
rating = result.find("div",class_="inline-block ratings-imdb-rating")["data-value"]
#scraping genre
genre = result.p.find("span", class_="genre")
genre = genre.contents[0]
#appending name, rating and genre to individual lists
names.append(name1)
ratings.append(rating)
genres.append(genre)
except Exception:
print("Try again with valid combination of tile and release year")
#storing in pandas dataframe
df = pd.DataFrame({'Film Name':names,'Rating':ratings,'Genre':genres})
#making csv using pandas
df.to_csv('film_ratings.csv', index=False, encoding='utf-8')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment