Skip to content

Instantly share code, notes, and snippets.

@rafnixg
Forked from eduardogpg/scraper.py
Last active April 29, 2023 01:11
Show Gist options
  • Save rafnixg/32a72280fe199671dbc9f2c5ecd492ed to your computer and use it in GitHub Desktop.
Save rafnixg/32a72280fe199671dbc9f2c5ecd492ed to your computer and use it in GitHub Desktop.
Una clase del BC de Ciencia de datos.
"""Scrapper IMDB Calendar Mexico"""
import csv
import json
import requests
from bs4 import BeautifulSoup
URL = "https://www.imdb.com/calendar/?region=MX"
"""
1.- Obtener el maqueto HTML
- Si el archivo HTML no existe de forma local, crearlo.
- Si el archivo HTML existe de forma local, obtener su contenido.
2.- Obtener la información
- Nombre
- Categorias
- Reparto
3.- Generar un archivo CSV
"""
def get_imdb_content():
"""Get the content of the IMDB calendar page
Returns:
string -- The content of the IMDB calendar page
None -- If the request was not successful
"""
headers = {"User-Agent": "Mozilla/5.0"}
response = requests.get(URL, headers=headers) # 20x - 30x - 40x - 50x
if response.status_code == 200:
return response.text
return None
def create_imdb_file_local(content):
"""Create a local file with the content of the IMDB calendar page
Arguments:
content {string} -- The content of the IMDB calendar page
"""
try:
with open("imdb.html", "w", encoding="UTF-8") as file:
file.write(content)
except:
pass
def get_imdb_file_local():
"""Get the content of the local file with the content of the IMDB calendar page
Returns:
string -- The content of the local file with the content of the IMDB calendar page
None -- If the file does not exist
"""
content = None
try:
with open("imdb.html", "r", encoding="UTF-8") as file:
content = file.read()
except:
pass
return content
def get_local_imdb_content():
"""Get the content of the IMDB calendar page from the local file or from the IMDB page
Returns:
string -- The content of the IMDB calendar page
"""
content = get_imdb_file_local()
if content:
return content
content = get_imdb_content()
create_imdb_file_local(content)
return content
def create_movie(tag):
"""Create a movie object from a tag of the IMDB calendar page
Arguments:
tag {bs4.element.Tag} -- A tag of the IMDB calendar page
Returns:
tuple -- A tuple with the name, categories and cast of the movie
"""
main_div = tag.find("div", {"class": "ipc-metadata-list-summary-item__c"})
name = main_div.div.a.text
ul_categories = main_div.find(
"ul",
{
"class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__tl base"
},
)
ul_cast = main_div.find(
"ul",
{
"class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__stl base"
},
)
cast = None
categories = [category.span.text for category in ul_categories.find_all("li")]
cast = [cast.span.text for cast in ul_cast.find_all("li")] if ul_cast else []
return (name, categories, cast)
def parse_content_html(content):
"""Parse the content of the IMDB calendar page
Arguments:
content {string} -- The content of the IMDB calendar page
Returns:
list -- A list with the tags of the movies
"""
soup = BeautifulSoup(content, "html.parser")
li_tags = soup.find_all(
"li",
{
"data-testid": "coming-soon-entry",
"class": "ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click sc-8c2b7f1f-0 bpqYIE",
},
)
return li_tags
def create_csv_movies_file(movies):
"""Create a CSV file with the movies
Arguments:
movies {list} -- A list with the movies
"""
with open("movies.csv", "w", encoding="UTF-8") as file:
writer = csv.writer(file, delimiter=",")
writer.writerow(["name", "categories", "cast"])
for movie in movies:
writer.writerow(
[
movie[0], # name
",".join(movie[1]), # categories
",".join(movie[2]), # cast
]
)
def create_json_movies_file(movies):
"""Create a JSON file with the movies
Arguments:
movies {list} -- A list with the movies
"""
dict_movies = [
{"name": movie[0], "categories": movie[1], "cast": movie[2]} for movie in movies
]
with open("movies.json", "w", encoding="utf-8") as file:
json.dump(dict_movies, file, indent=4)
def get_movies_from_tags(li_tags):
"""Get the movies from the tags of the IMDB calendar page
Arguments:
li_tags {list} -- A list with the tags of the movies
Returns:
list -- A list with the movies
"""
movies = []
for tag in li_tags:
movie = create_movie(tag)
movies.append(movie)
return movies
def main():
"""Main function"""
content = get_local_imdb_content()
if not content:
print("No se pudo obtener el contenido de IMDB")
return None
li_tags = parse_content_html(content)
movies = get_movies_from_tags(li_tags)
create_csv_movies_file(movies)
create_json_movies_file(movies)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment