rafnixg/scraper.py

## scraper.py
"""Scrapper IMDB Calendar Mexico"""
import csv
import json
import requests
from bs4 import BeautifulSoup

URL = "https://www.imdb.com/calendar/?region=MX"

"""
1.- Obtener el maqueto HTML
    - Si el archivo HTML no existe de forma local, crearlo.
    - Si el archivo HTML existe de forma local, obtener su contenido.
2.- Obtener la información
    - Nombre
    - Categorias
    - Reparto
3.- Generar un archivo CSV
"""


def get_imdb_content():
    """Get the content of the IMDB calendar page
    Returns:
        string -- The content of the IMDB calendar page
        None -- If the request was not successful
    """
    headers = {"User-Agent": "Mozilla/5.0"}
    response = requests.get(URL, headers=headers)  # 20x - 30x - 40x - 50x
    if response.status_code == 200:
        return response.text
    return None


def create_imdb_file_local(content):
    """Create a local file with the content of the IMDB calendar page
    Arguments:
        content {string} -- The content of the IMDB calendar page
    """
    try:
        with open("imdb.html", "w", encoding="UTF-8") as file:
            file.write(content)
    except:
        pass


def get_imdb_file_local():
    """Get the content of the local file with the content of the IMDB calendar page
    Returns:
        string -- The content of the local file with the content of the IMDB calendar page
        None -- If the file does not exist
    """
    content = None

    try:
        with open("imdb.html", "r", encoding="UTF-8") as file:
            content = file.read()
    except:
        pass

    return content


def get_local_imdb_content():
    """Get the content of the IMDB calendar page from the local file or from the IMDB page

    Returns:
        string -- The content of the IMDB calendar page
    """
    content = get_imdb_file_local()

    if content:
        return content

    content = get_imdb_content()
    create_imdb_file_local(content)

    return content


def create_movie(tag):
    """Create a movie object from a tag of the IMDB calendar page
    Arguments:
        tag {bs4.element.Tag} -- A tag of the IMDB calendar page
    Returns:
        tuple -- A tuple with the name, categories and cast of the movie
    """
    main_div = tag.find("div", {"class": "ipc-metadata-list-summary-item__c"})

    name = main_div.div.a.text
    ul_categories = main_div.find(
        "ul",
        {
            "class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__tl base"
        },
    )

    ul_cast = main_div.find(
        "ul",
        {
            "class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__stl base"
        },
    )

    cast = None
    categories = [category.span.text for category in ul_categories.find_all("li")]

    cast = [cast.span.text for cast in ul_cast.find_all("li")] if ul_cast else []

    return (name, categories, cast)


def parse_content_html(content):
    """Parse the content of the IMDB calendar page
    Arguments:
        content {string} -- The content of the IMDB calendar page
    Returns:
        list -- A list with the tags of the movies
    """
    soup = BeautifulSoup(content, "html.parser")

    li_tags = soup.find_all(
        "li",
        {
            "data-testid": "coming-soon-entry",
            "class": "ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click sc-8c2b7f1f-0 bpqYIE",
        },
    )

    return li_tags


def create_csv_movies_file(movies):
    """Create a CSV file with the movies
    Arguments:
        movies {list} -- A list with the movies
    """
    with open("movies.csv", "w", encoding="UTF-8") as file:
        writer = csv.writer(file, delimiter=",")
        writer.writerow(["name", "categories", "cast"])

        for movie in movies:
            writer.writerow(
                [
                    movie[0],  # name
                    ",".join(movie[1]),  # categories
                    ",".join(movie[2]),  # cast
                ]
            )

def create_json_movies_file(movies):
    """Create a JSON file with the movies
    Arguments:
        movies {list} -- A list with the movies
    """
    dict_movies = [
        {"name": movie[0], "categories": movie[1], "cast": movie[2]} for movie in movies
    ]
    with open("movies.json", "w", encoding="utf-8") as file:
        json.dump(dict_movies, file, indent=4)


def get_movies_from_tags(li_tags):
    """Get the movies from the tags of the IMDB calendar page
    Arguments:
        li_tags {list} -- A list with the tags of the movies
    Returns:
        list -- A list with the movies
    """
    movies = []
    for tag in li_tags:
        movie = create_movie(tag)
        movies.append(movie)
    return movies


def main():
    """Main function"""

    content = get_local_imdb_content()

    if not content:
        print("No se pudo obtener el contenido de IMDB")
        return None

    li_tags = parse_content_html(content)

    movies = get_movies_from_tags(li_tags)

    create_csv_movies_file(movies)

    create_json_movies_file(movies)


if __name__ == "__main__":
    main()
	"""Scrapper IMDB Calendar Mexico"""
	import csv
	import json
	import requests
	from bs4 import BeautifulSoup

	URL = "https://www.imdb.com/calendar/?region=MX"

	"""
	1.- Obtener el maqueto HTML
	- Si el archivo HTML no existe de forma local, crearlo.
	- Si el archivo HTML existe de forma local, obtener su contenido.
	2.- Obtener la información
	- Nombre
	- Categorias
	- Reparto
	3.- Generar un archivo CSV
	"""


	def get_imdb_content():
	"""Get the content of the IMDB calendar page
	Returns:
	string -- The content of the IMDB calendar page
	None -- If the request was not successful
	"""
	headers = {"User-Agent": "Mozilla/5.0"}
	response = requests.get(URL, headers=headers) # 20x - 30x - 40x - 50x
	if response.status_code == 200:
	return response.text
	return None


	def create_imdb_file_local(content):
	"""Create a local file with the content of the IMDB calendar page
	Arguments:
	content {string} -- The content of the IMDB calendar page
	"""
	try:
	with open("imdb.html", "w", encoding="UTF-8") as file:
	file.write(content)
	except:
	pass


	def get_imdb_file_local():
	"""Get the content of the local file with the content of the IMDB calendar page
	Returns:
	string -- The content of the local file with the content of the IMDB calendar page
	None -- If the file does not exist
	"""
	content = None

	try:
	with open("imdb.html", "r", encoding="UTF-8") as file:
	content = file.read()
	except:
	pass

	return content


	def get_local_imdb_content():
	"""Get the content of the IMDB calendar page from the local file or from the IMDB page

	Returns:
	string -- The content of the IMDB calendar page
	"""
	content = get_imdb_file_local()

	if content:
	return content

	content = get_imdb_content()
	create_imdb_file_local(content)

	return content


	def create_movie(tag):
	"""Create a movie object from a tag of the IMDB calendar page
	Arguments:
	tag {bs4.element.Tag} -- A tag of the IMDB calendar page
	Returns:
	tuple -- A tuple with the name, categories and cast of the movie
	"""
	main_div = tag.find("div", {"class": "ipc-metadata-list-summary-item__c"})

	name = main_div.div.a.text
	ul_categories = main_div.find(
	"ul",
	{
	"class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__tl base"
	},
	)

	ul_cast = main_div.find(
	"ul",
	{
	"class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__stl base"
	},
	)

	cast = None
	categories = [category.span.text for category in ul_categories.find_all("li")]

	cast = [cast.span.text for cast in ul_cast.find_all("li")] if ul_cast else []

	return (name, categories, cast)


	def parse_content_html(content):
	"""Parse the content of the IMDB calendar page
	Arguments:
	content {string} -- The content of the IMDB calendar page
	Returns:
	list -- A list with the tags of the movies
	"""
	soup = BeautifulSoup(content, "html.parser")

	li_tags = soup.find_all(
	"li",
	{
	"data-testid": "coming-soon-entry",
	"class": "ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click sc-8c2b7f1f-0 bpqYIE",
	},
	)

	return li_tags


	def create_csv_movies_file(movies):
	"""Create a CSV file with the movies
	Arguments:
	movies {list} -- A list with the movies
	"""
	with open("movies.csv", "w", encoding="UTF-8") as file:
	writer = csv.writer(file, delimiter=",")
	writer.writerow(["name", "categories", "cast"])

	for movie in movies:
	writer.writerow(
	[
	movie[0], # name
	",".join(movie[1]), # categories
	",".join(movie[2]), # cast
	]
	)

	def create_json_movies_file(movies):
	"""Create a JSON file with the movies
	Arguments:
	movies {list} -- A list with the movies
	"""
	dict_movies = [
	{"name": movie[0], "categories": movie[1], "cast": movie[2]} for movie in movies
	]
	with open("movies.json", "w", encoding="utf-8") as file:
	json.dump(dict_movies, file, indent=4)


	def get_movies_from_tags(li_tags):
	"""Get the movies from the tags of the IMDB calendar page
	Arguments:
	li_tags {list} -- A list with the tags of the movies
	Returns:
	list -- A list with the movies
	"""
	movies = []
	for tag in li_tags:
	movie = create_movie(tag)
	movies.append(movie)
	return movies


	def main():
	"""Main function"""

	content = get_local_imdb_content()

	if not content:
	print("No se pudo obtener el contenido de IMDB")
	return None

	li_tags = parse_content_html(content)

	movies = get_movies_from_tags(li_tags)

	create_csv_movies_file(movies)

	create_json_movies_file(movies)


	if __name__ == "__main__":
	main()