-
-
Save renefs/a10a3e9f17b30edf431619ddcc629f2e to your computer and use it in GitHub Desktop.
#! /usr/bin/python3 | |
# Author: Pablo Baeyens | |
# Usage: | |
# ./faScrap.py -h | |
# for usage info and options | |
import time | |
import argparse | |
import requests | |
import csv | |
import bs4 | |
from datetime import datetime | |
import platform | |
import locale | |
def set_locale(lang): | |
"""Attempts to set locale.""" | |
if platform.system() in {"Linux", "Darwin"}: | |
loc = "es_ES.utf8" if lang == "es" else "en_US.utf8" | |
elif platform.system() == "Windows": | |
loc = "es-ES" if lang == "es" else "en-US" | |
else: | |
raise locale.Error() | |
locale.setlocale(locale.LC_ALL, loc) | |
def get_date(tag, lang): | |
"""Gets date from tag (format YYYY-MM-DD)""" | |
if lang == "es": | |
date_str = tag.string[len("Votada el día: "):].strip() | |
print(date_str) | |
fecha = datetime.strptime(date_str, "%d de %B de %Y").date() | |
else: | |
date_str = tag.string[len("Rated on "):].strip() | |
fecha = datetime.strptime(date_str, "%B %d, %Y").date() | |
return fecha.strftime("%Y-%m-%d") | |
def get_directors(tag): | |
"""Gets directors from a film""" | |
directors = list( | |
map( | |
lambda d: d.a["title"], | |
tag.find_all(class_="mc-director")[0].find_all(class_="nb"))) | |
for director in directors: | |
if director.endswith("(Creator)"): | |
director = director[:-10] | |
return ", ".join(directors) | |
def is_film(tag, lang): | |
"""Checks if given tag is a film""" | |
title = tag.find_all(class_="mc-title")[0].a.string.strip() | |
skip = [] | |
if lang == "es": | |
skip = ["(Serie de TV)", "(Miniserie de TV)", "(TV)", "(C)"] | |
else: | |
skip = ["(TV Series)", "(TV Miniseries)", "(TV)", "(S)"] | |
return not any(map(title.endswith, skip)) | |
def get_data(user_id, lang): | |
"""Gets list of films from user id""" | |
data = [] | |
eof = False | |
n = 1 | |
FA = "https://www.filmaffinity.com/" + lang + \ | |
"/userratings.php?user_id={id}&p={n}&orderby=4" | |
print(FA) | |
while not eof: | |
url = FA.format(id=user_id, n=n) | |
print(url) | |
request = requests.get(FA.format(id=user_id, n=n)) | |
request.encoding = "utf-8" | |
page = bs4.BeautifulSoup(request.text, "lxml") | |
tags = page.find_all( | |
class_=["user-ratings-header", "user-ratings-movie"]) | |
cur_date = None | |
for tag in tags: | |
if tag["class"] == ["user-ratings-header"]: | |
cur_date = get_date(tag, lang) | |
elif is_film(tag, lang): | |
title = tag.find_all(class_="mc-title")[0].a | |
film = { | |
"Title": title.string.strip(), | |
"Year": title.next_sibling.strip()[1:-1], | |
"Directors": get_directors(tag), | |
"WatchedDate": cur_date, | |
"Rating": int(tag.find_all(class_="ur-mr-rat")[0].string) / 2, | |
"Rating10": tag.find_all(class_="ur-mr-rat")[0].string | |
} | |
data.append(film) | |
eof = request.status_code != 200 | |
if not eof: | |
print("Página {n}".format(n=n), end="\r") | |
else: | |
print("Página {n}. Download complete!".format(n=n - 1)) | |
n += 1 | |
return data | |
def save_to_csv(data, filename): | |
"""Saves list of dictionaries in a csv file""" | |
with open(filename, 'w', newline='') as csvfile: | |
writer = csv.writer(csvfile) | |
writer.writerow(list(data[0])) | |
for film in data: | |
writer.writerow(list(film.values())) | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description= | |
"Generates csv compatible with LetterBoxd from Filmaffinity user's id.") | |
parser.add_argument("id", help="User's id") | |
parser.add_argument( | |
"--csv", nargs=1, help="Name of export FILE", metavar="FILE") | |
parser.add_argument( | |
"--lang", | |
nargs=1, | |
help="Language for exporting", | |
metavar="LANG", | |
default=["en"], | |
choices={"es", "en"}) | |
args = parser.parse_args() | |
export_file = args.csv[ | |
0] if args.csv else "filmAffinity_{lang}_{id}.csv".format( | |
id=args.id, lang=args.lang[0]) | |
try: | |
set_locale(args.lang[0]) | |
except locale.Error: | |
print( | |
"Could not set locale for \'{lang}\' and UTF-8 encoding.".format( | |
lang=args.lang[0])) | |
manual_locale = input("locale (empty for default): ").strip() | |
if manual_locale: | |
try: | |
locale.setlocale(locale.LC_ALL, manual_locale) | |
except locale.Error as e: | |
print(e) | |
exit() | |
try: | |
data = get_data(args.id, "en") | |
print(data) | |
except ValueError as v: | |
print("Error:", v) | |
exit() | |
save_to_csv(data, export_file) |
beautifulsoup4==4.7.1 | |
bs4==0.0.1 | |
certifi==2019.3.9 | |
chardet==3.0.4 | |
idna==2.8 | |
lxml==4.3.3 | |
requests==2.21.0 | |
soupsieve==1.9.1 | |
urllib3==1.24.2 |
Title | Year | Directors | WatchedDate | Rating | Rating10 | |
---|---|---|---|---|---|---|
Avengers: Infinity War | 2018 | Anthony Russo, Joe Russo | 2019-04-28 | 4.5 | 9 | |
Avengers: Endgame | 2019 | Anthony Russo, Joe Russo | 2019-04-28 | 4.5 | 9 | |
Saw 2 | 2005 | Darren Lynn Bousman | 2006-03-12 | 3.5 | 7 | |
Munich | 2005 | Steven Spielberg | 2006-03-12 | 2.0 | 4 | |
Jarhead | 2005 | Sam Mendes | 2006-03-12 | 4.0 | 8 | |
The Chronicles of Narnia: The Lion, The Witch and the Wardrobe | 2005 | Andrew Adamson | 2006-03-12 | 3.0 | 6 | |
Torrente 3 | 2005 | Santiago Segura | 2006-03-12 | 2.0 | 4 | |
War of the Worlds | 2005 | Steven Spielberg | 2006-03-12 | 2.5 | 5 | |
Star Wars: Episode III Revenge of the Sith | 2005 | George Lucas | 2006-03-12 | 4.5 | 9 | |
King Kong | 2005 | Peter Jackson | 2006-03-12 | 1.5 | 3 | |
Mars Attacks! | 1996 | Tim Burton | 2006-03-12 | 4.5 | 9 | |
The Matrix Revolutions | 2003 | Lilly Wachowski, Lana Wachowski | 2006-03-12 | 4.0 | 8 | |
Night Watch | 2004 | Timur Bekmambetov | 2006-03-12 | 1.0 | 2 | |
Kill Bill: Volume 1 | 2003 | Quentin Tarantino | 2006-03-12 | 3.5 | 7 | |
The Faculty | 1998 | Robert Rodriguez | 2006-03-12 | 3.0 | 6 | |
The Village | 2004 | M. Night Shyamalan | 2006-03-12 | 2.5 | 5 | |
There's Something About Mary | 1998 | Peter Farrelly, Bobby Farrelly | 2006-03-12 | 3.5 | 7 | |
Twelve Monkeys | 1995 | Terry Gilliam | 2006-03-12 | 3.0 | 6 | |
Dumb and Dumber (Dumb & Dumber) | 1994 | Peter Farrelly, Bobby Farrelly | 2006-03-12 | 3.0 | 6 | |
A Clockwork Orange | 1971 | Stanley Kubrick | 2006-03-12 | 5.0 | 10 | |
Elephant | 2003 | Gus Van Sant | 2006-03-12 | 2.0 | 4 | |
The Blair Witch Project | 1999 | Daniel Myrick, Eduardo Sánchez | 2006-03-12 | 4.0 | 8 | |
Moulin Rouge | 2001 | Baz Luhrmann | 2006-03-12 | 1.5 | 3 |
if trhows me this error:
Error: unknown locale: en-US
:(
Do you know what is the problem ?
Thanks for this code :)
Which line? Can you print the stack trace?
try:
data = get_data(args.id, "en")
print(data)
except ValueError as v:
167 print("Error:", v)
exit()
Line 167
The code only run this 2 lines and stops in line 167
https://www.filmaffinity.com/en/userratings.php?user_id={id}&p={n}&orderby=4
https://www.filmaffinity.com/en/userratings.php?user_id=731957&p=1&orderby=4
Error: unknown locale: en-US
hi, i just fixed this error but i have another one:
UnicodeEncodeError: 'charmap' codec can't encode character '\u014d' in position 36: character maps to "< undefined > "
UnicodeEncodeError: 'charmap' codec can't encode character '\u014c' in position 21: character maps to "< undefined >"
i dont know why but this line "locale.setlocale(locale.LC_ALL, loc)" on line 28 wasn't working properly.
hi, i just fixed the other error. Try to change line 123 to this:
with open(filename, 'w', newline='', encoding='utf8') as csvfile:
Now i have the .csv file but it doenst have a table format.
Thx for your code men :)
Thanks for the script! It worked like a charm.
I had only an issue when I tried to install the requirements.txt
that you specified. After installing the dependencies manually, though, it worked just fine. I guess that some of the versions are not compatible with my system (I'm on an M1 Pro Mac).
What is the point of asking for language in the parameters if it is then not used?
try:
data = get_data(args.id, "en")
The code only run this 2 lines and stops in line 167 https://www.filmaffinity.com/en/userratings.php?user_id={id}&p={n}&orderby=4 https://www.filmaffinity.com/en/userratings.php?user_id=731957&p=1&orderby=4 Error: unknown locale: en-US
I am getting the exact same error
Run with
python main.py <FILM_AFFINITY_ID> --csv result.csv