Created
May 16, 2017 18:58
-
-
Save siemiatj/7103e91c2a3a50c4516e7f4f2609a232 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import bs4 as bs | |
import lxml | |
import urllib.request | |
import csv | |
def get_html(): | |
html = urllib.request.urlopen("http://www.imdb.com/chart/top?ref=ft_250").read() | |
soup = bs.BeautifulSoup(html, "lxml") | |
return soup.find_all('a') | |
def find_movies(anchors): | |
id_list = [] | |
number = 0 | |
for paragraph in anchors: | |
paragraph_str = str(paragraph) | |
if paragraph_str[16:25] not in id_list and paragraph.get('href') and "title/tt0" in paragraph_str: | |
id_list.append(paragraph_str[16:25]) | |
number +=1 | |
if number == 100: | |
break | |
return id_list | |
def get_movies_details(id_list): | |
list_of_movies =[] | |
address = "http://www.omdbapi.com/?i=" | |
for id_number in id_list: | |
html2 = urllib.request.urlopen(addres+id_number).read() | |
soup2 = bs.BeautifulSoup(html2, "lxml") | |
list_of_movies.append(str(soup2.find_all("p"))) | |
return list_of_movies | |
def get_movies_data(list_of_movies): | |
year_title = [] | |
for movie in list_of_movies: | |
first = '"Title":"' | |
last = '","Year":' | |
start = movie.index( first ) + len( first ) | |
end = movie.index( last, start ) | |
title = movie[start:end] | |
first = '"Year":"' | |
last = '","Rated"' | |
start = movie.index( first ) + len( first ) | |
end = movie.index( last, start ) | |
year = movie[start:end] | |
movie = [title, year] | |
year_title.append(movie) | |
year_title = sorted(year_title, key = lambda x: int(x[1])) | |
year_title.insert(0, ["title", "year"]) | |
return year_title | |
def save_data_to_file(movies_data): | |
with open ('%s.csv' % "movies", 'w', newline='') as file: | |
a = csv.writer(file, delimiter=',') | |
a.writerows(year_title) | |
def main(): | |
html = get_html() | |
movies_ids = find_movies(html) | |
movies_details = get_movies_details(movies_ids) | |
movies_data = get_movies_data(movies_details) | |
save_data_to_file(movies_data) | |
print("Done!") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment