Skip to content

Instantly share code, notes, and snippets.

@siemiatj
Created May 16, 2017 18:58
Show Gist options
  • Save siemiatj/7103e91c2a3a50c4516e7f4f2609a232 to your computer and use it in GitHub Desktop.
Save siemiatj/7103e91c2a3a50c4516e7f4f2609a232 to your computer and use it in GitHub Desktop.
import bs4 as bs
import lxml
import urllib.request
import csv
def get_html():
html = urllib.request.urlopen("http://www.imdb.com/chart/top?ref=ft_250").read()
soup = bs.BeautifulSoup(html, "lxml")
return soup.find_all('a')
def find_movies(anchors):
id_list = []
number = 0
for paragraph in anchors:
paragraph_str = str(paragraph)
if paragraph_str[16:25] not in id_list and paragraph.get('href') and "title/tt0" in paragraph_str:
id_list.append(paragraph_str[16:25])
number +=1
if number == 100:
break
return id_list
def get_movies_details(id_list):
list_of_movies =[]
address = "http://www.omdbapi.com/?i="
for id_number in id_list:
html2 = urllib.request.urlopen(addres+id_number).read()
soup2 = bs.BeautifulSoup(html2, "lxml")
list_of_movies.append(str(soup2.find_all("p")))
return list_of_movies
def get_movies_data(list_of_movies):
year_title = []
for movie in list_of_movies:
first = '"Title":"'
last = '","Year":'
start = movie.index( first ) + len( first )
end = movie.index( last, start )
title = movie[start:end]
first = '"Year":"'
last = '","Rated"'
start = movie.index( first ) + len( first )
end = movie.index( last, start )
year = movie[start:end]
movie = [title, year]
year_title.append(movie)
year_title = sorted(year_title, key = lambda x: int(x[1]))
year_title.insert(0, ["title", "year"])
return year_title
def save_data_to_file(movies_data):
with open ('%s.csv' % "movies", 'w', newline='') as file:
a = csv.writer(file, delimiter=',')
a.writerows(year_title)
def main():
html = get_html()
movies_ids = find_movies(html)
movies_details = get_movies_details(movies_ids)
movies_data = get_movies_data(movies_details)
save_data_to_file(movies_data)
print("Done!")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment