gopigof/YIFY_Scraper.py

## YIFY_Scraper.py
import requests
import bs4

base_url = 'http://www.yify-movies.com'


def parse_link(soup_object):
    movies = {}
    movie_links = []

    for i in soup_object.find_all('div'):
        if 'c6' in i.get('class'):
            movie_links.append(i)

    for i in movie_links:
        name = i('h2')[0].get_text()
        movies[name] = {'Name': name}

        temp_link = i('a')[1].get('href')
        if temp_link.startswith('/'): temp_link = base_url +temp_link
        new_soup_object = bs4.BeautifulSoup(requests.get(temp_link).text, 'lxml')
        magnet_link = new_soup_object.find('div', {'class': 'wish-list'}).find('a').get('href')

        genre = [i.get_text() for i in new_soup_object.find_all('a') if str(i.get('href')).startswith('/genre/')]
        imdb_link = new_soup_object.find_all('a')[-1].get('href')
        other = []
        for i in new_soup_object.find_all('b'):
            other.append((i.text, i.next_sibling))
        other[0][1] = genre

        movies[name].update({'Magnet Link': magnet_link, 'IMDB Link': imdb_link})
        for i in other:
            movies[name].update({i[0]: i[1]})

    return movies


def main():
    page_limit = 1
    crawl_option = ['', 'seed/', 'peer/', 'az/', 'za/'] # Time is default if unmentioned
    url = 'http://www.yify-movies.net/search/1080p/'
    movie_list = []

    for i in range(page_limit):
        print('Reached')
        # soup_object = bs4.BeautifulSoup(requests.get(url+'seed/').text, 'lxml')
        if i != 1:
            soup_object = bs4.BeautifulSoup(requests.get(url +crawl_option[1] +str(i)).text, 'lxml')
        else:
            soup_object = bs4.BeautifulSoup(requests.get(url + crawl_option[1]).text, 'lxml')
        movie_list.append(parse_link(soup_object))

    print(movie_list)


if __name__ == '__main__':
    main()
	import requests
	import bs4

	base_url = 'http://www.yify-movies.com'


	def parse_link(soup_object):
	movies = {}
	movie_links = []

	for i in soup_object.find_all('div'):
	if 'c6' in i.get('class'):
	movie_links.append(i)

	for i in movie_links:
	name = i('h2')[0].get_text()
	movies[name] = {'Name': name}

	temp_link = i('a')[1].get('href')
	if temp_link.startswith('/'): temp_link = base_url +temp_link
	new_soup_object = bs4.BeautifulSoup(requests.get(temp_link).text, 'lxml')
	magnet_link = new_soup_object.find('div', {'class': 'wish-list'}).find('a').get('href')

	genre = [i.get_text() for i in new_soup_object.find_all('a') if str(i.get('href')).startswith('/genre/')]
	imdb_link = new_soup_object.find_all('a')[-1].get('href')
	other = []
	for i in new_soup_object.find_all('b'):
	other.append((i.text, i.next_sibling))
	other[0][1] = genre

	movies[name].update({'Magnet Link': magnet_link, 'IMDB Link': imdb_link})
	for i in other:
	movies[name].update({i[0]: i[1]})

	return movies


	def main():
	page_limit = 1
	crawl_option = ['', 'seed/', 'peer/', 'az/', 'za/'] # Time is default if unmentioned
	url = 'http://www.yify-movies.net/search/1080p/'
	movie_list = []

	for i in range(page_limit):
	print('Reached')
	# soup_object = bs4.BeautifulSoup(requests.get(url+'seed/').text, 'lxml')
	if i != 1:
	soup_object = bs4.BeautifulSoup(requests.get(url +crawl_option[1] +str(i)).text, 'lxml')
	else:
	soup_object = bs4.BeautifulSoup(requests.get(url + crawl_option[1]).text, 'lxml')
	movie_list.append(parse_link(soup_object))

	print(movie_list)


	if __name__ == '__main__':
	main()