haydarai/imdb.py

## imdb.py
# -*- coding: utf-8 -*-
import os
import scrapy
import unidecode
import re
import uuid
from dotenv import load_dotenv
from elasticsearch import Elasticsearch


class ImdbSpider(scrapy.Spider):
    name = 'imdb'
    allowed_domains = ['www.imdb.com']
    start_urls = ['https://www.imdb.com/title/tt0096463/fullcredits/']

    def __init__(self, *args, **kwargs):
        super(ImdbSpider, self).__init__(*args, **kwargs)
        load_dotenv()

        ELASTIC_API_URL_HOST = os.environ['ELASTIC_API_URL_HOST']
        ELASTIC_API_URL_PORT = os.environ['ELASTIC_API_URL_PORT']
        ELASTIC_API_USERNAME = os.environ['ELASTIC_API_USERNAME']
        ELASTIC_API_PASSWORD = os.environ['ELASTIC_API_PASSWORD']

        self.es = Elasticsearch(host=ELASTIC_API_URL_HOST,
                                scheme='https',
                                port=ELASTIC_API_URL_PORT,
                                http_auth=(ELASTIC_API_USERNAME, ELASTIC_API_PASSWORD))

    def parse(self, response):
        movie_year = response.xpath("").extract()[0]
        movie_year = movie_year.replace(
            '\n', '').strip().replace('(', '').replace(')', '')
        try:
            movie_year = int(movie_year[0:4])
            if movie_year >= 1980 and movie_year < 1990:
                data = {}
                data['movie_id'] = ''
                data['movie_name'] = ''
                data['movie_year'] = movie_year

                rows = [...]
                for row in rows:
                    data['actor_name'] = row.xpath('').extract()[0]
                    actor_url = row.xpath('').extract()[0]
                    actor_url_tokens = actor_url.split('/')
                    data['actor_id'] = actor_url_tokens[2]
                    role_name = ''
                    if len(row.xpath('').extract()) > 0:
                        role_name = row.xpath('').extract()[0]
                    elif len(row.xpath('').extract()) > 0:
                        role_name = row.xpath('').extract()[0]
                        role_name = re.sub('\s+', ' ', role_name).strip()
                    data['role_name'] = role_name
                    self.es.index(index='imdb_movies',
                                  doc_type='movies',
                                  id=uuid.uuid4(),
                                  body=data)
                    if actor_url is not None:
                        actor_url = 'https://www.imdb.com' + actor_url
                        yield response.follow(actor_url, callback=self.parse_actor)
        except:
            print('Failed to parse movie year.')

    def parse_actor(self, response):
        rows = response.xpath("")
        for row in rows:
            movie_year = row.xpath('').extract()[0]
            movie_year = movie_year.replace('\n', '').replace('\xa0', '')
            try:
                movie_year = int(movie_year[0:4])
                if movie_year >= 1980 and movie_year < 1990:
                    movie_url = row.xpath('').extract()[0]
                    movie_url = 'https://www.imdb.com' + movie_url + 'fullcredits/'
                    yield response.follow(movie_url, callback=self.parse)

                    data = {}

                    data['actor_name'] = response.xpath("").extract()[0]
                    data['actor_horoscope'] = response.xpath("").extract()[0]

                    self.es.index(index='imdb_actors',
                                  doc_type='actors',
                                  id=data['actor_name'],
                                  body=data)

            except:
                print('Failed to parse movie year.')
	# -- coding: utf-8 --
	import os
	import scrapy
	import unidecode
	import re
	import uuid
	from dotenv import load_dotenv
	from elasticsearch import Elasticsearch


	class ImdbSpider(scrapy.Spider):
	name = 'imdb'
	allowed_domains = ['www.imdb.com']
	start_urls = ['https://www.imdb.com/title/tt0096463/fullcredits/']

	def __init__(self, args, *kwargs):
	super(ImdbSpider, self).__init__(args, *kwargs)
	load_dotenv()

	ELASTIC_API_URL_HOST = os.environ['ELASTIC_API_URL_HOST']
	ELASTIC_API_URL_PORT = os.environ['ELASTIC_API_URL_PORT']
	ELASTIC_API_USERNAME = os.environ['ELASTIC_API_USERNAME']
	ELASTIC_API_PASSWORD = os.environ['ELASTIC_API_PASSWORD']

	self.es = Elasticsearch(host=ELASTIC_API_URL_HOST,
	scheme='https',
	port=ELASTIC_API_URL_PORT,
	http_auth=(ELASTIC_API_USERNAME, ELASTIC_API_PASSWORD))

	def parse(self, response):
	movie_year = response.xpath("").extract()[0]
	movie_year = movie_year.replace(
	'\n', '').strip().replace('(', '').replace(')', '')
	try:
	movie_year = int(movie_year[0:4])
	if movie_year >= 1980 and movie_year < 1990:
	data = {}
	data['movie_id'] = ''
	data['movie_name'] = ''
	data['movie_year'] = movie_year

	rows = [...]
	for row in rows:
	data['actor_name'] = row.xpath('').extract()[0]
	actor_url = row.xpath('').extract()[0]
	actor_url_tokens = actor_url.split('/')
	data['actor_id'] = actor_url_tokens[2]
	role_name = ''
	if len(row.xpath('').extract()) > 0:
	role_name = row.xpath('').extract()[0]
	elif len(row.xpath('').extract()) > 0:
	role_name = row.xpath('').extract()[0]
	role_name = re.sub('\s+', ' ', role_name).strip()
	data['role_name'] = role_name
	self.es.index(index='imdb_movies',
	doc_type='movies',
	id=uuid.uuid4(),
	body=data)
	if actor_url is not None:
	actor_url = 'https://www.imdb.com' + actor_url
	yield response.follow(actor_url, callback=self.parse_actor)
	except:
	print('Failed to parse movie year.')

	def parse_actor(self, response):
	rows = response.xpath("")
	for row in rows:
	movie_year = row.xpath('').extract()[0]
	movie_year = movie_year.replace('\n', '').replace('\xa0', '')
	try:
	movie_year = int(movie_year[0:4])
	if movie_year >= 1980 and movie_year < 1990:
	movie_url = row.xpath('').extract()[0]
	movie_url = 'https://www.imdb.com' + movie_url + 'fullcredits/'
	yield response.follow(movie_url, callback=self.parse)

	data = {}

	data['actor_name'] = response.xpath("").extract()[0]
	data['actor_horoscope'] = response.xpath("").extract()[0]

	self.es.index(index='imdb_actors',
	doc_type='actors',
	id=data['actor_name'],
	body=data)

	except:
	print('Failed to parse movie year.')