Created
April 24, 2019 16:09
-
-
Save haydarai/1ec01d6c9e5e666b61ef37acd5aef2c4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import os | |
import scrapy | |
import unidecode | |
import re | |
import uuid | |
from dotenv import load_dotenv | |
from elasticsearch import Elasticsearch | |
class ImdbSpider(scrapy.Spider): | |
name = 'imdb' | |
allowed_domains = ['www.imdb.com'] | |
start_urls = ['https://www.imdb.com/title/tt0096463/fullcredits/'] | |
def __init__(self, *args, **kwargs): | |
super(ImdbSpider, self).__init__(*args, **kwargs) | |
load_dotenv() | |
ELASTIC_API_URL_HOST = os.environ['ELASTIC_API_URL_HOST'] | |
ELASTIC_API_URL_PORT = os.environ['ELASTIC_API_URL_PORT'] | |
ELASTIC_API_USERNAME = os.environ['ELASTIC_API_USERNAME'] | |
ELASTIC_API_PASSWORD = os.environ['ELASTIC_API_PASSWORD'] | |
self.es = Elasticsearch(host=ELASTIC_API_URL_HOST, | |
scheme='https', | |
port=ELASTIC_API_URL_PORT, | |
http_auth=(ELASTIC_API_USERNAME, ELASTIC_API_PASSWORD)) | |
def parse(self, response): | |
movie_year = response.xpath("").extract()[0] | |
movie_year = movie_year.replace( | |
'\n', '').strip().replace('(', '').replace(')', '') | |
try: | |
movie_year = int(movie_year[0:4]) | |
if movie_year >= 1980 and movie_year < 1990: | |
data = {} | |
data['movie_id'] = '' | |
data['movie_name'] = '' | |
data['movie_year'] = movie_year | |
rows = [...] | |
for row in rows: | |
data['actor_name'] = row.xpath('').extract()[0] | |
actor_url = row.xpath('').extract()[0] | |
actor_url_tokens = actor_url.split('/') | |
data['actor_id'] = actor_url_tokens[2] | |
role_name = '' | |
if len(row.xpath('').extract()) > 0: | |
role_name = row.xpath('').extract()[0] | |
elif len(row.xpath('').extract()) > 0: | |
role_name = row.xpath('').extract()[0] | |
role_name = re.sub('\s+', ' ', role_name).strip() | |
data['role_name'] = role_name | |
self.es.index(index='imdb_movies', | |
doc_type='movies', | |
id=uuid.uuid4(), | |
body=data) | |
if actor_url is not None: | |
actor_url = 'https://www.imdb.com' + actor_url | |
yield response.follow(actor_url, callback=self.parse_actor) | |
except: | |
print('Failed to parse movie year.') | |
def parse_actor(self, response): | |
rows = response.xpath("") | |
for row in rows: | |
movie_year = row.xpath('').extract()[0] | |
movie_year = movie_year.replace('\n', '').replace('\xa0', '') | |
try: | |
movie_year = int(movie_year[0:4]) | |
if movie_year >= 1980 and movie_year < 1990: | |
movie_url = row.xpath('').extract()[0] | |
movie_url = 'https://www.imdb.com' + movie_url + 'fullcredits/' | |
yield response.follow(movie_url, callback=self.parse) | |
data = {} | |
data['actor_name'] = response.xpath("").extract()[0] | |
data['actor_horoscope'] = response.xpath("").extract()[0] | |
self.es.index(index='imdb_actors', | |
doc_type='actors', | |
id=data['actor_name'], | |
body=data) | |
except: | |
print('Failed to parse movie year.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment