Skip to content

Instantly share code, notes, and snippets.

@haydarai
Created April 24, 2019 16:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save haydarai/1ec01d6c9e5e666b61ef37acd5aef2c4 to your computer and use it in GitHub Desktop.
Save haydarai/1ec01d6c9e5e666b61ef37acd5aef2c4 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import os
import scrapy
import unidecode
import re
import uuid
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
class ImdbSpider(scrapy.Spider):
name = 'imdb'
allowed_domains = ['www.imdb.com']
start_urls = ['https://www.imdb.com/title/tt0096463/fullcredits/']
def __init__(self, *args, **kwargs):
super(ImdbSpider, self).__init__(*args, **kwargs)
load_dotenv()
ELASTIC_API_URL_HOST = os.environ['ELASTIC_API_URL_HOST']
ELASTIC_API_URL_PORT = os.environ['ELASTIC_API_URL_PORT']
ELASTIC_API_USERNAME = os.environ['ELASTIC_API_USERNAME']
ELASTIC_API_PASSWORD = os.environ['ELASTIC_API_PASSWORD']
self.es = Elasticsearch(host=ELASTIC_API_URL_HOST,
scheme='https',
port=ELASTIC_API_URL_PORT,
http_auth=(ELASTIC_API_USERNAME, ELASTIC_API_PASSWORD))
def parse(self, response):
movie_year = response.xpath("").extract()[0]
movie_year = movie_year.replace(
'\n', '').strip().replace('(', '').replace(')', '')
try:
movie_year = int(movie_year[0:4])
if movie_year >= 1980 and movie_year < 1990:
data = {}
data['movie_id'] = ''
data['movie_name'] = ''
data['movie_year'] = movie_year
rows = [...]
for row in rows:
data['actor_name'] = row.xpath('').extract()[0]
actor_url = row.xpath('').extract()[0]
actor_url_tokens = actor_url.split('/')
data['actor_id'] = actor_url_tokens[2]
role_name = ''
if len(row.xpath('').extract()) > 0:
role_name = row.xpath('').extract()[0]
elif len(row.xpath('').extract()) > 0:
role_name = row.xpath('').extract()[0]
role_name = re.sub('\s+', ' ', role_name).strip()
data['role_name'] = role_name
self.es.index(index='imdb_movies',
doc_type='movies',
id=uuid.uuid4(),
body=data)
if actor_url is not None:
actor_url = 'https://www.imdb.com' + actor_url
yield response.follow(actor_url, callback=self.parse_actor)
except:
print('Failed to parse movie year.')
def parse_actor(self, response):
rows = response.xpath("")
for row in rows:
movie_year = row.xpath('').extract()[0]
movie_year = movie_year.replace('\n', '').replace('\xa0', '')
try:
movie_year = int(movie_year[0:4])
if movie_year >= 1980 and movie_year < 1990:
movie_url = row.xpath('').extract()[0]
movie_url = 'https://www.imdb.com' + movie_url + 'fullcredits/'
yield response.follow(movie_url, callback=self.parse)
data = {}
data['actor_name'] = response.xpath("").extract()[0]
data['actor_horoscope'] = response.xpath("").extract()[0]
self.es.index(index='imdb_actors',
doc_type='actors',
id=data['actor_name'],
body=data)
except:
print('Failed to parse movie year.')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment