Skip to content

Instantly share code, notes, and snippets.

@amarynets
Created December 27, 2016 12:06
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save amarynets/8a04d4d73ff1c9c36be0f1862956dd18 to your computer and use it in GitHub Desktop.
Save amarynets/8a04d4d73ff1c9c36be0f1862956dd18 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from urllib.parse import urljoin
import scrapy
from selenium import webdriver
from scraper.items import AdsItem
from ads.models import URL
class AdsSpider(scrapy.Spider):
name = "ads"
# urls = URL.objects.all()
start_urls = ['http://www.technologynetworks.com/']
# # for i in urls:
# # start_urls.append(i.name)
def parse(self, response):
try:
response = response.replace(body=self.chrome_request(response.url))
link = response.xpath("..//a")
print("ALL link")
external_link = self.get_all_external_href(link)
# url = URL.objects.get(name=response.url)
print(external_link)
for i in external_link:
print(i.extract())
item = AdsItem()
item['href'] = i.xpath("./@href").extract_first()
item['img'] = i.xpath("./img/@src").extract_first()
print(item)
yield item
except BaseException as e:
print(e)
print("Something wrong in function parse")
def create(self, data):
print("In creating")
for i in data:
print(i.extract())
item = AdsItem()
item['href'] = i.xpath("./@href").extract_first()
item['img'] = i.xpath("./img/@src").extract_first()
print(item)
yield item
def get_all_external_href(self, data):
result = list()
for i in data:
if self.has_target(i):
result.append(i)
return result
def chrome_request(self, url):
driver = webdriver.Chrome()
driver.get(url)
response = driver.page_source.encode('utf-8')
driver.quit()
return response
def has_target(self, a_tag):
target = a_tag.xpath("./@target").extract_first()
if target == '_blank':
return True
else:
return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment