-
-
Save amarynets/8a04d4d73ff1c9c36be0f1862956dd18 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from urllib.parse import urljoin | |
import scrapy | |
from selenium import webdriver | |
from scraper.items import AdsItem | |
from ads.models import URL | |
class AdsSpider(scrapy.Spider): | |
name = "ads" | |
# urls = URL.objects.all() | |
start_urls = ['http://www.technologynetworks.com/'] | |
# # for i in urls: | |
# # start_urls.append(i.name) | |
def parse(self, response): | |
try: | |
response = response.replace(body=self.chrome_request(response.url)) | |
link = response.xpath("..//a") | |
print("ALL link") | |
external_link = self.get_all_external_href(link) | |
# url = URL.objects.get(name=response.url) | |
print(external_link) | |
for i in external_link: | |
print(i.extract()) | |
item = AdsItem() | |
item['href'] = i.xpath("./@href").extract_first() | |
item['img'] = i.xpath("./img/@src").extract_first() | |
print(item) | |
yield item | |
except BaseException as e: | |
print(e) | |
print("Something wrong in function parse") | |
def create(self, data): | |
print("In creating") | |
for i in data: | |
print(i.extract()) | |
item = AdsItem() | |
item['href'] = i.xpath("./@href").extract_first() | |
item['img'] = i.xpath("./img/@src").extract_first() | |
print(item) | |
yield item | |
def get_all_external_href(self, data): | |
result = list() | |
for i in data: | |
if self.has_target(i): | |
result.append(i) | |
return result | |
def chrome_request(self, url): | |
driver = webdriver.Chrome() | |
driver.get(url) | |
response = driver.page_source.encode('utf-8') | |
driver.quit() | |
return response | |
def has_target(self, a_tag): | |
target = a_tag.xpath("./@target").extract_first() | |
if target == '_blank': | |
return True | |
else: | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment