vladimirmyshkovski/freelansim.py

## freelansim.py
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.utils.markup import remove_tags

from freelance.scraper.scraper.items import OrderItem


class FreelansimSpider(CrawlSpider):

    name = "freelansim"
    allowed_domains = ["freelance.habr.com"]
    start_urls = ["https://freelance.habr.com/tasks"]

    rules = [
        Rule(
            link_extractor=LinkExtractor(
                allow=("tasks/",),
                restrict_xpaths="//div[@class='task__title']//a",
                canonicalize=True,
                unique=True,
            ),
            follow=True,
            callback="parse_item",
        )
    ]

    def parse_item(self, response):
        item = OrderItem()
        item["title"] = (
            response.xpath("//title//text()")[0].extract().split(" — ")[0].strip()
        )
        item["text"] = (
            ''.join([item.extract().strip() for item in response.xpath("//div[@class='task__description']//text()")])
        )
        item["amount"] = 1
        item["currency"] = "USD"
        item["original_url"] = response.url
        item["html"] = ''.join(response.xpath(
            "//div[@class='task__description']//node()"
        ).extract())
        item["tags"] = [
            a.extract().strip()
            for a in response.xpath("//a[@class='tags__item_link']//text()")
            if len(a.extract().strip()) > 0
        ]
        amount = response.xpath("//span[@class='negotiated_price']//text()").extract()
        if not amount:
            amount = response.xpath(
                "//div[@class='task__finance']//span//text()"
            ).extract()[0]
            amount, task_type = amount.split(".")
            if "руб" in amount:
                item["amount"] = int(amount.split("руб")[0].replace(" ", ""))
                item["currency"] = "RUB"

        yield item
	from scrapy.linkextractors import LinkExtractor
	from scrapy.spiders import CrawlSpider, Rule
	from scrapy.utils.markup import remove_tags

	from freelance.scraper.scraper.items import OrderItem


	class FreelansimSpider(CrawlSpider):

	name = "freelansim"
	allowed_domains = ["freelance.habr.com"]
	start_urls = ["https://freelance.habr.com/tasks"]

	rules = [
	Rule(
	link_extractor=LinkExtractor(
	allow=("tasks/",),
	restrict_xpaths="//div[@class='task__title']//a",
	canonicalize=True,
	unique=True,
	),
	follow=True,
	callback="parse_item",
	)
	]

	def parse_item(self, response):
	item = OrderItem()
	item["title"] = (
	response.xpath("//title//text()")[0].extract().split(" — ")[0].strip()
	)
	item["text"] = (
	''.join([item.extract().strip() for item in response.xpath("//div[@class='task__description']//text()")])
	)
	item["amount"] = 1
	item["currency"] = "USD"
	item["original_url"] = response.url
	item["html"] = ''.join(response.xpath(
	"//div[@class='task__description']//node()"
	).extract())
	item["tags"] = [
	a.extract().strip()
	for a in response.xpath("//a[@class='tags__item_link']//text()")
	if len(a.extract().strip()) > 0
	]
	amount = response.xpath("//span[@class='negotiated_price']//text()").extract()
	if not amount:
	amount = response.xpath(
	"//div[@class='task__finance']//span//text()"
	).extract()[0]
	amount, task_type = amount.split(".")
	if "руб" in amount:
	item["amount"] = int(amount.split("руб")[0].replace(" ", ""))
	item["currency"] = "RUB"

	yield item