Skip to content

Instantly share code, notes, and snippets.

@vladimirmyshkovski
Created September 13, 2020 09:26
Show Gist options
  • Save vladimirmyshkovski/b9db886bef98a90dfea857727407b78b to your computer and use it in GitHub Desktop.
Save vladimirmyshkovski/b9db886bef98a90dfea857727407b78b to your computer and use it in GitHub Desktop.
Freelansim Scrapy spider
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.utils.markup import remove_tags
from freelance.scraper.scraper.items import OrderItem
class FreelansimSpider(CrawlSpider):
name = "freelansim"
allowed_domains = ["freelance.habr.com"]
start_urls = ["https://freelance.habr.com/tasks"]
rules = [
Rule(
link_extractor=LinkExtractor(
allow=("tasks/",),
restrict_xpaths="//div[@class='task__title']//a",
canonicalize=True,
unique=True,
),
follow=True,
callback="parse_item",
)
]
def parse_item(self, response):
item = OrderItem()
item["title"] = (
response.xpath("//title//text()")[0].extract().split(" — ")[0].strip()
)
item["text"] = (
''.join([item.extract().strip() for item in response.xpath("//div[@class='task__description']//text()")])
)
item["amount"] = 1
item["currency"] = "USD"
item["original_url"] = response.url
item["html"] = ''.join(response.xpath(
"//div[@class='task__description']//node()"
).extract())
item["tags"] = [
a.extract().strip()
for a in response.xpath("//a[@class='tags__item_link']//text()")
if len(a.extract().strip()) > 0
]
amount = response.xpath("//span[@class='negotiated_price']//text()").extract()
if not amount:
amount = response.xpath(
"//div[@class='task__finance']//span//text()"
).extract()[0]
amount, task_type = amount.split(".")
if "руб" in amount:
item["amount"] = int(amount.split("руб")[0].replace(" ", ""))
item["currency"] = "RUB"
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment