Skip to content

Instantly share code, notes, and snippets.

View clemfromspace's full-sized avatar
🎯
Focusing

Clément Denoix clemfromspace

🎯
Focusing
View GitHub Profile
@clemfromspace
clemfromspace / items.py
Last active March 10, 2018 17:02
Scrapy wikipédia url
import scrapy
class LinkItem(scrapy.Item):
href = scrapy.Field()
import re
from scrapy import Request
from scrapy.spider import CrawlSpider
from urlparse import urljoin
class TripAdvisorSpider(CrawlSpider):
# -*- coding: utf-8 -*-
import re
import dateparser
from scrapy import Request
from scrapy.spider import CrawlSpider
from urlparse import urljoin
@clemfromspace
clemfromspace / tasks.py
Created November 24, 2017 04:34
Running a scrapy spider from a celery task
from billiard.context import Process
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from celery_app import app
class CrawlerProcess(Process):
@clemfromspace
clemfromspace / puppeteer-middleware.py
Last active June 26, 2021 08:02
Scrapy with Puppeteer
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
import asyncio
from pyppeteer import launch
from scrapy import signals
from scrapy.http import HtmlResponse
from twisted.internet import defer
from .http import PuppeteerRequest