Skip to content

Instantly share code, notes, and snippets.

View clemfromspace's full-sized avatar
🎯
Focusing

Clément Denoix clemfromspace

🎯
Focusing
View GitHub Profile
@clemfromspace
clemfromspace / puppeteer-middleware.py
Last active June 26, 2021 08:02
Scrapy with Puppeteer
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
import asyncio
from pyppeteer import launch
from scrapy import signals
from scrapy.http import HtmlResponse
from twisted.internet import defer
from .http import PuppeteerRequest
@clemfromspace
clemfromspace / tasks.py
Created November 24, 2017 04:34
Running a scrapy spider from a celery task
from billiard.context import Process
from scrapy.crawler import Crawler
from scrapy import signals
from scrapy.utils.project import get_project_settings
from twisted.internet import reactor
from celery_app import app
class CrawlerProcess(Process):
# -*- coding: utf-8 -*-
import re
import dateparser
from scrapy import Request
from scrapy.spider import CrawlSpider
from urlparse import urljoin
import re
from scrapy import Request
from scrapy.spider import CrawlSpider
from urlparse import urljoin
class TripAdvisorSpider(CrawlSpider):
@clemfromspace
clemfromspace / items.py
Last active March 10, 2018 17:02
Scrapy wikipédia url
import scrapy
class LinkItem(scrapy.Item):
href = scrapy.Field()