Skip to content

Instantly share code, notes, and snippets.

@jluczak
jluczak / spider-research.py
Created August 23, 2017 08:44
Scrap Bio's from Facebook Research
import scrapy
from research.items import ResearchItem
class ResearchSpider(scrapy.Spider):
start_urls = [
'https://research.fb.com/people/'
]
name = 'research'
@jluczak
jluczak / items.py
Created August 21, 2017 11:27
Scrap companies from Crossweb
import scrapy
class CrosswebItem(scrapy.Item):
name=scrapy.Field()
city=scrapy.Field()
topics=scrapy.Field()
file_urls=scrapy.Field()
files=scrapy.Field()
description=scrapy.Field()
@jluczak
jluczak / spider_pycon.py
Created August 10, 2017 08:25
Parse PyCon speakers
import scrapy
from pycon.items import PyconItem
class HumanSpider(scrapy.Spider):
start_urls = [
'http://pyvideo.org/speakers.html'
]
name = 'pycon'
@jluczak
jluczak / spider_human.py
Last active August 7, 2017 19:57
Write a Scrapy parser for Human Talks
import scrapy
from human.items import HumanItem
class HumanSpider(scrapy.Spider):
start_urls = [
'http://humantalks.com/talks/'
]
name = 'human'
@jluczak
jluczak / event_script.py
Last active August 3, 2017 09:55
Find header images for external events
import csv
import pycurl
from io import BytesIO
import json
import requests
buffer = BytesIO()
with open('events_2016.csv', newline='') as csvfile:
spamreader = csv.reader(csvfile, delimiter=',', quotechar='"')
@jluczak
jluczak / spider_books.py
Created July 31, 2017 19:30
Write a spider in Scrapy that parses all Romance books and extracts title, price and rating from them; it should output JSON with 35 elements
import scrapy
class QuotesSpider(scrapy.Spider):
name = "books"
start_urls = [
'http://books.toscrape.com/catalogue/category/books/romance_8/index.html',
]
def parse(self, response):
@jluczak
jluczak / items.py
Last active August 3, 2017 13:37
Scrap Google Experts
import scrapy
from scrapy.item import Item,Field
class ExpertItem(scrapy.Item):
name=Field()
tangline=Field()
file_urls=Field()
files=Field()
city=Field()
gplus=Field()
@jluczak
jluczak / quickstart.py
Last active July 28, 2017 13:59
Fetching automatically talks for NDC
### START BOILERPLATE CODE
# Sample Python code for user authorization
import httplib2
import os
import sys
import pprint
@jluczak
jluczak / spider_mob.py
Last active July 19, 2017 07:21
scrapy sample - single & mulitlink
from scrapy.spiders import CrawlSpider, Rule
from mobile.items import MobileItem
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
from scrapy.selector import Selector
class MySpider(CrawlSpider):
name = "mobile"
allowed_domains = ["mobiletechcon.de"]
start_urls = ["https://mobiletechcon.de/speakers-en/"]