Skip to content

Instantly share code, notes, and snippets.

@lobstrio
lobstrio / lacentrale_scraper.py
Created April 15, 2021 18:33
Collect BMW vehicle data on lacentrale.fr
# -*- coding: utf-8 -*-
# Copyright(C) 2021 Sasha Bouloudnine
import requests
from lxml import html
import csv
class CrawlerLaCentrale():
@lobstrio
lobstrio / pagesjaunes_extract.py
Created November 21, 2018 19:05
Extract name and phone on PageJaunes.fr through Python 3, Request and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
import csv
from lxml import html
import datetime
import argparse
@lobstrio
lobstrio / download_ebooks_from_onion_link_python3_requests.py
Created March 24, 2023 17:23
Download free anarchist ebooks from an .onion site with Python3 and requests 🧅
@lobstrio
lobstrio / pappers_pdf_parser_with_python_and_tika.py
Last active April 14, 2023 19:41
Scrape PDFs programmatically site with Python3 and Tika library
from tika import parser
import re
import csv
HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']
def parse_pdf(filename):
# request
raw = parser.from_file(filename)
@lobstrio
lobstrio / google_maps_scraping_selenium.py
Created August 3, 2021 17:31
Collect all data from a Search URL on Google Maps 👋
# _*_ coding: utf-8 _*°
# Copyright(C) 2021 lobstr
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import csv
@lobstrio
lobstrio / chatgpt_powered_product_page_universal_scraper.py
Created April 28, 2023 12:48
Scrape price and title from *any* product page, with Python and ChatGPT 🤖
import os
import requests
import html2text
import re
import argparse
OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY'
COMPLETION_URL = 'https://api.openai.com/v1/chat/completions'
PROMPT = """Find the main article from this product page, and return from this text content, as JSON format:
from curl_cffi import requests
from lxml import html
import json
import csv
import time
import argparse
HEADERS = {
'authority': 'www.doctolib.fr',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
@lobstrio
lobstrio / yelp_scraper_2023.py
Created July 17, 2023 10:47
🍝 Collect all listings from Yelp from a Search URL e.g. https://www.yelp.fr/search?find_desc=Pizza&find_loc=marseille — phones included!
import requests
import csv
from lxml import html
import argparse
import time
class YelpSearchScraper:
def iter_listings(self, url):
response = requests.get(url)
if response.status_code != 200:
@lobstrio
lobstrio / growthhackingfr_scraper.py
Created August 11, 2023 17:43
🧙 Scrape all topics from the famous French GrowthHacking.fr forum — 'scraping' category only!
"""
GrowthHacking.fr Forum Scraper
This script is used to scrape data from the GrowthHacking.fr forum, specifically from the "Scraping" category.
It retrieves information about forum topics and saves it as CSV data.
Usage:
1. Install the required library using the following command:
$ pip install requests
@lobstrio
lobstrio / cdiscount_scraper.py
Last active August 11, 2023 21:57
Scrape all products from a cDiscount hot barbecue category URL 🍖
import requests
import re
import json
from lxml import html
import time
from retry import retry
import csv
URL = 'https://www.cdiscount.com/search/10/barbecue.html'