Skip to content

Instantly share code, notes, and snippets.

@lobstrio
lobstrio / chatgpt_powered_product_page_universal_scraper.py
Created April 28, 2023 12:48
Scrape price and title from *any* product page, with Python and ChatGPT 🤖
import os
import requests
import html2text
import re
import argparse
OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY'
COMPLETION_URL = 'https://api.openai.com/v1/chat/completions'
PROMPT = """Find the main article from this product page, and return from this text content, as JSON format:
@lobstrio
lobstrio / google_maps_scraping_selenium.py
Created August 3, 2021 17:31
Collect all data from a Search URL on Google Maps 👋
# _*_ coding: utf-8 _*°
# Copyright(C) 2021 lobstr
from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import csv
@lobstrio
lobstrio / pappers_pdf_parser_with_python_and_tika.py
Last active April 14, 2023 19:41
Scrape PDFs programmatically site with Python3 and Tika library
from tika import parser
import re
import csv
HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']
def parse_pdf(filename):
# request
raw = parser.from_file(filename)
@lobstrio
lobstrio / download_ebooks_from_onion_link_python3_requests.py
Created March 24, 2023 17:23
Download free anarchist ebooks from an .onion site with Python3 and requests 🧅
@lobstrio
lobstrio / pagesjaunes_extract.py
Created November 21, 2018 19:05
Extract name and phone on PageJaunes.fr through Python 3, Request and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
import csv
from lxml import html
import datetime
import argparse
@lobstrio
lobstrio / lacentrale_scraper.py
Created April 15, 2021 18:33
Collect BMW vehicle data on lacentrale.fr
# -*- coding: utf-8 -*-
# Copyright(C) 2021 Sasha Bouloudnine
import requests
from lxml import html
import csv
class CrawlerLaCentrale():
@lobstrio
lobstrio / pdf_parser.py
Created August 16, 2018 16:05
Python 3 script to convert .pdf file into .txt output using PDFMiner
#!/usr/bin/python3
# coding: utf-8
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import argparse
@lobstrio
lobstrio / amazon_xmas.py
Created December 20, 2018 13:58
Web Scraping Python Script for the Xmas Deals on Amazon using Requests
# -*- coding: utf-8 -*-
# Copyright(C) 2018 Sasha Bouloudnine
import requests
import sys
import re
import ast
import json
import time
@lobstrio
lobstrio / twitter_dtrump.py
Last active January 8, 2021 14:13
Really simple Web Scraping Python Script for the first Tweets of Donald Trump using Requests, and lxml
#!/usr/bin/python3
# coding: utf-8
import requests
from lxml import html
def extract():
"""
Export all Tweets from @realDonaldTrump
@lobstrio
lobstrio / leboncoin_chalet.py
Created March 2, 2018 12:50
Python 3 code to scrape leboncoin "chalet" items in Savoie through Scrapy library
import scrapy
import time
class LbcSpider(scrapy.Spider):
name = "chalet_savoie_lbc"
start_urls = [
'https://www.leboncoin.fr/locations_gites/offres/rhone_alpes/savoie/',
]
def parse(self, response):