lobstrio’s gists

## growthhackingfr_scraper.py
"""
GrowthHacking.fr Forum Scraper

This script is used to scrape data from the GrowthHacking.fr forum, specifically from the "Scraping" category.
It retrieves information about forum topics and saves it as CSV data.

Usage:
1. Install the required library using the following command:
   $ pip install requests

## yelp_scraper_2023.py
import requests
import csv
from lxml import html
import argparse
import time

class YelpSearchScraper:
    def iter_listings(self, url):
        response = requests.get(url)
        if response.status_code != 200:

## 👨‍⚕️ Scrape doctors from doctolib from any search URL and until any page! (072023 version)
from curl_cffi import requests
from lxml import html
import json
import csv
import time
import argparse

HEADERS = {
    'authority': 'www.doctolib.fr',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',

## cdiscount_scraper.py
import requests
import re
import json
from lxml import html
import time
from retry import retry
import csv

URL = 'https://www.cdiscount.com/search/10/barbecue.html'

## chatgpt_powered_product_page_universal_scraper.py
import os
import requests
import html2text
import re
import argparse

OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY'
COMPLETION_URL = 'https://api.openai.com/v1/chat/completions'

PROMPT = """Find the main article from this product page, and return from this text content, as JSON format:

## twitter_scraper.py
# =============================================================================
# Title: Twitter Users Tweets Scraper
# Language: Python
# Description: This script does scrape the first 100 tweets
#   of any Twitter User.
# Author: Sasha Bouloudnine
# Date: 2023-08-08
#
# Usage:
# - Make sure you have the required libraries installed by running:

## bypass_simple_captcha_pytesseract.py
import cv2
from pytesseract import image_to_string

# pip3 install opencv-python
# pip3 install pytesseract
# brew install tesseract

filename = 'lobstr.jpeg'
img = cv2.imread(filename)
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

## download_ebooks_from_onion_link_python3_requests.py
import requests
from lxml import html
import time

print('~~ start')

anarchist_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/special/index"
latest_books_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/latest?bare=1"

session = requests.session()

## pappers_pdf_parser_with_python_and_tika.py
from tika import parser
import re
import csv

HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']

def parse_pdf(filename):

    # request
    raw = parser.from_file(filename)

## google_maps_scraping_selenium.py
# _*_ coding: utf-8 _*°
# Copyright(C) 2021 lobstr

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import csv
	"""
	GrowthHacking.fr Forum Scraper

	This script is used to scrape data from the GrowthHacking.fr forum, specifically from the "Scraping" category.
	It retrieves information about forum topics and saves it as CSV data.

	Usage:
	1. Install the required library using the following command:
	$ pip install requests
	import requests
	import csv
	from lxml import html
	import argparse
	import time

	class YelpSearchScraper:
	def iter_listings(self, url):
	response = requests.get(url)
	if response.status_code != 200:
	from curl_cffi import requests
	from lxml import html
	import json
	import csv
	import time
	import argparse

	HEADERS = {
	'authority': 'www.doctolib.fr',
	'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7',
	import requests
	import re
	import json
	from lxml import html
	import time
	from retry import retry
	import csv

	URL = 'https://www.cdiscount.com/search/10/barbecue.html'
	import os
	import requests
	import html2text
	import re
	import argparse

	OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY'
	COMPLETION_URL = 'https://api.openai.com/v1/chat/completions'

	PROMPT = """Find the main article from this product page, and return from this text content, as JSON format:
	# =============================================================================
	# Title: Twitter Users Tweets Scraper
	# Language: Python
	# Description: This script does scrape the first 100 tweets
	# of any Twitter User.
	# Author: Sasha Bouloudnine
	# Date: 2023-08-08
	#
	# Usage:
	# - Make sure you have the required libraries installed by running:
	import cv2
	from pytesseract import image_to_string

	# pip3 install opencv-python
	# pip3 install pytesseract
	# brew install tesseract

	filename = 'lobstr.jpeg'
	img = cv2.imread(filename)
	gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
	from tika import parser
	import re
	import csv

	HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']

	def parse_pdf(filename):

	# request
	raw = parser.from_file(filename)
	# __ coding: utf-8 _°
	# Copyright(C) 2021 lobstr

	from selenium import webdriver
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.common.by import By
	from selenium.common.exceptions import NoSuchElementException
	import time
	import csv