lobstrio’s gists

## chatgpt_powered_product_page_universal_scraper.py
import os
import requests
import html2text
import re
import argparse

OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY'
COMPLETION_URL = 'https://api.openai.com/v1/chat/completions'

PROMPT = """Find the main article from this product page, and return from this text content, as JSON format:

## google_maps_scraping_selenium.py
# _*_ coding: utf-8 _*°
# Copyright(C) 2021 lobstr

from selenium import webdriver
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import time
import csv

## pappers_pdf_parser_with_python_and_tika.py
from tika import parser
import re
import csv

HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']

def parse_pdf(filename):

    # request
    raw = parser.from_file(filename)

## download_ebooks_from_onion_link_python3_requests.py
import requests
from lxml import html
import time

print('~~ start')

anarchist_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/special/index"
latest_books_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/latest?bare=1"

session = requests.session()

## pagesjaunes_extract.py
#!/usr/bin/python3
# coding: utf-8

import requests
import csv
from lxml import html
import datetime
import argparse


## lacentrale_scraper.py
# -*- coding: utf-8 -*-
# Copyright(C) 2021 Sasha Bouloudnine

import requests
from lxml import html
import csv


class CrawlerLaCentrale():

## pdf_parser.py
#!/usr/bin/python3
# coding: utf-8

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.pdfpage import PDFPage
from io import BytesIO
import argparse


## amazon_xmas.py
# -*- coding: utf-8 -*-

# Copyright(C) 2018 Sasha Bouloudnine

import requests
import sys
import re
import ast
import json
import time

## twitter_dtrump.py
#!/usr/bin/python3
# coding: utf-8

import requests
from lxml import html

def extract():

    """
    Export all Tweets from @realDonaldTrump

## leboncoin_chalet.py
import scrapy
import time

class LbcSpider(scrapy.Spider):
name = "chalet_savoie_lbc"
start_urls = [
'https://www.leboncoin.fr/locations_gites/offres/rhone_alpes/savoie/',
]

def parse(self, response):
	import os
	import requests
	import html2text
	import re
	import argparse

	OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY'
	COMPLETION_URL = 'https://api.openai.com/v1/chat/completions'

	PROMPT = """Find the main article from this product page, and return from this text content, as JSON format:
	# __ coding: utf-8 _°
	# Copyright(C) 2021 lobstr

	from selenium import webdriver
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.common.by import By
	from selenium.common.exceptions import NoSuchElementException
	import time
	import csv
	from tika import parser
	import re
	import csv

	HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals']

	def parse_pdf(filename):

	# request
	raw = parser.from_file(filename)
	import requests
	from lxml import html
	import time

	print('~~ start')

	anarchist_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/special/index"
	latest_books_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/latest?bare=1"

	session = requests.session()
	#!/usr/bin/python3
	# coding: utf-8

	import requests
	import csv
	from lxml import html
	import datetime
	import argparse
	# -- coding: utf-8 --
	# Copyright(C) 2021 Sasha Bouloudnine

	import requests
	from lxml import html
	import csv


	class CrawlerLaCentrale():
	#!/usr/bin/python3
	# coding: utf-8

	from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
	from pdfminer.converter import TextConverter
	from pdfminer.pdfpage import PDFPage
	from io import BytesIO
	import argparse
	# -- coding: utf-8 --

	# Copyright(C) 2018 Sasha Bouloudnine

	import requests
	import sys
	import re
	import ast
	import json
	import time
	import scrapy
	import time

	class LbcSpider(scrapy.Spider):
	name = "chalet_savoie_lbc"
	start_urls = [
	'https://www.leboncoin.fr/locations_gites/offres/rhone_alpes/savoie/',
	]

	def parse(self, response):