This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
import html2text | |
import re | |
import argparse | |
OPENAI_API_KEY = 'YOUR_OPEN_AI_API_KEY' | |
COMPLETION_URL = 'https://api.openai.com/v1/chat/completions' | |
PROMPT = """Find the main article from this product page, and return from this text content, as JSON format: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# _*_ coding: utf-8 _*° | |
# Copyright(C) 2021 lobstr | |
from selenium import webdriver | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.ui import WebDriverWait | |
from selenium.webdriver.common.by import By | |
from selenium.common.exceptions import NoSuchElementException | |
import time | |
import csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from tika import parser | |
import re | |
import csv | |
HEADERS = ['numero_gestion', 'a_jour_au', 'numero_rcs', 'date_immatriculation', 'raison_sociale', 'forme_juridique', 'capital_social', 'adresse_siege', 'activites_principals'] | |
def parse_pdf(filename): | |
# request | |
raw = parser.from_file(filename) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml import html | |
import time | |
print('~~ start') | |
anarchist_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/special/index" | |
latest_books_library_onion_link = "http://libraryqxxiqakubqv3dc2bend2koqsndbwox2johfywcatxie26bsad.onion/latest?bare=1" | |
session = requests.session() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
import requests | |
import csv | |
from lxml import html | |
import datetime | |
import argparse | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright(C) 2021 Sasha Bouloudnine | |
import requests | |
from lxml import html | |
import csv | |
class CrawlerLaCentrale(): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter | |
from pdfminer.converter import TextConverter | |
from pdfminer.pdfpage import PDFPage | |
from io import BytesIO | |
import argparse | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Copyright(C) 2018 Sasha Bouloudnine | |
import requests | |
import sys | |
import re | |
import ast | |
import json | |
import time |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# coding: utf-8 | |
import requests | |
from lxml import html | |
def extract(): | |
""" | |
Export all Tweets from @realDonaldTrump |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
import time | |
class LbcSpider(scrapy.Spider): | |
name = "chalet_savoie_lbc" | |
start_urls = [ | |
'https://www.leboncoin.fr/locations_gites/offres/rhone_alpes/savoie/', | |
] | |
def parse(self, response): |