Skip to content

Instantly share code, notes, and snippets.

View AnderRV's full-sized avatar

Ander Rodriguez AnderRV

View GitHub Profile
import requests
from bs4 import BeautifulSoup
import queue
from threading import Thread
starting_url = 'https://scrapeme.live/shop/page/1/'
visited = set()
max_visits = 100 # careful, it will crawl all the pages
num_workers = 5
data = []
data = []
def extract_content(soup):
for product in soup.select('.product'):
data.append({
'id': product.find('a', attrs={'data-product_id': True})['data-product_id'],
'name': product.find('h2').text,
'price': product.find(class_='amount').text
})
proxies = {
'http': 'http://190.64.18.177:80',
'https': 'http://49.12.2.178:3128',
}
headers = {
'authority': 'httpbin.org',
'cache-control': 'max-age=0',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
def queue_worker(i, q):
while True:
url = q.get()
if (len(visited) < max_visits and url not in visited):
crawl(url)
q.task_done()
q = queue.Queue()
num_workers = 4
for i in range(num_workers):
from threading import Thread
def queue_worker(i, q):
while True:
url = q.get() # Get an item from the queue, blocks until one is available
print('to process:', url)
q.task_done() # Notifies the queue that the item has been processed
q = queue.Queue()
Thread(target=queue_worker, args=(0, q), daemon=True).start()
import queue
q = queue.Queue()
q.put('https://scrapeme.live/shop/page/1/')
def crawl(url):
# ...
links = extract_links(soup)
for link in links:
if link not in visited:
def crawl(url):
if not url or url in visited:
return
print('Crawl: ', url)
visited.add(url)
html = get_html(url)
soup = BeautifulSoup(html, 'html.parser')
extract_content(soup)
links = extract_links(soup)
to_visit.update(links)
visited = set()
to_visit = set()
max_visits = 3
def crawl(url):
print('Crawl: ', url)
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
visited.add(url)
for a in soup.select('a.page-numbers'):
@AnderRV
AnderRV / autoscout24-germany-dataset.json
Created June 8, 2021 14:49
Dataset with more than 1000 cars from Germany, collected from AutoScout24
[
{
"date": "08/2020",
"fuel": "Gasoline",
"gear": "Manual",
"link": "/offers/mazda-cx-3-skyactiv-g-121-fwd-6gs-al-edition100-gasoline-0fcc31ce-0548-4ee2-b3c2-a4aba38fe9db",
"makemodel": "Mazda CX-3",
"mileage": 546,
"offerType": "Used",
"power": "89 kW (121 hp)",