Skip to content

Instantly share code, notes, and snippets.

@eupendra
Created May 26, 2021 08:46
Show Gist options
  • Save eupendra/f50e752d10f6d83ad292b4a2b610efb3 to your computer and use it in GitHub Desktop.
Save eupendra/f50e752d10f6d83ad292b4a2b610efb3 to your computer and use it in GitHub Desktop.
import threading
import time
import requests
from bs4 import BeautifulSoup
import csv
from multiprocessing import Pool, cpu_count
from urllib.parse import urljoin
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
from plates_scrapy import PlatesSpider
url = 'https://vplates.com.au/vplatesapi/checkcombo?vehicleType=car&combination={}'
def get_links():
output = []
df = pd.read_csv('input.csv')
for word in df['Word'].tolist():
output.append([word, url.format(word)])
return output
def process(link_info: list):
response = requests.get(link_info[1])
data = response.json()
success = data.get('success')
item = {
'Word': link_info[0],
'Available': 'Yes' if success else 'No'
}
# with open("output_requests.csv", "a") as f:
# f.write(','.join(item.values()))
print('.', end="")
if __name__ == '__main__':
# Get all links
all_links = get_links()
print(f"Total pages: {len(get_links())}")
start_time = time.time()
########################################
# Option 1, single thread
########################################
for link_info in all_links:
process(link_info)
duration = time.time() - start_time
print(f"\nNo optimization - {len(all_links)} links in {duration:.2f} seconds")
########################################
########################################
# Option 2 Concurrent
########################################
start_time = time.time()
with ThreadPoolExecutor(max_workers=32) as executor:
executor.map(process, all_links)
duration = time.time() - start_time
print(f"\n32 Threads - {len(all_links)} links in {duration:.2f} seconds")
########################################
# Option 3 Parallelism using multiprocessing
########################################
start_time = time.time()
with Pool(cpu_count()) as p:
p.map(process, all_links)
duration = time.time() - start_time
print(f"\n8 Cores in Parallel - {len(all_links)} links in {duration:.2f} seconds")
start_time = time.time()
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
settings = get_project_settings()
settings['LOG_LEVEL'] = 'WARN'
process = CrawlerProcess(settings)
process.crawl(PlatesSpider)
process.start()
duration = time.time() - start_time
print(f"\nScrapy - {len(all_links)} links in {duration:.2f} seconds")
# Total pages: 85
# No optimization - 85 links in 95.53 seconds
# 32 Threads - 85 links in 16.49 seconds
# Multiprocessing - 85 links in 16.42 seconds
# Scrapy - 85 links in 10.96 seconds
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment