Skip to content

Instantly share code, notes, and snippets.

@timster
Created May 2, 2019 13:00
Show Gist options
  • Save timster/6158787be31c1979970b7807f808454c to your computer and use it in GitHub Desktop.
Save timster/6158787be31c1979970b7807f808454c to your computer and use it in GitHub Desktop.
import json
import time
import multiprocessing
import requests
from bs4 import BeautifulSoup
PAGES = 100
def get_page(page):
url = "https://www.wykop.pl/strona/{}/".format(page)
print('getting', url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
return [{'name': el.text, 'href': el['href']} for el in soup.select(".article h2 a")]
if __name__ == '__main__':
start = time.time()
articles = []
with multiprocessing.Pool(processes=10) as pool:
articles = pool.map(get_page, range(1, PAGES+1))
print(time.time() - start)
with open('output.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(articles, ensure_ascii=False, indent=2))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment