Skip to content

Instantly share code, notes, and snippets.

@karolberezicki
Last active February 10, 2019 12:24
Show Gist options
  • Save karolberezicki/f05cf80ae92953441c69682509e78d9e to your computer and use it in GitHub Desktop.
Save karolberezicki/f05cf80ae92953441c69682509e78d9e to your computer and use it in GitHub Desktop.
This little Python script will download all images that user has upvoted ('plus')
import requests
import itertools
from bs4 import BeautifulSoup
from lxml import html
def wykop_login(username, password):
"""Logging in - not required for scrapping images, but might be usefull."""
session_requests = requests.session()
login_url = 'https://www.wykop.pl/zaloguj/'
result = session_requests.get(login_url)
payload = {
'user[username]': username,
'user[password]': password,
'csrfmiddlewaretoken': '<CSRF_TOKEN> will be filled below',
}
tree = html.fromstring(result.text)
authenticity_token = list(set(tree.xpath("""//*[@id='__token']""")))[0]
payload['csrfmiddlewaretoken'] = authenticity_token
result = session_requests.post(
login_url, data=payload, headers=dict(referer=login_url)
)
return session_requests
def is_href_image(href):
return href.startswith('https://www.wykop.pl/cdn') and (
href.endswith('.jpg') or href.endswith('.png')
)
def scrape(session_requests, username):
result_ok = True
for i in itertools.count(start=1):
if result_ok == False:
break
url = f'https://www.wykop.pl/ludzie/plusowane-wpisy/{username}/strona/{i}/'
print(f'GET {url}')
result = session_requests.get(url, headers=dict(referer=url))
soup = BeautifulSoup(result.content, features='lxml')
images = set()
for anchor in soup.findAll('a'):
if anchor.has_attr('href'):
href = anchor['href']
if is_href_image(href):
images.add(href)
print(f'Found {len(images)} images')
for image in images:
image_name = image.split('/')[-1]
with open(image_name, 'wb') as f:
f.write(requests.get(image).content)
result_ok = result.ok
pass
print('Done, goodbye')
if __name__ == '__main__':
session_requests = requests.session()
# session_requests = wykop_login('user','pass')
scrape(session_requests, 'm__b')
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment