Last active
February 10, 2019 12:24
-
-
Save karolberezicki/f05cf80ae92953441c69682509e78d9e to your computer and use it in GitHub Desktop.
This little Python script will download all images that user has upvoted ('plus')
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import itertools | |
from bs4 import BeautifulSoup | |
from lxml import html | |
def wykop_login(username, password): | |
"""Logging in - not required for scrapping images, but might be usefull.""" | |
session_requests = requests.session() | |
login_url = 'https://www.wykop.pl/zaloguj/' | |
result = session_requests.get(login_url) | |
payload = { | |
'user[username]': username, | |
'user[password]': password, | |
'csrfmiddlewaretoken': '<CSRF_TOKEN> will be filled below', | |
} | |
tree = html.fromstring(result.text) | |
authenticity_token = list(set(tree.xpath("""//*[@id='__token']""")))[0] | |
payload['csrfmiddlewaretoken'] = authenticity_token | |
result = session_requests.post( | |
login_url, data=payload, headers=dict(referer=login_url) | |
) | |
return session_requests | |
def is_href_image(href): | |
return href.startswith('https://www.wykop.pl/cdn') and ( | |
href.endswith('.jpg') or href.endswith('.png') | |
) | |
def scrape(session_requests, username): | |
result_ok = True | |
for i in itertools.count(start=1): | |
if result_ok == False: | |
break | |
url = f'https://www.wykop.pl/ludzie/plusowane-wpisy/{username}/strona/{i}/' | |
print(f'GET {url}') | |
result = session_requests.get(url, headers=dict(referer=url)) | |
soup = BeautifulSoup(result.content, features='lxml') | |
images = set() | |
for anchor in soup.findAll('a'): | |
if anchor.has_attr('href'): | |
href = anchor['href'] | |
if is_href_image(href): | |
images.add(href) | |
print(f'Found {len(images)} images') | |
for image in images: | |
image_name = image.split('/')[-1] | |
with open(image_name, 'wb') as f: | |
f.write(requests.get(image).content) | |
result_ok = result.ok | |
pass | |
print('Done, goodbye') | |
if __name__ == '__main__': | |
session_requests = requests.session() | |
# session_requests = wykop_login('user','pass') | |
scrape(session_requests, 'm__b') | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment