Skip to content

Instantly share code, notes, and snippets.

@cjmcgraw
Last active November 10, 2022 15:07
Show Gist options
  • Save cjmcgraw/cd83f0f00a46ab1dcdce04db78741781 to your computer and use it in GitHub Desktop.
Save cjmcgraw/cd83f0f00a46ab1dcdce04db78741781 to your computer and use it in GitHub Desktop.
for you know... big data sets of dick pics!
#! /usr/bin/env python
import requests
import argparse
import hashlib
import json
import time
import sys
import os
parser = argparse.ArgumentParser()
parser.add_argument('--subreddits', type=str, default='dickpic,penis,softies,balls,ratemycock,cock')
parser.add_argument('--url', type=str, default='https://www.reddit.com/r/{}/.json')
args = parser.parse_args()
subreddits = str(args.subreddits).split(',')
obj = dict()
for subreddit in subreddits:
directory = './{}-images'.format(subreddit)
if not os.path.exists(directory):
os.mkdir(directory)
with open(directory + '/image_urls.txt', 'w') as image_urls_file:
last_counts = []
total_image_count = 0
total_subreddit_api_calls = 0
params = {"count": 0, "limit": 100}
headers = {'user-agent': hashlib.md5('{}-dick-scraper'.format(subreddit)).hexdigest()}
while ('after' not in params) or params['after']:
try:
url = args.url.format(subreddit)
r = requests.get(url, params=params, headers=headers)
print('requesting: ')
print('< HTTP/1.1 {}'.format(url))
for k,v in headers.items():
print('< {}={}'.format(k,v))
print('< ')
print('< {}'.format(json.dumps(params)))
print('')
print('')
print('response:')
print('> HTTP/1.1 {}'.format(r.status_code))
print('')
assert r.status_code == 200, r.text
obj = r.json()
assert obj and obj.get('data', {}).get('children'), r.text
posts = obj.get('data').get('children')
access_images = lambda x: (x.get('data', {}).get('preview', {}).get('images', {}))
filtered_data = filter(lambda x: len(x) > 0, map(access_images, posts))
image_urls = [x[0].get('source', {}).get('url') for x in filtered_data]
last_counts.append(len(image_urls))
last_counts = last_counts[-10:]
print("pulling down images:")
for i, url in enumerate(image_urls):
image_urls_file.write(url + "\n")
sys.stdout.write('.')
if i and i % 40 == 0:
sys.stdout.write('(40 / {})\n'.format(len(image_urls)))
sys.stdout.flush()
img = requests.get(url).content
local_file = directory + '/' + hashlib.md5(url).hexdigest()
with open(local_file, 'w') as f:
f.write(img)
print('')
print("found {} images".format(len(image_urls)))
print('last counts: {}'.format(last_counts))
print('')
print('head of images:')
for url in image_urls[:5]:
print(url)
print('')
total_image_count += len(image_urls)
total_subreddit_api_calls += 1
params['count'] += 100
params['after'] = obj['data']['after']
except AssertionError as e:
print("failed with assertion error:\n{}".format(e))
time.sleep(0.25)
print("finished processing records for {}".format(subreddit))
print("calls made: {}".format(total_subreddit_api_calls))
print("images found: {}".format(total_image_count))
print('')
@cjmcgraw
Copy link
Author

you know. For those times you just need to collect a massive amount of dick pics from the world's leading dick pic repository (aka reddit)

@gssci
Copy link

gssci commented Mar 26, 2020

What version of python did you use to run this? On my environment with 3.7.6 I am getting this

 failed with assertion error:
{"reason": "banned", "message": "Not Found", "error": 404}
requesting: 
< HTTP/1.1 https://www.reddit.com/r/dickpic/.json
< user-agent=1531be6c629942a6d760cec0db16e02d
< 
< {"count": 0, "limit": 100000}

Edit: oh wait... the subreddits have been banned. Damn you reddit, obstructing science

Copy link

ghost commented Oct 6, 2020

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment