Skip to content

Instantly share code, notes, and snippets.

@nyxiereal
Last active July 4, 2023 07:24
Show Gist options
  • Save nyxiereal/1661573c25222392f9c7c82fee2e9fc6 to your computer and use it in GitHub Desktop.
Save nyxiereal/1661573c25222392f9c7c82fee2e9fc6 to your computer and use it in GitHub Desktop.
Basic Reddit API scraper for subreddits
import praw
import os
import shutil
import requests
from tqdm import tqdm
# Initialize PRAW with your Reddit API credentials
# Get your API creds at https://www.reddit.com/prefs/apps
reddit = praw.Reddit(
client_id='YOUR_CLIENT_ID',
client_secret='YOUR_CLIENT_SECRET',
user_agent='web:com.YOUR-SHORT-NAME.fems2:v1 (by u/YOUR-USER-NAME)'
)
# Define the subreddit and time range
subreddit_name = 'subreddit.name'
time_range = 'year' # Options: 'hour', 'day', 'week', 'month', 'year', 'all'
post_score_min = 100
# Get top posts in the given time range
subreddit = reddit.subreddit(subreddit_name)
top_posts = subreddit.top(time_filter=time_range, limit=None)
# Filter posts with more than 100 upvotes
filtered_posts = [post for post in top_posts if post.score > post_score_min]
# Iterate over the filtered posts and download media files
for post in filtered_posts:
if post.is_self or not post.url:
continue # Skip self-posts and posts without URLs
file_url = post.url
print(file_url)
with open(f"r{subreddit_name}.txt", "a") as file:
file.write(file_url + "\n")
print("Download complete!")
@nyxiereal
Copy link
Author

downloader for this, doesn't work for RedGifs and Reddit albums!

import requests
import os
import sys
import shutil

from imgur_downloader import ImgurDownloader

headers = {'Accept-Encoding': 'gzip, deflate',
           'User-Agent': 'Mozilla/5.0',
           'cache_control': 'max-age=600',
           'connection': 'keep-alive'}

def download_file(url, destination):
    if 'imgur' in url:
        ImgurDownloader(url, dir_download=destination, file_name=url.split('/')[-1], delete_dne=True, debug=True).save_images()

    else:
        response = requests.get(url, stream=True, headers=headers)
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        downloaded_size = 0
        
        with open(destination, 'wb') as file:
            for chunk in response.iter_content(chunk_size=8192):
                file.write(chunk)
                downloaded_size += len(chunk)
                
                # Calculate progress percentage
                progress = downloaded_size / total_size
                progress_percent = int(progress * 100)
                
                # Print progress bar
                sys.stdout.write('\r')
                sys.stdout.write(f'Downloading: {progress_percent}% [{"=" * (progress_percent // 2)}{" " * (50 - (progress_percent // 2))}]')
                sys.stdout.flush()
        
        sys.stdout.write('\n')  # Move to the next line after download completion

def download_images_from_file(file_path, destination_folder):
    with open(file_path, 'r') as file:
        for line in file:
            url = line.strip()
            file_name = url.split('/')[-1]
            destination = os.path.join(destination_folder, file_name)
            
            try:
                print(f'Downloading {url}...')
                download_file(url, destination)
                print(f'{file_name} downloaded successfully!')
            except requests.HTTPError as e:
                print(f'Error downloading {url}: {e}')
            except Exception as e:
                print(f'Error downloading {url}: {e}')

# Example usage
text_file_path = ''
download_folder = ''

if not os.path.isdir(download_folder):
    os.mkdir(download_folder)
download_images_from_file(text_file_path, download_folder)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment