Skip to content

Instantly share code, notes, and snippets.

@maxme
Created April 21, 2015 18:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save maxme/89be564f4cd6ff26153f to your computer and use it in GitHub Desktop.
Save maxme/89be564f4cd6ff26153f to your computer and use it in GitHub Desktop.
import random
import requests
import os
from urllib import request
from bs4 import BeautifulSoup
from multiprocessing import Pool
base_url = "http://imgur.com"
def web_spider(url):
print("Fetching images from: " + url)
source_code = requests.get(url)
# just get the code, no headers or anything
plain_text = source_code.text
# BeautifulSoup objects can be sorted through easy
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'image-list-link'}):
href = link.get('href')
download_picture(base_url + href)
def download_picture(url):
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for image in soup.findAll('link', {'rel': 'image_src'}):
image_source = image.get('href')
filename = "DL/" + str(image_source).split("/")[-1]
if os.path.isfile(filename):
print("Skipping.. Already downloaded...")
continue
elif str(image_source).endswith(".jpg") or str(image_source).endswith(".png"):
request.urlretrieve(image_source, filename)
print("Successfully downloaded", filename)
def main():
p = Pool(20)
try:
os.mkdir("DL")
except:
pass
p.map(web_spider, ("https://imgur.com/topic/Aww/top/all/page/" + str(i) for i in range(1000)))
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment