maxme/imgscraper.py

## imgscraper.py
import random
import requests
import os

from urllib import request
from bs4 import BeautifulSoup
from multiprocessing import Pool

base_url = "http://imgur.com"

def web_spider(url):
    print("Fetching images from: " + url)
    source_code = requests.get(url)
    # just get the code, no headers or anything
    plain_text = source_code.text
    # BeautifulSoup objects can be sorted through easy
    soup = BeautifulSoup(plain_text)
    for link in soup.findAll('a', {'class': 'image-list-link'}):
        href = link.get('href')
        download_picture(base_url + href)

def download_picture(url):
    source_code = requests.get(url)
    plain_text = source_code.text
    soup = BeautifulSoup(plain_text)

    for image in soup.findAll('link', {'rel': 'image_src'}):
        image_source = image.get('href')
        filename = "DL/" + str(image_source).split("/")[-1]
        if os.path.isfile(filename):
            print("Skipping.. Already downloaded...")
            continue
        elif str(image_source).endswith(".jpg") or str(image_source).endswith(".png"):
            request.urlretrieve(image_source, filename)
            print("Successfully downloaded", filename)

def main():
    p = Pool(20)
    try:
        os.mkdir("DL")
    except:
        pass
    p.map(web_spider, ("https://imgur.com/topic/Aww/top/all/page/" + str(i) for i in range(1000)))

main()
	import random
	import requests
	import os

	from urllib import request
	from bs4 import BeautifulSoup
	from multiprocessing import Pool

	base_url = "http://imgur.com"

	def web_spider(url):
	print("Fetching images from: " + url)
	source_code = requests.get(url)
	# just get the code, no headers or anything
	plain_text = source_code.text
	# BeautifulSoup objects can be sorted through easy
	soup = BeautifulSoup(plain_text)
	for link in soup.findAll('a', {'class': 'image-list-link'}):
	href = link.get('href')
	download_picture(base_url + href)

	def download_picture(url):
	source_code = requests.get(url)
	plain_text = source_code.text
	soup = BeautifulSoup(plain_text)

	for image in soup.findAll('link', {'rel': 'image_src'}):
	image_source = image.get('href')
	filename = "DL/" + str(image_source).split("/")[-1]
	if os.path.isfile(filename):
	print("Skipping.. Already downloaded...")
	continue
	elif str(image_source).endswith(".jpg") or str(image_source).endswith(".png"):
	request.urlretrieve(image_source, filename)
	print("Successfully downloaded", filename)

	def main():
	p = Pool(20)
	try:
	os.mkdir("DL")
	except:
	pass
	p.map(web_spider, ("https://imgur.com/topic/Aww/top/all/page/" + str(i) for i in range(1000)))

	main()