Skip to content

Instantly share code, notes, and snippets.

@geekdinazor
Created March 9, 2015 11:28
Show Gist options
  • Save geekdinazor/0b12904ba7f8492df83b to your computer and use it in GitHub Desktop.
Save geekdinazor/0b12904ba7f8492df83b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
instaRaider.py
usage: instaRaider.py [-h] -u USER [-c COUNT]
@amirkurtovic
"""
from bs4 import BeautifulSoup
import selenium.webdriver as webdriver
import re
from time import sleep
import urllib
import urllib2
import os
import sys
import argparse
class instaRaider(object):
def getImageCount(self, url):
'''
Given a url to Instagram profile, return number of photos posted
'''
response = urllib2.urlopen(url)
countsCode = re.search(r'counts\":{\"media\":\d+', response.read())
count = re.findall(r'\d+', countsCode.group())
return count[0]
def loadInstagram(self, profileUrl):
'''
Using Selenium WebDriver, load Instagram page to get page source
'''
count = self.getImageCount(self.profileUrl)
print self.userName + " has " + str(count) + " photos on Instagram."
print "Loading Selenium WebDriver..."
# Load webdriver and scale window down
driver = webdriver.Firefox()
driver.set_window_size(40,40)
driver.set_window_position(20,20)
print "Loading Instagram profile..."
# load Instagram profile and wait for PAUSE
driver.get(self.profileUrl)
driver.implicitly_wait(self.PAUSE)
# Check if the profile is private. If so, exit
try:
driver.find_element_by_css_selector('.MediaComp')
except:
sys.exit("User profile is private. Aborting.")
clicks = (int(count)-60)/20+1
for x in range(3):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sys.stdout.write('.')
sys.stdout.flush()
sleep(self.PAUSE)
# Load full Instagram profile if more than initial 60 photos desired
if (args.count < 61):
pass
else:
# Click on "Load more..." label
element = driver.find_element_by_xpath(self.loadLabelXPATH)
for y in range(clicks):
element.click()
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
sys.stdout.write('.')
sys.stdout.flush()
sleep(self.PAUSE)
# After load all profile photos, retur source to getPhotos()
source = BeautifulSoup(driver.page_source)
# close Firefox window
driver.close()
return source
def validUser(self, userName):
'''
returns True if Instagram username is valid
'''
# check if Instagram username is valid
req = urllib2.Request(self.profileUrl)
try:
urllib2.urlopen(req)
except:
return False
# if req doesn't fail, user profile exists
return True
def photoExists(self, url):
'''
Returns true if photo exists
Used when checking which suffix Instagram used for full-res photo
url: URL to Instagram photo
'''
try:
urllib2.urlopen(url)
except:
return False
return True
def getPhotos(self, source, userName, count):
'''
Given source code for loaded Instagram page,
extract all hrefs and download full-resolution photos
source: HTML source code of Instagram profile papge
'''
# directory where photos will be saved
directory = './Images/' + userName + '/'
# check if directory exists, if not, make it
if not os.path.exists(directory):
os.makedirs(directory)
# logfile to store urls is csv format
logfile = './Images/' + userName + '/' + userName + '.csv'
try:
file = open(logfile, "a")
except IOError:
print "\nLog file does not exist."
# photo number for file names
# indexes for progress bar
photosSaved = 0
progressBar = 0
photoNumber = 0
print "\nRaiding Instagram..."
print "Saving photos to " + directory
print "------"
# print progress bar
print "Photos saved so far:"
print "---------10--------20--------30--------40--------50"
for x in source.findAll('div', {'class':''}):
if (photoNumber >= count):
break
else:
# increment photonumber for next image
photoNumber += 1
#extract url to thumbnail from each photo
x = x.div
rawUrl = x['style']
photoUrl = rawUrl[21:]
photoName = directory + userName + "_" + str(photoNumber) + '.jpg'
# save full-resolution photo
urllib.urlretrieve(photoUrl, photoName)
# save filename and url to CSV file
file.write(photoUrl + "," + photoName + "\n")
# print hash to progress bar
if (photosSaved == 50):
photosSaved = 1
progressBar += 50
sys.stdout.write('\n')
sys.stdout.write('#')
sys.stdout.flush()
else:
# increment progress bar
photosSaved += 1
sys.stdout.write('#')
sys.stdout.flush()
sleep(self.PAUSE)
print "\n------"
print "Saved " + str(photoNumber) + " images to " + directory
# close logfile
file.close()
print "Saved activity in logfile: " + logfile
def __init__(self, userName):
self.userName = userName
self.profileUrl = 'http://instagram.com/' + userName + '/'
self.PAUSE = 1
self.loadLabelXPATH = "/html/body/div/div/div/section/div/div/div/div/div/a/div"
if __name__ == '__main__':
# parse arguments
parser = argparse.ArgumentParser(description="InstaRaider")
parser.add_argument('-u', '--user', help="Instagram username", required=True)
parser.add_argument('-c', '--count', help="# of photos to download", type=int)
args = parser.parse_args()
if (args.user):
userName = args.user
raider = instaRaider(userName)
url = raider.profileUrl
if(raider.validUser(userName)):
if not args.count:
count = raider.getImageCount(url)
else:
count = args.count
if raider.getImageCount(url) < count:
print "You want to dowload %r photos." % args.count
print "The user only has %r photo." % raider.getImageCount(url)
print "Downloading all photos."
count = raider.getImageCount(url)
# Get source code from fully loaded Instagram profile page
source = raider.loadInstagram(url)
# Download all photos identified on profile page
raider.getPhotos(source, userName, count)
else:
print "Username " + userName + " is not valid."
@geekdinazor
Copy link
Author

Don't work anymore...

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment