Skip to content

Instantly share code, notes, and snippets.

@samatt
Created November 30, 2014 22:07
Show Gist options
  • Save samatt/954dfb8f981a26093eac to your computer and use it in GitHub Desktop.
Save samatt/954dfb8f981a26093eac to your computer and use it in GitHub Desktop.
zazzleScrapyer.py
from bs4 import BeautifulSoup
import urllib2
import re
import os
import os.path
import pprint
import json
import mechanize
import random
from urlparse import urlparse
def load_JSON(filename):
with open(filename) as data_file:
data = json.load(data_file)
return data
def load_file(filename):
return open('zazzleUrls2.txt', "r")
def save_images(url,image,dir='images/'):
f = urllib2.urlopen(image)
args = urlparse(url).query.split('&')
name = args[6].split('=')[1]+'___'+args[8].split('=')[1]+".jpg"
output = open(dir+name,'w+')
output.write(f.read())
output.close()
def valid_product(pd):
#filtering current bad eggs
valid_product
if pd =='235452859979983154'or pd =='235477222237641929' or pd =='256358047748678165':
return False
else:
return True
def option_selector(pd):
# need different images for these products
print pd
if pd == "256295157103600855" or pd == "256155280231202463":
return 'page_productUi_view_viewSelector_option4'
elif pd =="256576786624590790" or pd == '168750349767412176':
return 'page_productUi_view_viewSelector_option1'
elif pd == "256984216551176475":
return "page_productUi_view_viewSelector_option0"
elif pd == "235521681047767555" or pd == "235831956280979601" or pd == '256383619216386932':
return "page_productUi_view_viewSelector_option3"
elif pd == '165820488325253739':
return "page_productUi_view_viewSelector_option4"
elif pd == '256919511037891195':
return "page_productUi_view_viewSelector_option5"
else:
return 'page_productUi_view_viewSelector_option2'
def get_images(url):
args = urlparse(url).query.split('&')
url_name_pd = args[8].split('=')[1]
if valid_product(url_name_pd) and url_name_pd in optionData.keys():
option = "page_productUi_view_viewSelector_option"+str(random.choice(optionData[url_name_pd]))
parse_mechanize(url=url,scrolly_id=option)
# else:
# print "Bogus"
def parse_mechanize(url,
scrolly_id="page_productUi_view_viewSelector_option2",
main_view_id="page_productUi_view_mainView-realview",
pd = "None"):
args = urlparse(url).query.split('&')
name = args[0].split('=')[1]+'___'+args[6].split('=')[1]+'___'+args[8].split('=')[1]+".jpg"
name_short = args[6].split('=')[1]+'___'+args[8].split('=')[1]+".jpg"
if os.path.exists(os.getcwd()+"/new_ant/"+name):
print args[8].split('=')[1] + " exists"
return
else:
# if os.path.exists(os.getcwd()+"/new_ant/"+name_short):
print args[8].split('=')[1] + " not exists"
browser = mechanize.Browser()
browser.set_handle_robots(False)
browser.open(url)
page = browser.response().get_data()
soup = BeautifulSoup(page)
image=''
scrolly = soup.find("div", {"id": scrolly_id})
if scrolly is not None:
image = soup.find("img", {"id": scrolly_id+"-image"})
image = image['src']
image = re.sub(r'_[0-9][0-9].jpg','_512.jpg', image.rstrip())
# print image
# print "more than 1 image"
else:
image = soup.find("img", {"id": main_view_id})['src']
print image
save_images(url=url,image=image,dir="new_spy/")
def parse_data(url):
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page)
image=''
scrolly = soup.find("div", {"id": "page_productUi_view_viewSelector_option2"})
if scrolly is not None:
image = soup.find("img", {"id": "page_productUi_view_viewSelector_option2-image"})
print image
image = image['src']
# image = re.sub(r'_[0-9][0-9].jpg','_512.jpg', image.rstrip())
# print line
# print "more than 1 image"
else:
image = soup.find("img", {"id": "page_productUi_view_mainView-realview"})['src']
print image
save_images(url,image)
data = load_JSON('new_ant.json');
optionData = load_JSON('image_options.json');
# print type(optionData)
# for i in data:
# print key
links = [o['product_urls'] for o in data]
for o in data:
for u in o['product_urls']:
#for manual parsing
# parse_mechanize(u,pd="235831956280979601",scrolly_id="page_productUi_view_viewSelector_option3")
#for bacth json
get_images(u)
#pacifier
# 256295157103600855
# page_productUi_view_viewSelector_option4
# speakers
# 256155280231202463
# page_productUi_view_viewSelector_option2
# paddle
# 256984216551176475
# page_productUi_view_viewSelector_option0
#hoodie
# 235521681047767555
# page_productUi_view_viewSelector_option3
#hoodie
# 235831956280979601
# page_productUi_view_viewSelector_option3
# f = load_file('zazzleUrls2')
# for line in f:
# save_images(url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment