Created
November 30, 2014 22:07
-
-
Save samatt/954dfb8f981a26093eac to your computer and use it in GitHub Desktop.
zazzleScrapyer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import urllib2 | |
import re | |
import os | |
import os.path | |
import pprint | |
import json | |
import mechanize | |
import random | |
from urlparse import urlparse | |
def load_JSON(filename): | |
with open(filename) as data_file: | |
data = json.load(data_file) | |
return data | |
def load_file(filename): | |
return open('zazzleUrls2.txt', "r") | |
def save_images(url,image,dir='images/'): | |
f = urllib2.urlopen(image) | |
args = urlparse(url).query.split('&') | |
name = args[6].split('=')[1]+'___'+args[8].split('=')[1]+".jpg" | |
output = open(dir+name,'w+') | |
output.write(f.read()) | |
output.close() | |
def valid_product(pd): | |
#filtering current bad eggs | |
valid_product | |
if pd =='235452859979983154'or pd =='235477222237641929' or pd =='256358047748678165': | |
return False | |
else: | |
return True | |
def option_selector(pd): | |
# need different images for these products | |
print pd | |
if pd == "256295157103600855" or pd == "256155280231202463": | |
return 'page_productUi_view_viewSelector_option4' | |
elif pd =="256576786624590790" or pd == '168750349767412176': | |
return 'page_productUi_view_viewSelector_option1' | |
elif pd == "256984216551176475": | |
return "page_productUi_view_viewSelector_option0" | |
elif pd == "235521681047767555" or pd == "235831956280979601" or pd == '256383619216386932': | |
return "page_productUi_view_viewSelector_option3" | |
elif pd == '165820488325253739': | |
return "page_productUi_view_viewSelector_option4" | |
elif pd == '256919511037891195': | |
return "page_productUi_view_viewSelector_option5" | |
else: | |
return 'page_productUi_view_viewSelector_option2' | |
def get_images(url): | |
args = urlparse(url).query.split('&') | |
url_name_pd = args[8].split('=')[1] | |
if valid_product(url_name_pd) and url_name_pd in optionData.keys(): | |
option = "page_productUi_view_viewSelector_option"+str(random.choice(optionData[url_name_pd])) | |
parse_mechanize(url=url,scrolly_id=option) | |
# else: | |
# print "Bogus" | |
def parse_mechanize(url, | |
scrolly_id="page_productUi_view_viewSelector_option2", | |
main_view_id="page_productUi_view_mainView-realview", | |
pd = "None"): | |
args = urlparse(url).query.split('&') | |
name = args[0].split('=')[1]+'___'+args[6].split('=')[1]+'___'+args[8].split('=')[1]+".jpg" | |
name_short = args[6].split('=')[1]+'___'+args[8].split('=')[1]+".jpg" | |
if os.path.exists(os.getcwd()+"/new_ant/"+name): | |
print args[8].split('=')[1] + " exists" | |
return | |
else: | |
# if os.path.exists(os.getcwd()+"/new_ant/"+name_short): | |
print args[8].split('=')[1] + " not exists" | |
browser = mechanize.Browser() | |
browser.set_handle_robots(False) | |
browser.open(url) | |
page = browser.response().get_data() | |
soup = BeautifulSoup(page) | |
image='' | |
scrolly = soup.find("div", {"id": scrolly_id}) | |
if scrolly is not None: | |
image = soup.find("img", {"id": scrolly_id+"-image"}) | |
image = image['src'] | |
image = re.sub(r'_[0-9][0-9].jpg','_512.jpg', image.rstrip()) | |
# print image | |
# print "more than 1 image" | |
else: | |
image = soup.find("img", {"id": main_view_id})['src'] | |
print image | |
save_images(url=url,image=image,dir="new_spy/") | |
def parse_data(url): | |
page = urllib2.urlopen(url).read() | |
soup = BeautifulSoup(page) | |
image='' | |
scrolly = soup.find("div", {"id": "page_productUi_view_viewSelector_option2"}) | |
if scrolly is not None: | |
image = soup.find("img", {"id": "page_productUi_view_viewSelector_option2-image"}) | |
print image | |
image = image['src'] | |
# image = re.sub(r'_[0-9][0-9].jpg','_512.jpg', image.rstrip()) | |
# print line | |
# print "more than 1 image" | |
else: | |
image = soup.find("img", {"id": "page_productUi_view_mainView-realview"})['src'] | |
print image | |
save_images(url,image) | |
data = load_JSON('new_ant.json'); | |
optionData = load_JSON('image_options.json'); | |
# print type(optionData) | |
# for i in data: | |
# print key | |
links = [o['product_urls'] for o in data] | |
for o in data: | |
for u in o['product_urls']: | |
#for manual parsing | |
# parse_mechanize(u,pd="235831956280979601",scrolly_id="page_productUi_view_viewSelector_option3") | |
#for bacth json | |
get_images(u) | |
#pacifier | |
# 256295157103600855 | |
# page_productUi_view_viewSelector_option4 | |
# speakers | |
# 256155280231202463 | |
# page_productUi_view_viewSelector_option2 | |
# paddle | |
# 256984216551176475 | |
# page_productUi_view_viewSelector_option0 | |
#hoodie | |
# 235521681047767555 | |
# page_productUi_view_viewSelector_option3 | |
#hoodie | |
# 235831956280979601 | |
# page_productUi_view_viewSelector_option3 | |
# f = load_file('zazzleUrls2') | |
# for line in f: | |
# save_images(url) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment