-
-
Save buzztiaan/4d6e49352810d65bb299 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib | |
import random | |
import gzip | |
import time | |
import os | |
import re | |
user_agents = [ | |
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', | |
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', | |
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', | |
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20100101 Ubuntu/raring Firefox/24.0', | |
] | |
class URLOpener(urllib.FancyURLopener): | |
version = random.choice(user_agents) | |
urlopen = URLOpener().open | |
#if os.path.exists('randomseed'): | |
# randomlist = [int(x) for x in open('randomseed','r').read().split(' ')] | |
#else: | |
# random.seed() | |
# randomlist = range(1,100000) | |
# print('Shuffling...') | |
# random.shuffle(randomlist) | |
# print('Shuffling Complete.') | |
# rfile = open('randomseed','w') | |
# rfile.write(' '.join([str(x) for x in randomlist])) | |
# rfile.close() | |
def grabber(maxcount=0): | |
maxbadcount = 30 | |
oldmax = 111000 | |
if maxcount is not 0: | |
mc = open('lastmaxcount','r+') | |
lm = mc.read().strip() | |
if lm is not '': | |
oldmax = int(lm) | |
else: | |
oldmax = 111000 | |
mc.seek(0) | |
mc.write(str(maxcount)) | |
mc.close() | |
maxbadcount = 1 | |
basestring = 'http://www.thingiverse.com/thing:' | |
end = '/zip' | |
badcount = 0 | |
num = oldmax -1 | |
while badcount < maxbadcount: | |
num = num + 1 | |
if not os.path.exists('thing_'+str(num)+'.html') and not os.path.exists('thing_'+str(num)+'.zip'): | |
print ('Opening '+basestring+str(num)+end+' ...') | |
zipdata = '' | |
while zipdata == '': | |
try: | |
zipdata = urlopen(basestring+str(num)+end).read() | |
except IOError as e: | |
print (e.strerror) | |
time.sleep(random.randrange(2,5)) | |
if '<!DOCTYPE html>' not in zipdata: | |
time.sleep(random.randrange(1,4)) | |
print ('Opening '+basestring+str(num)+' ...') | |
pagedata = '' | |
while pagedata == '': | |
try: | |
pagedata = urlopen(basestring+str(num)).read() | |
except IOError as e: | |
print (e.strerror) | |
time.sleep(random.randrange(2,5)) | |
if 'YOU HAVE REACHED THE END OF THE THINGIVERSE' in pagedata: | |
print ('Thing not found.') | |
if num > 100000: | |
badcount = badcount + 1 | |
print ('Bad counter now at '+str(badcount)) | |
else: | |
print ('Page OK.') | |
badcount = 0 | |
print ('Writing data.') | |
ofile = open('thing_'+str(num)+'.html','w') | |
zfile = open('thing_'+str(num)+'.zip','w') | |
ofile.write(pagedata) | |
zfile.write(zipdata) | |
zfile.close() | |
ofile.close() | |
else: | |
print ('Bad zip. Sorry!') | |
if maxcount is 0: | |
maxcount = 111000 | |
if num > maxcount: | |
badcount = badcount + 1 | |
print ('Bad counter now at '+str(badcount)) | |
time.sleep(random.randrange(0,2)) | |
else: | |
print ('We have thing:'+str(num)) | |
def newestfinder(): | |
pdata = urlopen('http://www.thingiverse.com/newest').read() | |
match = re.search('thing-name-(\d+)',pdata) | |
if match is not None: | |
print (match.group(1)) | |
return int(match.group(1)) | |
else: | |
print (pdata) | |
return 0 | |
grabber(newestfinder()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment