Skip to content

Instantly share code, notes, and snippets.

@buzztiaan
Created March 5, 2016 20:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save buzztiaan/4d6e49352810d65bb299 to your computer and use it in GitHub Desktop.
Save buzztiaan/4d6e49352810d65bb299 to your computer and use it in GitHub Desktop.
import urllib
import random
import gzip
import time
import os
import re
user_agents = [
'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11',
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20100101 Ubuntu/raring Firefox/24.0',
]
class URLOpener(urllib.FancyURLopener):
version = random.choice(user_agents)
urlopen = URLOpener().open
#if os.path.exists('randomseed'):
# randomlist = [int(x) for x in open('randomseed','r').read().split(' ')]
#else:
# random.seed()
# randomlist = range(1,100000)
# print('Shuffling...')
# random.shuffle(randomlist)
# print('Shuffling Complete.')
# rfile = open('randomseed','w')
# rfile.write(' '.join([str(x) for x in randomlist]))
# rfile.close()
def grabber(maxcount=0):
maxbadcount = 30
oldmax = 111000
if maxcount is not 0:
mc = open('lastmaxcount','r+')
lm = mc.read().strip()
if lm is not '':
oldmax = int(lm)
else:
oldmax = 111000
mc.seek(0)
mc.write(str(maxcount))
mc.close()
maxbadcount = 1
basestring = 'http://www.thingiverse.com/thing:'
end = '/zip'
badcount = 0
num = oldmax -1
while badcount < maxbadcount:
num = num + 1
if not os.path.exists('thing_'+str(num)+'.html') and not os.path.exists('thing_'+str(num)+'.zip'):
print ('Opening '+basestring+str(num)+end+' ...')
zipdata = ''
while zipdata == '':
try:
zipdata = urlopen(basestring+str(num)+end).read()
except IOError as e:
print (e.strerror)
time.sleep(random.randrange(2,5))
if '<!DOCTYPE html>' not in zipdata:
time.sleep(random.randrange(1,4))
print ('Opening '+basestring+str(num)+' ...')
pagedata = ''
while pagedata == '':
try:
pagedata = urlopen(basestring+str(num)).read()
except IOError as e:
print (e.strerror)
time.sleep(random.randrange(2,5))
if 'YOU HAVE REACHED THE END OF THE THINGIVERSE' in pagedata:
print ('Thing not found.')
if num > 100000:
badcount = badcount + 1
print ('Bad counter now at '+str(badcount))
else:
print ('Page OK.')
badcount = 0
print ('Writing data.')
ofile = open('thing_'+str(num)+'.html','w')
zfile = open('thing_'+str(num)+'.zip','w')
ofile.write(pagedata)
zfile.write(zipdata)
zfile.close()
ofile.close()
else:
print ('Bad zip. Sorry!')
if maxcount is 0:
maxcount = 111000
if num > maxcount:
badcount = badcount + 1
print ('Bad counter now at '+str(badcount))
time.sleep(random.randrange(0,2))
else:
print ('We have thing:'+str(num))
def newestfinder():
pdata = urlopen('http://www.thingiverse.com/newest').read()
match = re.search('thing-name-(\d+)',pdata)
if match is not None:
print (match.group(1))
return int(match.group(1))
else:
print (pdata)
return 0
grabber(newestfinder())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment