Skip to content

Instantly share code, notes, and snippets.

@MarcoQin
Created February 17, 2016 09:17
Show Gist options
  • Save MarcoQin/3f57b31846495ada2834 to your computer and use it in GitHub Desktop.
Save MarcoQin/3f57b31846495ada2834 to your computer and use it in GitHub Desktop.
Animenzzz sheets spider
#!/usr/bin/env python
# encoding: utf-8
import requests
def extract(begin, end, html):
if not html:
return ''
start = html.find(begin)
if start >= 0:
start += len(begin)
if end is not None:
end = html.find(end, start)
if end is None or end >= 0:
return html[start:end].strip()
def extract_all(begin, end, html):
return _extract_all(begin, end, html)
def _extract_all(begin, end, html):
if not html:
return ''
result = []
from_pos = 0
while True:
start = html.find(begin, from_pos)
if start >= 0:
start += len(begin)
endpos = html.find(end, start)
if endpos >= 0:
result.append(html[start:endpos])
from_pos = endpos + len(end)
continue
break
return result
class Spider(object):
def __init__(self, url):
self.url = url
self.r = requests.get(url).text # result
def track_out_all_links(self, html):
self.all_links = filter(lambda x: x.startswith('http://sheethost.com/sheet/'), extract_all('<td width="100%"><a href="', '"', html))
print len(self.all_links)
for link in self.all_links:
print link
def track_out_tables(self):
table = extract('<!-- start table -->', '<!-- end table -->', self.r)
return table
def run(self):
table = self.track_out_tables()
self.track_out_all_links(table)
for index, link in enumerate(self.all_links): # index for download special index of sheet
self.parse_single_link(link)
def parse_single_link(self, link):
r = requests.get(link).text
title = extract('<title>', ' &ndash;', r)
print title
pos = r.find(".mid")
pos1 = r.find(".pdf")
if pos != -1:
link = extract('href="', '"', r[pos - 100:pos])
self.get_file(link)
if pos1 != -1:
link = extract('href="', '"', r[pos1 - 100:pos1])
self.get_file(link)
def get_file(self, link):
rm = requests.get(link)
tmp = rm.headers['content-disposition']
pos0 = tmp.find('filename="')
pos0 = pos0 + len('filename="')
pos01 = tmp[pos0:].find('"')
filename = tmp[pos0:pos0 + pos01]
print filename
with open('sheets/{}'.format(filename), 'wb') as f:
f.write(rm.content)
if __name__ == "__main__":
Spider('http://sheethost.com/user/animenz/sheets').run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment