Created
February 17, 2016 09:17
-
-
Save MarcoQin/3f57b31846495ada2834 to your computer and use it in GitHub Desktop.
Animenzzz sheets spider
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
import requests | |
def extract(begin, end, html): | |
if not html: | |
return '' | |
start = html.find(begin) | |
if start >= 0: | |
start += len(begin) | |
if end is not None: | |
end = html.find(end, start) | |
if end is None or end >= 0: | |
return html[start:end].strip() | |
def extract_all(begin, end, html): | |
return _extract_all(begin, end, html) | |
def _extract_all(begin, end, html): | |
if not html: | |
return '' | |
result = [] | |
from_pos = 0 | |
while True: | |
start = html.find(begin, from_pos) | |
if start >= 0: | |
start += len(begin) | |
endpos = html.find(end, start) | |
if endpos >= 0: | |
result.append(html[start:endpos]) | |
from_pos = endpos + len(end) | |
continue | |
break | |
return result | |
class Spider(object): | |
def __init__(self, url): | |
self.url = url | |
self.r = requests.get(url).text # result | |
def track_out_all_links(self, html): | |
self.all_links = filter(lambda x: x.startswith('http://sheethost.com/sheet/'), extract_all('<td width="100%"><a href="', '"', html)) | |
print len(self.all_links) | |
for link in self.all_links: | |
print link | |
def track_out_tables(self): | |
table = extract('<!-- start table -->', '<!-- end table -->', self.r) | |
return table | |
def run(self): | |
table = self.track_out_tables() | |
self.track_out_all_links(table) | |
for index, link in enumerate(self.all_links): # index for download special index of sheet | |
self.parse_single_link(link) | |
def parse_single_link(self, link): | |
r = requests.get(link).text | |
title = extract('<title>', ' –', r) | |
print title | |
pos = r.find(".mid") | |
pos1 = r.find(".pdf") | |
if pos != -1: | |
link = extract('href="', '"', r[pos - 100:pos]) | |
self.get_file(link) | |
if pos1 != -1: | |
link = extract('href="', '"', r[pos1 - 100:pos1]) | |
self.get_file(link) | |
def get_file(self, link): | |
rm = requests.get(link) | |
tmp = rm.headers['content-disposition'] | |
pos0 = tmp.find('filename="') | |
pos0 = pos0 + len('filename="') | |
pos01 = tmp[pos0:].find('"') | |
filename = tmp[pos0:pos0 + pos01] | |
print filename | |
with open('sheets/{}'.format(filename), 'wb') as f: | |
f.write(rm.content) | |
if __name__ == "__main__": | |
Spider('http://sheethost.com/user/animenz/sheets').run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment