-
-
Save arowser/4129241 to your computer and use it in GitHub Desktop.
capture for http://simplecd.me/entry/L1Z9ddOT/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
from datetime import datetime | |
import os | |
import re | |
import urllib.request | |
from html.parser import HTMLParser | |
from time import sleep | |
import socket | |
socket.setdefaulttimeout(60) | |
class main_parser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.link_list = [] | |
def get_list(self): | |
return self.link_list | |
def print_list(self): | |
print(self.link_list) | |
def handle_starttag(self,tag,attrs): | |
if tag == 'a': | |
for name,value in attrs: | |
if name == 'href': | |
prev = value[1:9] | |
if prev == 'download': | |
self.link_list.append(value) | |
def handle_endtag(self, tag): | |
pass | |
def handle_data(self,data): | |
pass | |
class detail_parser(HTMLParser): | |
def __init__(self): | |
HTMLParser.__init__(self) | |
self.link_list = [] | |
def get_list(self): | |
return self.link_list | |
def print_list(self): | |
print(self.link_list) | |
def handle_starttag(self,tag,attrs): | |
if tag == 'a': | |
for name,value in attrs: | |
if name == 'href': | |
prev = value[0:4] | |
if prev == 'ed2k': | |
self.link_list.append(value) | |
def handle_endtag(self, tag): | |
pass | |
def handle_data(self,data): | |
pass | |
def getUrl(url,coding): | |
print('getUrl: ',url) | |
response = urllib.request.urlopen(url) | |
headers = response.info() | |
rawdata = response.read() | |
if ('Content-Encoding' in headers and headers['Content-Encoding']) or \ | |
('content-encoding' in headers and headers['content-encoding']): | |
import gzip | |
from io import BytesIO | |
data = BytesIO(rawdata) | |
gz = gzip.GzipFile(fileobj=data) | |
rawdata = gz.read() | |
gz.close() | |
print('downloaded: ',url) | |
return rawdata.decode(coding,'ignore') | |
def main(): | |
page = getUrl('http://simplecd.me/entry/L1Z9ddOT/','utf-8') | |
_man_parser = main_parser() | |
_man_parser.feed(page) | |
#_man_parser.print_list() | |
ed2k_list = [] | |
list = _man_parser.get_list(); | |
for var in list: | |
url = 'http://simplecd.me' + var | |
page = getUrl(url,'utf-8') | |
_detail_parser = detail_parser() | |
_detail_parser.feed(page) | |
for i in _detail_parser.get_list(): | |
ed2k_list.append(i) | |
f = open("ed2k_result.txt","wt"); | |
for i in ed2k_list: | |
print(i) | |
f.write(i) | |
f.write('\r\n') | |
f.close() | |
return | |
if __name__ == '__main__' : | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment