Skip to content

Instantly share code, notes, and snippets.

@flufy3d
Created September 14, 2012 09:54
Show Gist options
  • Save flufy3d/3721095 to your computer and use it in GitHub Desktop.
Save flufy3d/3721095 to your computer and use it in GitHub Desktop.
#coding=utf-8
from datetime import datetime
import os
import re
import urllib.request
from html.parser import HTMLParser
from time import sleep
import socket
socket.setdefaulttimeout(60)
class main_parser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.link_list = []
def get_list(self):
return self.link_list
def print_list(self):
print(self.link_list)
def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
prev = value[1:9]
if prev == 'download':
self.link_list.append(value)
def handle_endtag(self, tag):
pass
def handle_data(self,data):
pass
class detail_parser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.link_list = []
def get_list(self):
return self.link_list
def print_list(self):
print(self.link_list)
def handle_starttag(self,tag,attrs):
if tag == 'a':
for name,value in attrs:
if name == 'href':
prev = value[0:4]
if prev == 'ed2k':
self.link_list.append(value)
def handle_endtag(self, tag):
pass
def handle_data(self,data):
pass
def getUrl(url,coding):
print('getUrl: ',url)
response = urllib.request.urlopen(url)
headers = response.info()
rawdata = response.read()
if ('Content-Encoding' in headers and headers['Content-Encoding']) or \
('content-encoding' in headers and headers['content-encoding']):
import gzip
from io import BytesIO
data = BytesIO(rawdata)
gz = gzip.GzipFile(fileobj=data)
rawdata = gz.read()
gz.close()
print('downloaded: ',url)
return rawdata.decode(coding,'ignore')
def main():
page = getUrl('http://simplecd.me/entry/L1Z9ddOT/','utf-8')
_man_parser = main_parser()
_man_parser.feed(page)
#_man_parser.print_list()
ed2k_list = []
list = _man_parser.get_list();
for var in list:
url = 'http://simplecd.me' + var
page = getUrl(url,'utf-8')
_detail_parser = detail_parser()
_detail_parser.feed(page)
for i in _detail_parser.get_list():
ed2k_list.append(i)
f = open("ed2k_result.txt","wt");
for i in ed2k_list:
print(i)
f.write(i)
f.write('\r\n')
f.close()
return
if __name__ == '__main__' :
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment