Skip to content

Instantly share code, notes, and snippets.

@wafer-li
Created September 21, 2016 20:51
Show Gist options
  • Save wafer-li/b992e616034b4ad05b679e0a0197d8c8 to your computer and use it in GitHub Desktop.
Save wafer-li/b992e616034b4ad05b679e0a0197d8c8 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import urllib.request
import re
from typing import List, Tuple
class App:
def __init__(self,
param_type:str = 'loc',
param_content:str = 'CN',
start_count:int = 0,
base_url:str = 'http://www.ipv6forum.com/ipv6_enabled/approval_list.php'):
self.search_type = param_type
self.search_content = param_content
self.start_count = start_count
self.base_url = base_url
self.max_pages = None
self.result = None
def get_url(self) -> str:
self.url = self.base_url + \
'?' + 'type=' + self.search_type +\
'&content=' + self.search_content +\
'&start=' + str(self.start_count)
return self.url
def get_html(self) -> str:
return urllib.request.urlopen(self.get_url()).read().decode('UTF-8')
def get_max_pages(self, html) -> int:
if self.max_pages == None:
p = re.compile('<div.*?>.*?of (\d+)')
self.max_pages = int(p.findall(html)[0])
return self.max_pages
def shrink_html(self, html) -> str:
# get <table>
p = re.compile('<table.*?>(.*)</table>', re.DOTALL)
s_html = p.findall(html)[0]
# remove the <th> line
p = re.compile('<tr align.*?>(.*)')
s_html = p.findall(s_html)[0]
return s_html
def get_status_and_address(self, html) -> List[Tuple[str, str]]:
s_html = self.shrink_html(html)
p = re.compile('<b>(.+?)</b>.*?<a href="http.*?">(.+?)<')
return p.findall(s_html)
def process_status_addresses(self, html) -> List[str]:
status_addresses = self.get_status_and_address(html)
result = []
for t in status_addresses:
if t[0] == 'SERVICE-OUT':
status_addresses.remove(t)
else:
result.append(t[1])
return result
def process_whole_pages(self) -> List[str]:
result = []
html = self.get_html()
max_pages = self.get_max_pages(html)
max_loops = max_pages % 30
for i in range(max_loops):
for item in self.process_status_addresses(html):
result.append(item)
self.start_count += 30
html = self.get_html()
return result
def get_result(self) -> List[str]:
if self.result == None:
result = self.process_whole_pages()
self.process_address(result)
return self.result
def process_address(self, result:List[str]):
self.result = []
# * on which sites have the same root domain
for address in result:
if address.count('.') > 2:
new_address = '*' + address[address.index('.'):]
self.result.append(new_address)
else:
self.result.append(address)
# Remove the duplicated items
self.result = list(set(self.result))
if __name__ == '__main__':
app = App()
f = open('ipv6_enable_list_cn.txt', 'w')
count = 0
for address in app.get_result():
count += 1
f.write(address + ';')
if count > 4:
count = 0
f.write('\n')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment