xCASx/get-icons-for-site-list.py

## get-icons-for-site-list.py
'''
This programm gets url list from text file and request the pages, finds favicons and save them into current directory
'''
import urllib
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup

'''had to add this windows specific block to handle this bug in urllib2:
http://bugs.python.org/issue11220
'''
import ssl
ssl_context = urllib.request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
opener = urllib.request.build_opener(ssl_context)
urllib.request.install_opener(opener)
#end of urllib workaround

def getDomain(site):
    return site.replace("http://", "").replace("https://", "").split('/', 1)[0].rstrip()

def getHref(href, domain):
    if not href.startswith('http'):
        href = 'http://' + domain + "/" + href
    return href

def getFilename(doamin):
    filename = domain.replace("www.", "").rsplit('.', 1)[0]
    if filename.startswith("poker."):
        filename = filename.replace("poker.", "").replace(".", "") + "poker"
    return 'icon-' + filename + '.ico'

def getFile(href, filename):
    print(filename)
    print(href)
    try:
        urlretrieve(href, filename)
    except:
        print('Error getting url "' + href + '" to file "' + filename + '"')
        pass

with open("room_links.txt") as f:
    content = f.readlines()

for site in content:
    html = urlopen(site)
    soup = BeautifulSoup(html)
    # print(soup.prettify())
    processed = None

    for link in soup.find_all('link'):
        href = link.get('href')
        rels = link.get('rel')
        for rel in rels:
            rel = rel.lower()
            if 'icon' in rel:
                processed = True
                domain = getDomain(site)
                href = getHref(href, domain)
                filename = getFilename(domain)
                getFile(href, filename)
    if not processed:
        print('There are no link rel icon found. Looks lik site using old style favicon defining.')
        domain = getDomain(site)
        href = 'http://' + domain + "/favicon.ico"
        filename = getFilename(domain)
        getFile(href, filename)
	'''
	This programm gets url list from text file and request the pages, finds favicons and save them into current directory
	'''
	import urllib
	from urllib.request import urlopen
	from urllib.request import urlretrieve
	from bs4 import BeautifulSoup

	'''had to add this windows specific block to handle this bug in urllib2:
	http://bugs.python.org/issue11220
	'''
	import ssl
	ssl_context = urllib.request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
	opener = urllib.request.build_opener(ssl_context)
	urllib.request.install_opener(opener)
	#end of urllib workaround

	def getDomain(site):
	return site.replace("http://", "").replace("https://", "").split('/', 1)[0].rstrip()

	def getHref(href, domain):
	if not href.startswith('http'):
	href = 'http://' + domain + "/" + href
	return href

	def getFilename(doamin):
	filename = domain.replace("www.", "").rsplit('.', 1)[0]
	if filename.startswith("poker."):
	filename = filename.replace("poker.", "").replace(".", "") + "poker"
	return 'icon-' + filename + '.ico'

	def getFile(href, filename):
	print(filename)
	print(href)
	try:
	urlretrieve(href, filename)
	except:
	print('Error getting url "' + href + '" to file "' + filename + '"')
	pass

	with open("room_links.txt") as f:
	content = f.readlines()

	for site in content:
	html = urlopen(site)
	soup = BeautifulSoup(html)
	# print(soup.prettify())
	processed = None

	for link in soup.find_all('link'):
	href = link.get('href')
	rels = link.get('rel')
	for rel in rels:
	rel = rel.lower()
	if 'icon' in rel:
	processed = True
	domain = getDomain(site)
	href = getHref(href, domain)
	filename = getFilename(domain)
	getFile(href, filename)
	if not processed:
	print('There are no link rel icon found. Looks lik site using old style favicon defining.')
	domain = getDomain(site)
	href = 'http://' + domain + "/favicon.ico"
	filename = getFilename(domain)
	getFile(href, filename)