Skip to content

Instantly share code, notes, and snippets.

@xCASx
Created April 8, 2014 19:04
Show Gist options
  • Save xCASx/10172270 to your computer and use it in GitHub Desktop.
Save xCASx/10172270 to your computer and use it in GitHub Desktop.
Gets url list from text file and request the pages, finds favicons and save them into current directory
'''
This programm gets url list from text file and request the pages, finds favicons and save them into current directory
'''
import urllib
from urllib.request import urlopen
from urllib.request import urlretrieve
from bs4 import BeautifulSoup
'''had to add this windows specific block to handle this bug in urllib2:
http://bugs.python.org/issue11220
'''
import ssl
ssl_context = urllib.request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1))
opener = urllib.request.build_opener(ssl_context)
urllib.request.install_opener(opener)
#end of urllib workaround
def getDomain(site):
return site.replace("http://", "").replace("https://", "").split('/', 1)[0].rstrip()
def getHref(href, domain):
if not href.startswith('http'):
href = 'http://' + domain + "/" + href
return href
def getFilename(doamin):
filename = domain.replace("www.", "").rsplit('.', 1)[0]
if filename.startswith("poker."):
filename = filename.replace("poker.", "").replace(".", "") + "poker"
return 'icon-' + filename + '.ico'
def getFile(href, filename):
print(filename)
print(href)
try:
urlretrieve(href, filename)
except:
print('Error getting url "' + href + '" to file "' + filename + '"')
pass
with open("room_links.txt") as f:
content = f.readlines()
for site in content:
html = urlopen(site)
soup = BeautifulSoup(html)
# print(soup.prettify())
processed = None
for link in soup.find_all('link'):
href = link.get('href')
rels = link.get('rel')
for rel in rels:
rel = rel.lower()
if 'icon' in rel:
processed = True
domain = getDomain(site)
href = getHref(href, domain)
filename = getFilename(domain)
getFile(href, filename)
if not processed:
print('There are no link rel icon found. Looks lik site using old style favicon defining.')
domain = getDomain(site)
href = 'http://' + domain + "/favicon.ico"
filename = getFilename(domain)
getFile(href, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment