Created
April 8, 2014 19:04
-
-
Save xCASx/10172270 to your computer and use it in GitHub Desktop.
Gets url list from text file and request the pages, finds favicons and save them into current directory
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This programm gets url list from text file and request the pages, finds favicons and save them into current directory | |
''' | |
import urllib | |
from urllib.request import urlopen | |
from urllib.request import urlretrieve | |
from bs4 import BeautifulSoup | |
'''had to add this windows specific block to handle this bug in urllib2: | |
http://bugs.python.org/issue11220 | |
''' | |
import ssl | |
ssl_context = urllib.request.HTTPSHandler(context=ssl.SSLContext(ssl.PROTOCOL_TLSv1)) | |
opener = urllib.request.build_opener(ssl_context) | |
urllib.request.install_opener(opener) | |
#end of urllib workaround | |
def getDomain(site): | |
return site.replace("http://", "").replace("https://", "").split('/', 1)[0].rstrip() | |
def getHref(href, domain): | |
if not href.startswith('http'): | |
href = 'http://' + domain + "/" + href | |
return href | |
def getFilename(doamin): | |
filename = domain.replace("www.", "").rsplit('.', 1)[0] | |
if filename.startswith("poker."): | |
filename = filename.replace("poker.", "").replace(".", "") + "poker" | |
return 'icon-' + filename + '.ico' | |
def getFile(href, filename): | |
print(filename) | |
print(href) | |
try: | |
urlretrieve(href, filename) | |
except: | |
print('Error getting url "' + href + '" to file "' + filename + '"') | |
pass | |
with open("room_links.txt") as f: | |
content = f.readlines() | |
for site in content: | |
html = urlopen(site) | |
soup = BeautifulSoup(html) | |
# print(soup.prettify()) | |
processed = None | |
for link in soup.find_all('link'): | |
href = link.get('href') | |
rels = link.get('rel') | |
for rel in rels: | |
rel = rel.lower() | |
if 'icon' in rel: | |
processed = True | |
domain = getDomain(site) | |
href = getHref(href, domain) | |
filename = getFilename(domain) | |
getFile(href, filename) | |
if not processed: | |
print('There are no link rel icon found. Looks lik site using old style favicon defining.') | |
domain = getDomain(site) | |
href = 'http://' + domain + "/favicon.ico" | |
filename = getFilename(domain) | |
getFile(href, filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment