Created
January 2, 2014 19:24
-
-
Save djinn/8224961 to your computer and use it in GitHub Desktop.
This gist is created in response to a request for debugging backlink on a mailing list
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from urllib2 import urlopen, Request | |
import re | |
from bs4 import BeautifulSoup as Soup | |
from optparse import OptionParser | |
from collections import namedtuple, Mapping | |
import sys | |
__author__ = "Supreet Sethi <supreet.sethi@gmail.com>" | |
__copyright__ = "Copyright (C) 2014 Supreet Sethi" | |
__license__ = "Public Domain" | |
__version__ = "1.0" | |
__doc__ =""" This a backlink checker. It allows mass checking of a backlinked URL""" | |
log = logging.getLogger('backlink-checker') | |
def namedtuple_with_defaults(typename, field_names, default_values=[]): | |
T = namedtuple(typename, field_names) | |
T.__new__.__defaults__ = (None,) * len(T._fields) | |
if isinstance(default_values, Mapping): | |
prototype = T(**default_values) | |
else: | |
prototype = T(*default_values) | |
T.__new__.__defaults__ = tuple(prototype) | |
return T | |
backlink_data = namedtuple_with_defaults('Backlink', ['url', 'backlink', 'a_links', 'img_links', 'err'], [None, None, [], [], False]) | |
def check_url_backlink_exist(url, backlink, a_flag=True, img_flag=False): | |
try: | |
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:22.0) djinn Gecko/20100101 Firefox/22.0'} | |
req = Request(url, data=None, headers=headers) | |
dtfd = urlopen(req) | |
except Exception, e: | |
log.warning("URL: %(url)s could not be fetched: error -> %(error)s" % | |
{'url': url, | |
'error': str(e)} | |
) | |
dtfd.close() | |
return backlink_data(url=url, backlink=backlink, err=True) | |
resp = dtfd.getcode() | |
if resp == 200 and dtfd.info()['Content-type'].startswith('text/html'): | |
sp = Soup(dtfd.read()) | |
a_list, img_list = [], [] | |
if a_flag == True: | |
a_list = sp.findAll('a', href=re.compile('^%s' % backlink)) | |
a_list = [(a.text, a['href']) for a in a_list] | |
if img_flag == True: | |
img_list = sp.findAll('img', src=re.compile('^%s' % backlink)) | |
img_list = [(img['alt'], img['src']) for img in img_list] | |
return backlink_data(url=url, backlink=backlink, | |
a_links=a_list, img_links=img_list) | |
else: | |
return backlink_data(url=url, backlink=backlink, err=True) | |
def main(): | |
usage = "usage: %prog [options] -b http://<url> --url urls.txt -l backlinks.log" | |
parser = OptionParser(usage=usage) | |
parser.add_option("-u", "--url", dest="url_filename", | |
help="List of URLS to be checked", | |
metavar="FILE") | |
parser.add_option("-l", "--log", dest="log_file", | |
help="Log file where the data will be written. By default writes to /tmp/backlink.log", | |
metavar="FILE", default="/tmp/backlink.log") | |
parser.add_option("-b", "--backlink", dest="backlink", | |
help="backlink to be checked", | |
type="str", metavar="URL") | |
(options, args) = parser.parse_args(sys.argv) | |
if not options.url_filename: | |
parser.error("URL file is needed") | |
if not options.backlink: | |
parser.error("Backlink is needed") | |
hdlr = logging.FileHandler(options.log_file) | |
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s') | |
hdlr.setFormatter(formatter) | |
log.addHandler(hdlr) | |
log.setLevel(logging.DEBUG) | |
try: | |
urlfd = open(options.url_filename) | |
except Exception, e: | |
log.error("URL file is not available %(error)s" % {'error', e}) | |
sys.exit(1) | |
for row in urlfd.readlines(): | |
row = row.strip() | |
# There is no easy way to validate url but to check by sending request | |
dt = check_url_backlink_exist(row, options.backlink) | |
for text, link in dt.a_links: | |
log.info("URL: %s with backlink: %s: Found a tag text: %s with url: %s" % (row, options.backlink, text, link)) | |
for text, link in dt.img_links: | |
log.info("URL: %s with backlink: %s: Found img tag text: %s with url: %s" % (row, options.backlink, text, link)) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank you Supreet for writing mature code for me :)