elundmark/url_extractor.py

## url_extractor.py
#!/usr/bin/python -tt

# from: http://www.techgaun.com/2012/03/extracting-all-hyperlinks-from-webpages.html

import re, urllib2, sys

def main():
  ''' Usage: url_extractor.py "http://example.com/"
      NOTICE: Intended for root urls; ie no */file or /subfolder/*
      In that case you need to edit this file first
      './abc', '/abc' will be translated to
      'http://example.com/abc' (../ not translated)
      Return value: list
  '''
  if (len(sys.argv) != 2):
    print 'No URL specified. Taking default URL for link extraction'
    url = 'http://www.example.com/'
  else:
    url = str(sys.argv[1])
  links_regex = re.compile('<a\s+.*href=[\'"]?([^\'" >]+)', re.IGNORECASE)
  url_request = urllib2.Request(url)
  try:
    opener = urllib2.build_opener()
    opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.43 Safari/536.11')]
    response = opener.open(url)
    html = response.read()
    links = links_regex.findall(html)
    fixed_links = []
    for link in links:
      full_url = re.sub(r'^\.?/{1,2}', url, link, count=1)
      fixed_links.append(full_url)
    print '\n'.join(fixed_links)
  except urllib2.URLError:
    print 'Can\'t Connect to the website'

if __name__ == '__main__':
  sys.exit(main())
	#!/usr/bin/python -tt

	# from: http://www.techgaun.com/2012/03/extracting-all-hyperlinks-from-webpages.html

	import re, urllib2, sys

	def main():
	''' Usage: url_extractor.py "http://example.com/"
	NOTICE: Intended for root urls; ie no /file or /subfolder/
	In that case you need to edit this file first
	'./abc', '/abc' will be translated to
	'http://example.com/abc' (../ not translated)
	Return value: list
	'''
	if (len(sys.argv) != 2):
	print 'No URL specified. Taking default URL for link extraction'
	url = 'http://www.example.com/'
	else:
	url = str(sys.argv[1])
	links_regex = re.compile('<a\s+.*href=[\'"]?([^\'" >]+)', re.IGNORECASE)
	url_request = urllib2.Request(url)
	try:
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.43 Safari/536.11')]
	response = opener.open(url)
	html = response.read()
	links = links_regex.findall(html)
	fixed_links = []
	for link in links:
	full_url = re.sub(r'^\.?/{1,2}', url, link, count=1)
	fixed_links.append(full_url)
	print '\n'.join(fixed_links)
	except urllib2.URLError:
	print 'Can\'t Connect to the website'

	if __name__ == '__main__':
	sys.exit(main())