Skip to content

Instantly share code, notes, and snippets.

@daramcq
Created July 15, 2012 19:53
Show Gist options
  • Save daramcq/3118367 to your computer and use it in GitHub Desktop.
Save daramcq/3118367 to your computer and use it in GitHub Desktop.
Program to check for broken links on a webpage
import java.net.*;
import org.htmlparser.beans.LinkBean;
import org.htmlparser.http.HttpHeader;
import java.util.*;
import java.io.*;
public class LinkCheck
{
public static URL[] getLinks(String url)
{
// Use LinkBean class from HTMLParser to get array of links from a page
LinkBean lb = new LinkBean ();
lb.setURL (url);
URL[] urls = lb.getLinks ();
return urls;
}
public static boolean isImage(URL url)
{
String urlString = url.toString();
// Get the last three chars of url
int last = urlString.length()-1;
int extStart = urlString.length()-3;
String ext = urlString.substring(extStart);
// If these match standard image formats, it's an image
if (ext.equalsIgnoreCase("jpg")||ext.equalsIgnoreCase("png")||ext.equalsIgnoreCase("gif")||ext.equalsIgnoreCase("tif"))
return true;
// Otherwise, it's not
else
return false;
}
public static boolean isNotGoogle(URL url)
{
String urlString = url.toString();
if (urlString.contains("google"))
return false;
else
return true;
}
public static void main (String[] args)
{
try
{
// Use getLinks to return array of links from website specified at command-line
URL[] urls = getLinks(args[0]);
// Create an array of same size to store Response Codes
int[] codes = new int[urls.length];
// Create ArrayLists to store info from Broken Links
ArrayList <URL> brokenLinks = new ArrayList <URL> ();
ArrayList <Integer> brokenCodes = new ArrayList <Integer> ();
ArrayList <String> brokenReasons = new ArrayList <String> ();
// Iterate through the Urls
for (int i = 0; i < urls.length; i++)
{
try
{
// Connect to the URL and add Response Code to Codes Array
HttpURLConnection connect = (HttpURLConnection) urls[i].openConnection();
codes[i] = connect.getResponseCode();
// If the link is broken & not a Google link, add its info into ArrayLists
if (connect.getResponseCode() != 200 && isNotGoogle(urls[i]))
{
brokenLinks.add(urls[i]);
brokenCodes.add(codes[i]);
// If is an image, this is the Reason
if (isImage(urls[i]))
brokenReasons.add("Is Image - Access not Allowed");
// If code is 404, Reason is File not Found
else if (codes[i] == 404)
brokenReasons.add("File not Found");
// If code is 403, Reason is Access Forbidden
else if (codes[i] == 403)
brokenReasons.add("Access Forbidden by server");
// If code is 302, Reason is file moved
else if (codes[i] == 302)
brokenReasons.add("File temporarily moved");
//Otherwise reason is unknown
else
brokenReasons.add("Unknown");
}
connect.disconnect();
}
// If the connection fails catch exception
catch (Exception e)
{
}
}
// Create String to hold output HTML
String resultHTML = "<html><title>Broken Links - "+args[0]+"</title><br>"
+ "<h2> Broken Links for "+args[0]+ "</h2> <br><table border=1>"
+ "<tr><td><b>Link</b></td><td><b>Response Code</b></td><td><b>Reason for Failure</b></td></tr>";
// Iterate through the Array Lists to get the data about Broken Links
for (int i= 0; i<brokenLinks.size(); i++)
{
resultHTML+= "<tr><td><a href=\"" +brokenLinks.get(i)+ "\">"+brokenLinks.get(i)+"</a></td><td>"
+brokenCodes.get(i)+ "</td><td>" +brokenReasons.get(i)+ "</td> </tr>";
}
// Close HTML and create custom-named file for output
resultHTML += "</html>";
String filename = args[0].substring(7);
// Replace any forward slashes with hyphens
filename = filename.replace("/","-");
filename = "BrokenLinks_" + filename;
FileWriter fstream = new FileWriter(filename);
BufferedWriter out = new BufferedWriter(fstream);
out.write(resultHTML);
out.close();
}
catch (Exception e)
{
String s = e.getMessage();
if (s != null)
{
System.out.println(s);
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment