Created
July 15, 2012 19:53
-
-
Save daramcq/3118367 to your computer and use it in GitHub Desktop.
Program to check for broken links on a webpage
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.net.*; | |
import org.htmlparser.beans.LinkBean; | |
import org.htmlparser.http.HttpHeader; | |
import java.util.*; | |
import java.io.*; | |
public class LinkCheck | |
{ | |
public static URL[] getLinks(String url) | |
{ | |
// Use LinkBean class from HTMLParser to get array of links from a page | |
LinkBean lb = new LinkBean (); | |
lb.setURL (url); | |
URL[] urls = lb.getLinks (); | |
return urls; | |
} | |
public static boolean isImage(URL url) | |
{ | |
String urlString = url.toString(); | |
// Get the last three chars of url | |
int last = urlString.length()-1; | |
int extStart = urlString.length()-3; | |
String ext = urlString.substring(extStart); | |
// If these match standard image formats, it's an image | |
if (ext.equalsIgnoreCase("jpg")||ext.equalsIgnoreCase("png")||ext.equalsIgnoreCase("gif")||ext.equalsIgnoreCase("tif")) | |
return true; | |
// Otherwise, it's not | |
else | |
return false; | |
} | |
public static boolean isNotGoogle(URL url) | |
{ | |
String urlString = url.toString(); | |
if (urlString.contains("google")) | |
return false; | |
else | |
return true; | |
} | |
public static void main (String[] args) | |
{ | |
try | |
{ | |
// Use getLinks to return array of links from website specified at command-line | |
URL[] urls = getLinks(args[0]); | |
// Create an array of same size to store Response Codes | |
int[] codes = new int[urls.length]; | |
// Create ArrayLists to store info from Broken Links | |
ArrayList <URL> brokenLinks = new ArrayList <URL> (); | |
ArrayList <Integer> brokenCodes = new ArrayList <Integer> (); | |
ArrayList <String> brokenReasons = new ArrayList <String> (); | |
// Iterate through the Urls | |
for (int i = 0; i < urls.length; i++) | |
{ | |
try | |
{ | |
// Connect to the URL and add Response Code to Codes Array | |
HttpURLConnection connect = (HttpURLConnection) urls[i].openConnection(); | |
codes[i] = connect.getResponseCode(); | |
// If the link is broken & not a Google link, add its info into ArrayLists | |
if (connect.getResponseCode() != 200 && isNotGoogle(urls[i])) | |
{ | |
brokenLinks.add(urls[i]); | |
brokenCodes.add(codes[i]); | |
// If is an image, this is the Reason | |
if (isImage(urls[i])) | |
brokenReasons.add("Is Image - Access not Allowed"); | |
// If code is 404, Reason is File not Found | |
else if (codes[i] == 404) | |
brokenReasons.add("File not Found"); | |
// If code is 403, Reason is Access Forbidden | |
else if (codes[i] == 403) | |
brokenReasons.add("Access Forbidden by server"); | |
// If code is 302, Reason is file moved | |
else if (codes[i] == 302) | |
brokenReasons.add("File temporarily moved"); | |
//Otherwise reason is unknown | |
else | |
brokenReasons.add("Unknown"); | |
} | |
connect.disconnect(); | |
} | |
// If the connection fails catch exception | |
catch (Exception e) | |
{ | |
} | |
} | |
// Create String to hold output HTML | |
String resultHTML = "<html><title>Broken Links - "+args[0]+"</title><br>" | |
+ "<h2> Broken Links for "+args[0]+ "</h2> <br><table border=1>" | |
+ "<tr><td><b>Link</b></td><td><b>Response Code</b></td><td><b>Reason for Failure</b></td></tr>"; | |
// Iterate through the Array Lists to get the data about Broken Links | |
for (int i= 0; i<brokenLinks.size(); i++) | |
{ | |
resultHTML+= "<tr><td><a href=\"" +brokenLinks.get(i)+ "\">"+brokenLinks.get(i)+"</a></td><td>" | |
+brokenCodes.get(i)+ "</td><td>" +brokenReasons.get(i)+ "</td> </tr>"; | |
} | |
// Close HTML and create custom-named file for output | |
resultHTML += "</html>"; | |
String filename = args[0].substring(7); | |
// Replace any forward slashes with hyphens | |
filename = filename.replace("/","-"); | |
filename = "BrokenLinks_" + filename; | |
FileWriter fstream = new FileWriter(filename); | |
BufferedWriter out = new BufferedWriter(fstream); | |
out.write(resultHTML); | |
out.close(); | |
} | |
catch (Exception e) | |
{ | |
String s = e.getMessage(); | |
if (s != null) | |
{ | |
System.out.println(s); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment