Skip to content

Instantly share code, notes, and snippets.

@alexzhan
Created November 6, 2010 09:33
Show Gist options
  • Save alexzhan/665309 to your computer and use it in GitHub Desktop.
Save alexzhan/665309 to your computer and use it in GitHub Desktop.
more precise version of web page url lister with some bugs
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
/**
* @author alexzhan
* To fetch the URLs of one specific HTML page
* param gb2312 may be changed
* pagam http://www.swjtu.edu.cn may be changed
*/
public class YangInitial1 {
private static ArrayList<String> urlList = new ArrayList<String>();
private static String getDocumentAt(String urlString) {
StringBuffer html_text = new StringBuffer();
try {
URL url = new URL(urlString);
URLConnection conn = url.openConnection();
BufferedReader reader = new BufferedReader(new InputStreamReader(
conn.getInputStream(), "gb2312"));
String line = null;
while ((line = reader.readLine()) != null) {
html_text.append(line + "\n");
}
reader.close();
} catch (MalformedURLException e) {
System.out.println("invalid URL: " + urlString);
} catch (IOException e) {
e.printStackTrace();
}
return html_text.toString();
}
private static void writeToFile(ArrayList<String> list) throws IOException {
//BufferedWriter bw = new BufferedWriter(new FileWriter("/home/alex/Desktop/urlList"));
BufferedWriter bw = new BufferedWriter(new FileWriter("/home/alex/Desktop/urlList"));
// BufferedWriter bw = new BufferedWriter(new FileWriter("c:\\urlList.txt"));
for (String string : list) {
bw.write(string + "\r\n");//windows \r\n ;linux \r is ok
}
bw.close();
}
private static void generateURL(String string,String urlString) throws IOException {
String urlString1;
if(!urlString.endsWith("/")) urlString1 = urlString + "/";
else urlString1 = urlString;
String mainURL = urlString1.substring(0, urlString1.indexOf("/", 8));
System.err.println(mainURL);
int start = string.indexOf("<a");
int end = string.indexOf("</a", start);
while (start != -1) {
String hrefString = string.substring(start, end);
String href = "href=";
int hrefStart = hrefString.indexOf(href);
int urlStart = hrefString.indexOf("\"", hrefStart) + 1;
int urlEnd = hrefString.indexOf("\"", urlStart);
System.err.println("urlstart-----" + urlStart);
System.err.println("urlEnd------" + urlEnd);
if(urlEnd - urlStart < 2) {
start = string.indexOf("<a", end);
end = string.indexOf("</a", start);
continue;
}
String url = hrefString.substring(urlStart, urlEnd);
System.err.println("url---------" + url);
if(url.startsWith("http"))
urlList.add(url);
else if(url.startsWith("/") && !url.equals("/"))
urlList.add(mainURL + url);
start = string.indexOf("<a", end);
end = string.indexOf("</a", start);
}
System.err.println("the number of url of this page:" + urlList.size());
for (String urlString2 : urlList) {
System.err.println(urlString2);
}
writeToFile(urlList);
}
public static void main(String[] args) throws IOException {
// String url = "http://t.sina.com.cn/alexzhan";
// String url = "http://www.swjtu.edu.cn";
String url = "http://xkc.swjtu.edu.cn/newsList.do?id=000090000004";
Date time1 = new Date();
String string = getDocumentAt(url);
//System.err.println(string);
generateURL(string, url);
Date time2 = new Date();
long processtime = time2.getTime() - time1.getTime();
System.out.println("Done with time(" + processtime + ")ms");
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment