Created
November 6, 2010 09:33
-
-
Save alexzhan/665309 to your computer and use it in GitHub Desktop.
more precise version of web page url lister with some bugs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.BufferedReader; | |
import java.io.BufferedWriter; | |
import java.io.FileWriter; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.net.MalformedURLException; | |
import java.net.URL; | |
import java.net.URLConnection; | |
import java.util.ArrayList; | |
import java.util.Date; | |
/** | |
* @author alexzhan | |
* To fetch the URLs of one specific HTML page | |
* param gb2312 may be changed | |
* pagam http://www.swjtu.edu.cn may be changed | |
*/ | |
public class YangInitial1 { | |
private static ArrayList<String> urlList = new ArrayList<String>(); | |
private static String getDocumentAt(String urlString) { | |
StringBuffer html_text = new StringBuffer(); | |
try { | |
URL url = new URL(urlString); | |
URLConnection conn = url.openConnection(); | |
BufferedReader reader = new BufferedReader(new InputStreamReader( | |
conn.getInputStream(), "gb2312")); | |
String line = null; | |
while ((line = reader.readLine()) != null) { | |
html_text.append(line + "\n"); | |
} | |
reader.close(); | |
} catch (MalformedURLException e) { | |
System.out.println("invalid URL: " + urlString); | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return html_text.toString(); | |
} | |
private static void writeToFile(ArrayList<String> list) throws IOException { | |
//BufferedWriter bw = new BufferedWriter(new FileWriter("/home/alex/Desktop/urlList")); | |
BufferedWriter bw = new BufferedWriter(new FileWriter("/home/alex/Desktop/urlList")); | |
// BufferedWriter bw = new BufferedWriter(new FileWriter("c:\\urlList.txt")); | |
for (String string : list) { | |
bw.write(string + "\r\n");//windows \r\n ;linux \r is ok | |
} | |
bw.close(); | |
} | |
private static void generateURL(String string,String urlString) throws IOException { | |
String urlString1; | |
if(!urlString.endsWith("/")) urlString1 = urlString + "/"; | |
else urlString1 = urlString; | |
String mainURL = urlString1.substring(0, urlString1.indexOf("/", 8)); | |
System.err.println(mainURL); | |
int start = string.indexOf("<a"); | |
int end = string.indexOf("</a", start); | |
while (start != -1) { | |
String hrefString = string.substring(start, end); | |
String href = "href="; | |
int hrefStart = hrefString.indexOf(href); | |
int urlStart = hrefString.indexOf("\"", hrefStart) + 1; | |
int urlEnd = hrefString.indexOf("\"", urlStart); | |
System.err.println("urlstart-----" + urlStart); | |
System.err.println("urlEnd------" + urlEnd); | |
if(urlEnd - urlStart < 2) { | |
start = string.indexOf("<a", end); | |
end = string.indexOf("</a", start); | |
continue; | |
} | |
String url = hrefString.substring(urlStart, urlEnd); | |
System.err.println("url---------" + url); | |
if(url.startsWith("http")) | |
urlList.add(url); | |
else if(url.startsWith("/") && !url.equals("/")) | |
urlList.add(mainURL + url); | |
start = string.indexOf("<a", end); | |
end = string.indexOf("</a", start); | |
} | |
System.err.println("the number of url of this page:" + urlList.size()); | |
for (String urlString2 : urlList) { | |
System.err.println(urlString2); | |
} | |
writeToFile(urlList); | |
} | |
public static void main(String[] args) throws IOException { | |
// String url = "http://t.sina.com.cn/alexzhan"; | |
// String url = "http://www.swjtu.edu.cn"; | |
String url = "http://xkc.swjtu.edu.cn/newsList.do?id=000090000004"; | |
Date time1 = new Date(); | |
String string = getDocumentAt(url); | |
//System.err.println(string); | |
generateURL(string, url); | |
Date time2 = new Date(); | |
long processtime = time2.getTime() - time1.getTime(); | |
System.out.println("Done with time(" + processtime + ")ms"); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment