Created
April 26, 2011 01:47
-
-
Save hkulekci/941655 to your computer and use it in GitHub Desktop.
get anchor form a web page source
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
////// For Main Category //// | |
WebPageRequest webpage1 = new WebPageRequest("http://www.dmoz.org/World/T%C3%BCrk%C3%A7e//"); | |
webpage1.getwebpage(); // get web page source | |
webpage1.setAnchorList("/World"); // set anchor list from source | |
webpage1.setAcceptableLink(); | |
content = "Main Six Category\n\n" + webpage.getAnchorListStr(webpage1.getAcceptableAcnhorArray()); | |
/////// For One Sub Category /////// | |
int resultNum = WebPageRequest.mainSixCategory.length; | |
WebPageRequest webpage2[] = new WebPageRequest[resultNum]; | |
for (int i = 0; i<resultNum;i++){ | |
if (webpage1.getAcceptableAcnhorLink(i, 0) == null){ | |
System.out.println(i+"\n"); | |
continue; | |
} | |
webpage2[i] = new WebPageRequest("http://www.dmoz.org"+webpage1.getAcceptableAcnhorLink(i, 0),true); | |
webpage2[i].getwebpage(); // get web page source | |
webpage2[i].setAnchorList("/World"); // set anchor list from source | |
webpage2[i].setAcceptableLink(); | |
content += "Sub Category["+webpage1.getAcceptableAcnhorLink(i, 1)+"]\n\n" + | |
webpage.getAnchorListStr(webpage2[i].getAcceptableAcnhorArray()); | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* To change this template, choose Tools | Templates | |
* and open the template in the editor. | |
*/ | |
package yazlab2_deneme1; | |
import java.io.BufferedReader; | |
import java.io.IOException; | |
import java.io.InputStreamReader; | |
import java.net.URL; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
/** | |
* | |
* @author kulekci | |
*/ | |
class webpage { | |
private String content = null; | |
private String Url; | |
//TODO: this part will be changed | |
String [][] anchorList = new String[100000][2]; | |
private int numberOfLink = 0; | |
public webpage(String uri){ | |
Url = uri; | |
} | |
public void getwebpage() throws IOException{ | |
String sourceLine; | |
//String content = ""; | |
// The URL address of the page to open. | |
URL address = new URL(Url); | |
// Open the address and create a BufferedReader with the source code. | |
InputStreamReader pageInput = new InputStreamReader(address.openStream()); | |
BufferedReader source = new BufferedReader(pageInput); | |
// Append each new HTML line into one string. Add a tab character. | |
while ((sourceLine = source.readLine()) != null) | |
content += sourceLine + "\n"; | |
// Print the clean content & close the Readers | |
//System.out.println(content); | |
pageInput.close(); | |
source.close(); | |
} | |
public String getSaltSource(){ | |
return content; | |
} | |
/** | |
* array[0][i] => href | |
* array[1][i] => value | |
*/ | |
public void setAnchorList(String StartWith){ | |
if (content == null) | |
return; | |
// we may add some parameter for regex and we can get variable regular expression | |
/* Pattern.compile("<"+tag+".*"+attribute+"=\"(.*)?\".*>(.*)?</"+tag+">") */ | |
Pattern titleFinder = Pattern.compile("<a.*href=\"(.*)?\".*>(.*)?</a>"); | |
Matcher regexMatcher = titleFinder.matcher(content); | |
int i = 0; | |
System.out.println(i+"\n"); | |
while (regexMatcher.find()) { | |
try{ | |
if (regexMatcher.group(1) != null && regexMatcher.group(2) != null ){ | |
if (regexMatcher.group(1).startsWith(StartWith)){ | |
anchorList[i][0] = regexMatcher.group(1); | |
anchorList[i++][1] = regexMatcher.group(2); | |
} | |
} | |
}catch( NullPointerException e ){ | |
System.out.println("Error: "+e.toString()+"\n"); | |
} | |
} | |
numberOfLink = i; | |
System.out.println(i+"\n"); | |
} | |
public String getAnchorListStr(){ | |
String anchorListString = null; | |
if (anchorList == null) | |
return null; | |
for (int i = 0; i < numberOfLink; i++){ | |
if (anchorList[i][1] != null) | |
anchorListString += anchorList[i][1] + "["+anchorList[i][0]+"]" + "\n"; | |
} | |
return anchorListString; | |
} | |
public static String getAnchorListStr(String [][] str){ | |
String anchorListString = null; | |
if (str == null) | |
return null; | |
for (int i = 0; i < str.length; i++){ | |
if (str[i][1] != null) | |
anchorListString += str[i][1] + "["+str[i][0]+"]" + "\n"; | |
} | |
return anchorListString; | |
} | |
public String [][] getAnchorList(){ | |
return anchorList; | |
} | |
public void splitSource(String a, String b){ | |
Pattern titleFinder = Pattern.compile(a+"(.*)?"+b); | |
Matcher regexMatcher = titleFinder.matcher(content); | |
while (regexMatcher.find()) { | |
try{ | |
if (regexMatcher.group(1) != null ){ | |
content = regexMatcher.group(1); | |
} | |
}catch( NullPointerException e ){ | |
System.out.println("Error: "+e.toString()+"\n"); | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package trying; | |
/** | |
*Trying for dmoz.org links | |
* @author kulekci | |
*/ | |
class WebPageRequest extends webpage{ | |
private String [][] acceptable_anchor = null; | |
boolean subCategory = false; | |
public static String [] mainSixCategory ={ | |
"Alışveriş", "Basın ve Yayın", "Bilgisayar", | |
"Ekonomi ve İş Dünyası", "Sağlık","Spor" | |
}; | |
public WebPageRequest(String uri,boolean sc){ | |
super(uri); | |
subCategory = sc; | |
} | |
public WebPageRequest(String uri){ | |
super(uri); | |
subCategory = false; | |
} | |
public void setAcceptableLink(){ | |
acceptable_anchor = new String[anchorList.length][2]; | |
for (int i = 0;i<anchorList.length;i++){ | |
if (anchorList[i][1] == null) | |
continue; | |
if (in_array(mainSixCategory, anchorList[i][1].trim().toString()) || subCategory){ | |
try{ | |
acceptable_anchor[i][0] = anchorList[i][0]; | |
acceptable_anchor[i][1] = anchorList[i][1]; | |
}catch(NullPointerException e){ | |
} | |
} | |
} | |
} | |
public String[][] getAcceptableAcnhorArray(){ | |
if (acceptable_anchor == null){ | |
setAcceptableLink(); | |
} | |
return acceptable_anchor; | |
} | |
public String getAcceptableAcnhorLink(int i,int j){ | |
if (acceptable_anchor == null){ | |
setAcceptableLink(); | |
} | |
return acceptable_anchor[i][j]; | |
} | |
private static boolean in_array(String [] haystack, String needle) { | |
for(int i=0;i<haystack.length;i++) { | |
if(haystack[i].equals(needle)) { | |
return true; | |
} | |
} | |
return false; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment