Skip to content

Instantly share code, notes, and snippets.

@hkulekci
Created April 26, 2011 01:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hkulekci/941655 to your computer and use it in GitHub Desktop.
Save hkulekci/941655 to your computer and use it in GitHub Desktop.
get anchor form a web page source
////// For Main Category ////
WebPageRequest webpage1 = new WebPageRequest("http://www.dmoz.org/World/T%C3%BCrk%C3%A7e//");
webpage1.getwebpage(); // get web page source
webpage1.setAnchorList("/World"); // set anchor list from source
webpage1.setAcceptableLink();
content = "Main Six Category\n\n" + webpage.getAnchorListStr(webpage1.getAcceptableAcnhorArray());
/////// For One Sub Category ///////
int resultNum = WebPageRequest.mainSixCategory.length;
WebPageRequest webpage2[] = new WebPageRequest[resultNum];
for (int i = 0; i<resultNum;i++){
if (webpage1.getAcceptableAcnhorLink(i, 0) == null){
System.out.println(i+"\n");
continue;
}
webpage2[i] = new WebPageRequest("http://www.dmoz.org"+webpage1.getAcceptableAcnhorLink(i, 0),true);
webpage2[i].getwebpage(); // get web page source
webpage2[i].setAnchorList("/World"); // set anchor list from source
webpage2[i].setAcceptableLink();
content += "Sub Category["+webpage1.getAcceptableAcnhorLink(i, 1)+"]\n\n" +
webpage.getAnchorListStr(webpage2[i].getAcceptableAcnhorArray());
}
/*
* To change this template, choose Tools | Templates
* and open the template in the editor.
*/
package yazlab2_deneme1;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
*
* @author kulekci
*/
class webpage {
private String content = null;
private String Url;
//TODO: this part will be changed
String [][] anchorList = new String[100000][2];
private int numberOfLink = 0;
public webpage(String uri){
Url = uri;
}
public void getwebpage() throws IOException{
String sourceLine;
//String content = "";
// The URL address of the page to open.
URL address = new URL(Url);
// Open the address and create a BufferedReader with the source code.
InputStreamReader pageInput = new InputStreamReader(address.openStream());
BufferedReader source = new BufferedReader(pageInput);
// Append each new HTML line into one string. Add a tab character.
while ((sourceLine = source.readLine()) != null)
content += sourceLine + "\n";
// Print the clean content & close the Readers
//System.out.println(content);
pageInput.close();
source.close();
}
public String getSaltSource(){
return content;
}
/**
* array[0][i] => href
* array[1][i] => value
*/
public void setAnchorList(String StartWith){
if (content == null)
return;
// we may add some parameter for regex and we can get variable regular expression
/* Pattern.compile("<"+tag+".*"+attribute+"=\"(.*)?\".*>(.*)?</"+tag+">") */
Pattern titleFinder = Pattern.compile("<a.*href=\"(.*)?\".*>(.*)?</a>");
Matcher regexMatcher = titleFinder.matcher(content);
int i = 0;
System.out.println(i+"\n");
while (regexMatcher.find()) {
try{
if (regexMatcher.group(1) != null && regexMatcher.group(2) != null ){
if (regexMatcher.group(1).startsWith(StartWith)){
anchorList[i][0] = regexMatcher.group(1);
anchorList[i++][1] = regexMatcher.group(2);
}
}
}catch( NullPointerException e ){
System.out.println("Error: "+e.toString()+"\n");
}
}
numberOfLink = i;
System.out.println(i+"\n");
}
public String getAnchorListStr(){
String anchorListString = null;
if (anchorList == null)
return null;
for (int i = 0; i < numberOfLink; i++){
if (anchorList[i][1] != null)
anchorListString += anchorList[i][1] + "["+anchorList[i][0]+"]" + "\n";
}
return anchorListString;
}
public static String getAnchorListStr(String [][] str){
String anchorListString = null;
if (str == null)
return null;
for (int i = 0; i < str.length; i++){
if (str[i][1] != null)
anchorListString += str[i][1] + "["+str[i][0]+"]" + "\n";
}
return anchorListString;
}
public String [][] getAnchorList(){
return anchorList;
}
public void splitSource(String a, String b){
Pattern titleFinder = Pattern.compile(a+"(.*)?"+b);
Matcher regexMatcher = titleFinder.matcher(content);
while (regexMatcher.find()) {
try{
if (regexMatcher.group(1) != null ){
content = regexMatcher.group(1);
}
}catch( NullPointerException e ){
System.out.println("Error: "+e.toString()+"\n");
}
}
}
}
package trying;
/**
*Trying for dmoz.org links
* @author kulekci
*/
class WebPageRequest extends webpage{
private String [][] acceptable_anchor = null;
boolean subCategory = false;
public static String [] mainSixCategory ={
"Alışveriş", "Basın ve Yayın", "Bilgisayar",
"Ekonomi ve İş Dünyası", "Sağlık","Spor"
};
public WebPageRequest(String uri,boolean sc){
super(uri);
subCategory = sc;
}
public WebPageRequest(String uri){
super(uri);
subCategory = false;
}
public void setAcceptableLink(){
acceptable_anchor = new String[anchorList.length][2];
for (int i = 0;i<anchorList.length;i++){
if (anchorList[i][1] == null)
continue;
if (in_array(mainSixCategory, anchorList[i][1].trim().toString()) || subCategory){
try{
acceptable_anchor[i][0] = anchorList[i][0];
acceptable_anchor[i][1] = anchorList[i][1];
}catch(NullPointerException e){
}
}
}
}
public String[][] getAcceptableAcnhorArray(){
if (acceptable_anchor == null){
setAcceptableLink();
}
return acceptable_anchor;
}
public String getAcceptableAcnhorLink(int i,int j){
if (acceptable_anchor == null){
setAcceptableLink();
}
return acceptable_anchor[i][j];
}
private static boolean in_array(String [] haystack, String needle) {
for(int i=0;i<haystack.length;i++) {
if(haystack[i].equals(needle)) {
return true;
}
}
return false;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment