Skip to content

Instantly share code, notes, and snippets.

@jnioche
Last active April 26, 2017 14:27
Show Gist options
  • Save jnioche/5f595e41867e236e27efb45a90c5062d to your computer and use it in GitHub Desktop.
Save jnioche/5f595e41867e236e27efb45a90c5062d to your computer and use it in GitHub Desktop.
package com.digitalpebble.crawl;
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
import com.digitalpebble.stormcrawler.Metadata;
import com.digitalpebble.stormcrawler.protocol.ProtocolResponse;
import com.digitalpebble.stormcrawler.protocol.selenium.NavigationFilter;
public class JobBoardNavigationFilter extends NavigationFilter {
@Override
public ProtocolResponse filter(RemoteWebDriver driver, Metadata metadata) {
StringBuilder dummyContent = new StringBuilder("<html>");
// check that we are on the right sort of page
if (!driver.getCurrentUrl().contains("JobBoard/ListJobs.aspx")) {
return null;
}
// iterate on the result pages
while (true) {
// get the links for the current page
List<WebElement> anchors = driver.findElementsByTagName("A");
for (WebElement element : anchors) {
String href = element.getAttribute("href");
if (!href.contains("JobDetails.aspx"))
continue;
// generate an outlink
dummyContent.append("<a href=\"").append(href).append("\">");
dummyContent.append(element.getText()).append("<a>\n");
}
// see if there is a clickable 'next page' button
WebDriverWait wait = new WebDriverWait(driver, 10);
WebElement nextButton = wait.until(ExpectedConditions
.presenceOfElementLocated(By.id("__Next")));
if (!nextButton.isEnabled())
break;
nextButton.click();
}
dummyContent.append("</html>");
return new ProtocolResponse(dummyContent.toString().getBytes(), 200,
metadata);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment