Skip to content

Instantly share code, notes, and snippets.

View jnioche's full-sized avatar

Julien Nioche jnioche

View GitHub Profile
protected CrawlController init() throws Exception {
final CrawlConfig config = new CrawlConfig();
config.setCrawlStorageFolder(“/tmp”);
config.setPolitenessDelay(800);
config.setMaxDepthOfCrawling(3);
config.setIncludeBinaryContentInCrawling(false);
config.setResumableCrawling(true);
config.setHaltOnError(false);
final BasicURLNormalizer normalizer = BasicURLNormalizer.newBuilder().idnNormalization(BasicURLNormalizer.IdnNormalization.NONE).build();
final PageFetcher pageFetcher = new PageFetcher(config, normalizer);
@jnioche
jnioche / es-crawler.flux
Created February 23, 2021 08:31
StormCrawler topology with Tika included
name: "crawler"
includes:
- resource: true
file: "/crawler-default.yaml"
override: false
- resource: false
file: "crawler-conf.yaml"
override: true
@jnioche
jnioche / crawler-conf.yaml
Created October 6, 2017 15:53
#StormCrawler configuration to fetch with ChromeDriver in headless mode via Selenium
selenium.capabilities:
goog:chromeOptions:
args:
- "--headless"
- "--disable-gpu"
package com.digitalpebble.crawl;
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
@jnioche
jnioche / CookieConverter.java
Created March 14, 2017 17:00
Example of code I used for passing cookies to the protocol implementation in Nutch
package org.apache.nutch.protocol.httpclient;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
<document id='xxxx'>
<label>category_of_document</label>
<field name='text'>every document has some text</field>
<field name='title'>some even have a title</field>
<field name='description'>or some meaningful description</field>
</document>
<queryParser name="payload" class="com.digitalpebble.solr.PLDisMaxQParserPlugin" />
package com.digitalpebble.solr;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
public class PLDisMaxQParserPlugin extends QParserPlugin {
public void init(NamedList args) {
package com.digitalpebble.solr;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.search.DefaultSimilarity;
public class PayloadSimilarity extends DefaultSimilarity
{
@Override public float scorePayload(int docId, String fieldName, int start, int end, byte[] payload, int offset, int length)
{
if (length > 0) {
package com.digitalpebble.solr;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.BooleanClause;