Skip to content

Instantly share code, notes, and snippets.

View jnioche's full-sized avatar

Julien Nioche jnioche

View GitHub Profile
<fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
<analyzer>
<tokenizer class="solr.WhitespaceTokenizerFactory"/>
<!--
The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
Attributes of the DelimitedPayloadTokenFilterFactory :
"delimiter" - a one character delimiter. Default is | (pipe)
"encoder" - how to encode the following value into a playload
float -> org.apache.lucene.analysis.payloads.FloatEncoder,
package com.digitalpebble.solr;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.search.BooleanClause;
package com.digitalpebble.solr;
import org.apache.lucene.analysis.payloads.PayloadHelper;
import org.apache.lucene.search.DefaultSimilarity;
public class PayloadSimilarity extends DefaultSimilarity
{
@Override public float scorePayload(int docId, String fieldName, int start, int end, byte[] payload, int offset, int length)
{
if (length > 0) {
package com.digitalpebble.solr;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QParserPlugin;
public class PLDisMaxQParserPlugin extends QParserPlugin {
public void init(NamedList args) {
<queryParser name="payload" class="com.digitalpebble.solr.PLDisMaxQParserPlugin" />
<document id='xxxx'>
<label>category_of_document</label>
<field name='text'>every document has some text</field>
<field name='title'>some even have a title</field>
<field name='description'>or some meaningful description</field>
</document>
@jnioche
jnioche / CookieConverter.java
Created March 14, 2017 17:00
Example of code I used for passing cookies to the protocol implementation in Nutch
package org.apache.nutch.protocol.httpclient;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
package com.digitalpebble.crawl;
import java.util.List;
import org.openqa.selenium.By;
import org.openqa.selenium.WebElement;
import org.openqa.selenium.remote.RemoteWebDriver;
import org.openqa.selenium.support.ui.ExpectedConditions;
import org.openqa.selenium.support.ui.WebDriverWait;
@jnioche
jnioche / crawler-conf.yaml
Created October 6, 2017 15:53
#StormCrawler configuration to fetch with ChromeDriver in headless mode via Selenium
selenium.capabilities:
goog:chromeOptions:
args:
- "--headless"
- "--disable-gpu"
@jnioche
jnioche / es-crawler.flux
Created February 23, 2021 08:31
StormCrawler topology with Tika included
name: "crawler"
includes:
- resource: true
file: "/crawler-default.yaml"
override: false
- resource: false
file: "crawler-conf.yaml"
override: true