Skip to content

Instantly share code, notes, and snippets.

@seralf
Last active July 10, 2016 14:07
Show Gist options
  • Save seralf/5558844 to your computer and use it in GitHub Desktop.
Save seralf/5558844 to your computer and use it in GitHub Desktop.
Start Example for a simple deduplication scenario using Solr.
<?xml version="1.0" ?>
<schema name="simple" version="1.1">
<types>
<fieldtype name="string" class="solr.StrField" />
<fieldType name="uuid" class="solr.UUIDField" indexed="true" />
</types>
<fields>
<field name="uid" type="string" indexed="true" stored="true" default="NEW" multiValued="false" />
<dynamicField name="*" type="string" multiValued="true" indexed="true" stored="true" />
<copyField source="*" dest="fulltext" />
<field name="fulltext" type="string" multiValued="true" />
</fields>
<defaultSearchField>fulltext</defaultSearchField>
<solrQueryParser defaultOperator="OR" />
</schema>
<?xml version="1.0" encoding="UTF-8" ?>
<config>
<luceneMatchVersion>LUCENE_42</luceneMatchVersion>
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}" />
<codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />
<!-- These dir are relative to this xml! -->
<lib dir="../../../solr/contrib/extraction/lib" />
<lib dir="../../../solr/dist/" regex="solr-cell-\d.*\.jar" />
<requestHandler name="standard" class="solr.StandardRequestHandler" default="true" />
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler">
<lst name="defaults">
<str name="update.chain">deduplication</str>
</lst>
</requestHandler>
<requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" />
<admin>
<defaultQuery>*:*</defaultQuery>
</admin>
<requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler">
<lst name="defaults">
<str name="captureAttr">true</str>
<str name="fmap.content">text</str>
<str name="lowernames">true</str>
</lst>
</requestHandler>
<updateRequestProcessorChain name="deduplication">
<processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
<bool name="overwriteDupes">true</bool>
<str name="signatureField">uid</str>
<bool name="enabled">true</bool>
<str name="fields">text</str>
<str name="minTokenLen">3</str>
<str name="signatureClass">org.apache.solr.update.processor.TextProfileSignature</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
</config>
# it's possibile to add multiple times the same test document:
curl -X POST "http://localhost:8983/solr/pdfs/update?commit=true&wt=json" -H "Content-Type: text/xml" -d '<add><doc><field name="name">Test name</field><field name="text">Test text contents</field></doc></add>'
# and then produce a facet query in order to see if there are duplicates (uid.count>1)
# http://localhost:8983/solr/pdfs/select?q=*%3A*&fl=stream_name%2C+uid%2C+text&wt=xml&indent=true&facet=true&facet.field=uid
# note that we have not provided any id, but it can be added as usual, as the uid it is a different field
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment