Last active
July 10, 2016 14:07
-
-
Save seralf/5558844 to your computer and use it in GitHub Desktop.
Start Example for a simple deduplication scenario using Solr.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" ?> | |
<schema name="simple" version="1.1"> | |
<types> | |
<fieldtype name="string" class="solr.StrField" /> | |
<fieldType name="uuid" class="solr.UUIDField" indexed="true" /> | |
</types> | |
<fields> | |
<field name="uid" type="string" indexed="true" stored="true" default="NEW" multiValued="false" /> | |
<dynamicField name="*" type="string" multiValued="true" indexed="true" stored="true" /> | |
<copyField source="*" dest="fulltext" /> | |
<field name="fulltext" type="string" multiValued="true" /> | |
</fields> | |
<defaultSearchField>fulltext</defaultSearchField> | |
<solrQueryParser defaultOperator="OR" /> | |
</schema> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?xml version="1.0" encoding="UTF-8" ?> | |
<config> | |
<luceneMatchVersion>LUCENE_42</luceneMatchVersion> | |
<directoryFactory name="DirectoryFactory" class="${solr.directoryFactory:solr.StandardDirectoryFactory}" /> | |
<codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" /> | |
<!-- These dir are relative to this xml! --> | |
<lib dir="../../../solr/contrib/extraction/lib" /> | |
<lib dir="../../../solr/dist/" regex="solr-cell-\d.*\.jar" /> | |
<requestHandler name="standard" class="solr.StandardRequestHandler" default="true" /> | |
<requestHandler name="/update" class="solr.XmlUpdateRequestHandler"> | |
<lst name="defaults"> | |
<str name="update.chain">deduplication</str> | |
</lst> | |
</requestHandler> | |
<requestHandler name="/admin/" class="org.apache.solr.handler.admin.AdminHandlers" /> | |
<admin> | |
<defaultQuery>*:*</defaultQuery> | |
</admin> | |
<requestHandler name="/update/extract" class="solr.extraction.ExtractingRequestHandler"> | |
<lst name="defaults"> | |
<str name="captureAttr">true</str> | |
<str name="fmap.content">text</str> | |
<str name="lowernames">true</str> | |
</lst> | |
</requestHandler> | |
<updateRequestProcessorChain name="deduplication"> | |
<processor class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory"> | |
<bool name="overwriteDupes">true</bool> | |
<str name="signatureField">uid</str> | |
<bool name="enabled">true</bool> | |
<str name="fields">text</str> | |
<str name="minTokenLen">3</str> | |
<str name="signatureClass">org.apache.solr.update.processor.TextProfileSignature</str> | |
</processor> | |
<processor class="solr.LogUpdateProcessorFactory" /> | |
<processor class="solr.RunUpdateProcessorFactory" /> | |
</updateRequestProcessorChain> | |
</config> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# it's possibile to add multiple times the same test document: | |
curl -X POST "http://localhost:8983/solr/pdfs/update?commit=true&wt=json" -H "Content-Type: text/xml" -d '<add><doc><field name="name">Test name</field><field name="text">Test text contents</field></doc></add>' | |
# and then produce a facet query in order to see if there are duplicates (uid.count>1) | |
# http://localhost:8983/solr/pdfs/select?q=*%3A*&fl=stream_name%2C+uid%2C+text&wt=xml&indent=true&facet=true&facet.field=uid | |
# note that we have not provided any id, but it can be added as usual, as the uid it is a different field |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment