Skip to content

Instantly share code, notes, and snippets.

@Querela
Last active November 17, 2023 12:56
Show Gist options
  • Save Querela/825a084f94b30de88827050eddc8e361 to your computer and use it in GitHub Desktop.
Save Querela/825a084f94b30de88827050eddc8e361 to your computer and use it in GitHub Desktop.
Query Translation: FCS Queries to Solr
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.z3950.zing.cql.CQLAndNode;
import org.z3950.zing.cql.CQLBooleanNode;
import org.z3950.zing.cql.CQLNode;
import org.z3950.zing.cql.CQLOrNode;
import org.z3950.zing.cql.CQLTermNode;
import eu.clarin.sru.server.SRUConstants;
import eu.clarin.sru.server.SRUException;
import eu.clarin.sru.server.fcs.parser.QueryParserException;
public class CQLToSolrConverter {
private static final Logger LOGGER = LogManager.getLogger(CQLToSolrConverter.class);
public static String convertCQLtoSolrQuery(final CQLNode node)
throws QueryParserException, SRUException {
StringBuilder sb = new StringBuilder();
convertCQLtoSolrSingle(node, sb);
return sb.toString();
}
private static void convertCQLtoSolrSingle(final CQLNode node, StringBuilder sb)
throws SRUException {
if (node instanceof CQLTermNode) {
final CQLTermNode tn = ((CQLTermNode) node);
if (tn.getIndex() != null && !"cql.serverChoice".equalsIgnoreCase(tn.getIndex())) {
throw new SRUException(SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not support index/relation on '"
+ node.getClass().getSimpleName() + "' by this FCS Endpoint.");
}
sb.append('"');
sb.append(tn.getTerm());
sb.append('"');
} else if (node instanceof CQLOrNode || node instanceof CQLAndNode) {
final CQLBooleanNode bn = (CQLBooleanNode) node;
if (!bn.getModifiers().isEmpty()) {
throw new SRUException(SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not support modifiers on '" + node.getClass().getSimpleName()
+ "' by this FCS Endpoint.");
}
sb.append("( ");
convertCQLtoSolrSingle(bn.getLeftOperand(), sb);
if (node instanceof CQLOrNode) {
sb.append(" OR ");
} else if (node instanceof CQLAndNode) {
sb.append(" AND ");
}
convertCQLtoSolrSingle(bn.getRightOperand(), sb);
sb.append(" )");
} else {
throw new SRUException(SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not support '" + node.getClass().getSimpleName()
+ "' by this FCS Endpoint.");
}
}
}
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.z3950.zing.cql.CQLAndNode;
import org.z3950.zing.cql.CQLBooleanNode;
import org.z3950.zing.cql.CQLNode;
import org.z3950.zing.cql.CQLNotNode;
import org.z3950.zing.cql.CQLOrNode;
import org.z3950.zing.cql.CQLRelation;
import org.z3950.zing.cql.CQLTermNode;
import eu.clarin.sru.server.SRUConstants;
import eu.clarin.sru.server.SRUException;
import eu.clarin.sru.server.fcs.parser.QueryParserException;
public class LexCQLToSolrConverter {
private static final Logger LOGGER = LogManager.getLogger(LexCQLToSolrConverter.class);
public static String convertLexCQLtoSolrQuery(final CQLNode node)
throws QueryParserException, SRUException {
StringBuilder sb = new StringBuilder();
convertLexCQLtoSolrSingle(node, sb);
return sb.toString();
}
private static void convertLexCQLtoSolrSingle(final CQLNode node, StringBuilder sb)
throws SRUException {
if (node instanceof CQLTermNode) {
final CQLTermNode tn = ((CQLTermNode) node);
final CQLRelation rel = tn.getRelation();
if (!"=".equals(rel.getBase()) && !"==".equals(rel.getBase())) {
throw new SRUException(SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not support '" + rel.getBase()
+ "' relations by this FCS Endpoint.");
}
if (!rel.getModifiers().isEmpty()) {
throw new SRUException(SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not support modifiers on '" + rel.getBase()
+ "' relation by this FCS Endpoint.");
}
if (tn.getIndex() != null && !"cql.serverChoice".equalsIgnoreCase(tn.getIndex())) {
if ("lemma".equalsIgnoreCase(tn.getIndex())) {
sb.append("lemma4search");
} else {
sb.append(tn.getIndex());
}
sb.append(":");
}
sb.append('"');
sb.append(tn.getTerm());
sb.append('"');
} else if (node instanceof CQLOrNode || node instanceof CQLAndNode) {
final CQLBooleanNode bn = (CQLBooleanNode) node;
if (!bn.getModifiers().isEmpty()) {
throw new SRUException(SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not support modifiers on '" + node.getClass().getSimpleName()
+ "' by this FCS Endpoint.");
}
sb.append("( ");
convertLexCQLtoSolrSingle(bn.getLeftOperand(), sb);
if (node instanceof CQLOrNode) {
sb.append(" OR ");
} else if (node instanceof CQLAndNode) {
sb.append(" AND ");
}
convertLexCQLtoSolrSingle(bn.getRightOperand(), sb);
sb.append(" )");
} else if (node instanceof CQLNotNode) {
final CQLNotNode bnn = (CQLNotNode) node;
throw new SRUException(
SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not (yet) support '" + node.getClass().getSimpleName()
+ "' by this FCS Endpoint.");
} else {
throw new SRUException(
SRUConstants.SRU_CANNOT_PROCESS_QUERY_REASON_UNKNOWN,
"Queries with queryType 'cql' do not support '" + node.getClass().getSimpleName()
+ "' by this FCS Endpoint.");
}
}
}
public final class SAWSRUConstants {
public static final String SRU_QUERY_TYPE_LEX = "lex";
public static final String X_FCS_CONTEXT_KEY = "x-fcs-context";
public static final String X_FCS_CONTEXT_SEPARATOR = ",";
public static final String X_FCS_DATAVIEWS_KEY = "x-fcs-dataviews";
public static final String X_FCS_DATAVIEWS_SEPARATOR = ",";
public static final String CLARIN_FCS_RECORD_SCHEMA = "http://clarin.eu/fcs/resource";
// from
// https://github.com/clarin-eric/fcs-simple-endpoint/blob/main/src/main/java/eu/clarin/sru/server/fcs/XMLStreamWriterHelper.java
public static final String FCS_HITS_MIMETYPE = "application/x-clarin-fcs-hits+xml";
public static final String FCS_HITS_PREFIX = "hits";
public static final String FCS_HITS_NS = "http://clarin.eu/fcs/dataview/hits";
}
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import ResultEntry;
import SearchResultSet;
import eu.clarin.sru.server.SRUConstants;
import eu.clarin.sru.server.SRUDiagnostic;
import eu.clarin.sru.server.SRUDiagnosticList;
import eu.clarin.sru.server.SRUException;
import eu.clarin.sru.server.SRURequest;
import eu.clarin.sru.server.SRUSearchResultSet;
import eu.clarin.sru.server.SRUServerConfig;
import eu.clarin.sru.server.fcs.XMLStreamWriterHelper;
public class SAWSRUSearchResultSet extends SRUSearchResultSet {
private static final Logger LOGGER = LogManager.getLogger(SAWSRUSearchResultSet.class);
protected static final SAXParserFactory factory;
static {
factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
factory.setValidating(false);
factory.setXIncludeAware(false);
}
SRUServerConfig serverConfig = null;
SRURequest request = null;
private Set<String> extraDataviews;
private SearchResultSet results;
private int currentRecordCursor = 0;
protected SAWSRUSearchResultSet(SRUServerConfig serverConfig, SRURequest request,
SRUDiagnosticList diagnostics, List<String> dataviews, SearchResultSet results) {
super(diagnostics);
this.serverConfig = serverConfig;
this.request = request;
this.results = results;
currentRecordCursor = -1;
extraDataviews = new HashSet<>(dataviews);
}
@Override
public String getRecordIdentifier() {
return null;
}
@Override
public String getRecordSchemaIdentifier() {
return request.getRecordSchemaIdentifier() != null ? request.getRecordSchemaIdentifier()
: SAWSRUConstants.CLARIN_FCS_RECORD_SCHEMA;
}
@Override
public SRUDiagnostic getSurrogateDiagnostic() {
if ((getRecordSchemaIdentifier() != null) &&
!SAWSRUConstants.CLARIN_FCS_RECORD_SCHEMA.equals(getRecordSchemaIdentifier())) {
return new SRUDiagnostic(
SRUConstants.SRU_RECORD_NOT_AVAILABLE_IN_THIS_SCHEMA,
getRecordSchemaIdentifier(),
"Record is not available in record schema \"" +
getRecordSchemaIdentifier() + "\".");
}
return null;
}
@Override
public int getTotalRecordCount() {
return (int) results.getTotal();
}
@Override
public int getRecordCount() {
return results.getResults().size();
}
@Override
public boolean nextRecord() throws SRUException {
if (currentRecordCursor < (getRecordCount() - 1)) {
currentRecordCursor++;
return true;
}
return false;
}
@Override
public void writeRecord(XMLStreamWriter writer) throws XMLStreamException {
ResultEntry result = results.getResults().get(currentRecordCursor);
XMLStreamWriterHelper.writeStartResource(writer, results.getPid(), null);
XMLStreamWriterHelper.writeStartResourceFragment(writer, result.lemma, result.landingpage);
if (request != null && request.isQueryType(SAWSRUConstants.SRU_QUERY_TYPE_LEX)) {
writeLexHitsDataview(writer, result);
} else {
writeHitsDataview(writer, result);
}
XMLStreamWriterHelper.writeEndResourceFragment(writer);
XMLStreamWriterHelper.writeEndResource(writer);
}
protected void writeHitsDataview(XMLStreamWriter writer, ResultEntry result) throws XMLStreamException {
XMLStreamWriterHelper.writeStartDataView(writer, SAWSRUConstants.FCS_HITS_MIMETYPE);
writer.setPrefix(SAWSRUConstants.FCS_HITS_PREFIX, SAWSRUConstants.FCS_HITS_NS);
writer.writeStartElement(SAWSRUConstants.FCS_HITS_NS, "Result");
writer.writeNamespace(SAWSRUConstants.FCS_HITS_PREFIX, SAWSRUConstants.FCS_HITS_NS);
writeSolrHitsDataviewBytedXMLDoc(writer, result.dataview_hits.getBytes());
writer.writeEndElement(); // "Result" element
XMLStreamWriterHelper.writeEndDataView(writer);
}
protected void writeLexHitsDataview(XMLStreamWriter writer, ResultEntry result) throws XMLStreamException {
XMLStreamWriterHelper.writeStartDataView(writer, SAWSRUConstants.FCS_HITS_MIMETYPE);
writer.setPrefix(SAWSRUConstants.FCS_HITS_PREFIX, SAWSRUConstants.FCS_HITS_NS);
writer.writeStartElement(SAWSRUConstants.FCS_HITS_NS, "Result");
writer.writeNamespace(SAWSRUConstants.FCS_HITS_PREFIX, SAWSRUConstants.FCS_HITS_NS);
writeSolrHitsDataviewBytedXMLDoc(writer, result.dataview_lexhits.getBytes());
writer.writeEndElement(); // "Result" element
XMLStreamWriterHelper.writeEndDataView(writer);
}
/**
* Helper method for {@link #writeLexHitsDataview(XMLStreamWriter, ResultEntry)}
* and {@link #writeHitsDataview(XMLStreamWriter, ResultEntry)} to write an XML
* string to output. Also adds the <code>hits:</code> prefixes.
*
* @param writer
* @param bytes
* @throws XMLStreamException
*/
protected static void writeSolrHitsDataviewBytedXMLDoc(XMLStreamWriter writer, byte[] bytes)
throws XMLStreamException {
final String marker = "writeSolrHitsDataviewBytedXMLDoc";
try {
ByteArrayOutputStream baos = new ByteArrayOutputStream();
baos.write(("<" + marker + ">").getBytes());
baos.write(bytes);
baos.write(("</" + marker + ">").getBytes());
bytes = baos.toByteArray();
// LOGGER.info("bytes: {}", new String(bytes));
ByteArrayInputStream bais = new ByteArrayInputStream(bytes);
InputSource input = new InputSource(bais);
SAXParser parser = factory.newSAXParser();
parser.parse(input, new DefaultHandler() {
public boolean isBlank(final String s) {
// from: org.apache.logging.log4j.util.Strings.isBlank()
if (s == null || s.isEmpty()) {
return true;
}
for (int i = 0; i < s.length(); i++) {
char c = s.charAt(i);
if (!Character.isWhitespace(c)) {
return false;
}
}
return true;
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
// LOGGER.info("characters: {}", Arrays.copyOfRange(ch, start, start + length));
// strip blanks
// TODO: maybe with indent == 0, just check for single line-breaks after element ends?
if (isBlank(new String(ch, start, length))) {
return;
}
try {
writer.writeCharacters(ch, start, length);
} catch (XMLStreamException e) {
throw new SAXException(e);
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
if (qName.equals(marker)) {
return;
}
try {
writer.writeEndElement();
} catch (XMLStreamException e) {
throw new SAXException(e);
}
}
private Map<String, String> prefixes = new HashMap<>();
@Override
public void startPrefixMapping(String prefix, String uri) throws SAXException {
super.startPrefixMapping(prefix, uri);
// writer.writeNamespace(prefix, uri);
prefixes.put(prefix, uri);
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes)
throws SAXException {
if (qName.equals(marker)) {
return;
}
try {
if (qName.equals("Hit")) {
writer.writeStartElement(SAWSRUConstants.FCS_HITS_NS, qName);
} else {
writer.writeStartElement(qName);
// writer.writeStartElement(qName, localName, uri);
}
if (!prefixes.isEmpty()) {
for (Map.Entry<String, String> entry : prefixes.entrySet()) {
writer.writeNamespace(entry.getKey(), entry.getValue());
}
prefixes.clear();
}
for (int i = 0; i < attributes.getLength(); i++) {
writer.writeAttribute(attributes.getQName(i), attributes.getValue(i));
}
} catch (XMLStreamException e) {
throw new SAXException(e);
}
}
});
} catch (ParserConfigurationException e) {
throw new XMLStreamException(e);
} catch (SAXException e) {
throw new XMLStreamException(e);
} catch (IOException e) {
throw new XMLStreamException(e);
}
}
}
import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.Http2SolrClient;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.params.MapSolrParams;
public class Searcher {
private static final Logger LOGGER = LogManager.getLogger(Searcher.class);
private SolrClient solrClient;
public Searcher(String url, String user, String password) {
solrClient = new Http2SolrClient.Builder(url)
.connectionTimeout(10000)
.withBasicAuthCredentials(user, password)
.build();
}
public SearchResultSet search(String query, String collection, long offset, int limit) {
final MapSolrParams queryParams = new MapSolrParams(new HashMap<String, String>() {
{
// https://solr.apache.org/guide/solr/latest/query-guide/standard-query-parser.html
put("q", query);
// put("q.op", "OR");
put("df", "lemma4search");
// pagination
// https://solr.apache.org/guide/solr/latest/query-guide/pagination-of-results.html
put("start", String.valueOf(offset)); // starts with 0
put("rows", String.valueOf(limit));
// highlighting
// https://solr.apache.org/guide/solr/latest/query-guide/highlighting.html
put("hl", "true");
put("hl.fl", "dataview_hits"); // only highlight on "dataview_hits"
put("hl.fragsize", "0"); // whole field
put("hl.simple.pre", "<Hit>"); // no prefix so that we can parse more easily
put("hl.simple.post", "</Hit>");
}
});
try {
final QueryResponse response = solrClient.query(collection, queryParams);
final long total = response.getResults().getNumFound();
final List<ResultEntry> entries = response.getBeans(ResultEntry.class);
// process highlighting (update dataview_hits)
for (ResultEntry entry : entries) {
if (!response.getHighlighting().containsKey(entry.id)) {
continue;
}
final List<String> highlights = response.getHighlighting().get(entry.id).get("dataview_hits");
if (highlights == null || highlights.size() == 0) {
continue;
}
final String highlight = highlights.get(0);
entry.dataview_hits = highlight;
}
LOGGER.debug("results: {}", entries);
return new SearchResultSet(collection, query, entries, total, offset);
} catch (SolrServerException | IOException e) {
LOGGER.error("Solr request error", e);
}
return null;
}
}
<?xml version="1.0" encoding="UTF-8" ?>
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<!--
This example schema is the recommended starting point for users.
It should be kept correct and concise, usable out-of-the-box.
For more information, on how to customize this file, please see
https://solr.apache.org/guide/solr/latest/indexing-guide/schema-elements.html
-->
<schema name="default-config" version="1.6">
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<!-- FCS fields -->
<field name="lemma" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="lemma_alternative" type="string" indexed="true" stored="true" required="false" multiValued="true" />
<field name="pos_ud17" type="string" indexed="true" stored="true" required="false" multiValued="true" />
<field name="pos" type="string" indexed="true" stored="true" required="false" multiValued="true" />
<field name="dataview_hits" type="text_general" indexed="true" stored="true" required="true" multiValued="false" />
<field name="dataview_lexhits" type="string" indexed="false" stored="true" required="true" multiValued="false" />
<!-- search index fields -->
<field name="lemma4search" type="text_ascii_lower" indexed="true" stored="false" required="true" multiValued="true" />
<copyField source="lemma" dest="lemma4search" />
<copyField source="lemma_alternative" dest="lemma4search" />
<field name="pos4search" type="text_general" indexed="true" stored="false" required="false" multiValued="true" />
<copyField source="pos" dest="pos4search" />
<copyField source="pos_ud17" dest="pos4search" />
<fieldType name="text_ascii_lower" class="solr.TextField" positionIncrementGap="100" multiValued="true">
<analyzer>
<tokenizer name="standard"/>
<filter name="asciiFolding"/>
<filter name="lowercase"/>
</analyzer>
</fieldType>
</schema>
<add>
<doc>
<field name="id">PREFIX_Leipzig</field>
<field name="lemma">Leipzig</field>
<field name="pos">PROPN</field>
<field name="pos_ud17">PROPN</field>
<field name="dataview_hits">Leipzig (Eigenname) ist eine Kreisfreie Stadt sowie mit mehr als 620.000 Einwohnern die größte Stadt im Freistaat Sachsen. Sie ist als neuntgrößte eine der am schnellsten wachsenden Großstädte Deutschlands. Für Mitteldeutschland ist sie ein historisches Zentrum der Wirtschaft, des Handels und Verkehrs, der Verwaltung, Kultur und Bildung sowie für die &#8222;Kreativszene&#8220;.</field>
<field name="dataview_lexhits">&lt;Hit kind=&quot;lex-lemma&quot;&gt;Leipzig&lt;/Hit&gt; (&lt;Hit kind=&quot;lex-pos&quot;&gt;Eigenname&lt;/Hit&gt;) ist eine Kreisfreie Stadt sowie mit mehr als 620.000 Einwohnern die größte Stadt im Freistaat Sachsen. Sie ist als neuntgrößte eine der am schnellsten wachsenden Großstädte Deutschlands. Für Mitteldeutschland ist sie ein historisches Zentrum der Wirtschaft, des Handels und Verkehrs, der Verwaltung, Kultur und Bildung sowie für die &#8222;Kreativszene&#8220;.</field>
</doc>
</add>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment