Skip to content

Instantly share code, notes, and snippets.

@soruly
Last active April 2, 2018 08:53
Show Gist options
  • Save soruly/3b2ff2b0569752f37cec7052482be709 to your computer and use it in GitHub Desktop.
Save soruly/3b2ff2b0569752f37cec7052482be709 to your computer and use it in GitHub Desktop.
Various scripts that I used for whatanime.ga
#!/bin/bash
tmp_path=/tmp/animehash/
anime_path=/mnt/data/anime_new/
hash_path=/mnt/data/anime_hash/
cd /home/soruly
input_path="$1"
if [[ "$1" != */ ]] ; then
input_path="$1"/
fi
if [[ -d $input_path ]] ; then
if [[ "$input_path" == "$anime_path"* ]] ; then
relative_path="${1//$anime_path/}"
input_season=$(echo ${relative_path} | cut -d'/' -f1 -s)
input_series=$(echo ${relative_path} | cut -d'/' -f2 -s)
if [ "$input_season" ] && [ "$input_series" ] ; then
if [[ -d $input_path ]]; then
for file in "$input_path"*.mp4
do
#echo "$file"
/home/soruly/index-anime.sh "$file"
done
fi
elif [ "$input_season" ] ; then
if [[ -d $input_path ]]; then
for series in "$input_path"*
do
if [[ -d $series ]]; then
for file in "$series"/*.mp4
do
#echo "$file"
/home/soruly/index-anime.sh "$file"
done
fi
done
fi
else
for season in "$anime_path"*
do
if [[ -d $season ]]; then
for series in "$season"/*
do
if [[ -d $series ]]; then
for file in "$series"/*.mp4
do
#echo "$file"
/home/soruly/index-anime.sh "$file"
done
fi
done
fi
done
fi
else
echo Input path is not anime
fi
else
echo Input path is not directory
fi
#!/usr/bin/python
import os, sys, getopt, collections
from lxml import etree as ET
from operator import itemgetter
from collections import Counter
def getkey(elem):
return elem.findtext("field[@name='id']")
def main(argv):
path = ''
filename = ''
try:
opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="])
except getopt.GetoptError:
print 'test.py -i <path> -o <filename>'
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
print 'test.py -i <path> -o <filename>'
sys.exit()
elif opt in ("-i", "--ifile"):
path = os.path.join(arg)
elif opt in ("-o", "--ofile"):
filename = arg
#print 'Working path is ', path
print 'Working file is ', filename
tree = ET.parse(filename)
root = tree.getroot()
#root[:] = sorted(root, key=getkey)
new_root = ET.Element("add")
duplicated_doc_count = 0
queue = collections.deque('',12)
for doc in root.findall("doc"):
if doc.find("field[@name='cl_hi']").text in queue:
duplicated_doc_count += 1
else:
queue.append(doc.find("field[@name='cl_hi']").text)
new_doc = ET.SubElement(new_root, "doc")
ET.SubElement(new_doc, "field", name="id").text = doc.find("field[@name='id']").text
ET.SubElement(new_doc, "field", name="cl_hi").text = doc.find("field[@name='cl_hi']").text
ET.SubElement(new_doc, "field", name="cl_ha").text = doc.find("field[@name='cl_ha']").text
#new_root[:] = sorted(new_root, key=getkey)
new_tree = ET.ElementTree(new_root)
new_tree.write(path+"tmp.xml", encoding="UTF-8", xml_declaration=True, pretty_print=True)
if __name__ == "__main__":
main(sys.argv[1:])
#!/bin/bash
find -L "/mnt/data/anime_new" \
-type f \
-name "*.mp4" \
-mmin -1800 \
-not \( \
-path "/mnt/data/anime_new/HKTVBJ2/*" \
-prune \
\) \
-not \( \
-path "/mnt/data/anime_new/Others/*" \
-prune \
\) \
-exec /home/soruly/index-anime.sh "{}" \;
find -L "/mnt/data/anime_new/Others/Naruto Shippuuden" \
-type f \
-name "*.mp4" \
-mmin -1800 \
-exec /home/soruly/index-anime.sh "{}" \;
find -L "/mnt/data/anime_new/Others/One Piece" \
-type f \
-name "*.mp4" \
-mmin -1800 \
-exec /home/soruly/index-anime.sh "{}" \;
#!/bin/bash
tmp_path=/tmp/animehash/
anime_path=/mnt/data/anime_new/
hash_path=/mnt/data/anime_hash/
cd /home/soruly
#curl -s http://localhost:8983/solr/lireq/update?commit=true -d '<delete><query>*:*</query></delete>' > /dev/null
echo $1
if [[ ! -f index-anime.lock ]] ; then
touch index-anime.lock
if [[ "$1" == "$anime_path"*.mp4 ]] ; then
relative_path="${1//$anime_path/}"
input_season=$(echo ${relative_path} | cut -d'/' -f1)
input_series=$(echo ${relative_path} | cut -d'/' -f2)
file_name=$(basename "$1")
else
echo Invalid input file
exit
fi
file="$1"
#echo "${file}"
#continue
xmlxxx="${file%.mp4}.xml"
xmlfile="${xmlxxx//$anime_path/$hash_path}"
xmlpath=$(dirname "$xmlfile")
if [ ! -f "$xmlfile" ] ; then
echo Removing old files
rm -rf "${tmp_path}"
echo Creating temp directory
mkdir -p "${tmp_path}"
echo Extracting thumbnails
ffmpeg -i "$file" -q:v 2 -an -vf "fps=12,scale=-1:120,showinfo" "${tmp_path}%08d.jpg" 2>&1 | grep pts_time | awk -F"(pos:|pts_time:)" '{print $2}' | tr -d ' ' > "${tmp_path}pts_time.txt"
echo Preparing frame files for analysis
find "${tmp_path}" -mindepth 1 -maxdepth 1 -type f -name "*.jpg" -printf "${tmp_path}%f\n" | sort > "${tmp_path}frames.txt"
echo Analyzing frames
java -jar /var/solr/data/lib/lire-request-handler.jar -i "${tmp_path}frames.txt" -o "${tmp_path}tmp.xml" -n 8 -f
#java -jar /home/soruly/liresolr/dist/lire-request-handler.jar -i "${tmp_path}frames.txt" -o "${tmp_path}tmp.xml" -n 8 -f
echo Parsing output XML
python index-anime-parse.py -i "${tmp_path}" -o "${file//$anime_path/}"
echo Parsing input XML
python index-anime-extract.py -i "${tmp_path}" -o "${tmp_path}analyzed.xml"
echo Copy back parsed XML
mkdir -p "$xmlpath"
cp "${tmp_path}tmp.xml" "${xmlfile}"
echo Updating Solr
curl -s http://192.168.2.11:8983/solr/anime_cl/update -H "Content-Type: text/xml" --data-binary @"${tmp_path}tmp.xml" > /dev/null
curl -s http://192.168.2.11:8983/solr/anime_cl/update -H "Content-Type: text/xml" --data-binary "<commit/>" > /dev/null
echo Removing temp files
rm -rf "${tmp_path}"
echo Completed
curl -X POST "https://api.telegram.org/bot394963134:AAF07epjH1zYTSz7jZaaaaaaaRZDb4RV5ew/sendMessage" -d chat_id="@whatanimeupdates" --data-urlencode text="$input_series"$'\n'"$file_name"
fi
rm index-anime.lock
else
echo "Another process is running"
fi
sudo su
cd ~
wget http://archive.apache.org/dist/lucene/solr/5.3.1/solr-5.3.1.zip
unzip solr-5.3.1.zip
cd solr-5.3.1/bin/
./install_solr_service.sh ../../solr-5.3.1.zip
./solr stop -p 8983
systemctl start solr
cp -rf /opt/solr/server/solr/* /var/solr/data/
mkdir -p /var/solr/data/lire_core/config
cp -rf /var/solr/data/configsets/basic_configs/conf/* /var/solr/data/lire_core/conf/
cd /var/solr/data/lire_core/conf/
vim solrconfig.xml
vim schema.xml
systemctl restart solr
sudo mkdir -p /var/solr/data/lib
sudo cp -rf dist/* /var/solr/data/lib
chown -R solr:solr /var/solr
#create new core named lire_core
#Change heap memory at /var/solr/solr.in.sh
/*
* This file is part of the LIRE project: http://www.semanticmetadata.net/lire
* LIRE is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* LIRE is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with LIRE; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* We kindly ask you to refer the any or one of the following publications in
* any publication mentioning or employing Lire:
*
* Lux Mathias, Savvas A. Chatzichristofis. Lire: Lucene Image Retrieval –
* An Extensible Java CBIR Library. In proceedings of the 16th ACM International
* Conference on Multimedia, pp. 1085-1088, Vancouver, Canada, 2008
* URL: http://doi.acm.org/10.1145/1459359.1459577
*
* Lux Mathias. Content Based Image Retrieval with LIRE. In proceedings of the
* 19th ACM International Conference on Multimedia, pp. 735-738, Scottsdale,
* Arizona, USA, 2011
* URL: http://dl.acm.org/citation.cfm?id=2072432
*
* Mathias Lux, Oge Marques. Visual Information Retrieval using Java and LIRE
* Morgan & Claypool, 2013
* URL: http://www.morganclaypool.com/doi/abs/10.2200/S00468ED1V01Y201301ICR025
*
* Copyright statement:
* --------------------
* (c) 2002-2013 by Mathias Lux (mathias@juggle.at)
* http://www.semanticmetadata.net/lire, http://www.lire-project.net
*/
package net.semanticmetadata.lire.solr;
import net.semanticmetadata.lire.imageanalysis.EdgeHistogram;
import net.semanticmetadata.lire.imageanalysis.LireFeature;
import net.semanticmetadata.lire.impl.SimpleResult;
import net.semanticmetadata.lire.indexing.hashing.BitSampling;
import net.semanticmetadata.lire.utils.ImageUtils;
import org.apache.commons.codec.binary.Base64;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.queries.TermsFilter;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.params.SolrParams;
import org.apache.solr.common.util.NamedList;
import org.apache.solr.handler.RequestHandlerBase;
import org.apache.solr.request.SolrQueryRequest;
import org.apache.solr.response.SolrQueryResponse;
import org.apache.solr.search.SolrIndexSearcher;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.IOException;
import java.net.URL;
import java.util.*;
import java.io.*;
/**
* This is the main LIRE RequestHandler for the Solr Plugin. It supports query by example using the indexed id,
* an url or a feature vector. Furthermore, feature extraction and random selection of images are supported.
*
* @author Mathias Lux, mathias@juggle.at, 07.07.13
*/
public class LireRequestHandler extends RequestHandlerBase {
private HashMap<Integer,Integer> docCount = new HashMap<Integer, Integer>();
// private static HashMap<String, Class> fieldToClass = new HashMap<String, Class>(5);
private long time = 0;
private int countRequests = 0;
private int defaultNumberOfResults = 60;
/**
* number of candidate results retrieved from the index. The higher this number, the slower,
* the but more accurate the retrieval will be. 10k is a good value for starters.
*/
private int numberOfCandidateResults = 10000;
private static final int DEFAULT_NUMBER_OF_CANDIDATES = 10000;
/**
* The number of query terms that go along with the TermsFilter search. We need some to get a
* score, the less the faster. I put down a minimum of three in the method, this value gives
* the percentage of the overall number used (selected randomly).
*/
private double numberOfQueryTerms = 0.33;
private static final double DEFAULT_NUMBER_OF_QUERY_TERMS = 0.33;
static {
// one time hash function read ...
try {
BitSampling.readHashFunctions();
} catch (IOException e) {
e.printStackTrace();
}
}
@Override
public void init(NamedList args) {
super.init(args);
try{
BufferedReader br = new BufferedReader(new FileReader("/var/solr/data/anime_cl/histogram.csv"));
String line = null;
while((line=br.readLine())!=null){
String str[] = line.split(",");
docCount.put(Integer.parseInt(str[0]), Integer.parseInt(str[1]));
}
} catch(Exception e){
}
}
/**
* Handles three types of requests.
* <ol>
* <li>search by already extracted images.</li>
* <li>search by an image URL.</li>
* <li>Random results.</li>
* </ol>
*
* @param req
* @param rsp
* @throws Exception
*/
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
// (1) check if the necessary parameters are here
if (req.getParams().get("hashes") != null) { // we are searching for hashes ...
handleHashSearch(req, rsp);
} else if (req.getParams().get("url") != null) { // we are searching for an image based on an URL
handleUrlSearch(req, rsp);
} else if (req.getParams().get("id") != null) { // we are searching for an image based on an URL
handleIdSearch(req, rsp);
} else if (req.getParams().get("extract") != null) { // we are trying to extract from an image URL.
handleExtract(req, rsp);
} else { // lets return random results.
handleRandomSearch(req, rsp);
}
}
/**
* Handles the get parameters id, field and rows.
*
* @param req
* @param rsp
* @throws IOException
* @throws InstantiationException
* @throws IllegalAccessException
*/
private void handleIdSearch(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, InstantiationException, IllegalAccessException {
SolrIndexSearcher searcher = req.getSearcher();
try {
TopDocs hits = searcher.search(new TermQuery(new Term("id", req.getParams().get("id"))), 1);
String paramField = "cl_ha";
if (req.getParams().get("field") != null)
paramField = req.getParams().get("field");
LireFeature queryFeature = (LireFeature) FeatureRegistry.getClassForHashField(paramField).newInstance();
rsp.add("QueryField", paramField);
rsp.add("QueryFeature", queryFeature.getClass().getName());
numberOfQueryTerms = req.getParams().getDouble("accuracy", DEFAULT_NUMBER_OF_QUERY_TERMS);
numberOfCandidateResults = req.getParams().getInt("candidates", DEFAULT_NUMBER_OF_CANDIDATES);
if (hits.scoreDocs.length > 0) {
// Using DocValues to get the actual data from the index.
BinaryDocValues binaryValues = MultiDocValues.getBinaryValues(searcher.getIndexReader(), FeatureRegistry.getFeatureFieldName(paramField)); // *** #
if (binaryValues == null)
System.err.println("Could not find the DocValues of the query document. Are they in the index?");
BytesRef bytesRef = new BytesRef();
bytesRef = binaryValues.get(hits.scoreDocs[0].doc);
// Document d = searcher.getIndexReader().document(hits.scoreDocs[0].doc);
// String histogramFieldName = paramField.replace("_ha", "_hi");
queryFeature.setByteArrayRepresentation(bytesRef.bytes, bytesRef.offset, bytesRef.length);
int paramRows = defaultNumberOfResults;
if (req.getParams().getInt("rows") != null)
paramRows = req.getParams().getInt("rows");
// Re-generating the hashes to save space (instead of storing them in the index)
int[] hashes = BitSampling.generateHashes(queryFeature.getDoubleHistogram());
List<Term> termFilter = createTermFilter(hashes, paramField);
doSearch(req, rsp, searcher, paramField, paramRows, termFilter, createQuery(hashes, paramField, numberOfQueryTerms, "*"), queryFeature);
} else {
rsp.add("Error", "Did not find an image with the given id " + req.getParams().get("id"));
}
} catch (Exception e) {
rsp.add("Error", "There was an error with your search for the image with the id " + req.getParams().get("id")
+ ": " + e.getMessage());
}
}
/**
* Returns a random set of documents from the index. Mainly for testing purposes.
*
* @param req
* @param rsp
* @throws IOException
*/
private void handleRandomSearch(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException {
SolrIndexSearcher searcher = req.getSearcher();
DirectoryReader indexReader = searcher.getIndexReader();
double maxDoc = indexReader.maxDoc();
int paramRows = defaultNumberOfResults;
if (req.getParams().getInt("rows") != null)
paramRows = req.getParams().getInt("rows");
LinkedList list = new LinkedList();
while (list.size() < paramRows) {
HashMap m = new HashMap(2);
Document d = indexReader.document((int) Math.floor(Math.random() * maxDoc));
m.put("id", d.getValues("id")[0]);
m.put("title", d.getValues("title")[0]);
list.add(m);
}
rsp.add("docs", list);
}
/**
* Searches for an image given by an URL. Note that (i) extracting image features takes time and
* (ii) not every image is readable by Java.
*
* @param req
* @param rsp
* @throws IOException
* @throws InstantiationException
* @throws IllegalAccessException
*/
private void handleUrlSearch(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, InstantiationException, IllegalAccessException {
SolrParams params = req.getParams();
String paramUrl = params.get("url");
String paramField = "cl_ha";
if (req.getParams().get("field") != null)
paramField = req.getParams().get("field");
int paramRows = defaultNumberOfResults;
if (params.get("rows") != null)
paramRows = params.getInt("rows");
numberOfQueryTerms = req.getParams().getDouble("accuracy", DEFAULT_NUMBER_OF_QUERY_TERMS);
numberOfCandidateResults = req.getParams().getInt("candidates", DEFAULT_NUMBER_OF_CANDIDATES);
LireFeature feat = null;
List<Term> termFilter = null;
int[] hashes = null;
// wrapping the whole part in the try
try {
BufferedImage img = ImageIO.read(new URL(paramUrl).openStream());
img = ImageUtils.trimWhiteSpace(img);
// getting the right feature per field:
if (paramField == null || FeatureRegistry.getClassForHashField(paramField) == null) // if the feature is not registered.
feat = new EdgeHistogram();
else {
feat = (LireFeature) FeatureRegistry.getClassForHashField(paramField).newInstance();
}
feat.extract(img);
hashes = BitSampling.generateHashes(feat.getDoubleHistogram());
termFilter = createTermFilter(hashes, paramField);
ArrayList<String> hashStrings = new ArrayList<String>(hashes.length);
for (int i = 0; i < hashes.length; i++) {
hashStrings.add(Integer.toHexString(hashes[i]));
}
rsp.add("hashes", hashStrings);
} catch (Exception e) {
rsp.add("Error", "Error reading image from URL: " + paramUrl + ": " + e.getMessage());
e.printStackTrace();
}
// search if the feature has been extracted.
if (feat != null)
doSearch(req, rsp, req.getSearcher(), paramField, paramRows, termFilter, createQuery(hashes, paramField, numberOfQueryTerms, "*"), feat);
}
private void handleExtract(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, InstantiationException, IllegalAccessException {
SolrParams params = req.getParams();
String paramUrl = params.get("extract");
String paramField = "cl_ha";
if (req.getParams().get("field") != null)
paramField = req.getParams().get("field");
// int paramRows = defaultNumberOfResults;
// if (params.get("rows") != null)
// paramRows = params.getInt("rows");
LireFeature feat = null;
// BooleanQuery query = null;
// wrapping the whole part in the try
try {
BufferedImage img = ImageIO.read(new URL(paramUrl).openStream());
img = ImageUtils.trimWhiteSpace(img);
// getting the right feature per field:
if (paramField == null || FeatureRegistry.getClassForHashField(paramField) == null) // if the feature is not registered.
feat = new EdgeHistogram();
else {
feat = (LireFeature) FeatureRegistry.getClassForHashField(paramField).newInstance();
}
feat.extract(img);
rsp.add("histogram", Base64.encodeBase64String(feat.getByteArrayRepresentation()));
int[] hashes = BitSampling.generateHashes(feat.getDoubleHistogram());
ArrayList<String> hashStrings = new ArrayList<String>(hashes.length);
for (int i = 0; i < hashes.length; i++) {
hashStrings.add(Integer.toHexString(hashes[i]));
}
//Collections.shuffle(hashStrings);
rsp.add("hashes", hashStrings);
// just use 50% of the hashes for search ...
// query = createTermFilter(hashes, paramField, 0.5d);
} catch (Exception e) {
// rsp.add("Error", "Error reading image from URL: " + paramUrl + ": " + e.getMessage());
e.printStackTrace();
}
// search if the feature has been extracted.
// if (feat != null) doSearch(rsp, req.getSearcher(), paramField, paramRows, query, feat);
}
/**
* Search based on the given image hashes.
*
* @param req
* @param rsp
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
private void handleHashSearch(SolrQueryRequest req, SolrQueryResponse rsp) throws IOException, IllegalAccessException, InstantiationException {
SolrParams params = req.getParams();
SolrIndexSearcher searcher = req.getSearcher();
// get the params needed:
// hashes=x y z ...
// feature=<base64>
// field=<cl_ha|ph_ha|...>
String[] hashStrings = params.get("hashes").trim().split(",");
int[] hashes = new int[100];
byte[] featureVector = Base64.decodeBase64(params.get("feature"));
String paramField = "cl_ha";
if (req.getParams().get("field") != null)
paramField = req.getParams().get("field");
int paramRows = defaultNumberOfResults;
if (params.getInt("rows") != null)
paramRows = params.getInt("rows");
numberOfQueryTerms = req.getParams().getDouble("accuracy", DEFAULT_NUMBER_OF_QUERY_TERMS);
numberOfCandidateResults = req.getParams().getInt("candidates", DEFAULT_NUMBER_OF_CANDIDATES);
// create boolean query:
// System.out.println("** Creating query.");
LinkedList<Term> termFilter = new LinkedList<Term>();
BooleanQuery query = new BooleanQuery();
for (int i = 0; i < hashStrings.length; i++) {
hashes[i] = Integer.parseInt(hashStrings[i],16);
// be aware that the hashFunctionsFileName of the field must match the one you put the hashes in before.
// hashStrings[i] = hashStrings[i].trim();
// if (hashStrings[i].length() > 0) {
// termFilter.add(new Term(paramField, hashStrings[i].trim()));
// System.out.println("** " + field + ": " + hashes[i].trim());
// }
}
// Collections.shuffle(termFilter);
// for (int k = 0; k < termFilter.size() * numberOfQueryTerms; k++) {
// query.add(new BooleanClause(new TermQuery(termFilter.get(k)), BooleanClause.Occur.SHOULD));
// }
// System.out.println("** Doing search.");
// query feature
LireFeature queryFeature = (LireFeature) FeatureRegistry.getClassForHashField(paramField).newInstance();
queryFeature.setByteArrayRepresentation(featureVector);
// get results:
// doSearch(req, rsp, searcher, paramField, paramRows, termFilter, new MatchAllDocsQuery(), queryFeature);
String idFilter = req.getParams().get("filter");
doSearch(req, rsp, req.getSearcher(), paramField, paramRows, termFilter, createQuery(hashes, paramField, numberOfQueryTerms, idFilter), queryFeature);
}
/**
* Actual search implementation based on (i) hash based retrieval and (ii) feature based re-ranking.
*
* @param rsp
* @param searcher
* @param hashFieldName the hash field name
* @param maximumHits
* @param terms
* @param queryFeature
* @throws IOException
* @throws IllegalAccessException
* @throws InstantiationException
*/
private void doSearch(SolrQueryRequest req, SolrQueryResponse rsp, SolrIndexSearcher searcher, String hashFieldName, int maximumHits, List<Term> terms, Query query, LireFeature queryFeature) throws IOException, IllegalAccessException, InstantiationException {
// temp feature instance
LireFeature tmpFeature = queryFeature.getClass().newInstance();
// Taking the time of search for statistical purposes.
time = System.currentTimeMillis();
Filter filter = null;
// if the request contains a filter:
if (req.getParams().get("fq")!=null) {
// only filters with [<field>:<value> ]+ are supported
StringTokenizer st = new StringTokenizer(req.getParams().get("fq"), " ");
LinkedList<Term> filterTerms = new LinkedList<Term>();
while (st.hasMoreElements()) {
String[] tmpToken = st.nextToken().split(":");
if (tmpToken.length>1) {
filterTerms.add(new Term(tmpToken[0], tmpToken[1]));
}
}
if (filterTerms.size()>0)
filter = new TermsFilter(filterTerms);
}
TopDocs docs; // with query only.
if (filter == null) {
docs = searcher.search(query, numberOfCandidateResults);
} else {
docs = searcher.search(query, filter, numberOfCandidateResults);
}
// TopDocs docs = searcher.search(query, new TermsFilter(terms), numberOfCandidateResults); // with TermsFilter and boosting by simple query
// TopDocs docs = searcher.search(new ConstantScoreQuery(new TermsFilter(terms)), numberOfCandidateResults); // just with TermsFilter
time = time == 0 ? 0 : System.currentTimeMillis() - time;
rsp.add("RawDocsCount", docs.scoreDocs.length + "");
rsp.add("RawDocsSearchTime", time + "");
// re-rank
time = System.currentTimeMillis();
TreeSet<SimpleResult> resultScoreDocs = new TreeSet<SimpleResult>();
float maxDistance = -1f;
float tmpScore;
String featureFieldName = FeatureRegistry.getFeatureFieldName(hashFieldName);
// iterating and re-ranking the documents.
BinaryDocValues binaryValues = MultiDocValues.getBinaryValues(searcher.getIndexReader(), featureFieldName); // *** #
BytesRef bytesRef;// = new BytesRef();
for (int i = 0; i < docs.scoreDocs.length; i++) {
// using DocValues to retrieve the field values ...
bytesRef = binaryValues.get(docs.scoreDocs[i].doc);
tmpFeature.setByteArrayRepresentation(bytesRef.bytes, bytesRef.offset, bytesRef.length);
// Getting the document from the index.
// This is the slow step based on the field compression of stored fields.
// tmpFeature.setByteArrayRepresentation(d.getBinaryValue(name).bytes, d.getBinaryValue(name).offset, d.getBinaryValue(name).length);
tmpScore = queryFeature.getDistance(tmpFeature);
//if(tmpScore > 20) continue;
if (resultScoreDocs.size() < maximumHits) { // todo: There's potential here for a memory saver, think of a clever data structure that can do the trick without creating a new SimpleResult for each result.
resultScoreDocs.add(new SimpleResult(tmpScore, searcher.doc(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
maxDistance = resultScoreDocs.last().getDistance();
} else if (tmpScore < maxDistance) {
// if it is nearer to the sample than at least one of the current set:
// remove the last one ...
resultScoreDocs.remove(resultScoreDocs.last());
// add the new one ...
resultScoreDocs.add(new SimpleResult(tmpScore, searcher.doc(docs.scoreDocs[i].doc), docs.scoreDocs[i].doc));
// and set our new distance border ...
maxDistance = resultScoreDocs.last().getDistance();
}
}
// System.out.println("** Creating response.");
time = time == 0 ? 0 : System.currentTimeMillis() - time;
rsp.add("ReRankSearchTime", time + "");
LinkedList list = new LinkedList();
for (Iterator<SimpleResult> it = resultScoreDocs.iterator(); it.hasNext(); ) {
SimpleResult result = it.next();
HashMap m = new HashMap(2);
m.put("d", result.getDistance());
// add fields as requested:
if (req.getParams().get("fl") == null) {
m.put("id", result.getDocument().get("id"));
if (result.getDocument().get("title") != null)
m.put("title", result.getDocument().get("title"));
} else {
String fieldsRequested = req.getParams().get("fl");
if (fieldsRequested.contains("score")) {
m.put("score", result.getDistance());
}
if (fieldsRequested.contains("*")) {
// all fields
for (IndexableField field : result.getDocument().getFields()) {
String tmpField = field.name();
if (result.getDocument().getFields(tmpField).length > 1) {
m.put(result.getDocument().getFields(tmpField)[0].name(), result.getDocument().getValues(tmpField));
} else if (result.getDocument().getFields(tmpField).length > 0) {
m.put(result.getDocument().getFields(tmpField)[0].name(), result.getDocument().getFields(tmpField)[0].stringValue());
}
}
} else {
StringTokenizer st;
if (fieldsRequested.contains(","))
st = new StringTokenizer(fieldsRequested, ",");
else
st = new StringTokenizer(fieldsRequested, " ");
while (st.hasMoreElements()) {
String tmpField = st.nextToken();
if (result.getDocument().getFields(tmpField).length > 1) {
m.put(result.getDocument().getFields(tmpField)[0].name(), result.getDocument().getValues(tmpField));
} else if (result.getDocument().getFields(tmpField).length > 0) {
m.put(result.getDocument().getFields(tmpField)[0].name(), result.getDocument().getFields(tmpField)[0].stringValue());
}
}
}
}
// m.put(field, result.getDocument().get(field));
// m.put(field.replace("_ha", "_hi"), result.getDocument().getBinaryValue(field));
list.add(m);
}
rsp.add("docs", list);
// rsp.add("Test-name", "Test-val");
}
@Override
public String getDescription() {
return "LIRE Request Handler to add images to an index and search them. Search images by id, by url and by extracted features.";
}
@Override
public String getSource() {
return "http://lire-project.net";
}
@Override
public NamedList<Object> getStatistics() {
// Change stats here to get an insight in the admin console.
NamedList<Object> statistics = super.getStatistics();
statistics.add("Number of Requests", countRequests);
return statistics;
}
private BooleanQuery createQuery(int[] hashes, String paramField, double size, String idFilter) {
List<Integer> hList = new ArrayList<Integer>(hashes.length);
try{
PrintWriter writer = new PrintWriter("/tmp/!.txt", "UTF-8");
for (int i = 0; i < hashes.length; i++) {
if(docCount.get(hashes[i]) > 0 && docCount.get(hashes[i]) < 300000000){
writer.println(hashes[i]);
writer.println(docCount.get(hashes[i]));
}
}
writer.close();
} catch (IOException ex) {
}
for (int i = 0; i < hashes.length; i++) {
if(docCount.get(hashes[i]) > 0 && docCount.get(hashes[i]) < 300000000){
hList.add(hashes[i]);
}
}
//remove duplicates
Set<Integer> hs = new HashSet<>();
hs.addAll(hList);
hList.clear();
hList.addAll(hs);
Comparator<Integer> compareByFrequency = new Comparator<Integer>() {
@Override
public int compare(Integer h1, Integer h2) {
int v1 = docCount.get(h1);
int v2 = docCount.get(h2);
return v1 - v2;
}
};
Collections.sort(hList,compareByFrequency);
try{
PrintWriter writer = new PrintWriter("/tmp/!!.txt", "UTF-8");
for (int i = 0; i < hList.size(); i++) {
//writer.println(hList.get(i));
writer.println(docCount.get(hList.get(i)));
}
writer.close();
} catch (IOException ex) {
}
int[] offsets1 = {0,1,2,3,1,2,1,0,0,0};
int[] offsets2 = {1,2,3,4,3,4,4,2,3,4};
BooleanQuery query = new BooleanQuery();
int offset = (int)(size-1);
if(offset < 4){
query.add(new BooleanClause(new TermQuery(new Term(paramField, Integer.toHexString(hList.get(offsets1[offset])))), BooleanClause.Occur.MUST));
query.add(new BooleanClause(new TermQuery(new Term(paramField, Integer.toHexString(hList.get(offsets2[offset])))), BooleanClause.Occur.MUST));
if(!idFilter.equals("*")){
query.add(new BooleanClause(new WildcardQuery(new Term("id", idFilter)), BooleanClause.Occur.MUST));
}
}
else{
offset = offset - 4;
query.add(new BooleanClause(new TermQuery(new Term(paramField, Integer.toHexString(hList.get(offset)))), BooleanClause.Occur.MUST));
if(!idFilter.equals("*")){
query.add(new BooleanClause(new WildcardQuery(new Term("id", idFilter)), BooleanClause.Occur.MUST));
}
}
return query;
}
/**
* This is used to create a TermsFilter ... should be used to select in the index based on many terms.
* We just need to integrate a minimum query too, else we'd not get the appropriate results.
*
* @param hashes
* @param paramField
* @return
*/
private List<Term> createTermFilter(int[] hashes, String paramField) {
LinkedList<Term> termFilter = new LinkedList<Term>();
for (int i = 0; i < hashes.length; i++) {
// be aware that the hashFunctionsFileName of the field must match the one you put the hashes in before.
termFilter.add(new Term(paramField, Integer.toHexString(hashes[i])));
}
return termFilter;
}
}
# Install Solr as service
wget http://archive.apache.org/dist/lucene/solr/6.6.0/solr-6.6.0.tgz
tar xzf solr-6.6.0.tgz solr-6.6.0/bin/install_solr_service.sh --strip-components=2
# The script install_solr_service.sh may detect the wrong version of your linux
# Edit the script around line 68
sudo bash ./install_solr_service.sh solr-6.6.0.tgz
sudo systemctl start solr
sudo -u solr /opt/solr/bin/solr create -c new_core
# Setup project
sudo dnf install java-1.8.0-openjdk-devel
git clone git@github.com:soruly/liresolr.git
cd liresolr || exit
./gradlew distForSolr
# If gradle failed at org.apache.http.ssl.SSLInitializationException
# /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.131-1.b12.fc25.x86_64/jre/lib/security/cacerts (No such file or directory)
# Link your system cacerts to the missing java lib
# ln -s /etc/pki/java/cacerts /usr/lib/jvm/java-1.8.0-openjdk-1.8.0.131-1.b12.fc25.x86_64/jre/lib/security
# Copy compiled plugin
sudo cp ./dist/*.jar /opt/solr/server/solr-webapp/webapp/WEB-INF/lib/
sudo systemctl restart solr
# Setup database
sudo -u solr vim /var/solr/data/new_core/conf/solrconfig.xml
sudo -u solr vim /var/solr/data/new_core/conf/managed-schema
# Delete everything in solr core
curl http://localhost:8983/solr/new_core/update -H "Content-Type: text/xml" --data-binary "<delete><query>*:*</query></delete>"
# Load xml files to solr core
curl http://localhost:8983/solr/new_core/update -H "Content-Type: text/xml" --data-binary @outfile.xml
# Commit changes
curl http://localhost:8983/solr/new_core/update -H "Content-Type: text/xml" --data-binary "<commit/>"
# Get random results
http://localhost:8983/solr/new_core/lireq?indent=on&wt=json&rows=10
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment