markharwood

## ExampleQuery.sh
curl -XGET "http://localhost:9200/meetuprsvps/_xpack/_graph/_explore" -d'
{
  "query": {
    "term": {
      "_all": "elasticsearch"
    }
  },
  "vertices": [
    {
      "field": "member_id",

## gist:0cecd5019dbd5c4e90fc
        TransportClient esClient = new TransportClient(ImmutableSettings.settingsBuilder().put("cluster.name", clusterName).build());

        // ======= Start hack======
        Field f = esClient.getClass().getDeclaredField("injector");
        f.setAccessible(true);
        Injector injector = (Injector) f.get(esClient);
        injector.createChildInjector(new SignificantTermsHeuristicModule());
        // ======= End hack======

## ExampleQueryFileGenerator.py
import gzip
import csv
import time
import datetime
import json

 # This script reads a data file (car test results) and uses it to create a CSV file of query clauses for benchmarking.
 # It creates mostly well-behaved queries and randomly throws in "problem" queries of various forms
 # Each CSV row in the output is a set of clauses (query/agg/filter) and some metadata for reporting purposes.

## LoadMOTs
import gzip
import csv
from elasticsearch import helpers
from elasticsearch.client import Elasticsearch
import time

csv.register_dialect('piper', delimiter='|', quoting=csv.QUOTE_NONE)

# See http://postcodepal.com/dbgen/postcode_areas_true-centroids.zip
pf=open('/Users/Mark/Documents/work/irdata/MOT/postcode_areas.csv')

## addTagUpdateScript
//import org.apache.lucene.codecs.bloom.FuzzySet;

// Extract the doc source to a field
doc = ctx._source;

// Convert basic array into map for ease of manipulation
tagMap = doc.tags.collectEntries{[it.tag, it]};


// Patch the new tags into the data structure, adding one to a usercount

## Kibana dash
{
  "title": "eslogs",
  "services": {
    "query": {
      "list": {
        "0": {
          "query": "",
          "alias": "",
          "color": "#7EB26D",
          "id": 0,

## QueryBenchmarker
//================================
// Here a script for gathering the precision/recall stats for a query (see http://en.wikipedia.org/wiki/Precision_and_recall)
// A candidate classifier query's effectiveness is determined by counting hits on pre-classified content
// If we compute the F-measure we can potentially use it as the fitness function for a genetic algo that mutates our query
// (introducing phrases, minShouldMatch clauses etc) to move us towards our target goal of balancing precision/recall in our classifier.
//=================================

// Our candidate query for classifying documents in a category
var candidateQuery={ "terms": {"body":    ["vs", "shr", "cts", "net", "revs", "note", "loss", "mths", "shrs", "avg", "profit"]}};
// Our filter criteria for identifying documents in our target category

## runme.sh
#!/bin/sh
es_host="http://localhost:9200"
curl -XDELETE "$es_host/contests"
curl -XPUT "$es_host/contests" -d '
{
   "settings": {
      "index.number_of_replicas": 0,
      "index.number_of_shards": 1,
      "index.refresh_interval": -1
   },

## Convert aggs results to KML

//This function was used in my test rig to convert elasticsearch results to a KML structure which is
// later fed to an iFrame wrapping the GoogleEarth plugin

var data=[];
var buckets=inData.aggregations.map.buckets;

function addCommas(nStr)
{
    nStr += '';

## gist:8689136
    //Groups a related set of terms, typically from the results of some auto-expansion,
    // and provides the average DocFreq of the set in order to avoid Lucene's IDF ranking
    // favouring the rarest interpretation, which is often a poor choice for auto-expanded
    // terms e.g. the terms produced by a fuzzy query or trying alternative fields
    class CommonIDFContext {
        int commonDf = -1;
        Map<Term, Integer> balancedDfs;
        List<Term> commonTerms = new ArrayList<Term>();

        public void add(Term unbalancedQueryTerm) {
	curl -XGET "http://localhost:9200/meetuprsvps/_xpack/_graph/_explore" -d'
	{
	"query": {
	"term": {
	"_all": "elasticsearch"
	}
	},
	"vertices": [
	{
	"field": "member_id",
	TransportClient esClient = new TransportClient(ImmutableSettings.settingsBuilder().put("cluster.name", clusterName).build());

	// ======= Start hack======
	Field f = esClient.getClass().getDeclaredField("injector");
	f.setAccessible(true);
	Injector injector = (Injector) f.get(esClient);
	injector.createChildInjector(new SignificantTermsHeuristicModule());
	// ======= End hack======
	import gzip
	import csv
	import time
	import datetime
	import json

	# This script reads a data file (car test results) and uses it to create a CSV file of query clauses for benchmarking.
	# It creates mostly well-behaved queries and randomly throws in "problem" queries of various forms
	# Each CSV row in the output is a set of clauses (query/agg/filter) and some metadata for reporting purposes.
	import gzip
	import csv
	from elasticsearch import helpers
	from elasticsearch.client import Elasticsearch
	import time

	csv.register_dialect('piper', delimiter='\|', quoting=csv.QUOTE_NONE)

	# See http://postcodepal.com/dbgen/postcode_areas_true-centroids.zip
	pf=open('/Users/Mark/Documents/work/irdata/MOT/postcode_areas.csv')
	//import org.apache.lucene.codecs.bloom.FuzzySet;

	// Extract the doc source to a field
	doc = ctx._source;

	// Convert basic array into map for ease of manipulation
	tagMap = doc.tags.collectEntries{[it.tag, it]};


	// Patch the new tags into the data structure, adding one to a usercount
	{
	"title": "eslogs",
	"services": {
	"query": {
	"list": {
	"0": {
	"query": "",
	"alias": "",
	"color": "#7EB26D",
	"id": 0,
	//================================
	// Here a script for gathering the precision/recall stats for a query (see http://en.wikipedia.org/wiki/Precision_and_recall)
	// A candidate classifier query's effectiveness is determined by counting hits on pre-classified content
	// If we compute the F-measure we can potentially use it as the fitness function for a genetic algo that mutates our query
	// (introducing phrases, minShouldMatch clauses etc) to move us towards our target goal of balancing precision/recall in our classifier.
	//=================================

	// Our candidate query for classifying documents in a category
	var candidateQuery={ "terms": {"body": ["vs", "shr", "cts", "net", "revs", "note", "loss", "mths", "shrs", "avg", "profit"]}};
	// Our filter criteria for identifying documents in our target category
	#!/bin/sh
	es_host="http://localhost:9200"
	curl -XDELETE "$es_host/contests"
	curl -XPUT "$es_host/contests" -d '
	{
	"settings": {
	"index.number_of_replicas": 0,
	"index.number_of_shards": 1,
	"index.refresh_interval": -1
	},

	//This function was used in my test rig to convert elasticsearch results to a KML structure which is
	// later fed to an iFrame wrapping the GoogleEarth plugin

	var data=[];
	var buckets=inData.aggregations.map.buckets;

	function addCommas(nStr)
	{
	nStr += '';
	//Groups a related set of terms, typically from the results of some auto-expansion,
	// and provides the average DocFreq of the set in order to avoid Lucene's IDF ranking
	// favouring the rarest interpretation, which is often a poor choice for auto-expanded
	// terms e.g. the terms produced by a fuzzy query or trying alternative fields
	class CommonIDFContext {
	int commonDf = -1;
	Map<Term, Integer> balancedDfs;
	List<Term> commonTerms = new ArrayList<Term>();

	public void add(Term unbalancedQueryTerm) {