da-mkay/gist:3487909

## gistfile1.sh
# Delete previous tests

curl -XDELETE 'http://127.0.0.1:9200/files/?pretty=1'


# Setup

curl -XPUT 'http://127.0.0.1:9200/files/?pretty=1'  -d '
{
   "settings" : {
      "analysis" : {
         "analyzer" : {
            "filename_analyzer" : {
               "tokenizer" : "filename_tokenizer",
               "filter" : ["lowercase"]
            }
         },
         "tokenizer" : {
            "filename_tokenizer" : {
               "type" : "NGram",
               "max_gram" : 100,
               "min_gram" : 2
            }
         }
      }
   },
   "mappings" : {
      "file" : {
         "properties" : {
            "filename" : {
               "type" : "string",
               "analyzer" : "filename_analyzer"
            }
         }
      }
   }
}
'


# Insert some documents:

curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.12.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.05.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.05.01.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.08.27.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.12.12.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2011.12.12.txt" }'
curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "file_01_2012.09.09.txt" }'
curl -X POST 'http://localhost:9200/files/_refresh'


# Find all documents except the last one (which contains "2011"):

curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
{
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "2012" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
}
'


# Find all documents which contain "12" followed by "01" (the first three
# documents):

curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
{
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "12" } },
                { "span_term" : { "filename" : "01" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
}
'


# BUT this search does not work:

curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
{
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "2012" } },
                { "span_term" : { "filename" : "01" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
}
'


# However with "in_order" set to false it works, but it returns the unwanted
# "file_01_2012.09.09.txt":

curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1'  -d '
{
    "query" : {
        "span_near" : {
            "clauses" : [
                { "span_term" : { "filename" : "2012" } },
                { "span_term" : { "filename" : "01" } }
            ],
            "slop": 100,
            "in_order" : true
        }
    }
}
'


# I think the "2012 followed by 01"-query does not work because of how the
# nGram-tokenizer determines the token's position which is used by "span_near"-
# query to determine the order of the terms:

curl -XGET 'http://127.0.0.1:9200/files/_analyze?pretty=1&text=My_file_2012.01.05.txt&analyzer=filename_analyzer'
...
{
    "token" : "01",         ------------> "01" inside "2012"
    "start_offset" : 9,
    "end_offset" : 11,
    "type" : "word",
    "position" : 10
},
{
    "token" : "12",
    "start_offset" : 10,
    "end_offset" : 12,
    "type" : "word",
    "position" : 11
},
...
{
    "token" : "01",         ------------> "01" inside ".01."
    "start_offset" : 13,
    "end_offset" : 15,
    "type" : "word",
    "position" : 14
}
...
{
    "token" : "2012",
    "start_offset" : 8,
    "end_offset" : 12,
    "type" : "word",
    "position" : 50
}
...

# The nGram-tokenizer just increments the position for each token:
# First it generates the tokens with two characters which get a "position"-value
# from 1 to 21. Then it generates the tokens with three characters (position 22
# to 41) and so on.
# So when searching for "2012" and "01" the position-values 50 and 10 are used
# which are not in order and so the files are not found. But when using "12" and
# "01" then the values 11 and 14 are used which are in order.
	# Delete previous tests

	curl -XDELETE 'http://127.0.0.1:9200/files/?pretty=1'


	# Setup

	curl -XPUT 'http://127.0.0.1:9200/files/?pretty=1' -d '
	{
	"settings" : {
	"analysis" : {
	"analyzer" : {
	"filename_analyzer" : {
	"tokenizer" : "filename_tokenizer",
	"filter" : ["lowercase"]
	}
	},
	"tokenizer" : {
	"filename_tokenizer" : {
	"type" : "NGram",
	"max_gram" : 100,
	"min_gram" : 2
	}
	}
	}
	},
	"mappings" : {
	"file" : {
	"properties" : {
	"filename" : {
	"type" : "string",
	"analyzer" : "filename_analyzer"
	}
	}
	}
	}
	}
	'


	# Insert some documents:

	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.12.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.01.05.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.05.01.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.08.27.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2012.12.12.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "My_file_2011.12.12.txt" }'
	curl -X POST 'http://localhost:9200/files/file' -d '{ "filename" : "file_01_2012.09.09.txt" }'
	curl -X POST 'http://localhost:9200/files/_refresh'


	# Find all documents except the last one (which contains "2011"):

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "2012" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# Find all documents which contain "12" followed by "01" (the first three
	# documents):

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "12" } },
	{ "span_term" : { "filename" : "01" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# BUT this search does not work:

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "2012" } },
	{ "span_term" : { "filename" : "01" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# However with "in_order" set to false it works, but it returns the unwanted
	# "file_01_2012.09.09.txt":

	curl -XGET 'http://127.0.0.1:9200/files/file/_search?pretty=1' -d '
	{
	"query" : {
	"span_near" : {
	"clauses" : [
	{ "span_term" : { "filename" : "2012" } },
	{ "span_term" : { "filename" : "01" } }
	],
	"slop": 100,
	"in_order" : true
	}
	}
	}
	'


	# I think the "2012 followed by 01"-query does not work because of how the
	# nGram-tokenizer determines the token's position which is used by "span_near"-
	# query to determine the order of the terms:

	curl -XGET 'http://127.0.0.1:9200/files/_analyze?pretty=1&text=My_file_2012.01.05.txt&analyzer=filename_analyzer'
	...
	{
	"token" : "01", ------------> "01" inside "2012"
	"start_offset" : 9,
	"end_offset" : 11,
	"type" : "word",
	"position" : 10
	},
	{
	"token" : "12",
	"start_offset" : 10,
	"end_offset" : 12,
	"type" : "word",
	"position" : 11
	},
	...
	{
	"token" : "01", ------------> "01" inside ".01."
	"start_offset" : 13,
	"end_offset" : 15,
	"type" : "word",
	"position" : 14
	}
	...
	{
	"token" : "2012",
	"start_offset" : 8,
	"end_offset" : 12,
	"type" : "word",
	"position" : 50
	}
	...

	# The nGram-tokenizer just increments the position for each token:
	# First it generates the tokens with two characters which get a "position"-value
	# from 1 to 21. Then it generates the tokens with three characters (position 22
	# to 41) and so on.
	# So when searching for "2012" and "01" the position-values 50 and 10 are used
	# which are not in order and so the files are not found. But when using "12" and
	# "01" then the values 11 and 14 are used which are in order.