Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save eemp/30421b784fc7f761c3e890b3dbd14d46 to your computer and use it in GitHub Desktop.
Save eemp/30421b784fc7f761c3e890b3dbd14d46 to your computer and use it in GitHub Desktop.
Flawed Attempt At Random Weighted Sampling with Elasticsearch (http://eemp.io/2017/07/29/weighted-random-sampling-in-elasticsearch/)
#!/bin/bash
SERVER=http://localhost:9200
INDEX=random-samples
MAPPING=docs
CURL_OPTS="-sw '\n'"
ACTIONS_LOG=random-samples-index.log
TOTAL_DOCUMENTS=10
FEATURED_FLAG_WEIGHT=1.33
# drop index
curl $CURL_OPTS -XDELETE $SERVER/$INDEX > $ACTIONS_LOG
# create index
curl $CURL_OPTS -XPUT $SERVER/$INDEX -d '{
"settings": {
"index": {
"number_of_shards": 1,
"number_of_replicas": 0
}
}
}' >> $ACTIONS_LOG
# create mapping
curl $CURL_OPTS -XPUT $SERVER/$INDEX/_mapping/$MAPPING -d '{
"properties": {
"name": {
"type": "text"
},
"featured_flag": {
"type": "boolean"
},
"undesired_flag": {
"type": "boolean"
}
}
}' >> $ACTIONS_LOG
# create sample docs
# feature the first doc
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/1 -d '{
"name": "Document 001",
"featured_flag": true
}' >> $ACTIONS_LOG
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/2 -d '{
"name": "Document 002",
"featured_flag": true,
"undesired_flag": true
}' >> $ACTIONS_LOG
for ((iter=3; iter <= $TOTAL_DOCUMENTS; iter++)) do
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/${iter} -d '{
"name": "Document 00'"$iter"'",
"featured_flag": false
}' >> $ACTIONS_LOG ;
done
# refresh to allow immediate queries
curl $CURL_OPTS -XPUT $SERVER/$INDEX/_refresh >> $ACTIONS_LOG
# hmm, still need this
sleep 1
# clear results from prev run
rm -f results.json
# sample results
curl -s -XPOST $SERVER/$INDEX/$MAPPING/_search -d '{
"query": {
"bool": {
"must_not": {
"term": {
"undesired_flag": true
}
},
"should": [
{
"function_score": {
"functions": [
{
"filter": {
"match_all": {}
},
"random_score": {}
},
{
"filter": {
"term": {
"featured_flag": true
}
},
"weight": '"${FEATURED_FLAG_WEIGHT}"'
}
]
}
}
]
}
},
"explain": true,
"size": 1
}' | jq "." > explanation.json
# }' | jq ".hits.hits[0]._explanation" > explanation.json
# query
for iter in `seq 1000`; do
curl -s -XPOST $SERVER/$INDEX/$MAPPING/_search -d '{
"query": {
"bool": {
"must_not": {
"term": {
"undesired_flag": true
}
},
"should": [
{
"function_score": {
"functions": [
{
"filter": {
"match_all": {}
},
"random_score": {}
},
{
"filter": {
"term": {
"featured_flag": true
}
},
"weight": '"${FEATURED_FLAG_WEIGHT}"'
}
]
}
}
]
}
},
"explain": false,
"size": 1
}' | jq ".hits.hits[0]._source.name" >> results.json ;
done
for ((iter=1; iter <= $TOTAL_DOCUMENTS; iter++)) do
doc="Document 00${iter}"
count=`cat results.json | grep "Document 00${iter}" | wc -l`
echo "Total 00${iter} Documents: $count"
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment