Flawed Attempt At Random Weighted Sampling with Elasticsearch (http://eemp.io/2017/07/29/weighted-random-sampling-in-elasticsearch/)
#!/bin/bash | |
SERVER=http://localhost:9200 | |
INDEX=random-samples | |
MAPPING=docs | |
CURL_OPTS="-sw '\n'" | |
ACTIONS_LOG=random-samples-index.log | |
TOTAL_DOCUMENTS=10 | |
FEATURED_FLAG_WEIGHT=1.33 | |
# drop index | |
curl $CURL_OPTS -XDELETE $SERVER/$INDEX > $ACTIONS_LOG | |
# create index | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX -d '{ | |
"settings": { | |
"index": { | |
"number_of_shards": 1, | |
"number_of_replicas": 0 | |
} | |
} | |
}' >> $ACTIONS_LOG | |
# create mapping | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/_mapping/$MAPPING -d '{ | |
"properties": { | |
"name": { | |
"type": "text" | |
}, | |
"featured_flag": { | |
"type": "boolean" | |
}, | |
"undesired_flag": { | |
"type": "boolean" | |
} | |
} | |
}' >> $ACTIONS_LOG | |
# create sample docs | |
# feature the first doc | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/1 -d '{ | |
"name": "Document 001", | |
"featured_flag": true | |
}' >> $ACTIONS_LOG | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/2 -d '{ | |
"name": "Document 002", | |
"featured_flag": true, | |
"undesired_flag": true | |
}' >> $ACTIONS_LOG | |
for ((iter=3; iter <= $TOTAL_DOCUMENTS; iter++)) do | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/${iter} -d '{ | |
"name": "Document 00'"$iter"'", | |
"featured_flag": false | |
}' >> $ACTIONS_LOG ; | |
done | |
# refresh to allow immediate queries | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/_refresh >> $ACTIONS_LOG | |
# hmm, still need this | |
sleep 1 | |
# clear results from prev run | |
rm -f results.json | |
# sample results | |
curl -s -XPOST $SERVER/$INDEX/$MAPPING/_search -d '{ | |
"query": { | |
"bool": { | |
"must_not": { | |
"term": { | |
"undesired_flag": true | |
} | |
}, | |
"should": [ | |
{ | |
"function_score": { | |
"functions": [ | |
{ | |
"filter": { | |
"match_all": {} | |
}, | |
"random_score": {} | |
}, | |
{ | |
"filter": { | |
"term": { | |
"featured_flag": true | |
} | |
}, | |
"weight": '"${FEATURED_FLAG_WEIGHT}"' | |
} | |
] | |
} | |
} | |
] | |
} | |
}, | |
"explain": true, | |
"size": 1 | |
}' | jq "." > explanation.json | |
# }' | jq ".hits.hits[0]._explanation" > explanation.json | |
# query | |
for iter in `seq 1000`; do | |
curl -s -XPOST $SERVER/$INDEX/$MAPPING/_search -d '{ | |
"query": { | |
"bool": { | |
"must_not": { | |
"term": { | |
"undesired_flag": true | |
} | |
}, | |
"should": [ | |
{ | |
"function_score": { | |
"functions": [ | |
{ | |
"filter": { | |
"match_all": {} | |
}, | |
"random_score": {} | |
}, | |
{ | |
"filter": { | |
"term": { | |
"featured_flag": true | |
} | |
}, | |
"weight": '"${FEATURED_FLAG_WEIGHT}"' | |
} | |
] | |
} | |
} | |
] | |
} | |
}, | |
"explain": false, | |
"size": 1 | |
}' | jq ".hits.hits[0]._source.name" >> results.json ; | |
done | |
for ((iter=1; iter <= $TOTAL_DOCUMENTS; iter++)) do | |
doc="Document 00${iter}" | |
count=`cat results.json | grep "Document 00${iter}" | wc -l` | |
echo "Total 00${iter} Documents: $count" | |
done | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment