Last active
August 26, 2017 15:37
-
-
Save eemp/30421b784fc7f761c3e890b3dbd14d46 to your computer and use it in GitHub Desktop.
Flawed Attempt At Random Weighted Sampling with Elasticsearch (http://eemp.io/2017/07/29/weighted-random-sampling-in-elasticsearch/)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
SERVER=http://localhost:9200 | |
INDEX=random-samples | |
MAPPING=docs | |
CURL_OPTS="-sw '\n'" | |
ACTIONS_LOG=random-samples-index.log | |
TOTAL_DOCUMENTS=10 | |
FEATURED_FLAG_WEIGHT=1.33 | |
# drop index | |
curl $CURL_OPTS -XDELETE $SERVER/$INDEX > $ACTIONS_LOG | |
# create index | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX -d '{ | |
"settings": { | |
"index": { | |
"number_of_shards": 1, | |
"number_of_replicas": 0 | |
} | |
} | |
}' >> $ACTIONS_LOG | |
# create mapping | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/_mapping/$MAPPING -d '{ | |
"properties": { | |
"name": { | |
"type": "text" | |
}, | |
"featured_flag": { | |
"type": "boolean" | |
}, | |
"undesired_flag": { | |
"type": "boolean" | |
} | |
} | |
}' >> $ACTIONS_LOG | |
# create sample docs | |
# feature the first doc | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/1 -d '{ | |
"name": "Document 001", | |
"featured_flag": true | |
}' >> $ACTIONS_LOG | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/2 -d '{ | |
"name": "Document 002", | |
"featured_flag": true, | |
"undesired_flag": true | |
}' >> $ACTIONS_LOG | |
for ((iter=3; iter <= $TOTAL_DOCUMENTS; iter++)) do | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/$MAPPING/${iter} -d '{ | |
"name": "Document 00'"$iter"'", | |
"featured_flag": false | |
}' >> $ACTIONS_LOG ; | |
done | |
# refresh to allow immediate queries | |
curl $CURL_OPTS -XPUT $SERVER/$INDEX/_refresh >> $ACTIONS_LOG | |
# hmm, still need this | |
sleep 1 | |
# clear results from prev run | |
rm -f results.json | |
# sample results | |
curl -s -XPOST $SERVER/$INDEX/$MAPPING/_search -d '{ | |
"query": { | |
"bool": { | |
"must_not": { | |
"term": { | |
"undesired_flag": true | |
} | |
}, | |
"should": [ | |
{ | |
"function_score": { | |
"functions": [ | |
{ | |
"filter": { | |
"match_all": {} | |
}, | |
"random_score": {} | |
}, | |
{ | |
"filter": { | |
"term": { | |
"featured_flag": true | |
} | |
}, | |
"weight": '"${FEATURED_FLAG_WEIGHT}"' | |
} | |
] | |
} | |
} | |
] | |
} | |
}, | |
"explain": true, | |
"size": 1 | |
}' | jq "." > explanation.json | |
# }' | jq ".hits.hits[0]._explanation" > explanation.json | |
# query | |
for iter in `seq 1000`; do | |
curl -s -XPOST $SERVER/$INDEX/$MAPPING/_search -d '{ | |
"query": { | |
"bool": { | |
"must_not": { | |
"term": { | |
"undesired_flag": true | |
} | |
}, | |
"should": [ | |
{ | |
"function_score": { | |
"functions": [ | |
{ | |
"filter": { | |
"match_all": {} | |
}, | |
"random_score": {} | |
}, | |
{ | |
"filter": { | |
"term": { | |
"featured_flag": true | |
} | |
}, | |
"weight": '"${FEATURED_FLAG_WEIGHT}"' | |
} | |
] | |
} | |
} | |
] | |
} | |
}, | |
"explain": false, | |
"size": 1 | |
}' | jq ".hits.hits[0]._source.name" >> results.json ; | |
done | |
for ((iter=1; iter <= $TOTAL_DOCUMENTS; iter++)) do | |
doc="Document 00${iter}" | |
count=`cat results.json | grep "Document 00${iter}" | wc -l` | |
echo "Total 00${iter} Documents: $count" | |
done | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment