Skip to content

Instantly share code, notes, and snippets.

@zxteloiv
Last active December 14, 2015 08:19
Show Gist options
  • Save zxteloiv/5056921 to your computer and use it in GitHub Desktop.
Save zxteloiv/5056921 to your computer and use it in GitHub Desktop.
sampling data simple random or qv-random
#!/usr/local/bin/python
import sys
import random
def GetRand(src, N, dest):
query_list = []
# load src query
fp = open(src, "r")
while True:
line = fp.readline()
if not line:
break
query_list.append(line.rstrip())
fp.close()
# get N random lines and output
fp = open(dest, "w")
choice_list = []
for i in range(0, N):
r = random.randrange(0, len(query_list))
while r in choice_list:
r = random.randrange(len(query_list))
choice_list.append(r)
fp.write(query_list[r] + "\n")
fp.close()
pass
if __name__ == "__main__":
if len(sys.argv) == 4:
GetRand(sys.argv[1], int(sys.argv[2]), sys.argv[3])
else:
print "Usage: %s src_file sample_number_of_lines output_file" % sys.argv[0];
#include <time.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <vector>
#include <map>
#include <algorithm>
#include <iostream>
using namespace std;
bool GetQvAndQuery(const std::string &line,
int32_t &qv_num,
std::string &query)
{
size_t pos = line.find_first_of('\t');
if (std::string::npos == pos){
return false;
}
qv_num = atoi(line.substr(0,pos).c_str());
query = line.substr(pos + 1);
return true;
}
struct Node
{
std::string m_query;
int32_t m_qv;
int64_t m_sum;
Node():m_query(""){}
};
bool NodeCmp(const Node &lnode, const Node &rnode){
if (strcmp(lnode.m_query.c_str(), rnode.m_query.c_str()) <= 0) {
return false;
}
else {
return true;
}
}
void StatOneFile(const std::string &file_name,
std::vector<Node> &query_vector){
FILE* fr = fopen(file_name.c_str(), "r");
if (!fr) {
return ;
}
char line[1024];
while(fgets(line,1024, fr)){
int32_t len = strlen(line);
int32_t j = len - 1;
while(j >= 0) {
if (line[j] == ' ' || line[j] == '\t' || line[j] == '\n' || line[j] == '\r') {
line[j] = '\0';
j --;
}
else break;
}
if (j > 0){
int32_t qv_num;
std::string query;
bool is = GetQvAndQuery(std::string(line, 0, j+1), qv_num, query);
if (!is || query.length() > 25) {
continue;
}
Node node;
node.m_qv = qv_num;
node.m_query = query;
query_vector.push_back(node);
}
}
std::cout << "size:" << query_vector.size() << std::endl;
fclose(fr);
}
int32_t SearchIndex(std::vector<Node> &query_vector,
int64_t num){
int32_t start = 0;
int32_t end = query_vector.size() - 1;
int mid = -1;
while(end >= start ){
mid = (end + start) / 2;
int32_t mid_start = query_vector[mid].m_sum - query_vector[mid].m_qv + 1;
int32_t mid_end = query_vector[mid].m_sum;
if (num < mid_start) end = mid - 1;
else if (num > mid_end) start = mid + 1;
else break;;
}
return mid;
}
void SelectQueryByQV(std::vector<Node> &query_vector,
int32_t qv_num,
std::map<std::string, int32_t> &select_query){
select_query.clear();
if (query_vector.size() == 0){
return ;
}
// set start, end
query_vector[0].m_sum = query_vector[0].m_qv;
for (uint32_t i = 1; i < query_vector.size(); ++i){
query_vector[i].m_sum = query_vector[i-1].m_sum + query_vector[i].m_qv;
}
int64_t total_num = query_vector[query_vector.size() - 1].m_sum;
printf("size:%d total_num:%ld\n", (int)query_vector.size(), total_num);
for (int32_t i = 0; i < qv_num; ++i){
int64_t num = rand() % (total_num + 1);
int32_t index = SearchIndex(query_vector, num);
if (index == -1){
continue;
}
std::string query = query_vector[index].m_query;
std::map<std::string, int32_t>::iterator map_it = select_query.find(query);
if (map_it == select_query.end()){
select_query.insert(std::make_pair(query, 1));
}
else {
select_query[query] += 1;
}
}
}
void SelectQueryByQueryCount(std::vector<Node> &query_vector,
int32_t query_num,
std::map<std::string, int32_t> &select_query){
select_query.clear();
if (query_vector.size() == 0){
return ;
}
// set start, end
query_vector[0].m_sum = query_vector[0].m_qv;
for (uint32_t i = 1; i < query_vector.size(); ++i){
query_vector[i].m_sum = query_vector[i-1].m_sum + query_vector[i].m_qv;
}
int64_t total_num = query_vector[query_vector.size() - 1].m_sum;
printf("size:%d total_num:%ld\n", (int)query_vector.size(), total_num);
while (select_query.size() < query_num) {
int64_t num = rand() % (total_num + 1);
int32_t index = SearchIndex(query_vector, num);
if (index == -1){
continue;
}
std::string query = query_vector[index].m_query;
std::map<std::string, int32_t>::iterator map_it = select_query.find(query);
if (map_it == select_query.end()){
select_query.insert(std::make_pair(query, 1));
}
else {
select_query[query] += 1;
}
}
}
void SaveToFile(std::map<std::string, int32_t> &select_query,
const std::string &file_name){
FILE * fw = fopen(file_name.c_str(), "w");
if (!fw) {
return ;
}
std::map<std::string, int32_t>::iterator it = select_query.begin();
for(; it != select_query.end(); ++it){
fprintf(fw, "%d\t%s\n", it->second, it->first.c_str());
}
fclose(fw);
}
int main(int argc , char* argv []) {
if (5 != argc) {
std::cout << "Usage: "
<< argv[0]
<< " qv_file(qv query) 0|1 sample_qv_number output-file" << std::endl
<< "Notes: 0 means to select by qv count, and 1 by query count" << std::endl;
return 0;
}
srand((unsigned)time(NULL));
std::vector<Node> query_vector;
StatOneFile(argv[1], query_vector);
std::map<std::string, int32_t> select_query;
if (0 == atoi(argv[2])) {
SelectQueryByQV(query_vector, atoi(argv[3]), select_query);
} else if (1 == atoi(argv[2])) {
SelectQueryByQueryCount(query_vector, atoi(argv[3]), select_query);
}
SaveToFile(select_query, argv[4]);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment