Last active
December 14, 2015 08:19
-
-
Save zxteloiv/5056921 to your computer and use it in GitHub Desktop.
sampling data simple random or qv-random
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
import sys | |
import random | |
def GetRand(src, N, dest): | |
query_list = [] | |
# load src query | |
fp = open(src, "r") | |
while True: | |
line = fp.readline() | |
if not line: | |
break | |
query_list.append(line.rstrip()) | |
fp.close() | |
# get N random lines and output | |
fp = open(dest, "w") | |
choice_list = [] | |
for i in range(0, N): | |
r = random.randrange(0, len(query_list)) | |
while r in choice_list: | |
r = random.randrange(len(query_list)) | |
choice_list.append(r) | |
fp.write(query_list[r] + "\n") | |
fp.close() | |
pass | |
if __name__ == "__main__": | |
if len(sys.argv) == 4: | |
GetRand(sys.argv[1], int(sys.argv[2]), sys.argv[3]) | |
else: | |
print "Usage: %s src_file sample_number_of_lines output_file" % sys.argv[0]; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <time.h> | |
#include <stdlib.h> | |
#include <stdint.h> | |
#include <string.h> | |
#include <stdio.h> | |
#include <vector> | |
#include <map> | |
#include <algorithm> | |
#include <iostream> | |
using namespace std; | |
bool GetQvAndQuery(const std::string &line, | |
int32_t &qv_num, | |
std::string &query) | |
{ | |
size_t pos = line.find_first_of('\t'); | |
if (std::string::npos == pos){ | |
return false; | |
} | |
qv_num = atoi(line.substr(0,pos).c_str()); | |
query = line.substr(pos + 1); | |
return true; | |
} | |
struct Node | |
{ | |
std::string m_query; | |
int32_t m_qv; | |
int64_t m_sum; | |
Node():m_query(""){} | |
}; | |
bool NodeCmp(const Node &lnode, const Node &rnode){ | |
if (strcmp(lnode.m_query.c_str(), rnode.m_query.c_str()) <= 0) { | |
return false; | |
} | |
else { | |
return true; | |
} | |
} | |
void StatOneFile(const std::string &file_name, | |
std::vector<Node> &query_vector){ | |
FILE* fr = fopen(file_name.c_str(), "r"); | |
if (!fr) { | |
return ; | |
} | |
char line[1024]; | |
while(fgets(line,1024, fr)){ | |
int32_t len = strlen(line); | |
int32_t j = len - 1; | |
while(j >= 0) { | |
if (line[j] == ' ' || line[j] == '\t' || line[j] == '\n' || line[j] == '\r') { | |
line[j] = '\0'; | |
j --; | |
} | |
else break; | |
} | |
if (j > 0){ | |
int32_t qv_num; | |
std::string query; | |
bool is = GetQvAndQuery(std::string(line, 0, j+1), qv_num, query); | |
if (!is || query.length() > 25) { | |
continue; | |
} | |
Node node; | |
node.m_qv = qv_num; | |
node.m_query = query; | |
query_vector.push_back(node); | |
} | |
} | |
std::cout << "size:" << query_vector.size() << std::endl; | |
fclose(fr); | |
} | |
int32_t SearchIndex(std::vector<Node> &query_vector, | |
int64_t num){ | |
int32_t start = 0; | |
int32_t end = query_vector.size() - 1; | |
int mid = -1; | |
while(end >= start ){ | |
mid = (end + start) / 2; | |
int32_t mid_start = query_vector[mid].m_sum - query_vector[mid].m_qv + 1; | |
int32_t mid_end = query_vector[mid].m_sum; | |
if (num < mid_start) end = mid - 1; | |
else if (num > mid_end) start = mid + 1; | |
else break;; | |
} | |
return mid; | |
} | |
void SelectQueryByQV(std::vector<Node> &query_vector, | |
int32_t qv_num, | |
std::map<std::string, int32_t> &select_query){ | |
select_query.clear(); | |
if (query_vector.size() == 0){ | |
return ; | |
} | |
// set start, end | |
query_vector[0].m_sum = query_vector[0].m_qv; | |
for (uint32_t i = 1; i < query_vector.size(); ++i){ | |
query_vector[i].m_sum = query_vector[i-1].m_sum + query_vector[i].m_qv; | |
} | |
int64_t total_num = query_vector[query_vector.size() - 1].m_sum; | |
printf("size:%d total_num:%ld\n", (int)query_vector.size(), total_num); | |
for (int32_t i = 0; i < qv_num; ++i){ | |
int64_t num = rand() % (total_num + 1); | |
int32_t index = SearchIndex(query_vector, num); | |
if (index == -1){ | |
continue; | |
} | |
std::string query = query_vector[index].m_query; | |
std::map<std::string, int32_t>::iterator map_it = select_query.find(query); | |
if (map_it == select_query.end()){ | |
select_query.insert(std::make_pair(query, 1)); | |
} | |
else { | |
select_query[query] += 1; | |
} | |
} | |
} | |
void SelectQueryByQueryCount(std::vector<Node> &query_vector, | |
int32_t query_num, | |
std::map<std::string, int32_t> &select_query){ | |
select_query.clear(); | |
if (query_vector.size() == 0){ | |
return ; | |
} | |
// set start, end | |
query_vector[0].m_sum = query_vector[0].m_qv; | |
for (uint32_t i = 1; i < query_vector.size(); ++i){ | |
query_vector[i].m_sum = query_vector[i-1].m_sum + query_vector[i].m_qv; | |
} | |
int64_t total_num = query_vector[query_vector.size() - 1].m_sum; | |
printf("size:%d total_num:%ld\n", (int)query_vector.size(), total_num); | |
while (select_query.size() < query_num) { | |
int64_t num = rand() % (total_num + 1); | |
int32_t index = SearchIndex(query_vector, num); | |
if (index == -1){ | |
continue; | |
} | |
std::string query = query_vector[index].m_query; | |
std::map<std::string, int32_t>::iterator map_it = select_query.find(query); | |
if (map_it == select_query.end()){ | |
select_query.insert(std::make_pair(query, 1)); | |
} | |
else { | |
select_query[query] += 1; | |
} | |
} | |
} | |
void SaveToFile(std::map<std::string, int32_t> &select_query, | |
const std::string &file_name){ | |
FILE * fw = fopen(file_name.c_str(), "w"); | |
if (!fw) { | |
return ; | |
} | |
std::map<std::string, int32_t>::iterator it = select_query.begin(); | |
for(; it != select_query.end(); ++it){ | |
fprintf(fw, "%d\t%s\n", it->second, it->first.c_str()); | |
} | |
fclose(fw); | |
} | |
int main(int argc , char* argv []) { | |
if (5 != argc) { | |
std::cout << "Usage: " | |
<< argv[0] | |
<< " qv_file(qv query) 0|1 sample_qv_number output-file" << std::endl | |
<< "Notes: 0 means to select by qv count, and 1 by query count" << std::endl; | |
return 0; | |
} | |
srand((unsigned)time(NULL)); | |
std::vector<Node> query_vector; | |
StatOneFile(argv[1], query_vector); | |
std::map<std::string, int32_t> select_query; | |
if (0 == atoi(argv[2])) { | |
SelectQueryByQV(query_vector, atoi(argv[3]), select_query); | |
} else if (1 == atoi(argv[2])) { | |
SelectQueryByQueryCount(query_vector, atoi(argv[3]), select_query); | |
} | |
SaveToFile(select_query, argv[4]); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment