Last active
April 13, 2017 06:41
-
-
Save antonmks/2c55230b0cc0785f45da31611774ee31 to your computer and use it in GitHub Desktop.
Query 2 { DISTINCT, AVG, MAX(char3 ) }
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <algorithm> | |
#include <ctime> | |
#include <fstream> | |
#include <iomanip> | |
#include <iostream> | |
#include <iterator> | |
#include <map> | |
#include <numeric> | |
#include <sstream> | |
#include <stdint.h> | |
#include <thrust/device_vector.h> | |
#include <thrust/extrema.h> | |
#include <thrust/reduce.h> | |
#include <thrust/sequence.h> | |
#include <thrust/sort.h> | |
#include <thrust/unique.h> | |
#include <time.h> | |
using namespace std; | |
uint64_t filesize(const char *filename) { | |
ifstream in(filename, ios::binary | ios::ate); | |
if (!in) { | |
cout << "Could not open file " << filename << endl; | |
exit(0); | |
}; | |
return in.tellg(); | |
} | |
struct increase_pointers { | |
char **char_pos; | |
char *first_pos; | |
increase_pointers(char **_char_pos, char *_first_pos) | |
: char_pos(_char_pos), first_pos(_first_pos) {} | |
template <typename IndexType> | |
__host__ __device__ void operator()(const IndexType &i) { | |
char_pos[i] = first_pos + 9 * i; | |
// if (i < 10) | |
// printf("INS %d %lld %lld \n", i, char_pos[i], first_pos); | |
} | |
}; | |
// applying WHERE conditions | |
struct check_records { | |
const unsigned int *time6; | |
const char *bool5; | |
bool *res; | |
const unsigned int *time6_from; | |
const unsigned int *time6_to; | |
const char *bool5_val; | |
check_records(const unsigned int *_time6, const char *_bool5, bool *_res, | |
const unsigned int *_time6_from, const unsigned int *_time6_to, | |
const char *_bool5_val) | |
: time6(_time6), bool5(_bool5), res(_res), time6_from(_time6_from), | |
time6_to(_time6_to), bool5_val(_bool5_val) {} | |
template <typename IndexType> | |
__host__ __device__ void operator()(const IndexType &i) { | |
if (time6[i] >= time6_from[0] && time6[i] <= time6_to[0] && | |
bool5[i] == bool5_val[0]) { | |
res[i] = 1; | |
} else | |
res[i] = 0; | |
} | |
}; | |
// sort fixed-length strings | |
struct sort_str { | |
__host__ __device__ bool operator()(const char *t1, const char *t2) { | |
for (unsigned int i = 0; i < 8; i++) { | |
if (t1[i] < t2[i]) | |
return 1; | |
if (t1[i] > t2[i]) | |
return 0; | |
}; | |
return 0; | |
} | |
}; | |
int main(int ac, char **av) { | |
unsigned int time6_from = 19000101, time6_to = 20300101; | |
char bool5_val = 0; | |
string usage = "Usage : query1 [-time6_from TIME6_FROM] [-time6_to TIME6_TO " | |
"] [-bool5 BOOL5]"; | |
if (ac == 1) { | |
cout << usage << endl; | |
exit(1); | |
}; | |
for (unsigned int i = 1; i < ac; i++) { | |
if (strcmp(av[i], "-time6_from") == 0) { | |
if (i + 1 < ac) { | |
time6_from = atoi(av[i + 1]); | |
i++; | |
} else { | |
cout << usage << endl; | |
exit(1); | |
}; | |
} | |
if (strcmp(av[i], "-time6_to") == 0) { | |
if (i + 1 < ac) { | |
time6_to = atoi(av[i + 1]); | |
i++; | |
} else { | |
cout << usage << endl; | |
exit(1); | |
}; | |
} else if (strcmp(av[i], "-bool5") == 0) { | |
if (i + 1 < ac) { | |
bool5_val = av[i + 1][0]; | |
i++; | |
} else { | |
cout << usage << endl; | |
exit(1); | |
}; | |
} | |
}; | |
string file_name = "col2.txt"; | |
std::fstream f(file_name, std::ios_base::in | ios::binary); | |
uint64_t key; | |
thrust::device_vector<uint64_t> id2; | |
std::vector<uint64_t> keys; | |
thrust::device_vector<bool> res; | |
cout << "Parsing columns " << endl; | |
if (f) { | |
while (f >> key) { | |
keys.push_back(key); | |
}; | |
id2.resize(keys.size()); | |
thrust::copy(keys.begin(), keys.end(), id2.begin()); | |
cout << "read " << file_name << endl; | |
} else { | |
cout << "Could not open file " << file_name << endl; | |
}; | |
file_name = "col6.txt"; | |
std::fstream f1(file_name, std::ios_base::in | ios::binary); | |
string date_string; | |
thrust::device_vector<unsigned int> time6; | |
std::vector<unsigned int> date_keys; | |
if (f1) { | |
while (f1 >> date_string) { | |
date_keys.push_back(std::stoi(date_string.substr(0, 4) + | |
date_string.substr(5, 2) + | |
date_string.substr(8, 2))); | |
}; | |
time6.resize(date_keys.size()); | |
thrust::copy(date_keys.begin(), date_keys.end(), time6.begin()); | |
cout << "read " << file_name << endl; | |
} else { | |
cout << "Could not open file " << file_name << endl; | |
}; | |
file_name = "col5.txt"; | |
std::fstream f2(file_name, std::ios_base::in | ios::binary); | |
char bool_char; | |
thrust::device_vector<char> bool5; | |
std::vector<char> bool_keys; | |
if (f2) { | |
while (f2 >> bool_char) { | |
bool_keys.push_back(bool_char); | |
}; | |
cout << "read " << file_name << endl; | |
bool5.resize(bool_keys.size()); | |
thrust::copy(bool_keys.begin(), bool_keys.end(), bool5.begin()); | |
} else { | |
cout << "Could not open file " << file_name << endl; | |
}; | |
file_name = "col3.txt"; | |
std::fstream f3(file_name, std::ios_base::in | ios::binary); | |
auto file_size = filesize(file_name.c_str()); | |
thrust::host_vector<char> file_buffer(file_size); | |
thrust::device_vector<char> char3; | |
if (f3) { | |
f3.read(file_buffer.data(), file_size); | |
char3.resize(file_size); | |
thrust::copy(file_buffer.begin(), file_buffer.end(), char3.begin()); | |
cout << "read " << file_name << endl; | |
} else { | |
cout << "Could not open file " << file_name << endl; | |
}; | |
// Now we have all 4 arrays in a device memory | |
std::clock_t start1 = std::clock(); | |
res.resize(bool_keys.size()); | |
// SQL WHERE condition check | |
thrust::device_vector<unsigned int> dev_time6_from(1); | |
thrust::device_vector<unsigned int> dev_time6_to(1); | |
thrust::device_vector<char> dev_bool5(1); | |
dev_time6_from[0] = time6_from; | |
dev_time6_to[0] = time6_to; | |
dev_bool5[0] = bool5_val; | |
thrust::counting_iterator<unsigned int> begin(0); | |
check_records ff( | |
(const unsigned int *)thrust::raw_pointer_cast(time6.data()), | |
(const char *)thrust::raw_pointer_cast(bool5.data()), | |
thrust::raw_pointer_cast(res.data()), | |
(const unsigned int *)thrust::raw_pointer_cast(dev_time6_from.data()), | |
(const unsigned int *)thrust::raw_pointer_cast(dev_time6_to.data()), | |
(const char *)thrust::raw_pointer_cast(dev_bool5.data())); | |
thrust::for_each(begin, begin + res.size(), ff); | |
time6.resize(0); | |
time6.shrink_to_fit(); | |
bool5.resize(0); | |
bool5.shrink_to_fit(); | |
// copy_if the results | |
thrust::device_vector<uint64_t> id2_cpy(res.size()); | |
auto w_count = thrust::copy_if(id2.begin(), id2.end(), res.begin(), | |
id2_cpy.begin(), thrust::identity<bool>()) - | |
id2_cpy.begin(); | |
// SQL DISTINCT | |
thrust::sort(id2_cpy.begin(), id2_cpy.begin() + w_count); | |
auto distinct_cnt = | |
thrust::unique(id2_cpy.begin(), id2_cpy.begin() + w_count) - | |
id2_cpy.begin(); | |
cout << "distinct count " << distinct_cnt << endl; | |
// SQL AVG | |
auto sum = thrust::reduce(id2_cpy.begin(), id2_cpy.end()); | |
cout << "AVG(id2) = " << sum / w_count | |
<< endl; // incorrect results because sum of ~100 million 20-digit | |
// integers is greater than a max of uint64_t | |
// please notice that the same overflow occurs with CH | |
// so it would be better to sum something smaller | |
// Not enough memory at this point | |
id2.resize(0); | |
id2.shrink_to_fit(); | |
id2_cpy.resize(0); | |
id2_cpy.shrink_to_fit(); | |
// MAX(char3) | |
thrust::host_vector<char> max_char(8); | |
thrust::device_vector<char *> char_pos(date_keys.size()); | |
increase_pointers func(thrust::raw_pointer_cast(char_pos.data()), | |
thrust::raw_pointer_cast(char3.data())); | |
thrust::for_each(begin, begin + char_pos.size(), func); | |
thrust::device_vector<char *> char_pos_res(w_count); | |
// apply WHERE conditions | |
thrust::copy_if(char_pos.begin(), char_pos.end(), res.begin(), | |
char_pos_res.begin(), thrust::identity<bool>()); | |
// thrust::sort(char_pos_res.begin(), char_pos_res.end(), sort_str()); | |
// //sorting all strings is too slow , so we use max_element | |
auto end = | |
thrust::max_element(char_pos_res.begin(), char_pos_res.end(), sort_str()); | |
char *device_str = *(end); | |
cudaMemcpy(max_char.data(), (void *)device_str, 8, cudaMemcpyDeviceToHost); | |
cout << "MAX(char3) = "; | |
for (int z = 0; z < 8; z++) | |
cout << max_char[z]; | |
cout << endl; | |
std::cout << "time " << ((std::clock() - start1) / (double)CLOCKS_PER_SEC) | |
<< '\n'; | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment