Skip to content

Instantly share code, notes, and snippets.

@Riyaaaaa
Last active December 3, 2015 08:05
Show Gist options
  • Save Riyaaaaa/b8b87cf180d5f9892ef5 to your computer and use it in GitHub Desktop.
Save Riyaaaaa/b8b87cf180d5f9892ef5 to your computer and use it in GitHub Desktop.
#include <iostream>
#include <amp.h>
using namespace concurrency;
int main() {
int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'};
array_view<int> av(11, v);
parallel_for_each(av.get_extent(), [=](index<1> idx) restrict(amp) {
av[idx] += 1;
});
for(unsigned int i = 0; i < av.get_extent().size(); i++)
std::cout << static_cast<char>(av(i));
return 0;
}
#include<iostream>
#include<algorithm>
#include<vector>
using concurrency::accelerator;
std::vector<accelerator> findAccelerators(){
std::vector<accelerator> accels;
accels = accelerator::get_all();
for(int i=0; i<accels.size(); i++){
std::wcout << i+1 << "th device = " << accels[i].get_description() << "\n";
}
//emulatorのアクセラレータを削除します
accels.erase(std::remove_if(accels.begin(),accels.end(),[](accelerator& accel){return accel.get_is_emulated();}),accels.end());
return accels;
}
0th device = NVIDIA GeForce GTX 670
1th device = Microsoft Basic Render Driver
2th device = Software Adapter
3th device = CPU accelerator
accelerator: NVIDIA GeForce GTX 670
version of the accelerator: 720896
memory: 1.98681 [GB]
is supporting double precision: yes
is attached to a display: yes
is supporting cpu shared memory: yes
4
1
5760/5760
1
----------------cpu calculation succeeded---------------
score 1.94933[s]
-------------------parallel calculation----------------
rows/cols 810/1440
----------------gpu calculation succeeded---------------
score 0.0314382[s]
void getAccelDiscription(const accelerator& accel){
std::wcout << "accelerator: "<< accel.get_description() << std::endl;
std::cout << "version of the accelerator: " << accel.get_version() << std::endl;
std::cout << "memory: " << accel.get_dedicated_memory()/1024./1000. << " [GB]" << std::endl;;
std::cout << "is supporting double precision: " << (accel.get_supports_double_precision() ? "yes" : "no") << std::endl;
std::cout << "is attached to a display: " << (accel.get_has_display() ? "yes" : "no") << std::endl;
std::cout << "is supporting cpu shared memory: " << (accel.get_supports_cpu_shared_memory() ? "yes" : "no") << std::endl;
return;
}
std::vector<accelerator>::iterator getBiggestMemoryAccelerator(std::vector<accelerator>& accels){
return std::max_element(accels.begin(),accels.end(),[](const accelerator& rhs,const accelerator& lhs){return rhs.get_dedicated_memory() < lhs.get_dedicated_memory();});
}
template <
typename _Value_type,
int _Rank
>
friend class array;
template <
typename _Value_type,
int _Rank = 1
>
class array_view;
#include"amp.h"
#include<array>
#include<iostream>
template<class T,int dim,class F>
void accessArray(concurrency::array<T,dim>& vGArray,F&& function){
concurrency::array_view<T,dim> vGArrayView = vGArray; //concurrency::arrayのラッパーを作成
function(vGArrayView); //array_viewはcpu側からアクセス可能
}
template<class T, int dim>
std::unique_ptr<concurrency::array<T, dim>> createArray(const concurrency::accelerator& accel, int size) {
return std::make_unique<concurrency::array<T, dim>>(size, accel.get_default_view());
}
int main() {
constexpr int dim = 1;
const int size = 100;
concurrency::accelerator accel = *getBiggestMemoryAccelerator(findAccelerators());
auto vGArray(createArray<int, dim>(accel, size));
accessArray(*vGArray, [&](auto& _array) {
for (int i = 0; i<size; i++)_array[i] = i;
});
return 0;
}
int main() {
constexpr int dim = 1;
const int size = 100;
std::array<int,size> arr;
concurrency::accelerator accel = *getBiggestMemoryAccelerator(findAccelerators());
concurrency::extent<dim> ex;
ex[0] = size;
concurrency::array_view<int, dim> view(size,reinterpret_cast<int*>(&arr[0])); //iteratorに対応していないので仕方なくbegin()ではなくポインタをキャストしてます
parallel_for_each(accel.get_default_view(),
ex,
[=](concurrency::index<dim> gindex) restrict(amp) {
view[gindex] = 114514; //array_viewはコピーキャプチャ可能
}
);
view.synchronize(); //メモリ参照元と同期します
for (int i = 0; i<size; i++) {
std::cout << arr[i] << ",";
}
std::cout << std::endl;
return 0;
}
template <
int _Rank
>
class index;
template<class T,int Rank,typename... Args>
T& accessArrayByIndex(const concurrency::array_view<T,Rank>& a,Args... indexes) restrict(amp)
{
static_assert(sizeof...(indexes) == Rank,"number of index is incorrect");
concurrency::index<Rank> idx(indexes...);
return a[idx];
}
int main(void){
constexpr int COLS=6,ROWS=4;
std::array<std::array<float,COLS>,ROWS> data={
1,2,3,4,5,6,
7,8,9,10,11,12,
1,2,3,4,5,6,
7,8,9,10,11,12
};
concurrency::array_view<float,2> data_view(ROWS,COLS,reinterpret_cast<float*>(&data[0][0]));
//print data_view[3][2] by index<2>;
std::cout << accessArrayByIndex(data_view,3,2);
std::cout << std::endl;
return 0;
}
#include"opencv_include.h" //これはopencvの環境構築用のヘッダです
#include"amp.h"
#include"index/tiled_index_modules.hpp"
#include<iostream>
#include<chrono>
void image_processing_test(concurrency::accelerator& accel)
{
cv::Mat input;
cv::Mat_<float> gray,gray_cpu;
input = cv::imread("image_middle.jpg",cv::IMREAD_GRAYSCALE);
input.convertTo(gray, CV_32FC1);
input.convertTo(gray_cpu, CV_32FC1);
for (int rows = 0; rows < input.rows; rows++) {
for (int cols = 0; cols < input.cols; cols++) {
gray.at<float>(rows, cols) /= 255.;
}
}
std::cout << gray.elemSize1() << std::endl;
std::cout << gray.channels() << std::endl;
std::cout << gray.step << "/" << gray.elemSize() * gray.cols << std::endl;
std::cout << gray.isContinuous() << std::endl;
constexpr int convolution_size = 15;
{
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
for (int rows = 0; rows < gray.rows; rows++) {
for (int cols = 0; cols < gray.cols; cols++) {
int sum;
for (int y = -convolution_size; y <= convolution_size; y++) {
for (int x = -convolution_size; x <= convolution_size; x++) {
if (rows + y >= 0 && rows + y < gray.rows && cols + x >= 0 && cols + x < gray.cols)
sum += gray.data[(rows + y) * gray.step + (cols + x) * gray.elemSize()];
else sum += gray.data[rows * gray.step + cols * gray.elemSize()];
}
}
gray_cpu.data[rows * gray.step + cols * gray.elemSize()] = sum / pow(2 * convolution_size + 1, 2);
}
}
std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now();
std::cout << "----------------cpu calculation succeeded---------------" << std::endl;
std::chrono::duration<double> diff = after - now;
std::cout << "score " << diff.count() << "[s]" << std::endl;
}
{
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
auto result = convolutionCalculateAverage<float, convolution_size*2, convolution_size*2>(reinterpret_cast<float*>(&gray.data[0]), input.rows, input.cols, accel);
std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now();
std::cout << "----------------gpu calculation succeeded---------------" << std::endl;
std::chrono::duration<double> diff = after - now;
std::cout << "score " << diff.count() << "[s]" << std::endl;
for (int rows = 0; rows < input.rows; rows++) {
for (int cols = 0; cols < input.cols; cols++) {
gray.at<float>(rows, cols) = result[rows*input.cols + cols];
}
}
}
cv::namedWindow("window", CV_WINDOW_AUTOSIZE);
cv::namedWindow("window2", CV_WINDOW_AUTOSIZE);
cv::imshow("window", gray);
cv::imshow("window2", input);
cv::waitKey(0);
}
template<typename T,int TILE_COLS, int TILE_ROWS>
std::unique_ptr<T[]> convolutionCalculateAverage(T* data, int rows, int cols,const concurrency::accelerator& accel)
{
std::unique_ptr<T[]> average(new T[rows*cols]);
concurrency::array_view<T, 2> data_view(rows, cols, data);
concurrency::array_view<float, 2> average_view(rows, cols, reinterpret_cast<float*>(average.get()));
std::cout << "\n-------------------parallel calculation-----------------" << std::endl;
std::cout << "rows/cols " << rows << "/" << cols << std::endl;
average_view.discard_data();
parallel_for_each(
data_view.get_extent().tile<TILE_ROWS, TILE_COLS>(),
[=](concurrency::tiled_index<TILE_ROWS, TILE_COLS> idx) restrict(amp) {
tile_static T nums[TILE_ROWS][TILE_COLS];
nums[idx.local[1]][idx.local[0]] = data_view[idx.global];
idx.barrier.wait();
T sum=0;
for (int i = 0; i<TILE_ROWS; i++) {
for (int j = 0; j<TILE_COLS; j++) {
sum += nums[i][j];
}
}
average_view[idx.global] = sum / static_cast<T>(TILE_ROWS*TILE_COLS);
}
);
average_view.synchronize();
return std::move(average);
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment