Riyaaaaa/file0.cpp

## file0.cpp
#include <iostream>
#include <amp.h>
using namespace concurrency;
int main() {
  int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'};

  array_view<int> av(11, v);
  parallel_for_each(av.get_extent(), [=](index<1> idx) restrict(amp) {
    av[idx] += 1;
  });

  for(unsigned int i = 0; i < av.get_extent().size(); i++)
    std::cout << static_cast<char>(av(i));
  return 0;
}

## file1.cpp
#include<iostream>
#include<algorithm>
#include<vector>

using concurrency::accelerator;

std::vector<accelerator> findAccelerators(){
    std::vector<accelerator> accels;
    accels = accelerator::get_all();

    for(int i=0; i<accels.size(); i++){
        std::wcout << i+1 << "th device = " << accels[i].get_description() << "\n";
    }

    //emulatorのアクセラレータを削除します
    accels.erase(std::remove_if(accels.begin(),accels.end(),[](accelerator& accel){return accel.get_is_emulated();}),accels.end());

    return accels;
}

## file12.txt
0th device = NVIDIA GeForce GTX 670
1th device = Microsoft Basic Render Driver
2th device = Software Adapter
3th device = CPU accelerator
accelerator: NVIDIA GeForce GTX 670
version of the accelerator: 720896
memory: 1.98681 [GB]
is supporting double precision: yes
is attached to a display: yes
is supporting cpu shared memory: yes
4
1
5760/5760
1
----------------cpu calculation succeeded---------------
score 1.94933[s]
-------------------parallel calculation----------------
rows/cols 810/1440
----------------gpu calculation succeeded---------------
score 0.0314382[s]

## file2.cpp
void getAccelDiscription(const accelerator& accel){
    std::wcout << "accelerator: "<< accel.get_description() << std::endl;
    std::cout << "version of the accelerator: " << accel.get_version() << std::endl;
    std::cout << "memory: " << accel.get_dedicated_memory()/1024./1000. << " [GB]" << std::endl;;
    std::cout << "is supporting double precision: " << (accel.get_supports_double_precision() ? "yes" : "no") << std::endl;
    std::cout << "is attached to a display: " << (accel.get_has_display() ? "yes" : "no") << std::endl;
    std::cout << "is supporting cpu shared memory: " << (accel.get_supports_cpu_shared_memory() ? "yes" : "no") << std::endl;
    return;
}

## file3.cpp
std::vector<accelerator>::iterator getBiggestMemoryAccelerator(std::vector<accelerator>& accels){
    return std::max_element(accels.begin(),accels.end(),[](const accelerator& rhs,const accelerator& lhs){return rhs.get_dedicated_memory() < lhs.get_dedicated_memory();});
}

## file4.cpp
template <
   typename _Value_type,
   int _Rank
>
friend class array;

## file5.cpp
template <
   typename _Value_type,
   int _Rank = 1
>
class array_view;

## file6.cpp
#include"amp.h"
#include<array>
#include<iostream>

template<class T,int dim,class F>
void accessArray(concurrency::array<T,dim>& vGArray,F&& function){
    concurrency::array_view<T,dim> vGArrayView = vGArray; //concurrency::arrayのラッパーを作成
    function(vGArrayView); //array_viewはcpu側からアクセス可能
}

template<class T, int dim>
std::unique_ptr<concurrency::array<T, dim>> createArray(const concurrency::accelerator& accel, int size) {
    return std::make_unique<concurrency::array<T, dim>>(size, accel.get_default_view());
}

int main() {
    constexpr int dim = 1;
    const int size = 100;

    concurrency::accelerator accel = *getBiggestMemoryAccelerator(findAccelerators());

    auto vGArray(createArray<int, dim>(accel, size));

    accessArray(*vGArray, [&](auto& _array) {
        for (int i = 0; i<size; i++)_array[i] = i;
    });

    return 0;
}

## file7.cpp
int main() {
    constexpr int dim = 1;
    const int size = 100;
    std::array<int,size> arr;

    concurrency::accelerator accel = *getBiggestMemoryAccelerator(findAccelerators());
    concurrency::extent<dim> ex;
    ex[0] = size;

    concurrency::array_view<int, dim> view(size,reinterpret_cast<int*>(&arr[0])); //iteratorに対応していないので仕方なくbegin()ではなくポインタをキャストしてます
    parallel_for_each(accel.get_default_view(),
        ex,
        [=](concurrency::index<dim> gindex) restrict(amp) {
        view[gindex] = 114514; //array_viewはコピーキャプチャ可能
    }
    );

    view.synchronize(); //メモリ参照元と同期します

    for (int i = 0; i<size; i++) {
        std::cout << arr[i] << ",";
    }
    std::cout << std::endl;

    return 0;
}


## file8.cpp
template <
   int _Rank
>
class index;

## file9.cpp

template<class T,int Rank,typename... Args>
T& accessArrayByIndex(const concurrency::array_view<T,Rank>& a,Args... indexes) restrict(amp)
{
    static_assert(sizeof...(indexes) == Rank,"number of index is incorrect");

    concurrency::index<Rank> idx(indexes...);
    return a[idx];
}

int main(void){
    constexpr int COLS=6,ROWS=4;
    std::array<std::array<float,COLS>,ROWS> data={
        1,2,3,4,5,6,
        7,8,9,10,11,12,
        1,2,3,4,5,6,
        7,8,9,10,11,12
    };

    concurrency::array_view<float,2> data_view(ROWS,COLS,reinterpret_cast<float*>(&data[0][0]));
    //print data_view[3][2] by index<2>;
    std::cout << accessArrayByIndex(data_view,3,2);
    std::cout << std::endl;
    return 0;
}

## main.cpp
#include"opencv_include.h" //これはopencvの環境構築用のヘッダです
#include"amp.h"
#include"index/tiled_index_modules.hpp"
#include<iostream>
#include<chrono>

void image_processing_test(concurrency::accelerator& accel)
{
    cv::Mat input;
    cv::Mat_<float> gray,gray_cpu;
    input = cv::imread("image_middle.jpg",cv::IMREAD_GRAYSCALE);
    input.convertTo(gray, CV_32FC1);
    input.convertTo(gray_cpu, CV_32FC1);

    for (int rows = 0; rows < input.rows; rows++) {
        for (int cols = 0; cols < input.cols; cols++) {
            gray.at<float>(rows, cols) /= 255.;
        }
    }
    std::cout << gray.elemSize1() << std::endl;
    std::cout << gray.channels() << std::endl;
    std::cout << gray.step << "/" << gray.elemSize() * gray.cols << std::endl;
    std::cout << gray.isContinuous() << std::endl;

    constexpr int convolution_size = 15;
    {
        std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
        for (int rows = 0; rows < gray.rows; rows++) {
            for (int cols = 0; cols < gray.cols; cols++) {

                int sum;

                for (int y = -convolution_size; y <= convolution_size; y++) {
                    for (int x = -convolution_size; x <= convolution_size; x++) {
                        if (rows + y >= 0 && rows + y < gray.rows && cols + x >= 0 && cols + x < gray.cols)
                            sum += gray.data[(rows + y) * gray.step + (cols + x) * gray.elemSize()];
                        else sum += gray.data[rows * gray.step + cols * gray.elemSize()];
                    }
                }
                gray_cpu.data[rows * gray.step + cols * gray.elemSize()] = sum / pow(2 * convolution_size + 1, 2);

            }
        }
        std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now();
        std::cout << "----------------cpu calculation succeeded---------------" << std::endl;
        std::chrono::duration<double> diff = after - now;
        std::cout << "score " << diff.count() << "[s]" << std::endl;
    }

    {
        std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
        auto result = convolutionCalculateAverage<float, convolution_size*2, convolution_size*2>(reinterpret_cast<float*>(&gray.data[0]), input.rows, input.cols, accel);
        std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now();

        std::cout << "----------------gpu calculation succeeded---------------" << std::endl;

        std::chrono::duration<double> diff = after - now;

        std::cout << "score " << diff.count() << "[s]" << std::endl;

        for (int rows = 0; rows < input.rows; rows++) {
            for (int cols = 0; cols < input.cols; cols++) {
                gray.at<float>(rows, cols) = result[rows*input.cols + cols];
            }
        }
    }

    cv::namedWindow("window", CV_WINDOW_AUTOSIZE);
    cv::namedWindow("window2", CV_WINDOW_AUTOSIZE);

    cv::imshow("window", gray);
    cv::imshow("window2", input);

    cv::waitKey(0);
}

## tiled_index_modules.hpp
template<typename T,int TILE_COLS, int TILE_ROWS>
std::unique_ptr<T[]> convolutionCalculateAverage(T* data, int rows, int cols,const concurrency::accelerator& accel)
{
    std::unique_ptr<T[]> average(new T[rows*cols]);

    concurrency::array_view<T, 2> data_view(rows, cols, data);
    concurrency::array_view<float, 2> average_view(rows, cols, reinterpret_cast<float*>(average.get()));

    std::cout << "\n-------------------parallel calculation-----------------" << std::endl;
    std::cout << "rows/cols " << rows << "/" << cols << std::endl;

    average_view.discard_data();
    parallel_for_each(
        data_view.get_extent().tile<TILE_ROWS, TILE_COLS>(),
        [=](concurrency::tiled_index<TILE_ROWS, TILE_COLS> idx) restrict(amp) {
        tile_static T nums[TILE_ROWS][TILE_COLS];
        nums[idx.local[1]][idx.local[0]] = data_view[idx.global];
        idx.barrier.wait();
        T sum=0;
        for (int i = 0; i<TILE_ROWS; i++) {
            for (int j = 0; j<TILE_COLS; j++) {
                sum += nums[i][j];
            }
        }
        average_view[idx.global] = sum / static_cast<T>(TILE_ROWS*TILE_COLS);
    }
    );
    average_view.synchronize();

    return std::move(average);
}
	#include <iostream>
	#include <amp.h>
	using namespace concurrency;
	int main() {
	int v[11] = {'G', 'd', 'k', 'k', 'n', 31, 'v', 'n', 'q', 'k', 'c'};

	array_view<int> av(11, v);
	parallel_for_each(av.get_extent(), [=](index<1> idx) restrict(amp) {
	av[idx] += 1;
	});

	for(unsigned int i = 0; i < av.get_extent().size(); i++)
	std::cout << static_cast<char>(av(i));
	return 0;
	}
	#include<iostream>
	#include<algorithm>
	#include<vector>

	using concurrency::accelerator;

	std::vector<accelerator> findAccelerators(){
	std::vector<accelerator> accels;
	accels = accelerator::get_all();

	for(int i=0; i<accels.size(); i++){
	std::wcout << i+1 << "th device = " << accels[i].get_description() << "\n";
	}

	//emulatorのアクセラレータを削除します
	accels.erase(std::remove_if(accels.begin(),accels.end(),[](accelerator& accel){return accel.get_is_emulated();}),accels.end());

	return accels;
	}
	0th device = NVIDIA GeForce GTX 670
	1th device = Microsoft Basic Render Driver
	2th device = Software Adapter
	3th device = CPU accelerator
	accelerator: NVIDIA GeForce GTX 670
	version of the accelerator: 720896
	memory: 1.98681 [GB]
	is supporting double precision: yes
	is attached to a display: yes
	is supporting cpu shared memory: yes
	4
	1
	5760/5760
	1
	----------------cpu calculation succeeded---------------
	score 1.94933[s]
	-------------------parallel calculation----------------
	rows/cols 810/1440
	----------------gpu calculation succeeded---------------
	score 0.0314382[s]
	void getAccelDiscription(const accelerator& accel){
	std::wcout << "accelerator: "<< accel.get_description() << std::endl;
	std::cout << "version of the accelerator: " << accel.get_version() << std::endl;
	std::cout << "memory: " << accel.get_dedicated_memory()/1024./1000. << " [GB]" << std::endl;;
	std::cout << "is supporting double precision: " << (accel.get_supports_double_precision() ? "yes" : "no") << std::endl;
	std::cout << "is attached to a display: " << (accel.get_has_display() ? "yes" : "no") << std::endl;
	std::cout << "is supporting cpu shared memory: " << (accel.get_supports_cpu_shared_memory() ? "yes" : "no") << std::endl;
	return;
	}
	std::vector<accelerator>::iterator getBiggestMemoryAccelerator(std::vector<accelerator>& accels){
	return std::max_element(accels.begin(),accels.end(),[](const accelerator& rhs,const accelerator& lhs){return rhs.get_dedicated_memory() < lhs.get_dedicated_memory();});
	}
	template <
	typename _Value_type,
	int _Rank
	>
	friend class array;
	template <
	typename _Value_type,
	int _Rank = 1
	>
	class array_view;
	#include"amp.h"
	#include<array>
	#include<iostream>

	template<class T,int dim,class F>
	void accessArray(concurrency::array<T,dim>& vGArray,F&& function){
	concurrency::array_view<T,dim> vGArrayView = vGArray; //concurrency::arrayのラッパーを作成
	function(vGArrayView); //array_viewはcpu側からアクセス可能
	}

	template<class T, int dim>
	std::unique_ptr<concurrency::array<T, dim>> createArray(const concurrency::accelerator& accel, int size) {
	return std::make_unique<concurrency::array<T, dim>>(size, accel.get_default_view());
	}

	int main() {
	constexpr int dim = 1;
	const int size = 100;

	concurrency::accelerator accel = *getBiggestMemoryAccelerator(findAccelerators());

	auto vGArray(createArray<int, dim>(accel, size));

	accessArray(*vGArray, [&](auto& _array) {
	for (int i = 0; i<size; i++)_array[i] = i;
	});

	return 0;
	}

	template<class T,int Rank,typename... Args>
	T& accessArrayByIndex(const concurrency::array_view<T,Rank>& a,Args... indexes) restrict(amp)
	{
	static_assert(sizeof...(indexes) == Rank,"number of index is incorrect");

	concurrency::index<Rank> idx(indexes...);
	return a[idx];
	}

	int main(void){
	constexpr int COLS=6,ROWS=4;
	std::array<std::array<float,COLS>,ROWS> data={
	1,2,3,4,5,6,
	7,8,9,10,11,12,
	1,2,3,4,5,6,
	7,8,9,10,11,12
	};

	concurrency::array_view<float,2> data_view(ROWS,COLS,reinterpret_cast<float*>(&data[0][0]));
	//print data_view[3][2] by index<2>;
	std::cout << accessArrayByIndex(data_view,3,2);
	std::cout << std::endl;
	return 0;
	}
	#include"opencv_include.h" //これはopencvの環境構築用のヘッダです
	#include"amp.h"
	#include"index/tiled_index_modules.hpp"
	#include<iostream>
	#include<chrono>

	void image_processing_test(concurrency::accelerator& accel)
	{
	cv::Mat input;
	cv::Mat_<float> gray,gray_cpu;
	input = cv::imread("image_middle.jpg",cv::IMREAD_GRAYSCALE);
	input.convertTo(gray, CV_32FC1);
	input.convertTo(gray_cpu, CV_32FC1);

	for (int rows = 0; rows < input.rows; rows++) {
	for (int cols = 0; cols < input.cols; cols++) {
	gray.at<float>(rows, cols) /= 255.;
	}
	}
	std::cout << gray.elemSize1() << std::endl;
	std::cout << gray.channels() << std::endl;
	std::cout << gray.step << "/" << gray.elemSize() * gray.cols << std::endl;
	std::cout << gray.isContinuous() << std::endl;

	constexpr int convolution_size = 15;
	{
	std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
	for (int rows = 0; rows < gray.rows; rows++) {
	for (int cols = 0; cols < gray.cols; cols++) {

	int sum;

	for (int y = -convolution_size; y <= convolution_size; y++) {
	for (int x = -convolution_size; x <= convolution_size; x++) {
	if (rows + y >= 0 && rows + y < gray.rows && cols + x >= 0 && cols + x < gray.cols)
	sum += gray.data[(rows + y) * gray.step + (cols + x) * gray.elemSize()];
	else sum += gray.data[rows * gray.step + cols * gray.elemSize()];
	}
	}
	gray_cpu.data[rows * gray.step + cols * gray.elemSize()] = sum / pow(2 * convolution_size + 1, 2);

	}
	}
	std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now();
	std::cout << "----------------cpu calculation succeeded---------------" << std::endl;
	std::chrono::duration<double> diff = after - now;
	std::cout << "score " << diff.count() << "[s]" << std::endl;
	}

	{
	std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
	auto result = convolutionCalculateAverage<float, convolution_size2, convolution_size2>(reinterpret_cast<float*>(&gray.data[0]), input.rows, input.cols, accel);
	std::chrono::time_point<std::chrono::system_clock> after = std::chrono::system_clock::now();

	std::cout << "----------------gpu calculation succeeded---------------" << std::endl;

	std::chrono::duration<double> diff = after - now;

	std::cout << "score " << diff.count() << "[s]" << std::endl;

	for (int rows = 0; rows < input.rows; rows++) {
	for (int cols = 0; cols < input.cols; cols++) {
	gray.at<float>(rows, cols) = result[rows*input.cols + cols];
	}
	}
	}

	cv::namedWindow("window", CV_WINDOW_AUTOSIZE);
	cv::namedWindow("window2", CV_WINDOW_AUTOSIZE);

	cv::imshow("window", gray);
	cv::imshow("window2", input);

	cv::waitKey(0);
	}