15
« on: February 12, 2014, 12:28:15 am »
I've now managed to get an OpenCL version of it running, but it seems to run a lot slower than it does on the processor. Is this just not a good problem for GPU's or is there a bigger problem.
#define __CL_ENABLE_EXCEPTIONS
#include <CL/cl.hpp>
#include <functional>
#include <ctime>
#include <iostream>
#include <fstream>
#include <exception>
#include <cstdlib>
#include <vector>
#include <thread>
#include <cmath>
#include <string>
#include <algorithm>
#include <thread>
#include <cmath>
#include <sstream>
#define SUCCESS 0
#define FAILURE 1
#define EXPECTED_FAILURE 2
const int NUM_ELEMENTS = 512;
int convertToString(const char *filename, std::string& s)
{
size_t size;
char* str;
// create a file stream object by filename
std::fstream f(filename, (std::fstream::in | std::fstream::binary));
if(!f.is_open())
{
return FAILURE;
}
else
{
size_t fileSize;
f.seekg(0, std::fstream::end);
size = fileSize = (size_t)f.tellg();
f.seekg(0, std::fstream::beg);
str = new char[size+1];
if(!str)
{
f.close();
return FAILURE;
}
f.read(str, fileSize);
f.close();
str[size] = '\0';
s = str;
delete[] str;
return SUCCESS;
}
}
void printOutput(unsigned long long start, unsigned long long *values){
for(unsigned int i = 0; i < NUM_ELEMENTS; i++)
if (values[i] != 0)
std::cout << start+i << ',' << values[i] << std::endl;
}
void newList(unsigned long long start, unsigned long long *dataList){
for(int i=0; i < NUM_ELEMENTS; ++i)
dataList[i] = start + i;
}
using namespace cl;
Kernel kernelA;
Context context;
CommandQueue queue;
int init() {
cl_int status = 0;
const char* buildOption ="-x clc++ ";
std::vector<Platform> platforms;
status = Platform::get(&platforms);
if (status != CL_SUCCESS){
std::cout<<"Error: Getting platforms!"<<std::endl;
return FAILURE;
}
std::vector<cl::Platform>::iterator iter;
for(iter = platforms.begin(); iter != platforms.end(); ++iter)
if(!strcmp((*iter).getInfo<CL_PLATFORM_VENDOR>().c_str(), "Advanced Micro Devices, Inc."))
break;
cl_context_properties cps[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(*iter) (), 0};
bool gpuNotFound = false;
try{
context = cl::Context(CL_DEVICE_TYPE_GPU, cps, NULL, NULL, &status);
}
catch(std::exception e){
gpuNotFound = true;
}
if(gpuNotFound){
std::cout<<"GPU not found, falling back to CPU!"<<std::endl;
context = cl::Context(CL_DEVICE_TYPE_CPU, cps, NULL, NULL, &status);
if (status != CL_SUCCESS){
std::cout<<"Error: Creating context!"<<std::endl;
return FAILURE;
}
}
try{
std::vector<Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
queue = CommandQueue(context, devices[0]);
std::ifstream sourceFile("Rationals.cl");
std::string sourceCode(
std::istreambuf_iterator<char>(sourceFile),
(std::istreambuf_iterator<char>()));
Program::Sources source(1, std::make_pair(sourceCode.c_str(), sourceCode.length()+1));
Program program = Program(context, source);
program.build(devices, buildOption);
kernelA = Kernel(program, "countRationals");
}catch(cl::Error e){
std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
return FAILURE;
}
return SUCCESS;
}
int execute(unsigned long long* inputList, unsigned long long* outputList) {
try{
Buffer inputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
Buffer outputBuffer = Buffer(context, CL_MEM_READ_WRITE, NUM_ELEMENTS * sizeof(unsigned long long));
queue.enqueueWriteBuffer(inputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), inputList);
kernelA.setArg(0, inputBuffer);
kernelA.setArg(1, outputBuffer);
NDRange global(NUM_ELEMENTS);
NDRange local(NUM_ELEMENTS/2);
queue.enqueueNDRangeKernel(kernelA, NullRange, global, local);
queue.enqueueReadBuffer(outputBuffer, CL_TRUE, 0, NUM_ELEMENTS * sizeof(unsigned long long), outputList);
}catch(cl::Error e){
std::cout << "Line "<< __LINE__<<": Error in "<<e.what() <<std::endl;
return FAILURE;
}
return SUCCESS;
}
using namespace std;
int main(int argc, char* argv[]){
unsigned long long minNum, maxNum;
if (argc == 2){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[1]) + 1);
}
else if (argc == 3){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else if (argc == 4){
minNum = pow(3, atoi(argv[1]));
maxNum = pow(3, atoi(argv[2]));
}
else return -1;
unsigned long long *q = nullptr, *result = nullptr, *old = nullptr;
thread workThread, outThread;
q = new unsigned long long[NUM_ELEMENTS];
newList(minNum, q);
result = new unsigned long long[NUM_ELEMENTS];
init();
workThread = thread(execute, q, result);
workThread.join();
for(unsigned long long i = minNum + NUM_ELEMENTS; i < maxNum; i += NUM_ELEMENTS){
old = result;
result = new unsigned long long[NUM_ELEMENTS];
newList(i, q);
workThread = thread(execute, q, result);
outThread = thread(printOutput, i, old);
workThread.join();
outThread.join();
delete[] old;
old = nullptr;
}
delete[] q;
delete[] result;
return 0;
}
With this kernel code
bool testCantor(unsigned long p, unsigned long q){
while(q % 3 == 0){
q /= 3;
if (p/q == 1) return p==q;
p %= q;
}
unsigned long p_start = p;
do{
unsigned long p3 = p * 3;
if(p3/q == 1) return false;
p = p3 % q;
} while(p != p_start);
return true;
}
bool coprime(unsigned long a, unsigned long b){
while (a != b){
if (a > b) a = a - b;
else b = b - a;
}
return a == 1;
}
__kernel
void countRationals(__global unsigned long *input, __global unsigned long *output){
int gid = get_global_id(0);
unsigned long q = input[gid], p = 1;
output[gid] = 0;
for(p = 1; p <= q/3; p++){
if(p % 3 != 0 && testCantor(p, q))
for(unsigned long i = p; i <= q/3; i *= 3)
if(coprime(i,q))
output[gid] += 2;
}
}