Skip to content

Commit

Permalink
Merge pull request ROCm#117 from bragadeesh/develop
Browse files Browse the repository at this point in the history
bluestein fft initial
  • Loading branch information
bragadeesh authored Feb 22, 2018
2 parents 0f8cd33 + 24e257b commit bcf5286
Show file tree
Hide file tree
Showing 16 changed files with 867 additions and 85 deletions.
7 changes: 4 additions & 3 deletions clients/rider/rider.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -548,6 +548,7 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride


bool checkflag= false;
double err_ratio = 1E-6;

// Read and check output data
// This check is not valid if the FFT is executed multiple times inplace.
Expand Down Expand Up @@ -576,22 +577,22 @@ int transform( size_t* lengths, const size_t *inStrides, const size_t *outStride
{
if (0 == (i % outfftVectorSizePadded))
{
if (output[i].real() != outfftVectorSize)
if (fabs(output[i].real() - outfftVectorSize)/outfftVectorSize > err_ratio)
{
checkflag = true;
break;
}
}
else
{
if (output[ i ].real() != 0)
if (fabs(output[ i ].real()) > (err_ratio * outfftVectorSize))
{
checkflag = true;
break;
}
}

if (output[ i ].imag() != 0)
if (fabs(output[ i ].imag()) > (err_ratio * outfftVectorSize))
{
checkflag = true;
break;
Expand Down
24 changes: 22 additions & 2 deletions clients/selftest/test_complex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,19 @@ void ErrorCheck(size_t N, CT *ref, CT *tst, size_t size=0)
size_t maxRefRealIdx = 0, maxRefImagIdx = 0;
size_t maxTstRealIdx = 0, maxTstImagIdx = 0;

#if 0
std::cout << "lib output" << std::endl;
for(size_t i=0; i<size; i++)
{
std::cout << tst[i][0] << ", " << tst[i][1] << std::endl;
}
std::cout << "ref output" << std::endl;
for(size_t i=0; i<size; i++)
{
std::cout << N*ref[i][0] << ", " << N*ref[i][1] << std::endl;
}
#endif

for(size_t i=0; i<size; i++)
{
T refReal = N*fabs(ref[i][0]);
Expand Down Expand Up @@ -257,10 +270,10 @@ class BasicInterfaceBasisTest : public ::testing::Test

virtual void RunBvt(size_t L)
{
if(!SupportedLength(L))
/*if(!SupportedLength(L))
{
return;
}
}*/

void *bufs[1];
bufs[0] = dev;
Expand Down Expand Up @@ -392,6 +405,13 @@ typedef BasicInterface3DBasisTest<float, complex_single, rocfft_precision_single

// complex to complex interface

// primes

TEST_F( BasicInterfaceSingle1DBasisTest, FwdLen504017 ) { TestRoutine(504017, -1); }
TEST_F( BasicInterfaceSingle1DBasisTest, InvLen504017 ) { TestRoutine(504017, 1); }
TEST_F( BasicInterfaceSingle1DBasisTest, FwdLen69 ) { TestRoutine(69, -1); }
TEST_F( BasicInterfaceSingle1DBasisTest, InvLen69 ) { TestRoutine(69, 1); }

// some big tests

/*
Expand Down
1 change: 1 addition & 0 deletions library/src/device/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ add_custom_command(
# The following is a list of implementation files defining the library
set( rocfft_device_source
transpose.cpp
bluestein.cpp
real2complex.cpp
complex2real.cpp
function_pool.cpp
Expand Down
129 changes: 129 additions & 0 deletions library/src/device/bluestein.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@

/*******************************************************************************
* Copyright (C) 2016 Advanced Micro Devices, Inc. All rights reserved.
******************************************************************************/

#include <iostream>
#include "kernel_launch.h"
#include "rocfft_hip.h"
#include "bluestein.h"


template<typename T>
rocfft_status chirp_launch(size_t N, size_t M, T* B, void *twiddles_large, int twl, int dir, hipStream_t rocfft_stream)
{
dim3 grid((M-N)/64 + 1);
dim3 threads(64);

hipLaunchKernelGGL(HIP_KERNEL_NAME(chirp_device<T>), dim3(grid), dim3(threads), 0, rocfft_stream, N, M, B, (T *)twiddles_large, twl, dir);

return rocfft_status_success;
}


void rocfft_internal_chirp(const void *data_p, void *back_p)
{
DeviceCallIn *data = (DeviceCallIn *)data_p;

size_t N = data->node->length[0];
size_t M = data->node->lengthBlue;

int twl = 0;

if(data->node->large1D > (size_t)256*256*256*256) printf("large1D twiddle size too large error");
else if(data->node->large1D > (size_t)256*256*256) twl = 4;
else if(data->node->large1D > (size_t)256*256) twl = 3;
else if(data->node->large1D > (size_t)256) twl = 2;
else twl = 1;

int dir = data->node->direction;

hipStream_t rocfft_stream = data->rocfft_stream;

if( data->node->precision == rocfft_precision_single)
chirp_launch<float2>(N, M, (float2 *)data->bufOut[0], data->node->twiddles_large, twl, dir, rocfft_stream);
else
chirp_launch<double2>(N, M, (double2 *)data->bufOut[0], data->node->twiddles_large, twl, dir, rocfft_stream);
}

template<typename T>
rocfft_status mul_launch(size_t numof, size_t totalWI, size_t N, size_t M, const T* A, T* B, size_t dim, size_t *lengths, size_t *stride_in, size_t *stride_out, int dir, int scheme, hipStream_t rocfft_stream)
{

dim3 grid((totalWI-1)/64 + 1);
dim3 threads(64);

hipLaunchKernelGGL(HIP_KERNEL_NAME(mul_device<T>), dim3(grid), dim3(threads), 0, rocfft_stream,
numof, totalWI, N, M, A, B, dim, lengths, stride_in, stride_out, dir, scheme);

return rocfft_status_success;

}

void rocfft_internal_mul(const void *data_p, void *back_p)
{
DeviceCallIn *data = (DeviceCallIn *)data_p;

size_t N = data->node->length[0];
size_t M = data->node->lengthBlue;


int scheme = 0; // fft mul
if(data->node->scheme == CS_KERNEL_PAD_MUL)
{
scheme = 1; // pad mul
}
else if(data->node->scheme == CS_KERNEL_RES_MUL)
{
scheme = 2; // res mul
}

size_t cBytes;
if(data->node->precision == rocfft_precision_single)
{
cBytes = sizeof(float)*2;
}
else
{
cBytes = sizeof(double)*2;
}

void *bufIn = data->bufIn[0];
void *bufOut = data->bufOut[0];

size_t numof = 0;
if(scheme == 0)
{
bufIn = ((char *)bufIn + M*cBytes);
bufOut = ((char *)bufOut + 2*M*cBytes);

numof = M;
}
else if(scheme == 1)
{
bufOut = ((char *)bufOut + M*cBytes);

numof = M;
}
else if(scheme == 2)
{
numof = N;
}

size_t count = data->node->batch;
for(size_t i=1; i<data->node->length.size(); i++) count *= data->node->length[i];
count *= numof;

int dir = data->node->direction;

hipStream_t rocfft_stream = data->rocfft_stream;


if( data->node->precision == rocfft_precision_single)
mul_launch<float2>(numof, count, N, M, (const float2 *)bufIn, (float2 *)bufOut, data->node->length.size(),
data->node->devKernArg, data->node->devKernArg + 1*KERN_ARGS_ARRAY_WIDTH, data->node->devKernArg + 2*KERN_ARGS_ARRAY_WIDTH, dir, scheme, rocfft_stream);
else
mul_launch<double2>(numof, count, N, M, (const double2 *)bufIn, (double2 *)bufOut, data->node->length.size(),
data->node->devKernArg, data->node->devKernArg + 1*KERN_ARGS_ARRAY_WIDTH, data->node->devKernArg + 2*KERN_ARGS_ARRAY_WIDTH, dir, scheme, rocfft_stream);

}
122 changes: 122 additions & 0 deletions library/src/device/kernels/bluestein.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#ifndef BLUESTEIN_H
#define BLUESTEIN_H

#include "rocfft_hip.h"
#include "common.h"


template<typename T>
__global__ void
chirp_device(const size_t N, const size_t M, T* output, T *twiddles_large, const int twl, const int dir)
{
size_t tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;

T val = lib_make_vector2<T>(0, 0);

if(twl == 1)
val = TWLstep1(twiddles_large, (tx*tx)%(2*N));
else if(twl == 2)
val = TWLstep2(twiddles_large, (tx*tx)%(2*N));
else if(twl == 3)
val = TWLstep3(twiddles_large, (tx*tx)%(2*N));
else if(twl == 4)
val = TWLstep4(twiddles_large, (tx*tx)%(2*N));

val.y *= (real_type_t<T>)(dir);

if(tx == 0)
{
output[tx] = val;
output[tx + M] = val;
}
else if(tx < N)
{
output[tx] = val;
output[tx + M] = val;

output[M - tx] = val;
output[M - tx + M] = val;
}
else if(tx <= (M-N))
{
output[tx] = lib_make_vector2<T>(0, 0);
output[tx + M] = lib_make_vector2<T>(0, 0);
}
}


template<typename T>
__global__ void
mul_device(const size_t numof, const size_t totalWI, const size_t N, const size_t M, const T* input, T* output, const size_t dim, const size_t *lengths,
const size_t *stride_in, const size_t *stride_out, const int dir, const int scheme)
{
size_t tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;

if(tx >= totalWI)
return;

size_t iOffset = 0;
size_t oOffset = 0;

size_t counter_mod = tx/numof;

for(size_t i = dim; i>1; i--){
size_t currentLength = 1;
for(size_t j=1; j<i; j++){
currentLength *= lengths[j];
}

iOffset += (counter_mod / currentLength)*stride_in[i];
oOffset += (counter_mod / currentLength)*stride_out[i];
counter_mod = counter_mod % currentLength;
}
iOffset += counter_mod * stride_in[1];
oOffset += counter_mod * stride_out[1];

tx = tx%numof;
if(scheme == 0)
{
output += oOffset;

T out = output[tx];
output[tx].x = input[tx].x * out.x - input[tx].y * out.y;
output[tx].y = input[tx].x * out.y + input[tx].y * out.x;
}
else if(scheme == 1)
{
T *chirp = output;

input += iOffset;

output += M;
output += oOffset;

if(tx < N)
{
output[tx].x = input[tx].x * chirp[tx].x + input[tx].y * chirp[tx].y;
output[tx].y = -input[tx].x * chirp[tx].y + input[tx].y * chirp[tx].x;
}
else
{
output[tx] = lib_make_vector2<T>(0, 0);
}
}
else if(scheme == 2)
{
const T *chirp = input;

input += 2*M;
input += iOffset;

output += oOffset;

real_type_t<T> MI = 1.0 / (real_type_t<T>)M;
output[tx].x = MI * ( input[tx].x * chirp[tx].x + input[tx].y * chirp[tx].y);
output[tx].y = MI * (-input[tx].x * chirp[tx].y + input[tx].y * chirp[tx].x);

}
}


#endif // BLUESTEIN_H

9 changes: 9 additions & 0 deletions library/src/device/kernels/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,15 @@ __device__ inline double4 lib_make_vector4(double v0, double v1, double v2, doub
#endif


template <typename T>
__device__ T
TWLstep1(T *twiddles, size_t u)
{
size_t j = u & 255;
T result = twiddles[j];
return result;
}

template <typename T>
__device__ T
TWLstep2(T *twiddles, size_t u)
Expand Down
5 changes: 5 additions & 0 deletions library/src/device/kernels/transpose.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
#ifndef TRANSPOSE_H
#define TRANSPOSE_H

#include "rocfft_hip.h"
#include "common.h"

Expand Down Expand Up @@ -165,3 +168,5 @@ transpose_kernel2_scheme(const T* input, T* output, T *twiddles_large, size_t di
transpose_tile_device<T, DIM_X, DIM_Y>(input, output, mm, nn, hipBlockIdx_x * DIM_X, hipBlockIdx_y * DIM_X, ld_in, ld_out, twiddles_large, 0, 0);
}

#endif // TRANSPOSE_H

3 changes: 2 additions & 1 deletion library/src/include/kernel_launch.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,9 @@ extern "C"
*/


void rocfft_internal_transpose_var1_sp(const void *data_p, void *back_p);

void rocfft_internal_mul(const void *data_p, void *back_p);
void rocfft_internal_chirp(const void *data_p, void *back_p);
void rocfft_internal_transpose_var2(const void *data_p, void *back_p);

}
Expand Down
7 changes: 7 additions & 0 deletions library/src/include/plan.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,13 @@ inline bool SupportedLength(size_t len)
return false;
}

inline size_t FindBlue(size_t len)
{
size_t p = 1;
while(p < len) p <<= 1;
return 2*p;
}

struct rocfft_plan_description_t
{

Expand Down
Loading

0 comments on commit bcf5286

Please sign in to comment.