Untitled Paste
cpp
Wrap Lines
Raw #ifdef __CLION_IDE__
#include <libgpu/opencl/cl/clion_defines.cl>
#endif
#include "../defines.h"
#define TILE_SIZE 16
__attribute__((reqd_work_group_size(16, 16, 1)))
__kernel void matrix_02_transpose_coalesced_via_local_memory(
__global const float* matrix, // w x h
__global float* transposed_matrix, // h x w
unsigned int w,
unsigned int h)
{
__local float tile[TILE_SIZE][TILE_SIZE + 1];
size_t g_col = get_global_id(0);
size_t g_row = get_global_id(1);
size_t l_col = get_local_id(0);
size_t l_row = get_local_id(1);
size_t block_x = get_group_id(0);
size_t block_y = get_group_id(1);
if (g_col < w && g_row < h) {
tile[l_row][l_col] = matrix[g_row * w + g_col];
}
barrier(CLK_LOCAL_MEM_FENCE);
size_t out_x = block_y * TILE_SIZE + l_col;
size_t out_y = block_x * TILE_SIZE + l_row;
if (out_x < h && out_y < w) {
transposed_matrix[out_y * h + out_x] = tile[l_col][l_row];
}
}
RUSTEBIN