#ifdef __CLION_IDE__ #include #endif #include "../defines.h" #define TILE_SIZE 16 __attribute__((reqd_work_group_size(16, 16, 1))) __kernel void matrix_02_transpose_coalesced_via_local_memory( __global const float* matrix, // w x h __global float* transposed_matrix, // h x w unsigned int w, unsigned int h) { __local float tile[TILE_SIZE][TILE_SIZE + 1]; size_t g_col = get_global_id(0); size_t g_row = get_global_id(1); size_t l_col = get_local_id(0); size_t l_row = get_local_id(1); size_t block_x = get_group_id(0); size_t block_y = get_group_id(1); if (g_col < w && g_row < h) { tile[l_row][l_col] = matrix[g_row * w + g_col]; } barrier(CLK_LOCAL_MEM_FENCE); size_t out_x = block_y * TILE_SIZE + l_col; size_t out_y = block_x * TILE_SIZE + l_row; if (out_x < h && out_y < w) { transposed_matrix[out_y * h + out_x] = tile[l_col][l_row]; } }