Skip to content
This repository was archived by the owner on Aug 11, 2023. It is now read-only.
161 changes: 105 additions & 56 deletions samples/matrix_multiply/matrix_multiply.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,22 @@ void block_host(float *MA, float *MB, float *MC, int matSize) {
/* We set the block size to 32 for simplicity, though the optimal
* value will depend on the platform this is run on. */
int block_size = 32;
int numBlocks = block_size / matSize;
int extraBlockLength = block_size % matSize;
numBlocks = extraBlockLength ? (numBlocks + 1) : (numBlocks);

#pragma omp parallel for collapse(2)
for (int i = 0; i < matSize; i += block_size)
for (int j = 0; j < matSize; j += block_size)
for (int k = 0; k < matSize; k += block_size) {
for (int bi = 0; bi < (block_size); bi++)
for (int bj = 0; bj < (block_size); bj++)
for (int bk = 0; bk < (block_size); bk++) {
MC[(bi + i) * matSize + (bj + j)] +=
MA[(bi + i) * matSize + (k + bk)] *
MB[(k + bk) * matSize + (bj + j)];
for (int bIndexI = 0; bIndexI < matSize; bIndexI += block_size)
for (int bIndexJ = 0; bIndexJ < matSize; bIndexJ += block_size)
for (int bIndexK = 0; bIndexK < matSize; bIndexK += block_size) {
int i = bIndexI;
int j = bIndexJ;
int k = bIndexK;
for (int bi = i; bi < std::min(i + block_size, matSize); bi++)
for (int bj = j; bj < std::min(j + block_size, matSize); bj++)
for (int bk = k; bk < std::min(k + block_size, matSize); bk++) {
MC[(bi)*matSize + (bj)] +=
MA[(bi)*matSize + (bk)] * MB[(bk)*matSize + (bj)];
}
}
}
Expand All @@ -95,16 +100,31 @@ inline int prevPowerOfTwo(int x) {
return x - (x >> 1);
}

/* Checks if X is a power of two.
* If there are bits sets to one after AND with the
* previous number, then it is not a power of two.
*/
inline bool isPowerOfTwo(int x) { return (x & (x - 1)) == 0; }

/* Function template that performs the matrix * matrix operation. (It is
* a template because only some OpenCL devices support double-precision
* floating-point numbers, but it is interesting to make the comparison
* where available.)
* Broadly, the function chooses an appropriate work size, then enqueues
* the matrix * matrix lambda on the queue provided. Because the queues
* are constructed inside this function, it will block until the work is
* finished. */
* finished.
* Note that this example only works for powers of two.
* */
template <typename T>
void local_mxm(cl::sycl::queue &q, T *MA, T *MB, T *MC, int matSize) {
bool local_mxm(cl::sycl::queue &q, T *MA, T *MB, T *MC, int matSize) {
// Make sure it is power of two before running
if (!isPowerOfTwo(matSize)) {
std::cout << " This example only works with power of two sizes "
<< std::endl;
return true;
}

auto device = q.get_device();
auto maxBlockSize =
device.get_info<cl::sycl::info::device::max_work_group_size>();
Expand All @@ -113,6 +133,9 @@ void local_mxm(cl::sycl::queue &q, T *MA, T *MB, T *MC, int matSize) {
<< std::endl;
std::cout << " The order is : " << matSize << std::endl;
std::cout << " The blockSize is : " << blockSize << std::endl;
// Make sure the block size is not larger than the mat size
blockSize = std::min(matSize, blockSize);

{
range<1> dimensions(matSize * matSize);
/* Buffers optionally take allocators as a template argument. In this
Expand Down Expand Up @@ -184,14 +207,16 @@ void local_mxm(cl::sycl::queue &q, T *MA, T *MB, T *MC, int matSize) {
});
});
}
return false;
}

/* Helper function to indicate the parameters the sample takes. */
void usage(std::string programName) {
std::cout << " Incorrect number of parameters " << std::endl;
std::cout << " Usage: " << std::endl;
std::cout << programName << " [matrix size] [omp|sycl]" << std::endl;
std::cout << "[matrix size] : Size of the matrix to multiply (minimum 32)" << std::endl;
std::cout << "[matrix size] : Size of the matrix to multiply (minimum 32)"
<< std::endl;
std::cout << "[omp|sycl] : Run the OpenMP or the SYCL variant. "
<< " Default is to use both " << std::endl;
}
Expand All @@ -202,6 +227,7 @@ int main(int argc, char *argv[]) {
float *MC;
bool sycl = true;
bool omp = true;
bool error = false;

if (argc != 2 && argc != 3) {
usage(argv[0]);
Expand Down Expand Up @@ -271,65 +297,88 @@ int main(int argc, char *argv[]) {
float flops =
(2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
std::cout << "GFLOPs: " << flops << std::endl;

bool error = false;
// Testing
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
if (std::fabs(MC[i * matSize + j] - MB[i * matSize + j]) > 1e-8) {
std::cout << " Position " << i << ", " << j
<< " differs: " << MC[i * matSize + j]
<< " != " << MB[i * matSize + j] << std::endl;
error = true;
}
}
if (!error) {
std::cout << "Success" << std::endl;
} else {
std::cout << " Error in the computation " << std::endl;
}
}
}

if (sycl) {
std::cout << " ***** SYCL " << std::endl;
// Matrix initialization
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
MC[i * matSize + j] = 0.0f; // i * matSize + j;
}

{
/* Create the SYCL queue - note that we add an async handler function
* to capture potential asynchronous errors. This function will be
* called every time there is an asynchronous error on the queue (i.e.
* some error occurs while the queue is executing kernels) and one of
* cl::sycl::queue::throw() or cl::sycl::queue::wait_and_throw() is
* called. */
queue q([&](exception_list eL) {
try {
for (auto &e : eL) {
std::rethrow_exception(e);
{
/* Create the SYCL queue - note that we add an async handler function
* to capture potential asynchronous errors. This function will be
* called every time there is an asynchronous error on the queue (i.e.
* some error occurs while the queue is executing kernels) and one of
* cl::sycl::queue::throw() or cl::sycl::queue::wait_and_throw() is
* called. */
queue q([&](exception_list eL) {
try {
for (auto &e : eL) {
std::rethrow_exception(e);
}
} catch (cl::sycl::exception e) {
std::cout << " An exception has been thrown: " << e.what()
<< std::endl;
}
} catch (cl::sycl::exception e) {
std::cout << " An exception has been thrown: " << e.what()
<< std::endl;
}
});

auto start = std::chrono::steady_clock::now();
local_mxm(q, MA, MB, MC, matSize);
auto end = std::chrono::steady_clock::now();
std::cout << "SYCL: ";
auto time = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start).count();
std::cout << "Time: " << time << std::endl;
float flops =
(2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
std::cout << "GFLOPs: " << flops << std::endl;
}
}
});

auto start = std::chrono::steady_clock::now();
error = local_mxm(q, MA, MB, MC, matSize);
q.wait_and_throw();
auto end = std::chrono::steady_clock::now();
auto time = std::chrono::duration_cast<std::chrono::milliseconds>(
end - start).count();
std::cout << "SYCL: ";
std::cout << "Time: " << time << std::endl;
float flops =
(2.0f * matSize * matSize * matSize / (time / 1000.0f)) * 1.0e-9f;
std::cout << "GFLOPs: " << flops << std::endl;
std::cout << " Output " << std::endl;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will be printed on a line by itself

}

std::cout << " Output " << std::endl;
display_matrix(MC, matSize);
bool error = false;
// Testing
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
if (std::fabs(MC[i * matSize + j] - MB[i * matSize + j]) > 1e-8) {
std::cout << " Position " << i << ", " << j
<< " differs: " << MC[i * matSize + j]
<< " != " << MB[i * matSize + j] << std::endl;
error = true;
if (!error) {
display_matrix(MC, matSize);
error = false;
// Testing
for (int i = 0; i < matSize; i++)
for (int j = 0; j < matSize; j++) {
if (std::fabs(MC[i * matSize + j] - MB[i * matSize + j]) > 1e-8) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we replace this with something like the sample from here? Sounds pedantic, but I'm worried that for larger numbers (say ~100) that will be spuriously false, or that for small numbers it will be incorrectly true.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was there before my latest changes, so I rather not change it in this patch.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, it was? My bad, I'll add it myself then, rather than be a hypocrite! But in a separate merge request. Thanks very much!

std::cout << " Position " << i << ", " << j
<< " differs: " << MC[i * matSize + j]
<< " != " << MB[i * matSize + j] << std::endl;
error = true;
}
}
if (!error) {
std::cout << "Success" << std::endl;
;
} else {
std::cout << " Error in the computation " << std::endl;
}
}
}
if (!error) {
std::cout << "Success" << std::endl;
;
} else {
std::cout << " Error in the computation " << std::endl;
}

delete[] MA;
Expand Down