diff --git a/docs/source/basic/cheatsheet.rst b/docs/source/basic/cheatsheet.rst index 66f139fe83bb..4d76367f8c5c 100644 --- a/docs/source/basic/cheatsheet.rst +++ b/docs/source/basic/cheatsheet.rst @@ -167,6 +167,16 @@ Enqueue a memory copy from device to host memcpy(queue, bufHost, bufDevice, extent); +Makes the memory of bufA available on dev via bufB. A zero-copy can be performed and bufA and bufB may share the memory. + .. code-block:: c++ + + auto bufB = makeAvailable(queue, dev, bufA); + +Makes the memory of bufB available on the device of bufA, as bufA. A zero-copy can be performed and bufA and bufB may share the memory. + .. code-block:: c++ + + makeAvailable(queue, bufA, bufB); + .. raw:: pdf PageBreak diff --git a/example/randomCells2D/src/randomCells2D.cpp b/example/randomCells2D/src/randomCells2D.cpp index 02f67d30f3e7..f0818681710a 100644 --- a/example/randomCells2D/src/randomCells2D.cpp +++ b/example/randomCells2D/src/randomCells2D.cpp @@ -165,7 +165,6 @@ auto main() -> int using BufAcc = alpaka::Buf; using BufHostRand = alpaka::Buf, Dim, Idx>; using BufAccRand = alpaka::Buf, Dim, Idx>; - using BufHostRandVec = alpaka::Buf, Dim, Idx>; using BufAccRandVec = alpaka::Buf, Dim, Idx>; using WorkDiv = alpaka::WorkDivMembers; @@ -187,27 +186,21 @@ auto main() -> int // Setup buffer. BufHost bufHostS{alpaka::allocBuf(devHost, extent)}; float* const ptrBufHostS{alpaka::getPtrNative(bufHostS)}; - BufAcc bufAccS{alpaka::allocBuf(devAcc, extent)}; - float* const ptrBufAccS{alpaka::getPtrNative(bufAccS)}; BufHost bufHostV{alpaka::allocBuf(devHost, extent)}; float* const ptrBufHostV{alpaka::getPtrNative(bufHostV)}; - BufAcc bufAccV{alpaka::allocBuf(devAcc, extent)}; - float* const ptrBufAccV{alpaka::getPtrNative(bufAccV)}; - BufHostRand bufHostRandS{alpaka::allocBuf, Idx>(devHost, extent)}; - BufAccRand bufAccRandS{alpaka::allocBuf, Idx>(devAcc, extent)}; - RandomEngineSingle* const ptrBufAccRandS{alpaka::getPtrNative(bufAccRandS)}; - - BufHostRandVec bufHostRandV{alpaka::allocBuf, Idx>(devHost, extent)}; - BufAccRandVec bufAccRandV{alpaka::allocBuf, Idx>(devAcc, extent)}; - RandomEngineVector* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)}; InitRandomKernel initRandomKernel; + + BufAccRand bufAccRandS{alpaka::allocBuf, Idx>(devAcc, extent)}; + RandomEngineSingle* const ptrBufAccRandS{alpaka::getPtrNative(bufAccRandS)}; auto pitchBufAccRandS = alpaka::getPitchBytes<1u>(bufAccRandS); alpaka::exec(queue, workdiv, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS); alpaka::wait(queue); + BufAccRandVec bufAccRandV{alpaka::allocBuf, Idx>(devAcc, extent)}; + RandomEngineVector* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)}; auto pitchBufAccRandV = alpaka::getPitchBytes<1u>(bufAccRandV); alpaka::exec(queue, workdiv, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV); alpaka::wait(queue); @@ -224,8 +217,9 @@ auto main() -> int } } + BufAcc bufAccS{alpaka::makeAvailable(queue, devAcc, bufHostS)}; + float* const ptrBufAccS{alpaka::getPtrNative(bufAccS)}; auto pitchBufAccS = alpaka::getPitchBytes<1u>(bufAccS); - alpaka::memcpy(queue, bufAccS, bufHostS); RunTimestepKernelSingle runTimestepKernelSingle; alpaka::exec( queue, @@ -236,10 +230,11 @@ auto main() -> int ptrBufAccS, pitchBufAccRandS, pitchBufAccS); - alpaka::memcpy(queue, bufHostS, bufAccS); + alpaka::makeAvailable(queue, bufHostS, bufAccS); + BufAcc bufAccV{alpaka::makeAvailable(queue, devAcc, bufHostV)}; + float* const ptrBufAccV{alpaka::getPtrNative(bufAccV)}; auto pitchBufAccV = alpaka::getPitchBytes<1u>(bufAccV); - alpaka::memcpy(queue, bufAccV, bufHostV); RunTimestepKernelVector runTimestepKernelVector; alpaka::exec( queue, @@ -250,7 +245,7 @@ auto main() -> int ptrBufAccV, pitchBufAccRandV, pitchBufAccV); - alpaka::memcpy(queue, bufHostV, bufAccV); + alpaka::makeAvailable(queue, bufHostV, bufAccV); alpaka::wait(queue); float avgS = 0; diff --git a/include/alpaka/mem/buf/Traits.hpp b/include/alpaka/mem/buf/Traits.hpp index 56ce12f00b21..c260a5b8d1c9 100644 --- a/include/alpaka/mem/buf/Traits.hpp +++ b/include/alpaka/mem/buf/Traits.hpp @@ -191,4 +191,82 @@ namespace alpaka ALPAKA_UNREACHABLE(allocBuf(host, extent)); } + + namespace detail + { + // TODO(bgruber): very crude + template + auto canZeroCopy(DevDst const& devDst, DevSrc const& devSrc) -> bool + { + if constexpr(std::is_same_v) + if(devSrc == devDst) + return true; + return false; + } + } // namespace detail + + //! Makes the content of the source view available on the device associated with the destination queue. If the + //! destination shares the same memory space as the source view, no copy is performed and the destination view is + //! updated to share the same buffer as the source view. Otherwise, a memcpy is performed from source to + //! destination view. + template + ALPAKA_FN_HOST void makeAvailable(TQueue& queue, TViewDst& viewDst, TViewSrc const& viewSrc) + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + if constexpr(std::is_same_v) // TODO(bgruber): lift this by converting buffer types + if(detail::canZeroCopy(getDev(viewDst), getDev(viewSrc))) + { +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + std::cout << "zero_memcopy: copy elided\n"; +#endif + viewDst = viewSrc; + return; + } + +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + std::cout << "zero_memcopy: deep copy required\n"; +#endif + memcpy(queue, viewDst, viewSrc); + } + + //! Makes the content of the source view available on the destination device. If the destination shares the same + //! memory space as the source view, no copy is performed and the source view is returned. Otherwise a newly + //! allocated buffer is created on the destination device and the content of the source view copied to it. + template< + typename TQueue, + typename TDevDst, + typename TViewSrc, + std::enable_if_t, int> = 0, + typename TViewDst = Buf, Dim, Idx>> + ALPAKA_FN_HOST auto makeAvailable(TQueue& queue, TDevDst const& dstDev, TViewSrc const& viewSrc) -> TViewDst + { + ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; + + if constexpr(std::is_same_v) // TODO(bgruber): lift this by converting buffer types + if(detail::canZeroCopy(dstDev, getDev(viewSrc))) + { +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + std::cout << "zero_memcopy: shallow copy returned\n"; +#endif + return viewSrc; + } + + using E = Elem; + using I = Idx; + auto const extent = getExtentVec(viewSrc); + TViewDst dst = [&] + { + using TDevQueue = Dev; + if constexpr(std::is_same_v) + if(getDev(queue) == dstDev) + return allocAsyncBufIfSupported(queue, extent); + return allocBuf(dstDev, extent); + }(); + memcpy(queue, dst, viewSrc); +#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL + std::cout << "zero_memcopy: deep copy returned\n"; +#endif + return dst; + } } // namespace alpaka diff --git a/include/alpaka/test/mem/view/ViewTest.hpp b/include/alpaka/test/mem/view/ViewTest.hpp index 9989b04747e5..9b61ba049159 100644 --- a/include/alpaka/test/mem/view/ViewTest.hpp +++ b/include/alpaka/test/mem/view/ViewTest.hpp @@ -202,7 +202,8 @@ namespace alpaka::test REQUIRE(fixture(verifyViewsEqualKernel, test::begin(viewA), test::end(viewA), test::begin(viewB))); } - //! Fills the given view with increasing values starting at 0. + //! Fills the given view with increasing values starting at 0. To verify that a view is like that, use \ref + //! iotaCheckView. template ALPAKA_FN_HOST auto iotaFillView(TQueue& queue, TView& view) -> void { @@ -210,7 +211,6 @@ namespace alpaka::test using PltfHost = Pltf; using Elem = Elem; - DevHost const devHost = getDevByIdx(0); auto const extent = getExtentVec(view); @@ -226,6 +226,30 @@ namespace alpaka::test wait(queue); } + //! Checks that the given view contains increasing values starting at 0. To initialize such a view, use \ref + //! iotaFillView. + template + ALPAKA_FN_HOST auto iotaCheckView(TQueue& queue, TView& view) -> void + { + using Elem = Elem; + + // prepare a host buffer + auto const devHost = getDevByIdx(0); + auto const extent = getExtentVec(view); + std::vector v(static_cast(extent.prod()), static_cast(0)); + auto hostView = createView(devHost, v, extent); + + // copy data to host + memcpy(queue, hostView, view); + wait(queue); + + // check that content is a iota range + for(std::size_t i = 0; i < v.size(); i++) + { + CHECK(v[i] == static_cast(i)); + } + } + template ALPAKA_FN_HOST auto testViewMutable(TQueue& queue, TView& view) -> void { diff --git a/test/unit/mem/buf/src/BufTest.cpp b/test/unit/mem/buf/src/BufTest.cpp index c13e3d09766b..38da5e41b601 100644 --- a/test/unit/mem/buf/src/BufTest.cpp +++ b/test/unit/mem/buf/src/BufTest.cpp @@ -325,3 +325,55 @@ TEMPLATE_LIST_TEST_CASE("memBufMove", "[memBuf]", alpaka::test::TestAccs) CHECK(read(buf2) == 1); } // both buffers destruct fine here } + + +TEMPLATE_LIST_TEST_CASE("Zerocopy", "[memBuf]", alpaka::test::TestAccs) +{ + using Acc = TestType; + using Dim = alpaka::Dim; + using Idx = alpaka::Idx; + using Dev = alpaka::Dev; + using Queue = alpaka::test::DefaultQueue; + using Elem = int; + + constexpr auto accIsHostDev = std::is_same_v; + + auto const extent + = alpaka::createVecFromIndexedFn::template ForExtentBuf>(); + auto const hostDev = alpaka::getDevByIdx(0); + auto const accDev = alpaka::getDevByIdx>(0); + auto queue = Queue(accDev); + + // create and fill host buffer + auto hostBuf = alpaka::allocBuf(hostDev, extent); + alpaka::test::iotaFillView(queue, hostBuf); + { + INFO("hostBuf initially"); + alpaka::test::iotaCheckView(queue, hostBuf); + } + + // zero-copy to device, check it there + auto devBuf = alpaka::makeAvailable(queue, accDev, hostBuf); + if constexpr(accIsHostDev) + CHECK(alpaka::getPtrNative(devBuf) == alpaka::getPtrNative(hostBuf)); + { + INFO("devBuf"); + alpaka::test::iotaCheckView(queue, devBuf); + } + + // case 1: zero-copy back to host into existing buffer, check it there + { + alpaka::makeAvailable(queue, hostBuf, devBuf); + INFO("hostBuf after copying back"); + alpaka::test::iotaCheckView(queue, hostBuf); + } + + // case 2: zero-copy back to host into new buffer, check it there + { + auto dstHostBuf = alpaka::makeAvailable(queue, hostDev, devBuf); + if constexpr(accIsHostDev) + CHECK(alpaka::getPtrNative(devBuf) == alpaka::getPtrNative(hostBuf)); + INFO("dstHostBuf after copying back"); + alpaka::test::iotaCheckView(queue, dstHostBuf); + } +}