Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[abandoned] Allowing zerocopy: makeAvailable(queue, dst, srcView) #1820

Draft
wants to merge 4 commits into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions docs/source/basic/cheatsheet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,16 @@ Enqueue a memory copy from device to host

memcpy(queue, bufHost, bufDevice, extent);

Makes the memory of bufA available on dev via bufB. A zero-copy can be performed and bufA and bufB may share the memory.
.. code-block:: c++

auto bufB = makeAvailable(queue, dev, bufA);

Makes the memory of bufB available on the device of bufA, as bufA. A zero-copy can be performed and bufA and bufB may share the memory.
.. code-block:: c++

makeAvailable(queue, bufA, bufB);

.. raw:: pdf

PageBreak
Expand Down
27 changes: 11 additions & 16 deletions example/randomCells2D/src/randomCells2D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,6 @@ auto main() -> int
using BufAcc = alpaka::Buf<Acc, float, Dim, Idx>;
using BufHostRand = alpaka::Buf<Host, RandomEngineSingle<Acc>, Dim, Idx>;
using BufAccRand = alpaka::Buf<Acc, RandomEngineSingle<Acc>, Dim, Idx>;
using BufHostRandVec = alpaka::Buf<Host, RandomEngineVector<Acc>, Dim, Idx>;
using BufAccRandVec = alpaka::Buf<Acc, RandomEngineVector<Acc>, Dim, Idx>;
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;

Expand All @@ -187,27 +186,21 @@ auto main() -> int
// Setup buffer.
BufHost bufHostS{alpaka::allocBuf<float, Idx>(devHost, extent)};
float* const ptrBufHostS{alpaka::getPtrNative(bufHostS)};
BufAcc bufAccS{alpaka::allocBuf<float, Idx>(devAcc, extent)};
float* const ptrBufAccS{alpaka::getPtrNative(bufAccS)};

BufHost bufHostV{alpaka::allocBuf<float, Idx>(devHost, extent)};
float* const ptrBufHostV{alpaka::getPtrNative(bufHostV)};
BufAcc bufAccV{alpaka::allocBuf<float, Idx>(devAcc, extent)};
float* const ptrBufAccV{alpaka::getPtrNative(bufAccV)};

BufHostRand bufHostRandS{alpaka::allocBuf<RandomEngineSingle<Acc>, Idx>(devHost, extent)};
BufAccRand bufAccRandS{alpaka::allocBuf<RandomEngineSingle<Acc>, Idx>(devAcc, extent)};
RandomEngineSingle<Acc>* const ptrBufAccRandS{alpaka::getPtrNative(bufAccRandS)};

BufHostRandVec bufHostRandV{alpaka::allocBuf<RandomEngineVector<Acc>, Idx>(devHost, extent)};
BufAccRandVec bufAccRandV{alpaka::allocBuf<RandomEngineVector<Acc>, Idx>(devAcc, extent)};
RandomEngineVector<Acc>* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)};

InitRandomKernel initRandomKernel;

BufAccRand bufAccRandS{alpaka::allocBuf<RandomEngineSingle<Acc>, Idx>(devAcc, extent)};
RandomEngineSingle<Acc>* const ptrBufAccRandS{alpaka::getPtrNative(bufAccRandS)};
auto pitchBufAccRandS = alpaka::getPitchBytes<1u>(bufAccRandS);
alpaka::exec<Acc>(queue, workdiv, initRandomKernel, extent, ptrBufAccRandS, pitchBufAccRandS);
alpaka::wait(queue);

BufAccRandVec bufAccRandV{alpaka::allocBuf<RandomEngineVector<Acc>, Idx>(devAcc, extent)};
RandomEngineVector<Acc>* const ptrBufAccRandV{alpaka::getPtrNative(bufAccRandV)};
auto pitchBufAccRandV = alpaka::getPitchBytes<1u>(bufAccRandV);
alpaka::exec<Acc>(queue, workdiv, initRandomKernel, extent, ptrBufAccRandV, pitchBufAccRandV);
alpaka::wait(queue);
Expand All @@ -224,8 +217,9 @@ auto main() -> int
}
}

BufAcc bufAccS{alpaka::makeAvailable(queue, devAcc, bufHostS)};
float* const ptrBufAccS{alpaka::getPtrNative(bufAccS)};
auto pitchBufAccS = alpaka::getPitchBytes<1u>(bufAccS);
alpaka::memcpy(queue, bufAccS, bufHostS);
RunTimestepKernelSingle runTimestepKernelSingle;
alpaka::exec<Acc>(
queue,
Expand All @@ -236,10 +230,11 @@ auto main() -> int
ptrBufAccS,
pitchBufAccRandS,
pitchBufAccS);
alpaka::memcpy(queue, bufHostS, bufAccS);
alpaka::makeAvailable(queue, bufHostS, bufAccS);

BufAcc bufAccV{alpaka::makeAvailable(queue, devAcc, bufHostV)};
float* const ptrBufAccV{alpaka::getPtrNative(bufAccV)};
auto pitchBufAccV = alpaka::getPitchBytes<1u>(bufAccV);
alpaka::memcpy(queue, bufAccV, bufHostV);
RunTimestepKernelVector runTimestepKernelVector;
alpaka::exec<Acc>(
queue,
Expand All @@ -250,7 +245,7 @@ auto main() -> int
ptrBufAccV,
pitchBufAccRandV,
pitchBufAccV);
alpaka::memcpy(queue, bufHostV, bufAccV);
alpaka::makeAvailable(queue, bufHostV, bufAccV);
alpaka::wait(queue);

float avgS = 0;
Expand Down
78 changes: 78 additions & 0 deletions include/alpaka/mem/buf/Traits.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,4 +191,82 @@ namespace alpaka

ALPAKA_UNREACHABLE(allocBuf<TElem, TIdx>(host, extent));
}

namespace detail
{
// TODO(bgruber): very crude
template<typename DevDst, typename DevSrc>
auto canZeroCopy(DevDst const& devDst, DevSrc const& devSrc) -> bool
{
if constexpr(std::is_same_v<DevDst, DevSrc>)
if(devSrc == devDst)
return true;
return false;
}
} // namespace detail

//! Makes the content of the source view available on the device associated with the destination queue. If the
//! destination shares the same memory space as the source view, no copy is performed and the destination view is
//! updated to share the same buffer as the source view. Otherwise, a memcpy is performed from source to
//! destination view.
template<typename TQueue, typename TViewDst, typename TViewSrc>
ALPAKA_FN_HOST void makeAvailable(TQueue& queue, TViewDst& viewDst, TViewSrc const& viewSrc)
{
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

if constexpr(std::is_same_v<TViewSrc, TViewDst>) // TODO(bgruber): lift this by converting buffer types
if(detail::canZeroCopy(getDev(viewDst), getDev(viewSrc)))
{
#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
std::cout << "zero_memcopy: copy elided\n";
#endif
viewDst = viewSrc;
return;
}

#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
std::cout << "zero_memcopy: deep copy required\n";
#endif
memcpy(queue, viewDst, viewSrc);
}

//! Makes the content of the source view available on the destination device. If the destination shares the same
//! memory space as the source view, no copy is performed and the source view is returned. Otherwise a newly
//! allocated buffer is created on the destination device and the content of the source view copied to it.
template<
typename TQueue,
typename TDevDst,
typename TViewSrc,
std::enable_if_t<isDevice<TDevDst>, int> = 0,
typename TViewDst = Buf<TDevDst, Elem<TViewSrc>, Dim<TViewSrc>, Idx<TViewSrc>>>
ALPAKA_FN_HOST auto makeAvailable(TQueue& queue, TDevDst const& dstDev, TViewSrc const& viewSrc) -> TViewDst
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please extend the documentation and point to the fact that the user must take care that all operations on viewSrc are finished.
This was one point where I was always confused in this PR but now I understand that the user must link the queue to avoid data races.

alpaka::enqueue(quereWhereViewSrcIsUsed, event);
wait(destQueue, event);
auto newBuff = alpaka::makeAvailable(destQueue, destDev, viewSrc));

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bernhardmgruber dstDev is available in the queue parameter, is it really required to have this extra parameter queue?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please extend the documentation and point to the fact that the user must take care that all operations on viewSrc are finished.

If you really insist, I can do it. It is also not documented for alpaka::memcpy btw. and I think it is kind of obvious. It's also not true. You can have pending operations on the source view:

auto bufHost = alpaka::allocBuf<int, Idx>(devHost, extents);
auto bufAcc = alpaka::makeAvailable(queue, devAcc, bufHost);
alpaka::exec<Acc>(queue, workdiv, Kernel{}, alpaka::getPtrNative(bufAcc));
alpaka::makeAvailable(queue, bufHost, bufAcc); // pending kernel execution on bufAcc

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dstDev is available in the queue parameter, is it really required to have this extra parameter queue?

yes, because the device of the queue is not always the destination device. See example above, where the destination is the host, but the queue's device is the accelerator.

{
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

if constexpr(std::is_same_v<TViewSrc, TViewDst>) // TODO(bgruber): lift this by converting buffer types
if(detail::canZeroCopy(dstDev, getDev(viewSrc)))
{
#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
std::cout << "zero_memcopy: shallow copy returned\n";
#endif
return viewSrc;
}

using E = Elem<TViewSrc>;
using I = Idx<TViewSrc>;
auto const extent = getExtentVec(viewSrc);
TViewDst dst = [&]
{
using TDevQueue = Dev<TQueue>;
if constexpr(std::is_same_v<TDevQueue, TDevDst>)
if(getDev(queue) == dstDev)
return allocAsyncBufIfSupported<E, I>(queue, extent);
return allocBuf<E, I>(dstDev, extent);
}();
memcpy(queue, dst, viewSrc);
#if ALPAKA_DEBUG >= ALPAKA_DEBUG_MINIMAL
std::cout << "zero_memcopy: deep copy returned\n";
#endif
return dst;
}
} // namespace alpaka
28 changes: 26 additions & 2 deletions include/alpaka/test/mem/view/ViewTest.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,15 +202,15 @@ namespace alpaka::test
REQUIRE(fixture(verifyViewsEqualKernel, test::begin(viewA), test::end(viewA), test::begin(viewB)));
}

//! Fills the given view with increasing values starting at 0.
//! Fills the given view with increasing values starting at 0. To verify that a view is like that, use \ref
//! iotaCheckView.
template<typename TView, typename TQueue>
ALPAKA_FN_HOST auto iotaFillView(TQueue& queue, TView& view) -> void
{
using DevHost = DevCpu;
using PltfHost = Pltf<DevHost>;

using Elem = Elem<TView>;

DevHost const devHost = getDevByIdx<PltfHost>(0);

auto const extent = getExtentVec(view);
Expand All @@ -226,6 +226,30 @@ namespace alpaka::test
wait(queue);
}

//! Checks that the given view contains increasing values starting at 0. To initialize such a view, use \ref
//! iotaFillView.
template<typename TView, typename TQueue>
ALPAKA_FN_HOST auto iotaCheckView(TQueue& queue, TView& view) -> void
{
using Elem = Elem<TView>;

// prepare a host buffer
auto const devHost = getDevByIdx<PltfCpu>(0);
auto const extent = getExtentVec(view);
std::vector<Elem> v(static_cast<std::size_t>(extent.prod()), static_cast<Elem>(0));
auto hostView = createView(devHost, v, extent);

// copy data to host
memcpy(queue, hostView, view);
wait(queue);

// check that content is a iota range
for(std::size_t i = 0; i < v.size(); i++)
{
CHECK(v[i] == static_cast<Elem>(i));
}
}

template<typename TAcc, typename TView, typename TQueue>
ALPAKA_FN_HOST auto testViewMutable(TQueue& queue, TView& view) -> void
{
Expand Down
52 changes: 52 additions & 0 deletions test/unit/mem/buf/src/BufTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,3 +325,55 @@ TEMPLATE_LIST_TEST_CASE("memBufMove", "[memBuf]", alpaka::test::TestAccs)
CHECK(read(buf2) == 1);
} // both buffers destruct fine here
}


TEMPLATE_LIST_TEST_CASE("Zerocopy", "[memBuf]", alpaka::test::TestAccs)
{
using Acc = TestType;
using Dim = alpaka::Dim<Acc>;
using Idx = alpaka::Idx<Acc>;
using Dev = alpaka::Dev<Acc>;
using Queue = alpaka::test::DefaultQueue<Dev>;
using Elem = int;

constexpr auto accIsHostDev = std::is_same_v<Dev, alpaka::DevCpu>;

auto const extent
= alpaka::createVecFromIndexedFn<Dim, alpaka::test::CreateVecWithIdx<Idx>::template ForExtentBuf>();
auto const hostDev = alpaka::getDevByIdx<alpaka::PltfCpu>(0);
auto const accDev = alpaka::getDevByIdx<alpaka::Pltf<Dev>>(0);
auto queue = Queue(accDev);

// create and fill host buffer
auto hostBuf = alpaka::allocBuf<Elem, Idx>(hostDev, extent);
alpaka::test::iotaFillView(queue, hostBuf);
{
INFO("hostBuf initially");
alpaka::test::iotaCheckView(queue, hostBuf);
}

// zero-copy to device, check it there
auto devBuf = alpaka::makeAvailable(queue, accDev, hostBuf);
if constexpr(accIsHostDev)
CHECK(alpaka::getPtrNative(devBuf) == alpaka::getPtrNative(hostBuf));
{
INFO("devBuf");
alpaka::test::iotaCheckView(queue, devBuf);
}

// case 1: zero-copy back to host into existing buffer, check it there
{
alpaka::makeAvailable(queue, hostBuf, devBuf);
INFO("hostBuf after copying back");
alpaka::test::iotaCheckView(queue, hostBuf);
}

// case 2: zero-copy back to host into new buffer, check it there
{
auto dstHostBuf = alpaka::makeAvailable(queue, hostDev, devBuf);
if constexpr(accIsHostDev)
CHECK(alpaka::getPtrNative(devBuf) == alpaka::getPtrNative(hostBuf));
INFO("dstHostBuf after copying back");
alpaka::test::iotaCheckView(queue, dstHostBuf);
}
}