5#ifndef GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
6#define GKO_PUBLIC_CORE_BASE_EXECUTOR_HPP_
20#include <ginkgo/core/base/device.hpp>
21#include <ginkgo/core/base/fwd_decls.hpp>
22#include <ginkgo/core/base/machine_topology.hpp>
23#include <ginkgo/core/base/memory.hpp>
24#include <ginkgo/core/base/scoped_device_id_guard.hpp>
25#include <ginkgo/core/base/types.hpp>
26#include <ginkgo/core/log/logger.hpp>
27#include <ginkgo/core/synthesizer/containers.hpp>
68constexpr allocation_mode default_cuda_alloc_mode = allocation_mode::device;
70constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
76 allocation_mode::unified_global;
78#if (GINKGO_HIP_PLATFORM_HCC == 1)
81constexpr allocation_mode default_hip_alloc_mode = allocation_mode::device;
87 allocation_mode::unified_global;
101enum class dpcpp_queue_property {
113GKO_ATTRIBUTES GKO_INLINE dpcpp_queue_property operator|(dpcpp_queue_property a,
114 dpcpp_queue_property b)
116 return static_cast<dpcpp_queue_property
>(
static_cast<int>(a) |
117 static_cast<int>(b));
124#define GKO_FORWARD_DECLARE(_type, ...) class _type
126GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_FORWARD_DECLARE);
128#undef GKO_FORWARD_DECLARE
260#define GKO_DECLARE_RUN_OVERLOAD(_type, ...) \
261 virtual void run(std::shared_ptr<const _type>) const
263 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_RUN_OVERLOAD);
265#undef GKO_DECLARE_RUN_OVERLOAD
268 virtual void run(std::shared_ptr<const ReferenceExecutor> executor)
const;
291template <
typename Closure>
292class RegisteredOperation :
public Operation {
300 RegisteredOperation(
const char* name, Closure op)
301 : name_(name), op_(std::move(op))
304 const char*
get_name()
const noexcept override {
return name_; }
306 void run(std::shared_ptr<const ReferenceExecutor> exec)
const override
311 void run(std::shared_ptr<const OmpExecutor> exec)
const override
316 void run(std::shared_ptr<const CudaExecutor> exec)
const override
321 void run(std::shared_ptr<const HipExecutor> exec)
const override
326 void run(std::shared_ptr<const DpcppExecutor> exec)
const override
337template <
typename Closure>
338RegisteredOperation<Closure> make_register_operation(
const char* name,
341 return RegisteredOperation<Closure>{name, std::move(op)};
419#define GKO_REGISTER_OPERATION(_name, _kernel) \
420 template <typename... Args> \
421 auto make_##_name(Args&&... args) \
423 return ::gko::detail::make_register_operation( \
424 #_kernel, [&args...](auto exec) { \
425 using exec_type = decltype(exec); \
428 std::shared_ptr<const ::gko::ReferenceExecutor>>:: \
430 ::gko::kernels::reference::_kernel( \
431 std::dynamic_pointer_cast< \
432 const ::gko::ReferenceExecutor>(exec), \
433 std::forward<Args>(args)...); \
434 } else if (std::is_same< \
436 std::shared_ptr<const ::gko::OmpExecutor>>:: \
438 ::gko::kernels::omp::_kernel( \
439 std::dynamic_pointer_cast<const ::gko::OmpExecutor>( \
441 std::forward<Args>(args)...); \
442 } else if (std::is_same< \
444 std::shared_ptr<const ::gko::CudaExecutor>>:: \
446 ::gko::kernels::cuda::_kernel( \
447 std::dynamic_pointer_cast<const ::gko::CudaExecutor>( \
449 std::forward<Args>(args)...); \
450 } else if (std::is_same< \
452 std::shared_ptr<const ::gko::HipExecutor>>:: \
454 ::gko::kernels::hip::_kernel( \
455 std::dynamic_pointer_cast<const ::gko::HipExecutor>( \
457 std::forward<Args>(args)...); \
458 } else if (std::is_same< \
460 std::shared_ptr<const ::gko::DpcppExecutor>>:: \
462 ::gko::kernels::dpcpp::_kernel( \
463 std::dynamic_pointer_cast<const ::gko::DpcppExecutor>( \
465 std::forward<Args>(args)...); \
467 GKO_NOT_IMPLEMENTED; \
471 static_assert(true, \
472 "This assert is used to counter the false positive extra " \
473 "semi-colon warnings")
513#define GKO_REGISTER_HOST_OPERATION(_name, _kernel) \
514 template <typename... Args> \
515 auto make_##_name(Args&&... args) \
517 return ::gko::detail::make_register_operation( \
519 [&args...](auto) { _kernel(std::forward<Args>(args)...); }); \
521 static_assert(true, \
522 "This assert is used to counter the false positive extra " \
523 "semi-colon warnings")
526#define GKO_DECLARE_EXECUTOR_FRIEND(_type, ...) friend class _type
616 template <
typename T>
617 friend class detail::ExecutorBase;
619 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_DECLARE_EXECUTOR_FRIEND);
620 friend class ReferenceExecutor;
623 virtual ~Executor() =
default;
625 Executor() =
default;
626 Executor(Executor&) =
delete;
627 Executor(Executor&&) =
delete;
628 Executor& operator=(Executor&) =
delete;
629 Executor& operator=(Executor&&) =
delete;
652 template <
typename ClosureOmp,
typename ClosureCuda,
typename ClosureHip,
653 typename ClosureDpcpp>
655 "Please use the overload with std::string as first parameter.")
656 void
run(const ClosureOmp& op_omp, const ClosureCuda& op_cuda,
657 const ClosureHip& op_hip, const ClosureDpcpp& op_dpcpp)
const
659 LambdaOperation<ClosureOmp, ClosureOmp, ClosureCuda, ClosureHip,
661 op(op_omp, op_cuda, op_hip, op_dpcpp);
681 template <
typename ClosureReference,
typename ClosureOmp,
682 typename ClosureCuda,
typename ClosureHip,
typename ClosureDpcpp>
683 void run(std::string name,
const ClosureReference& op_ref,
684 const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
685 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
const
687 LambdaOperation<ClosureReference, ClosureOmp, ClosureCuda, ClosureHip,
689 op(std::move(name), op_ref, op_omp, op_cuda, op_hip, op_dpcpp);
704 template <
typename T>
707 this->
template log<log::Logger::allocation_started>(
708 this, num_elems *
sizeof(T));
709 T* allocated =
static_cast<T*
>(this->raw_alloc(num_elems *
sizeof(T)));
710 this->
template log<log::Logger::allocation_completed>(
711 this, num_elems *
sizeof(T),
reinterpret_cast<uintptr>(allocated));
722 void free(
void* ptr)
const noexcept
724 this->
template log<log::Logger::free_started>(
725 this,
reinterpret_cast<uintptr>(ptr));
727 this->
template log<log::Logger::free_completed>(
728 this,
reinterpret_cast<uintptr>(ptr));
743 template <
typename T>
745 const T* src_ptr, T* dest_ptr)
const
747 const auto src_loc =
reinterpret_cast<uintptr>(src_ptr);
748 const auto dest_loc =
reinterpret_cast<uintptr>(dest_ptr);
749 this->
template log<log::Logger::copy_started>(
750 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
751 if (
this != src_exec.
get()) {
752 src_exec->template log<log::Logger::copy_started>(
753 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
756 this->raw_copy_from(src_exec.
get(), num_elems *
sizeof(T), src_ptr,
759#if (GKO_VERBOSE_LEVEL >= 1) && !defined(NDEBUG)
762 std::clog <<
"Not direct copy. Try to copy data from the masters."
765 auto src_master = src_exec->get_master().
get();
766 if (num_elems > 0 && src_master != src_exec.
get()) {
767 auto* master_ptr = src_exec->get_master()->alloc<T>(num_elems);
768 src_master->copy_from<T>(src_exec, num_elems, src_ptr,
770 this->
copy_from<T>(src_master, num_elems, master_ptr, dest_ptr);
771 src_master->free(master_ptr);
774 this->
template log<log::Logger::copy_completed>(
775 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
776 if (
this != src_exec.
get()) {
777 src_exec->template log<log::Logger::copy_completed>(
778 src_exec.
get(),
this, src_loc, dest_loc, num_elems *
sizeof(T));
793 template <
typename T>
796 this->
copy_from(
this, num_elems, src_ptr, dest_ptr);
808 template <
typename T>
812 this->
get_master()->copy_from(
this, 1, ptr, &out);
840 this->propagating_logger_refcount_.fetch_add(
841 logger->needs_propagation() ? 1 : 0);
842 this->EnableLogging<Executor>::add_logger(logger);
853 this->propagating_logger_refcount_.fetch_sub(
855 this->EnableLogging<Executor>::remove_logger(logger);
858 using EnableLogging<
Executor>::remove_logger;
869 log_propagation_mode_ = mode;
881 return this->propagating_logger_refcount_.load() > 0 &&
894 return this->verify_memory_from(other.get());
916 std::string device_type;
931 int num_computing_units = -1;
944 int num_pu_per_cu = -1;
954 std::vector<int> subgroup_sizes{};
964 int max_subgroup_size = -1;
976 std::vector<int> max_workitem_sizes{};
987 int max_workgroup_size;
1004 std::string pci_bus_id = std::string(13,
'x');
1016 std::vector<int> closest_pu_ids{};
1024 const exec_info& get_exec_info()
const {
return this->exec_info_; }
1035 virtual void* raw_alloc(size_type size)
const = 0;
1044 virtual void raw_free(
void* ptr)
const noexcept = 0;
1056 virtual void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
1057 const void* src_ptr,
void* dest_ptr)
const = 0;
1068#define GKO_ENABLE_RAW_COPY_TO(_exec_type, ...) \
1069 virtual void raw_copy_to(const _exec_type* dest_exec, size_type n_bytes, \
1070 const void* src_ptr, void* dest_ptr) const = 0
1072 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_RAW_COPY_TO);
1074#undef GKO_ENABLE_RAW_COPY_TO
1083 virtual bool verify_memory_from(
const Executor* src_exec)
const = 0;
1094#define GKO_ENABLE_VERIFY_MEMORY_TO(_exec_type, ...) \
1095 virtual bool verify_memory_to(const _exec_type* dest_exec) const = 0
1097 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_ENABLE_VERIFY_MEMORY_TO);
1099 GKO_ENABLE_VERIFY_MEMORY_TO(ReferenceExecutor, ref);
1101#undef GKO_ENABLE_VERIFY_MEMORY_TO
1109 virtual void populate_exec_info(
const machine_topology* mach_topo) = 0;
1116 exec_info& get_exec_info() {
return this->exec_info_; }
1118 exec_info exec_info_;
1122 std::atomic<int> propagating_logger_refcount_{};
1139 template <
typename ClosureReference,
typename ClosureOmp,
1140 typename ClosureCuda,
typename ClosureHip,
typename ClosureDpcpp>
1141 class LambdaOperation :
public Operation {
1143 LambdaOperation(std::string name,
const ClosureReference& op_ref,
1144 const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1145 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1146 : name_(std::move(name)),
1164 LambdaOperation(
const ClosureOmp& op_omp,
const ClosureCuda& op_cuda,
1165 const ClosureHip& op_hip,
const ClosureDpcpp& op_dpcpp)
1166 : LambdaOperation(
"unnamed", op_omp, op_omp, op_cuda, op_hip,
1170 void run(std::shared_ptr<const OmpExecutor>)
const override
1175 void run(std::shared_ptr<const ReferenceExecutor>)
const override
1180 void run(std::shared_ptr<const CudaExecutor>)
const override
1185 void run(std::shared_ptr<const HipExecutor>)
const override
1190 void run(std::shared_ptr<const DpcppExecutor>)
const override
1195 const char* get_name() const noexcept
override {
return name_.c_str(); }
1199 ClosureReference op_ref_;
1201 ClosureCuda op_cuda_;
1203 ClosureDpcpp op_dpcpp_;
1216template <
typename T>
1243 std::shared_ptr<const Executor> exec_;
1247template <
typename T>
1248class executor_deleter<T[]> {
1250 using pointer = T[];
1252 explicit executor_deleter(std::shared_ptr<const Executor> exec)
1256 void operator()(pointer ptr)
const
1264 std::shared_ptr<const Executor> exec_;
1271template <
typename ConcreteExecutor>
1272class ExecutorBase :
public Executor {
1275 friend class ::gko::OmpExecutor;
1276 friend class ::gko::HipExecutor;
1277 friend class ::gko::DpcppExecutor;
1278 friend class ::gko::CudaExecutor;
1279 friend class ::gko::ReferenceExecutor;
1282 void run(
const Operation& op)
const override
1285 auto scope_guard = get_scoped_device_id_guard();
1286 op.run(self()->shared_from_this());
1291 void raw_copy_from(
const Executor* src_exec, size_type n_bytes,
1292 const void* src_ptr,
void* dest_ptr)
const override
1294 src_exec->raw_copy_to(self(), n_bytes, src_ptr, dest_ptr);
1297 virtual bool verify_memory_from(
const Executor* src_exec)
const override
1299 return src_exec->verify_memory_to(self());
1303 ConcreteExecutor* self() noexcept
1305 return static_cast<ConcreteExecutor*
>(
this);
1308 const ConcreteExecutor* self() const noexcept
1310 return static_cast<const ConcreteExecutor*
>(
this);
1314#undef GKO_DECLARE_EXECUTOR_FRIEND
1324class EnableDeviceReset {
1332 "device_reset is no longer supported, call "
1333 "cudaDeviceReset/hipDeviceReset manually")
1334 void set_device_reset(
bool device_reset) {}
1342 "device_reset is no longer supported, call "
1343 "cudaDeviceReset/hipDeviceReset manually")
1344 bool get_device_reset() {
return false; }
1352 EnableDeviceReset() {}
1355 "device_reset is no longer supported, call "
1356 "cudaDeviceReset/hipDeviceReset manually")
1357 EnableDeviceReset(
bool device_reset) {}
1364#define GKO_OVERRIDE_RAW_COPY_TO(_executor_type, ...) \
1365 void raw_copy_to(const _executor_type* dest_exec, size_type n_bytes, \
1366 const void* src_ptr, void* dest_ptr) const override
1369#define GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(dest_, bool_) \
1370 virtual bool verify_memory_to(const dest_* other) const override \
1374 static_assert(true, \
1375 "This assert is used to counter the false positive extra " \
1376 "semi-colon warnings")
1386class OmpExecutor :
public detail::ExecutorBase<OmpExecutor>,
1387 public std::enable_shared_from_this<OmpExecutor> {
1388 friend class detail::ExecutorBase<OmpExecutor>;
1397 std::shared_ptr<CpuAllocatorBase>
alloc =
1398 std::make_shared<CpuAllocator>())
1400 return std::shared_ptr<OmpExecutor>(
new OmpExecutor(std::move(
alloc)));
1409 int get_num_cores()
const
1411 return this->get_exec_info().num_computing_units;
1414 int get_num_threads_per_core()
const
1416 return this->get_exec_info().num_pu_per_cu;
1419 static int get_num_omp_threads();
1421 scoped_device_id_guard get_scoped_device_id_guard()
const override;
1426 OmpExecutor(std::shared_ptr<CpuAllocatorBase>
alloc)
1434 void* raw_alloc(
size_type size)
const override;
1436 void raw_free(
void* ptr)
const noexcept override;
1438 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1440 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(
OmpExecutor,
true);
1444 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(
HipExecutor,
false);
1446 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(
CudaExecutor,
false);
1448 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
1450 std::shared_ptr<CpuAllocatorBase> alloc_;
1456using DefaultExecutor = OmpExecutor;
1468class ReferenceExecutor :
public OmpExecutor {
1472 static std::shared_ptr<ReferenceExecutor> create(
1473 std::shared_ptr<CpuAllocatorBase>
alloc =
1474 std::make_shared<CpuAllocator>())
1476 return std::shared_ptr<ReferenceExecutor>(
1477 new ReferenceExecutor(std::move(
alloc)));
1490 op.run(std::static_pointer_cast<const ReferenceExecutor>(
1491 this->shared_from_this()));
1499 this->ReferenceExecutor::populate_exec_info(
1503 void populate_exec_info(
const machine_topology*)
override
1505 this->get_exec_info().device_id = -1;
1506 this->get_exec_info().num_computing_units = 1;
1507 this->get_exec_info().num_pu_per_cu = 1;
1510 bool verify_memory_from(
const Executor* src_exec)
const override
1512 return src_exec->verify_memory_to(
this);
1515 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
true);
1517 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1519 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1521 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
1523 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
1528namespace reference {
1529using DefaultExecutor = ReferenceExecutor;
1540class CudaExecutor :
public detail::ExecutorBase<CudaExecutor>,
1541 public std::enable_shared_from_this<CudaExecutor>,
1542 public detail::EnableDeviceReset {
1543 friend class detail::ExecutorBase<CudaExecutor>;
1560 "calling this CudaExecutor::create method is deprecated, because"
1561 "device_reset no longer has an effect"
1562 "call CudaExecutor::create("
1563 " int device_id, std::shared_ptr<Executor> master,"
1564 " std::shared_ptr<CudaAllocatorBase> alloc,"
1565 " CUstream_st* stream);"
1568 int device_id,
std::shared_ptr<
Executor> master,
bool device_reset,
1570 CUstream_st* stream =
nullptr);
1585 CUstream_st* stream =
nullptr);
1602 return this->get_exec_info().device_id;
1615 return this->get_exec_info().num_pu_per_cu;
1623 return this->get_exec_info().num_computing_units;
1631 return this->get_exec_info().num_computing_units *
1632 this->get_exec_info().num_pu_per_cu;
1640 return this->get_exec_info().max_subgroup_size;
1648 return this->get_exec_info().major;
1656 return this->get_exec_info().minor;
1664 GKO_DEPRECATED(
"use get_blas_handle() instead")
1677 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1688 return cusparse_handle_.get();
1698 return this->get_exec_info().closest_pu_ids;
1717 void set_gpu_property();
1719 void init_handles();
1721 CudaExecutor(
int device_id, std::shared_ptr<Executor> master,
1722 std::shared_ptr<CudaAllocatorBase> alloc, CUstream_st* stream)
1723 : master_(master), alloc_{
std::move(alloc)}, stream_{stream}
1725 this->get_exec_info().device_id = device_id;
1726 this->get_exec_info().num_computing_units = 0;
1727 this->get_exec_info().num_pu_per_cu = 0;
1728 this->CudaExecutor::populate_exec_info(
1730 this->set_gpu_property();
1731 this->init_handles();
1734 void* raw_alloc(
size_type size)
const override;
1736 void raw_free(
void* ptr)
const noexcept override;
1738 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1740 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1742 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1744 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1746 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1748 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1750 void populate_exec_info(
const machine_topology* mach_topo)
override;
1753 std::shared_ptr<Executor> master_;
1755 template <
typename T>
1756 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1757 handle_manager<cublasContext> cublas_handle_;
1758 handle_manager<cusparseContext> cusparse_handle_;
1759 std::shared_ptr<CudaAllocatorBase> alloc_;
1760 CUstream_st* stream_;
1766using DefaultExecutor = CudaExecutor;
1777class HipExecutor :
public detail::ExecutorBase<HipExecutor>,
1778 public std::enable_shared_from_this<HipExecutor>,
1779 public detail::EnableDeviceReset {
1780 friend class detail::ExecutorBase<HipExecutor>;
1797 "device_reset is deprecated entirely, call hipDeviceReset directly. "
1798 "alloc_mode was replaced by the Allocator type "
1801 int device_id,
std::shared_ptr<
Executor> master,
bool device_reset,
1803 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1805 static
std::shared_ptr<HipExecutor>
create(
1809 GKO_HIP_STREAM_STRUCT* stream =
nullptr);
1826 return this->get_exec_info().device_id;
1839 return this->get_exec_info().num_pu_per_cu;
1847 return this->get_exec_info().num_computing_units;
1855 return this->get_exec_info().major;
1863 return this->get_exec_info().minor;
1871 return this->get_exec_info().num_computing_units *
1872 this->get_exec_info().num_pu_per_cu;
1880 return this->get_exec_info().max_subgroup_size;
1888 GKO_DEPRECATED(
"use get_blas_handle() instead")
1901 GKO_DEPRECATED(
"use get_sparselib_handle() instead")
1912 return hipsparse_handle_.get();
1929 return this->get_exec_info().closest_pu_ids;
1932 GKO_HIP_STREAM_STRUCT* get_stream()
const {
return stream_; }
1935 void set_gpu_property();
1937 void init_handles();
1939 HipExecutor(
int device_id, std::shared_ptr<Executor> master,
1940 std::shared_ptr<HipAllocatorBase> alloc,
1941 GKO_HIP_STREAM_STRUCT* stream)
1942 : master_{
std::move(master)}, alloc_{
std::move(alloc)}, stream_{stream}
1944 this->get_exec_info().device_id = device_id;
1945 this->get_exec_info().num_computing_units = 0;
1946 this->get_exec_info().num_pu_per_cu = 0;
1947 this->HipExecutor::populate_exec_info(machine_topology::get_instance());
1948 this->set_gpu_property();
1949 this->init_handles();
1952 void* raw_alloc(size_type size)
const override;
1954 void raw_free(
void* ptr)
const noexcept override;
1956 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
1958 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(OmpExecutor,
false);
1960 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
1962 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(DpcppExecutor,
false);
1964 bool verify_memory_to(
const CudaExecutor* dest_exec)
const override;
1966 bool verify_memory_to(
const HipExecutor* dest_exec)
const override;
1968 void populate_exec_info(
const machine_topology* mach_topo)
override;
1971 std::shared_ptr<Executor> master_;
1973 template <
typename T>
1974 using handle_manager = std::unique_ptr<T, std::function<void(T*)>>;
1975 handle_manager<hipblasContext> hipblas_handle_;
1976 handle_manager<hipsparseContext> hipsparse_handle_;
1977 std::shared_ptr<HipAllocatorBase> alloc_;
1978 GKO_HIP_STREAM_STRUCT* stream_;
1984using DefaultExecutor = HipExecutor;
1995class DpcppExecutor :
public detail::ExecutorBase<DpcppExecutor>,
1996 public std::enable_shared_from_this<DpcppExecutor> {
1997 friend class detail::ExecutorBase<DpcppExecutor>;
2011 static std::shared_ptr<DpcppExecutor>
create(
2012 int device_id, std::shared_ptr<Executor> master,
2013 std::string device_type =
"all",
2014 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2033 return this->get_exec_info().device_id;
2036 sycl::queue* get_queue()
const {
return queue_.get(); }
2054 return this->get_exec_info().subgroup_sizes;
2064 return this->get_exec_info().num_computing_units;
2072 return this->get_exec_info().num_computing_units *
2073 this->get_exec_info().num_pu_per_cu;
2083 return this->get_exec_info().max_workitem_sizes;
2093 return this->get_exec_info().max_workgroup_size;
2103 return this->get_exec_info().max_subgroup_size;
2113 return this->get_exec_info().device_type;
2117 void set_device_property(
2118 dpcpp_queue_property property = dpcpp_queue_property::in_order);
2121 int device_id, std::shared_ptr<Executor> master,
2122 std::string device_type =
"all",
2123 dpcpp_queue_property property = dpcpp_queue_property::in_order)
2126 std::for_each(device_type.begin(), device_type.end(),
2127 [](
char& c) { c = std::tolower(c); });
2128 this->get_exec_info().device_type = std::string(device_type);
2129 this->get_exec_info().device_id = device_id;
2130 this->set_device_property(property);
2133 void populate_exec_info(
const machine_topology* mach_topo)
override;
2135 void* raw_alloc(size_type size)
const override;
2137 void raw_free(
void* ptr)
const noexcept override;
2139 GKO_ENABLE_FOR_ALL_EXECUTORS(GKO_OVERRIDE_RAW_COPY_TO);
2141 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(CudaExecutor,
false);
2143 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(HipExecutor,
false);
2145 GKO_DEFAULT_OVERRIDE_VERIFY_MEMORY(ReferenceExecutor,
false);
2147 bool verify_memory_to(
const OmpExecutor* dest_exec)
const override;
2149 bool verify_memory_to(
const DpcppExecutor* dest_exec)
const override;
2152 std::shared_ptr<Executor> master_;
2154 template <
typename T>
2155 using queue_manager = std::unique_ptr<T, std::function<void(T*)>>;
2156 queue_manager<sycl::queue> queue_;
2162using DefaultExecutor = DpcppExecutor;
2167#undef GKO_OVERRIDE_RAW_COPY_TO
Implement this interface to provide an allocator for CudaExecutor.
Definition memory.hpp:40
Allocator using cudaMalloc.
Definition memory.hpp:102
This is the Executor subclass which represents the CUDA device.
Definition executor.hpp:1542
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
cusparseContext * get_sparselib_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1686
cublasContext * get_blas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1670
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1696
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1706
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1613
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1646
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1621
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1638
cusparseContext * get_cusparse_handle() const
Get the cusparse handle for this executor.
Definition executor.hpp:1678
int get_device_id() const noexcept
Get the CUDA device id of the device associated to this executor.
Definition executor.hpp:1600
void synchronize() const override
Synchronize the operations launched on the executor with its master.
CUstream_st * get_stream() const
Returns the CUDA stream used by this executor.
Definition executor.hpp:1714
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1654
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1629
cublasContext * get_cublas_handle() const
Get the cublas handle for this executor.
Definition executor.hpp:1665
std::string get_description() const override
static int get_num_devices()
Get the number of devices present on the system.
This is the Executor subclass which represents a DPC++ enhanced device.
Definition executor.hpp:1996
const std::vector< int > & get_subgroup_sizes() const noexcept
Get the available subgroup sizes for this device.
Definition executor.hpp:2052
int get_num_computing_units() const noexcept
Get the number of Computing Units of this executor.
Definition executor.hpp:2062
int get_max_workgroup_size() const noexcept
Get the maximum workgroup size.
Definition executor.hpp:2091
std::string get_description() const override
int get_num_subgroups() const noexcept
Get the number of subgroups of this executor.
Definition executor.hpp:2070
void synchronize() const override
Synchronize the operations launched on the executor with its master.
const std::vector< int > & get_max_workitem_sizes() const noexcept
Get the maximum work item sizes.
Definition executor.hpp:2081
int get_max_subgroup_size() const noexcept
Get the maximum subgroup size.
Definition executor.hpp:2101
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
std::string get_device_type() const noexcept
Get a string representing the device type.
Definition executor.hpp:2111
static int get_num_devices(std::string device_type)
Get the number of devices present on the system.
int get_device_id() const noexcept
Get the DPCPP device id of the device associated to this executor.
Definition executor.hpp:2031
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:615
void free(void *ptr) const noexcept
Frees memory previously allocated with Executor::alloc().
Definition executor.hpp:722
virtual void run(const Operation &op) const =0
Runs the specified Operation using this Executor.
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
bool should_propagate_log() const
Returns true iff events occurring at an object created on this executor should be logged at propagati...
Definition executor.hpp:879
bool memory_accessible(const std::shared_ptr< const Executor > &other) const
Verifies whether the executors share the same memory.
Definition executor.hpp:892
void copy(size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data within this Executor.
Definition executor.hpp:794
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition executor.hpp:838
void copy_from(ptr_param< const Executor > src_exec, size_type num_elems, const T *src_ptr, T *dest_ptr) const
Copies data from another Executor.
Definition executor.hpp:744
void set_log_propagation_mode(log_propagation_mode mode)
Sets the logger event propagation mode for the executor.
Definition executor.hpp:867
T * alloc(size_type num_elems) const
Allocates memory in this Executor.
Definition executor.hpp:705
virtual std::string get_description() const =0
virtual std::shared_ptr< Executor > get_master() noexcept=0
Returns the master OmpExecutor of this Executor.
T copy_val_to_host(const T *ptr) const
Retrieves a single element at the given location from executor memory.
Definition executor.hpp:809
void run(std::string name, const ClosureReference &op_ref, const ClosureOmp &op_omp, const ClosureCuda &op_cuda, const ClosureHip &op_hip, const ClosureDpcpp &op_dpcpp) const
Runs one of the passed in functors, depending on the Executor type.
Definition executor.hpp:683
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:851
Implement this interface to provide an allocator for HipExecutor.
Definition memory.hpp:65
Definition memory.hpp:172
This is the Executor subclass which represents the HIP enhanced device.
Definition executor.hpp:1779
void synchronize() const override
Synchronize the operations launched on the executor with its master.
std::string get_description() const override
int get_device_id() const noexcept
Get the HIP device id of the device associated to this executor.
Definition executor.hpp:1824
int get_num_warps_per_sm() const noexcept
Get the number of warps per SM of this executor.
Definition executor.hpp:1837
int get_major_version() const noexcept
Get the major version of compute capability.
Definition executor.hpp:1853
std::vector< int > get_closest_pus() const
Get the closest PUs.
Definition executor.hpp:1927
int get_minor_version() const noexcept
Get the minor version of compute capability.
Definition executor.hpp:1861
static int get_num_devices()
Get the number of devices present on the system.
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
hipsparseContext * get_sparselib_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1910
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
int get_num_multiprocessor() const noexcept
Get the number of multiprocessor of this executor.
Definition executor.hpp:1845
hipblasContext * get_blas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1894
int get_num_warps() const noexcept
Get the number of warps of this executor.
Definition executor.hpp:1869
hipsparseContext * get_hipsparse_handle() const
Get the hipsparse handle for this executor.
Definition executor.hpp:1902
int get_closest_numa() const
Get the closest NUMA node.
Definition executor.hpp:1920
int get_warp_size() const noexcept
Get the warp size of this executor.
Definition executor.hpp:1878
hipblasContext * get_hipblas_handle() const
Get the hipblas handle for this executor.
Definition executor.hpp:1889
NotSupported is thrown in case it is not possible to perform the requested operation on the given obj...
Definition exception.hpp:127
This is the Executor subclass which represents the OpenMP device (typically CPU).
Definition executor.hpp:1387
std::string get_description() const override
std::shared_ptr< Executor > get_master() noexcept override
Returns the master OmpExecutor of this Executor.
void synchronize() const override
Synchronize the operations launched on the executor with its master.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1396
Operations can be used to define functionalities whose implementations differ among devices.
Definition executor.hpp:258
virtual const char * get_name() const noexcept
Returns the operation's name.
This is a specialization of the OmpExecutor, which runs the reference implementations of the kernels ...
Definition executor.hpp:1468
void run(const Operation &op) const override
Runs the specified Operation using this Executor.
Definition executor.hpp:1487
std::string get_description() const override
Definition executor.hpp:1485
executor_deleter(std::shared_ptr< const Executor > exec)
Creates a new deleter.
Definition executor.hpp:1226
void operator()(pointer ptr) const
Deletes the object.
Definition executor.hpp:1235
EnableLogging is a mixin which should be inherited by any class which wants to enable logging.
Definition logger.hpp:760
virtual bool needs_propagation() const
Returns true if this logger, when attached to an Executor, needs to be forwarded all events from obje...
Definition logger.hpp:654
The machine topology class represents the hierarchical topology of a machine, including NUMA nodes,...
Definition machine_topology.hpp:61
static machine_topology * get_instance()
Returns an instance of the machine_topology object.
Definition machine_topology.hpp:182
This class is used for function parameters in the place of raw pointers.
Definition utils_helper.hpp:41
T * get() const
Definition utils_helper.hpp:75
This move-only class uses RAII to set the device id within a scoped block, if necessary.
Definition scoped_device_id_guard.hpp:76
The logger namespace .
Definition convergence.hpp:22
The Ginkgo namespace.
Definition abstract_factory.hpp:20
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:141
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:89
log_propagation_mode
How Logger events are propagated to their Executor.
Definition executor.hpp:34
@ automatic
Events get reported to loggers attached to the triggering object and propagating loggers (Logger::nee...
Definition executor.hpp:46
@ never
Events only get reported at loggers attached to the triggering object.
Definition executor.hpp:40
allocation_mode
Specify the mode of allocation for CUDA/HIP GPUs.
Definition executor.hpp:62