The CB-GMRES solver example.
Introduction
About the example
This example showcases the usage of the Ginkgo solver CB-GMRES (Compressed Basis GMRES). A small system is solved with two un-preconditioned CB-GMRES solvers:
- without compressing the krylov basis; it uses double precision for both the matrix and the krylov basis, and
- with a compression of the krylov basis; it uses double precision for the matrix and all arithmetic operations, while using single precision for the storage of the krylov basis
Both solves are timed and the residual norm of each solution is computed to show that both solutions are correct.
The commented program
Make sure all previous executor operations have finished before starting the time
exec->synchronize();
auto tic = std::chrono::steady_clock::now();
solver->apply(b, x_copy);
Make sure all computations are done before stopping the time
exec->synchronize();
auto tac = std::chrono::steady_clock::now();
duration += std::chrono::duration<double>(tac - tic).count();
}
Copy the solution back to x, so the caller has the result
x->copy_from(x_copy);
return duration / static_cast<double>(repeats);
}
int main(int argc, char* argv[])
{
Use some shortcuts. In Ginkgo, vectors are seen as a gko::matrix::Dense with one column/one row. The advantage of this concept is that using multiple vectors is a now a natural extension of adding columns/rows are necessary.
using ValueType = double;
using IndexType = int;
Dense is a matrix format which explicitly stores all values of the matrix.
Definition dense.hpp:117
typename detail::remove_complex_s< T >::type remove_complex
Obtain the type which removed the complex of complex/scalar type or the template parameter of class b...
Definition math.hpp:260
The gko::matrix::Csr class is used here, but any other matrix class such as gko::matrix::Coo, gko::matrix::Hybrid, gko::matrix::Ell or gko::matrix::Sellp could also be used.
CSR is a matrix format which stores only the nonzero coefficients by compressing each row of the matr...
Definition csr.hpp:121
The gko::solver::CbGmres is used here, but any other solver class can also be used.
CB-GMRES or the compressed basis generalized minimal residual method is an iterative type Krylov subs...
Definition cb_gmres.hpp:98
Print the ginkgo version information.
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor] " << std::endl;
std::exit(-1);
}
static const version_info & get()
Returns an instance of version_info.
Definition version.hpp:139
Map which generates the appropriate executor
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1396
executor where Ginkgo will perform the computation
const auto exec = exec_map.at(executor_string)();
Note: this matrix is copied from "SOURCE_DIR/matrices" instead of from the local directory. For details, see "examples/cb-gmres/CMakeLists.txt"
std::unique_ptr< MatrixType > read(StreamType &&is, MatrixArgs &&... args)
Reads a matrix stored in matrix market format from an input stream.
Definition mtx_io.hpp:159
Create a uniform right-hand side with a norm2 of 1 on the host (norm2(b) == 1), followed by copying it to the actual executor (to make sure it also works for GPUs)
const auto A_size = A->get_size();
auto b_host = vec::create(exec->get_master(),
gko::dim<2>{A_size[0], 1});
b_host->at(i, 0) =
ValueType{1} / std::sqrt(static_cast<ValueType>(A_size[0]));
}
b_host->compute_norm2(b_norm);
auto b =
clone(exec, b_host);
std::unique_ptr< Matrix > initialize(size_type stride, std::initializer_list< typename Matrix::value_type > vals, std::shared_ptr< const Executor > exec, TArgs &&... create_args)
Creates and initializes a column-vector.
Definition dense.hpp:1565
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:89
detail::cloned_type< Pointer > clone(const Pointer &p)
Creates a unique clone of the object pointed to by p.
Definition utils_helper.hpp:173
A type representing the dimensions of a multidimensional object.
Definition dim.hpp:26
As an initial guess, use the right-hand side
auto x_keep = clone(b);
auto x_reduce = clone(x_keep);
const RealValueType reduction_factor{1e-6};
Generate two solver factories: _keep
uses the same precision for the krylov basis as the matrix, and _reduce
uses one precision below it. If ValueType
is double, then _reduce
uses float as the krylov basis storage type
auto solver_gen_keep =
cb_gmres::build()
.with_criteria(gko::stop::Iteration::build().with_max_iters(1000u),
gko::stop::ResidualNorm<ValueType>::build()
.with_baseline(gko::stop::mode::rhs_norm)
.with_reduction_factor(reduction_factor))
.with_krylov_dim(100u)
.with_storage_precision(
gko::solver::cb_gmres::storage_precision::keep)
.on(exec);
auto solver_gen_reduce =
cb_gmres::build()
.with_criteria(gko::stop::Iteration::build().with_max_iters(1000u),
gko::stop::ResidualNorm<ValueType>::build()
.with_baseline(gko::stop::mode::rhs_norm)
.with_reduction_factor(reduction_factor))
.with_krylov_dim(100u)
.with_storage_precision(
gko::solver::cb_gmres::storage_precision::reduce1)
.on(exec);
Generate the actual solver from the factory and the matrix.
auto solver_keep = solver_gen_keep->generate(A);
auto solver_reduce = solver_gen_reduce->generate(A);
Solve both system and measure the time for each.
auto time_keep =
measure_solve_time_in_s(exec, solver_keep.get(), b.get(), x_keep.get());
auto time_reduce = measure_solve_time_in_s(exec, solver_reduce.get(),
b.get(), x_reduce.get());
Make sure the output is in scientific notation for easier comparison
std::cout << std::scientific;
Note: The time might not be significantly different since the matrix is quite small
std::cout << "Solve time without compression: " << time_keep << " s\n"
<< "Solve time with compression: " << time_reduce << " s\n";
To measure if your solution has actually converged, the error of the solution is measured. one, neg_one are objects that represent the numbers which allow for a uniform interface when computing on any device. To compute the residual, the (advanced) apply method is used.
tmp = Ax - tmp
A->apply(one, x_keep, neg_one, tmp);
tmp->compute_norm2(res_norm_keep);
std::cout << "\nResidual norm without compression:\n";
write(std::cout, res_norm_keep);
tmp->copy_from(b);
A->apply(one, x_reduce, neg_one, tmp);
tmp->compute_norm2(res_norm_reduce);
std::cout << "\nResidual norm with compression:\n";
write(std::cout, res_norm_reduce);
}
Results
The following is the expected result:
Solve time without compression: 1.842690e-04 s
Solve time with compression: 1.589936e-04 s
Residual norm without compression:
%%MatrixMarket matrix array real general
1 1
2.430544e-07
Residual norm with compression:
%%MatrixMarket matrix array real general
1 1
3.437257e-07
Comments about programming and debugging
The plain program
#include <chrono>
#include <cmath>
#include <fstream>
#include <iostream>
#include <map>
#include <string>
#include <ginkgo/ginkgo.hpp>
double measure_solve_time_in_s(std::shared_ptr<const gko::Executor> exec,
{
constexpr int repeats{5};
double duration{0};
for (int i = 0; i < repeats; ++i) {
if (i != 0) {
x_copy->copy_from(x);
}
exec->synchronize();
auto tic = std::chrono::steady_clock::now();
exec->synchronize();
auto tac = std::chrono::steady_clock::now();
duration += std::chrono::duration<double>(tac - tic).count();
}
x->copy_from(x_copy);
return duration / static_cast<double>(repeats);
}
int main(int argc, char* argv[])
{
using ValueType = double;
using IndexType = int;
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor] " << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
const auto exec = exec_map.at(executor_string)();
const auto A_size = A->get_size();
auto b_host = vec::create(exec->get_master(),
gko::dim<2>{A_size[0], 1});
b_host->at(i, 0) =
ValueType{1} / std::sqrt(static_cast<ValueType>(A_size[0]));
}
b_host->compute_norm2(b_norm);
auto b =
clone(exec, b_host);
auto x_reduce =
clone(x_keep);
const RealValueType reduction_factor{1e-6};
auto solver_gen_keep =
cb_gmres::build()
.with_criteria(gko::stop::Iteration::build().with_max_iters(1000u),
gko::stop::ResidualNorm<ValueType>::build()
.with_baseline(gko::stop::mode::rhs_norm)
.with_reduction_factor(reduction_factor))
.with_krylov_dim(100u)
.with_storage_precision(
gko::solver::cb_gmres::storage_precision::keep)
.on(exec);
auto solver_gen_reduce =
cb_gmres::build()
.with_criteria(gko::stop::Iteration::build().with_max_iters(1000u),
gko::stop::ResidualNorm<ValueType>::build()
.with_baseline(gko::stop::mode::rhs_norm)
.with_reduction_factor(reduction_factor))
.with_krylov_dim(100u)
.with_storage_precision(
gko::solver::cb_gmres::storage_precision::reduce1)
.on(exec);
auto solver_keep = solver_gen_keep->generate(A);
auto solver_reduce = solver_gen_reduce->generate(A);
auto time_keep =
measure_solve_time_in_s(exec, solver_keep.get(), b.get(), x_keep.get());
auto time_reduce = measure_solve_time_in_s(exec, solver_reduce.get(),
b.get(), x_reduce.get());
std::cout << std::scientific;
std::cout << "Solve time without compression: " << time_keep << " s\n"
<< "Solve time with compression: " << time_reduce << " s\n";
A->apply(one, x_keep, neg_one, tmp);
tmp->compute_norm2(res_norm_keep);
std::cout << "\nResidual norm without compression:\n";
write(std::cout, res_norm_keep);
tmp->copy_from(b);
A->apply(one, x_reduce, neg_one, tmp);
tmp->compute_norm2(res_norm_reduce);
std::cout << "\nResidual norm with compression:\n";
write(std::cout, res_norm_reduce);
}
Definition lin_op.hpp:117
@ solver
Solver events.
Definition profiler_hook.hpp:34
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:630
void write(StreamType &&os, MatrixPtrType &&matrix, layout_type layout=detail::mtx_io_traits< std::remove_cv_t< detail::pointee< MatrixPtrType > > >::default_layout)
Writes a matrix into an output stream in matrix market format.
Definition mtx_io.hpp:295
detail::shared_type< OwningPointer > share(OwningPointer &&p)
Marks the object pointed to by p as shared.
Definition utils_helper.hpp:224