284 for (
unsigned int face = range.first; face < range.second; ++face)
287 phi_m.gather_evaluate(src, face_evaluation_flags);
289 phi_p.gather_evaluate(src, face_evaluation_flags);
293 phi_m.integrate_scatter(face_evaluation_flags, dst);
294 phi_p.integrate_scatter(face_evaluation_flags, dst);
297 [&](
const auto &data,
auto &dst,
const auto &src,
const auto range) {
302 for (
unsigned int face = range.first; face < range.second; ++face)
305 phi_m.gather_evaluate(src, face_evaluation_flags);
309 phi_m.integrate_scatter(face_evaluation_flags, dst);
319matrix_free.template loop_cell_centric<VectorType, VectorType>(
320 [&](
const auto &data,
auto &dst,
const auto &src,
const auto range) {
325 for (
unsigned int cell = range.first; cell < range.second; ++cell)
328 phi.gather_evaluate(src, cell_evaluation_flags);
332 phi.integrate_scatter(cell_evaluation_flags, dst);
335 for (
unsigned int face = 0; face < GeometryInfo<dim>::faces_per_cell; ++face)
337 if (data.get_faces_by_cells_boundary_id(cell, face)[0] ==
341 phi_m.reinit(cell, face);
342 phi_m.gather_evaluate(src, face_evaluation_flags);
343 phi_p.reinit(cell, face);
344 phi_p.gather_evaluate(src, face_evaluation_flags);
348 phi_m.integrate_scatter(face_evaluation_flags, dst);
353 phi_m.reinit(cell, face);
354 phi_m.gather_evaluate(src, face_evaluation_flags);
358 phi_m.integrate_scatter(face_evaluation_flags, dst);
368the cell number and the local face number. The given example only
369highlights how to
transform face-centric loops into cell-centric loops and
370is by no means efficient, since data is read and written multiple times
371from and to the global vector as well as computations are performed
372redundantly. Below, we will discuss advanced techniques that target these issues.
383additional_data.mapping_update_flags_faces_by_cells =
384 additional_data.mapping_update_flags_inner_faces |
385 additional_data.mapping_update_flags_boundary_faces;
387data.reinit(
mapping, dof_handler, constraint, quadrature, additional_data);
390In particular, these flags enable that the
internal data structures are
set up
391for all faces of the cells.
393Currently, cell-centric loops in deal.II only work
for uniformly refined meshes
394and
if no constraints are applied (which is the standard
case DG is normally
398<a name=
"step_76-ProvidinglambdastoMatrixFreeloops"></a><h3>Providing lambdas to
MatrixFree loops</h3>
401The examples given above have already used lambdas, which have been provided to
403a version where a
class and a pointer to one of its methods are used and a
404variant where lambdas are utilized.
406In the following code, a class and a pointer to one of its methods, which should
411local_apply_cell(
const MatrixFree<dim, Number> & data,
413 const VectorType & src,
414 const std::pair<unsigned int, unsigned int> &range)
const
416 FEEvaluation<dim, degree, degree + 1, 1, Number> phi(data);
417 for (
unsigned int cell = range.first; cell < range.second; ++cell)
420 phi.gather_evaluate(src, cell_evaluation_flags);
424 phi.integrate_scatter(cell_evaluation_flags, dst);
430matrix_free.cell_loop(&Operator::local_apply_cell,
this, dst, src);
433However, it is also possible to pass an anonymous function via a
lambda function
437matrix_free.template cell_loop<VectorType, VectorType>(
438 [&](
const auto &data,
auto &dst,
const auto &src,
const auto range) {
439 FEEvaluation<dim, degree, degree + 1, 1, Number> phi(data);
440 for (
unsigned int cell = range.first; cell < range.second; ++cell)
443 phi.gather_evaluate(src, cell_evaluation_flags);
447 phi.integrate_scatter(cell_evaluation_flags, dst);
454<a name=
"step_76-VectorizedArrayType"></a><h3>VectorizedArrayType</h3>
457The
class VectorizedArray<Number> is a key component to achieve the high
458node-level performance of the
matrix-
free algorithms in deal.II.
459It is a wrapper class around a short vector of @f$n@f$ entries of type Number and
460maps arithmetic operations to appropriate single-instruction/multiple-data
461(SIMD) concepts by intrinsic
functions. The length of the vector can be
462queried by VectorizedArray::size() and its underlying number type by
465In the default case (<code>VectorizedArray<Number></code>), the vector length is
466set at compile time of the library to
467match the highest
value supported by the given processor architecture.
468However, also a second optional template argument can be
469specified as <code>VectorizedArray<Number, size></code>, where <code>size</code> explicitly
470controls the vector length within the capabilities of a particular instruction
471set. A full list of supported vector lengths is presented in the following table:
473<table align="center" class="doxtable">
480 <td><code>VectorizedArray<double, 1></code></td>
481 <td><code>VectorizedArray<float, 1></code></td>
482 <td>(auto-vectorization)</td>
485 <td><code>VectorizedArray<double, 2></code></td>
486 <td><code>VectorizedArray<float, 4></code></td>
487 <td>SSE2/AltiVec</td>
490 <td><code>VectorizedArray<double, 4></code></td>
491 <td><code>VectorizedArray<float, 8></code></td>
495 <td><code>VectorizedArray<double, 8></code></td>
496 <td><code>VectorizedArray<float, 16></code></td>
501This allows users to select the vector length/ISA and, as a consequence, the
502number of cells to be processed at once in
matrix-
free operator evaluations,
503possibly reducing the pressure on the caches, an severe issue for very high
504degrees (and dimensions).
506A possible further reason to
reduce the number of filled lanes
507is to simplify debugging: instead of having to look at,
e.g., 8
508cells, one can concentrate on a single cell.
510The interface of VectorizedArray also enables the replacement by any type with
511a
matching interface. Specifically, this prepares deal.II for the <code>std::simd</code>
512class that is planned to become part of the
C++23 standard. The following table
513compares the deal.II-specific SIMD classes and the equivalent
C++23 classes:
516<table align="center" class="doxtable">
518 <th>VectorizedArray (deal.II)</th>
519 <th>std::simd (C++23)</th>
522 <td><code>VectorizedArray<Number></code></td>
523 <td><code>std::experimental::native_simd<Number></code></td>
526 <td><code>VectorizedArray<Number, size></code></td>
527 <td><code>std::experimental::fixed_size_simd<Number, size></code></td>
532 * <a name="step_76-CommProg"></a>
533 * <h1> The commented program</h1>
536 * <a name="step_76-Parametersandutilityfunctions"></a>
537 * <h3>Parameters and utility
functions</h3>
541 * The same includes as in @ref step_67 "step-67":
544 * #include <deal.II/base/conditional_ostream.h>
545 * #include <deal.II/base/function.h>
546 * #include <deal.II/base/time_stepping.h>
547 * #include <deal.II/base/timer.h>
548 * #include <deal.II/base/utilities.h>
549 * #include <deal.II/base/vectorization.h>
551 * #include <deal.II/distributed/tria.h>
553 * #include <deal.II/dofs/dof_handler.h>
555 * #include <deal.II/fe/fe_dgq.h>
556 * #include <deal.II/fe/fe_system.h>
558 * #include <deal.II/grid/grid_generator.h>
559 * #include <deal.II/grid/tria.h>
560 * #include <deal.II/grid/tria_accessor.h>
561 * #include <deal.II/grid/tria_iterator.h>
563 * #include <deal.II/lac/affine_constraints.h>
564 * #include <deal.II/lac/la_parallel_vector.h>
566 * #include <deal.II/matrix_free/fe_evaluation.h>
567 * #include <deal.II/matrix_free/matrix_free.h>
568 * #include <deal.II/matrix_free/operators.h>
570 * #include <deal.II/numerics/data_out.h>
574 * #include <iostream>
578 * A new include for categorizing of cells according to their boundary IDs:
581 * #include <deal.II/matrix_free/tools.h>
587 *
using namespace dealii;
591 * The same input parameters as in @ref step_67
"step-67":
594 *
constexpr unsigned int testcase = 1;
596 *
constexpr unsigned int n_global_refinements = 2;
597 *
constexpr unsigned int fe_degree = 5;
598 *
constexpr unsigned int n_q_points_1d = fe_degree + 2;
602 * This parameter specifies the size of the shared-memory
group. Currently,
604 * to the options that the memory features can be turned off or all processes
605 * having access to the same shared-memory domain are grouped together.
610 *
using Number = double;
614 * Here, the type of the data structure is chosen
for vectorization. In the
615 *
default case, VectorizedArray<Number> is used, i.e., the highest
616 * instruction-
set-architecture extension available on the given hardware with
617 * the maximum number of vector lanes is used. However, one might
reduce
618 * the number of filled lanes,
e.g., by writing
619 * <code>
using VectorizedArrayType = VectorizedArray<Number, 4></code> to only
623 *
using VectorizedArrayType = VectorizedArray<Number>;
627 * The following parameters have not changed:
630 *
constexpr double gamma = 1.4;
631 *
constexpr double final_time = testcase == 0 ? 10 : 2.0;
632 *
constexpr double output_tick = testcase == 0 ? 1 : 0.05;
634 *
const double courant_number = 0.15 /
std::pow(fe_degree, 1.5);
638 * Specify
max number of time steps useful
for performance studies.
645 * Runge-Kutta-related
functions copied from @ref step_67
"step-67" and slightly modified
646 * with the purpose to minimize global vector access:
649 *
enum LowStorageRungeKuttaScheme
656 *
constexpr LowStorageRungeKuttaScheme lsrk_scheme = stage_5_order_4;
660 *
class LowStorageRungeKuttaIntegrator
663 * LowStorageRungeKuttaIntegrator(
const LowStorageRungeKuttaScheme scheme)
668 *
case stage_3_order_3:
671 *
case stage_5_order_4:
674 *
case stage_7_order_4:
677 *
case stage_9_order_5:
684 * TimeStepping::LowStorageRungeKutta<
685 * LinearAlgebra::distributed::Vector<Number>>
686 * rk_integrator(lsrk);
687 * std::vector<double> ci;
688 * rk_integrator.get_coefficients(ai, bi, ci);
691 *
unsigned int n_stages() const
696 *
template <
typename VectorType,
typename Operator>
697 *
void perform_time_step(
const Operator &pde_operator,
698 *
const double current_time,
699 *
const double time_step,
700 * VectorType &solution,
701 * VectorType &vec_ri,
702 * VectorType &vec_ki)
const
706 * vec_ki.swap(solution);
708 *
double sum_previous_bi = 0;
709 *
for (
unsigned int stage = 0; stage < bi.size(); ++stage)
711 *
const double c_i = stage == 0 ? 0 : sum_previous_bi + ai[stage - 1];
713 * pde_operator.perform_stage(stage,
714 * current_time + c_i * time_step,
715 * bi[stage] * time_step,
716 * (stage == bi.size() - 1 ?
718 * ai[stage] * time_step),
719 * (stage % 2 == 0 ? vec_ki : vec_ri),
720 * (stage % 2 == 0 ? vec_ri : vec_ki),
724 * sum_previous_bi += bi[stage - 1];
729 * std::vector<double> bi;
730 * std::vector<double> ai;
736 * Euler-specific utility
functions from @ref step_67
"step-67":
739 *
enum EulerNumericalFlux
741 * lax_friedrichs_modified,
742 * harten_lax_vanleer,
744 *
constexpr EulerNumericalFlux numerical_flux_type = lax_friedrichs_modified;
749 *
class ExactSolution :
public Function<dim>
752 * ExactSolution(
const double time)
753 * : Function<dim>(dim + 2, time)
756 *
virtual double value(
const Point<dim> &p,
757 *
const unsigned int component = 0)
const override;
763 *
double ExactSolution<dim>::value(
const Point<dim> &x,
764 *
const unsigned int component)
const
766 *
const double t = this->
get_time();
772 *
Assert(dim == 2, ExcNotImplemented());
773 *
const double beta = 5;
777 *
const double radius_sqr =
778 * (x - x0).norm_square() - 2. * (x[0] - x0[0]) * t + t * t;
779 *
const double factor =
781 *
const double density_log = std::log2(
782 *
std::abs(1. - (gamma - 1.) / gamma * 0.25 * factor * factor));
783 *
const double density = std::exp2(density_log * (1. / (gamma - 1.)));
784 *
const double u = 1. - factor * (x[1] - x0[1]);
785 *
const double v = factor * (x[0] - t - x0[0]);
787 *
if (component == 0)
789 *
else if (component == 1)
790 *
return density * u;
791 *
else if (component == 2)
792 *
return density * v;
795 *
const double pressure =
796 * std::exp2(density_log * (gamma / (gamma - 1.)));
797 *
return pressure / (
gamma - 1.) +
798 * 0.5 * (density * u * u + density * v * v);
804 *
if (component == 0)
806 *
else if (component == 1)
808 *
else if (component == dim + 1)
809 *
return 3.097857142857143;
822 *
template <
int dim,
typename Number>
824 * Tensor<1, dim, Number>
825 * euler_velocity(
const Tensor<1, dim + 2, Number> &conserved_variables)
827 *
const Number inverse_density = Number(1.) / conserved_variables[0];
829 * Tensor<1, dim, Number> velocity;
830 *
for (
unsigned int d = 0;
d < dim; ++
d)
831 * velocity[d] = conserved_variables[1 + d] * inverse_density;
836 *
template <
int dim,
typename Number>
839 * euler_pressure(
const Tensor<1, dim + 2, Number> &conserved_variables)
841 *
const Tensor<1, dim, Number> velocity =
842 * euler_velocity<dim>(conserved_variables);
844 * Number rho_u_dot_u = conserved_variables[1] * velocity[0];
845 *
for (
unsigned int d = 1;
d < dim; ++
d)
846 * rho_u_dot_u += conserved_variables[1 + d] * velocity[d];
848 *
return (gamma - 1.) * (conserved_variables[dim + 1] - 0.5 * rho_u_dot_u);
851 *
template <
int dim,
typename Number>
853 * Tensor<1, dim + 2, Tensor<1, dim, Number>>
854 * euler_flux(
const Tensor<1, dim + 2, Number> &conserved_variables)
856 *
const Tensor<1, dim, Number> velocity =
857 * euler_velocity<dim>(conserved_variables);
858 *
const Number pressure = euler_pressure<dim>(conserved_variables);
860 * Tensor<1, dim + 2, Tensor<1, dim, Number>> flux;
861 *
for (
unsigned int d = 0;
d < dim; ++
d)
863 * flux[0][
d] = conserved_variables[1 +
d];
864 *
for (
unsigned int e = 0;
e < dim; ++
e)
865 * flux[e + 1][d] = conserved_variables[e + 1] * velocity[d];
866 * flux[
d + 1][
d] += pressure;
868 * velocity[
d] * (conserved_variables[dim + 1] + pressure);
874 *
template <
int n_components,
int dim,
typename Number>
876 * Tensor<1, n_components, Number>
877 *
operator*(
const Tensor<1, n_components, Tensor<1, dim, Number>> &matrix,
878 *
const Tensor<1, dim, Number> &vector)
880 * Tensor<1, n_components, Number> result;
881 *
for (
unsigned int d = 0;
d < n_components; ++
d)
882 * result[d] = matrix[d] * vector;
886 *
template <
int dim,
typename Number>
888 * Tensor<1, dim + 2, Number>
889 * euler_numerical_flux(
const Tensor<1, dim + 2, Number> &u_m,
890 *
const Tensor<1, dim + 2, Number> &u_p,
891 *
const Tensor<1, dim, Number> &normal)
893 *
const auto velocity_m = euler_velocity<dim>(u_m);
894 *
const auto velocity_p = euler_velocity<dim>(u_p);
896 *
const auto pressure_m = euler_pressure<dim>(u_m);
897 *
const auto pressure_p = euler_pressure<dim>(u_p);
899 *
const auto flux_m = euler_flux<dim>(u_m);
900 *
const auto flux_p = euler_flux<dim>(u_p);
902 *
switch (numerical_flux_type)
904 *
case lax_friedrichs_modified:
908 * gamma * pressure_p * (1. / u_p[0]),
909 * velocity_m.norm_square() +
910 * gamma * pressure_m * (1. / u_m[0])));
912 *
return 0.5 * (flux_m * normal + flux_p * normal) +
913 * 0.5 * lambda * (u_m - u_p);
916 *
case harten_lax_vanleer:
918 *
const auto avg_velocity_normal =
919 * 0.5 * ((velocity_m + velocity_p) * normal);
922 * (pressure_p * (1. / u_p[0]) + pressure_m * (1. / u_m[0]))));
923 *
const Number s_pos =
924 *
std::max(Number(), avg_velocity_normal + avg_c);
925 *
const Number s_neg =
926 *
std::min(Number(), avg_velocity_normal - avg_c);
927 *
const Number inverse_s = Number(1.) / (s_pos - s_neg);
930 * ((s_pos * (flux_m * normal) - s_neg * (flux_p * normal)) -
931 * s_pos * s_neg * (u_m - u_p));
946 * General-purpose utility
functions from @ref step_67
"step-67":
949 *
template <
int dim,
typename VectorizedArrayType>
950 * VectorizedArrayType
951 * evaluate_function(
const Function<dim> &function,
952 *
const Point<dim, VectorizedArrayType> &p_vectorized,
953 *
const unsigned int component)
955 * VectorizedArrayType result;
956 *
for (
unsigned int v = 0; v < VectorizedArrayType::size(); ++v)
959 *
for (
unsigned int d = 0;
d < dim; ++
d)
960 * p[d] = p_vectorized[d][v];
961 * result[v] = function.value(p, component);
967 *
template <
int dim,
typename VectorizedArrayType,
int n_components = dim + 2>
968 * Tensor<1, n_components, VectorizedArrayType>
969 * evaluate_function(
const Function<dim> &function,
970 *
const Point<dim, VectorizedArrayType> &p_vectorized)
973 * Tensor<1, n_components, VectorizedArrayType> result;
974 *
for (
unsigned int v = 0; v < VectorizedArrayType::size(); ++v)
977 *
for (
unsigned int d = 0;
d < dim; ++
d)
978 * p[d] = p_vectorized[d][v];
979 *
for (
unsigned int d = 0;
d < n_components; ++
d)
980 * result[d][v] = function.value(p, d);
989 * <a name=
"step_76-EuleroperatorusingacellcentricloopandMPI30sharedmemory"></a>
990 * <h3>Euler
operator using a cell-centric
loop and
MPI-3.0 shared memory</h3>
994 * Euler
operator from @ref step_67
"step-67" with some changes as detailed below:
997 *
template <
int dim,
int degree,
int n_po
ints_1d>
998 *
class EulerOperator
1001 *
static constexpr unsigned int n_quadrature_points_1d = n_points_1d;
1003 * EulerOperator(TimerOutput &timer_output);
1008 *
const DoFHandler<dim> &dof_handler);
1011 * std::unique_ptr<Function<dim>> inflow_function);
1013 *
void set_subsonic_outflow_boundary(
1015 * std::unique_ptr<Function<dim>> outflow_energy);
1019 *
void set_body_force(std::unique_ptr<Function<dim>> body_force);
1022 * perform_stage(
const unsigned int stage,
1023 *
const Number cur_time,
1026 *
const LinearAlgebra::distributed::Vector<Number> ¤t_ri,
1027 * LinearAlgebra::distributed::Vector<Number> &vec_ki,
1028 * LinearAlgebra::distributed::Vector<Number> &solution)
const;
1030 *
void project(
const Function<dim> &function,
1031 * LinearAlgebra::distributed::Vector<Number> &solution)
const;
1033 * std::array<double, 3> compute_errors(
1034 *
const Function<dim> &function,
1035 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const;
1037 *
double compute_cell_transport_speed(
1038 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const;
1041 * initialize_vector(LinearAlgebra::distributed::Vector<Number> &vector)
const;
1046 * Instance of SubCommunicatorWrapper containing the sub-communicator, which
1048 * shared-memory capabilities:
1051 * MPI_Comm subcommunicator;
1053 * MatrixFree<dim, Number, VectorizedArrayType> data;
1055 * TimerOutput &timer;
1057 * std::map<types::boundary_id, std::unique_ptr<Function<dim>>>
1058 * inflow_boundaries;
1059 * std::map<types::boundary_id, std::unique_ptr<Function<dim>>>
1060 * subsonic_outflow_boundaries;
1061 * std::set<types::boundary_id> wall_boundaries;
1062 * std::unique_ptr<Function<dim>> body_force;
1069 * New constructor, which creates a sub-communicator. The user can specify
1070 * the size of the sub-communicator via the global parameter group_size. If
1071 * the size is set to -1, all
MPI processes of a
1072 * shared-memory domain are combined to a group. The specified size is
1073 * decisive for the benefit of the shared-memory capabilities of MatrixFree
1074 * and, therefore, setting the <code>size</code> to <code>-1</code> is a
1075 * reasonable choice. By setting, the size to <code>1</code> users explicitly
1076 * disable the
MPI-3.0 shared-memory features of MatrixFree and rely
1077 * completely on
MPI-2.0 features, like <code>MPI_Isend</code> and
1078 * <code>MPI_Irecv</code>.
1081 * template <
int dim,
int degree,
int n_points_1d>
1082 * EulerOperator<dim, degree, n_points_1d>::EulerOperator(TimerOutput &timer)
1086 *
if (group_size == 1)
1088 * this->subcommunicator = MPI_COMM_SELF;
1094 * MPI_Comm_split_type(MPI_COMM_WORLD,
1095 * MPI_COMM_TYPE_SHARED,
1098 * &subcommunicator);
1105 * (
void)subcommunicator;
1107 * this->subcommunicator = MPI_COMM_SELF;
1114 * New destructor responsible
for freeing of the sub-communicator.
1117 *
template <
int dim,
int degree,
int n_po
ints_1d>
1118 * EulerOperator<dim, degree, n_points_1d>::~EulerOperator()
1121 *
if (this->subcommunicator != MPI_COMM_SELF)
1122 * MPI_Comm_free(&subcommunicator);
1129 * Modified
reinit() function to set up the internal data structures in
1130 * MatrixFree in a way that it is usable by the cell-centric loops and
1131 * the
MPI-3.0 shared-memory capabilities are used:
1134 * template <
int dim,
int degree,
int n_points_1d>
1135 *
void EulerOperator<dim, degree, n_points_1d>::reinit(
1136 * const Mapping<dim> &
mapping,
1137 * const DoFHandler<dim> &dof_handler)
1139 *
const std::vector<const DoFHandler<dim> *> dof_handlers = {&dof_handler};
1140 *
const AffineConstraints<double> dummy;
1141 *
const std::vector<const AffineConstraints<double> *> constraints = {&dummy};
1142 *
const std::vector<Quadrature<1>> quadratures = {QGauss<1>(n_q_points_1d),
1143 * QGauss<1>(fe_degree + 1)};
1145 *
typename MatrixFree<dim, Number, VectorizedArrayType>::AdditionalData
1147 * additional_data.mapping_update_flags =
1150 * additional_data.mapping_update_flags_inner_faces =
1153 * additional_data.mapping_update_flags_boundary_faces =
1156 * additional_data.tasks_parallel_scheme =
1157 * MatrixFree<dim, Number, VectorizedArrayType>::AdditionalData::none;
1161 * Categorize cells so that all lanes have the same boundary IDs
for each
1162 * face. This is strictly not necessary, however, allows to write simpler
1163 * code in EulerOperator::perform_stage() without masking, since it is
1164 * guaranteed that all cells grouped together (in a VectorizedArray)
1165 * have to perform exactly the same operation also on the faces.
1168 * MatrixFreeTools::categorize_by_boundary_ids(dof_handler.get_triangulation(),
1173 * Enable
MPI-3.0 shared-memory capabilities within MatrixFree by providing
1174 * the sub-communicator:
1177 * additional_data.communicator_sm = subcommunicator;
1180 *
mapping, dof_handlers, constraints, quadratures, additional_data);
1186 * The following function does an entire stage of a Runge--Kutta update
1187 * and is, alongside the slightly modified setup, the heart of this tutorial
1188 * compared to @ref step_67
"step-67".
1192 * In contrast to @ref step_67
"step-67", we are not executing the advection step
1193 * (using MatrixFree::loop()) and the inverse mass-matrix step
1194 * (using MatrixFree::cell_loop()) in sequence, but evaluate everything in
1195 * one go inside of MatrixFree::loop_cell_centric(). This function expects
1196 * a single function that is executed on each locally-owned (macro) cell as
1197 * parameter so that we need to loop over all faces of that cell and perform
1198 * needed integration steps on our own.
1202 * The following function contains to a large extent copies of the following
1203 * functions from @ref step_67
"step-67" so that comments related the evaluation of the weak
1204 * form are skipped here:
1205 * - <code>EulerDG::EulerOperator::local_apply_cell</code>
1206 * - <code>EulerDG::EulerOperator::local_apply_face</code>
1207 * - <code>EulerDG::EulerOperator::local_apply_boundary_face</code>
1208 * - <code>EulerDG::EulerOperator::local_apply_inverse_mass_matrix</code>
1211 * template <
int dim,
int degree,
int n_points_1d>
1212 *
void EulerOperator<dim, degree, n_points_1d>::perform_stage(
1213 * const
unsigned int stage,
1214 * const Number current_time,
1217 * const LinearAlgebra::distributed::Vector<Number> ¤t_ri,
1218 * LinearAlgebra::distributed::Vector<Number> &vec_ki,
1219 * LinearAlgebra::distributed::Vector<Number> &solution) const
1221 *
for (
auto &i : inflow_boundaries)
1222 * i.second->set_time(current_time);
1223 * for (
auto &i : subsonic_outflow_boundaries)
1224 * i.second->set_time(current_time);
1229 * providing a lambda containing the effects of the cell, face and
1230 * boundary-face integrals:
1233 * data.template loop_cell_centric<LinearAlgebra::distributed::Vector<Number>,
1234 * LinearAlgebra::distributed::Vector<Number>>(
1235 * [&](
const auto &data,
auto &dst,
const auto &src,
const auto cell_range) {
1236 * using FECellIntegral = FEEvaluation<dim,
1241 * VectorizedArrayType>;
1242 * using FEFaceIntegral = FEFaceEvaluation<dim,
1247 * VectorizedArrayType>;
1249 * FECellIntegral phi(data);
1250 * FECellIntegral phi_temp(data);
1251 * FEFaceIntegral phi_m(data, true);
1252 * FEFaceIntegral phi_p(data, false);
1254 * Tensor<1, dim, VectorizedArrayType> constant_body_force;
1255 * const Functions::ConstantFunction<dim> *constant_function =
1256 * dynamic_cast<Functions::ConstantFunction<dim> *>(body_force.get());
1258 * if (constant_function)
1259 * constant_body_force =
1260 * evaluate_function<dim, VectorizedArrayType, dim>(
1261 * *constant_function, Point<dim, VectorizedArrayType>());
1263 * const internal::EvaluatorTensorProduct<
1264 * internal::EvaluatorVariant::evaluate_evenodd,
1268 * VectorizedArrayType,
1271 * data.get_shape_info().data[0].shape_gradients_collocation_eo,
1274 * AlignedVector<VectorizedArrayType> buffer(phi.static_n_q_points *
1275 * phi.n_components);
1279 * Loop over all cell batches:
1282 * for (
unsigned int cell = cell_range.first; cell < cell_range.second;
1287 * if (ai != Number())
1288 * phi_temp.reinit(cell);
1292 * Read values from global vector and compute the values at the
1293 * quadrature points:
1296 * if (ai != Number() && stage == 0)
1298 * phi.read_dof_values(src);
1300 * for (unsigned int i = 0;
1301 * i < phi.static_dofs_per_component * (dim + 2);
1303 * phi_temp.begin_dof_values()[i] = phi.begin_dof_values()[i];
1305 * phi.evaluate(EvaluationFlags::values);
1309 * phi.gather_evaluate(src, EvaluationFlags::values);
1314 * Buffer the computed values at the quadrature points, since
1316 * step, however, are needed later on
for the face integrals:
1319 * for (
unsigned int i = 0; i < phi.static_n_q_points * (dim + 2); ++i)
1320 * buffer[i] = phi.begin_values()[i];
1324 * Apply the cell integral at the cell quadrature points. See also
1325 * the function <code>EulerOperator::local_apply_cell()</code> from
1326 * @ref step_67
"step-67":
1329 *
for (
const unsigned int q : phi.quadrature_point_indices())
1331 *
const auto w_q = phi.get_value(q);
1332 * phi.submit_gradient(euler_flux<dim>(w_q), q);
1333 *
if (body_force.get() !=
nullptr)
1335 *
const Tensor<1, dim, VectorizedArrayType> force =
1336 * constant_function ?
1337 * constant_body_force :
1338 * evaluate_function<dim, VectorizedArrayType, dim>(
1339 * *body_force, phi.quadrature_point(q));
1341 * Tensor<1, dim + 2, VectorizedArrayType> forcing;
1342 *
for (
unsigned int d = 0;
d < dim; ++
d)
1343 * forcing[d + 1] = w_q[0] * force[d];
1344 *
for (
unsigned int d = 0;
d < dim; ++
d)
1345 * forcing[dim + 1] += force[d] * w_q[d + 1];
1347 * phi.submit_value(forcing, q);
1354 * points. We skip the interpolation back to the support points
1355 * of the element, since we first collect all contributions in the
1356 * cell quadrature points and only perform the interpolation back
1357 * as the
final step.
1361 *
auto *values_ptr = phi.begin_values();
1362 *
auto *gradient_ptr = phi.begin_gradients();
1364 *
for (
unsigned int c = 0; c < dim + 2; ++c)
1366 *
if (dim >= 1 && body_force.get() ==
nullptr)
1367 * eval.template gradients<0, false, false, dim>(gradient_ptr,
1369 *
else if (dim >= 1)
1370 * eval.template gradients<0, false, true, dim>(gradient_ptr,
1373 * eval.template gradients<1, false, true, dim>(gradient_ptr +
1377 * eval.template gradients<2, false, true, dim>(gradient_ptr +
1381 * values_ptr += phi.static_n_q_points;
1382 * gradient_ptr += phi.static_n_q_points * dim;
1388 * Loop over all faces of the current cell:
1391 *
for (
unsigned int face = 0;
1392 * face < GeometryInfo<dim>::faces_per_cell;
1397 * Determine the boundary ID of the current face. Since we have
1398 *
set up MatrixFree in a way that all filled lanes have
1399 * guaranteed the same boundary ID, we can select the
1400 * boundary ID of the first lane.
1403 *
const auto boundary_ids =
1404 * data.get_faces_by_cells_boundary_id(cell, face);
1406 *
Assert(std::equal(boundary_ids.begin(),
1407 * boundary_ids.begin() +
1408 * data.n_active_entries_per_cell_batch(cell),
1409 * boundary_ids.begin()),
1410 * ExcMessage(
"Boundary IDs of lanes differ."));
1414 * phi_m.reinit(cell, face);
1418 * Interpolate the
values from the cell quadrature points to the
1419 * quadrature points of the current face via a simple 1
d
1423 * internal::FEFaceNormalEvaluationImpl<dim,
1425 * VectorizedArrayType>::
1426 *
template interpolate_quadrature<true, false>(
1429 * data.get_shape_info(),
1431 * phi_m.begin_values(),
1436 * Check
if the face is an internal or a boundary face and
1437 * select a different code path based on
this information:
1444 * Process and internal face. The following lines of code
1445 * are a
copy of the function
1446 * <code>EulerDG::EulerOperator::local_apply_face</code>
1447 * from @ref step_67
"step-67":
1450 * phi_p.reinit(cell, face);
1453 *
for (
const unsigned int q :
1454 * phi_m.quadrature_point_indices())
1456 *
const auto numerical_flux =
1457 * euler_numerical_flux<dim>(phi_m.get_value(q),
1458 * phi_p.get_value(q),
1459 * phi_m.normal_vector(q));
1460 * phi_m.submit_value(-numerical_flux, q);
1467 * Process a boundary face. These following lines of code
1468 * are a
copy of the function
1469 * <code>EulerDG::EulerOperator::local_apply_boundary_face</code>
1470 * from @ref step_67
"step-67":
1473 *
for (
const unsigned int q :
1474 * phi_m.quadrature_point_indices())
1476 *
const auto w_m = phi_m.get_value(q);
1477 *
const auto normal = phi_m.normal_vector(q);
1479 *
auto rho_u_dot_n = w_m[1] * normal[0];
1480 *
for (
unsigned int d = 1;
d < dim; ++
d)
1481 * rho_u_dot_n += w_m[1 + d] * normal[d];
1483 *
bool at_outflow =
false;
1485 * Tensor<1, dim + 2, VectorizedArrayType> w_p;
1487 *
if (wall_boundaries.find(boundary_id) !=
1488 * wall_boundaries.end())
1491 *
for (
unsigned int d = 0;
d < dim; ++
d)
1493 * w_m[d + 1] - 2. * rho_u_dot_n * normal[d];
1494 * w_p[dim + 1] = w_m[dim + 1];
1496 *
else if (inflow_boundaries.find(boundary_id) !=
1497 * inflow_boundaries.end())
1498 * w_p = evaluate_function(
1499 * *inflow_boundaries.find(boundary_id)->second,
1500 * phi_m.quadrature_point(q));
1501 *
else if (subsonic_outflow_boundaries.find(
1503 * subsonic_outflow_boundaries.end())
1507 * evaluate_function(*subsonic_outflow_boundaries
1508 * .find(boundary_id)
1510 * phi_m.quadrature_point(q),
1512 * at_outflow =
true;
1517 *
"Unknown boundary id, did "
1518 *
"you set a boundary condition for "
1519 *
"this part of the domain boundary?"));
1521 *
auto flux = euler_numerical_flux<dim>(w_m, w_p, normal);
1524 *
for (
unsigned int v = 0;
1525 * v < VectorizedArrayType::size();
1528 *
if (rho_u_dot_n[v] < -1e-12)
1529 *
for (
unsigned int d = 0;
d < dim; ++
d)
1530 * flux[d + 1][v] = 0.;
1533 * phi_m.submit_value(-flux, q);
1539 * Evaluate local integrals related to cell by quadrature and
1540 * add into cell contribution via a simple 1
d interpolation:
1543 * internal::FEFaceNormalEvaluationImpl<dim,
1545 * VectorizedArrayType>::
1546 *
template interpolate_quadrature<false, true>(
1549 * data.get_shape_info(),
1550 * phi_m.begin_values(),
1551 * phi.begin_values(),
1557 * Apply inverse mass
matrix in the cell quadrature points. See
1559 * <code>EulerDG::EulerOperator::local_apply_inverse_mass_matrix()</code>
1560 * from @ref step_67
"step-67":
1563 *
for (
unsigned int q = 0; q < phi.static_n_q_points; ++q)
1565 *
const auto factor = VectorizedArrayType(1.0) / phi.JxW(q);
1566 *
for (
unsigned int c = 0; c < dim + 2; ++c)
1567 * phi.begin_values()[c * phi.static_n_q_points + q] =
1568 * phi.begin_values()[c * phi.static_n_q_points + q] * factor;
1573 * Transform
values from collocation space to the original
1574 * Gauss-Lobatto space:
1577 * internal::FEEvaluationImplBasisChange<
1582 * n_points_1d>::do_backward(dim + 2,
1583 * data.get_shape_info()
1585 * .inverse_shape_values_eo,
1587 * phi.begin_values(),
1588 * phi.begin_dof_values());
1592 * Perform Runge-Kutta update and write results back to global
1596 *
if (ai == Number())
1598 *
for (
unsigned int q = 0; q < phi.static_dofs_per_cell; ++q)
1599 * phi.begin_dof_values()[q] = bi * phi.begin_dof_values()[q];
1600 * phi.distribute_local_to_global(solution);
1605 * phi_temp.read_dof_values(solution);
1607 *
for (
unsigned int q = 0; q < phi.static_dofs_per_cell; ++q)
1609 *
const auto K_i = phi.begin_dof_values()[q];
1611 * phi.begin_dof_values()[q] =
1612 * phi_temp.begin_dof_values()[q] + (ai * K_i);
1614 * phi_temp.begin_dof_values()[q] += bi * K_i;
1616 * phi.set_dof_values(dst);
1617 * phi_temp.set_dof_values(solution);
1624 * MatrixFree<dim, Number, VectorizedArrayType>::DataAccessOnFaces::values);
1631 * From here, the code of @ref step_67
"step-67" has not changed.
1634 *
template <
int dim,
int degree,
int n_po
ints_1d>
1635 *
void EulerOperator<dim, degree, n_points_1d>::initialize_vector(
1636 * LinearAlgebra::distributed::Vector<Number> &vector)
const
1638 * data.initialize_dof_vector(vector);
1643 *
template <
int dim,
int degree,
int n_po
ints_1d>
1644 *
void EulerOperator<dim, degree, n_points_1d>::set_inflow_boundary(
1646 * std::unique_ptr<Function<dim>> inflow_function)
1648 *
AssertThrow(subsonic_outflow_boundaries.find(boundary_id) ==
1649 * subsonic_outflow_boundaries.end() &&
1650 * wall_boundaries.find(boundary_id) == wall_boundaries.end(),
1651 * ExcMessage(
"You already set the boundary with id " +
1652 * std::to_string(
static_cast<int>(boundary_id)) +
1653 *
" to another type of boundary before now setting " +
1655 *
AssertThrow(inflow_function->n_components == dim + 2,
1656 * ExcMessage(
"Expected function with dim+2 components"));
1658 * inflow_boundaries[
boundary_id] = std::move(inflow_function);
1663 *
template <
int dim,
int degree,
int n_po
ints_1d>
1664 *
void EulerOperator<dim, degree, n_points_1d>::set_subsonic_outflow_boundary(
1666 * std::unique_ptr<Function<dim>> outflow_function)
1668 *
AssertThrow(inflow_boundaries.find(boundary_id) ==
1669 * inflow_boundaries.end() &&
1670 * wall_boundaries.find(boundary_id) == wall_boundaries.end(),
1671 * ExcMessage(
"You already set the boundary with id " +
1672 * std::to_string(
static_cast<int>(boundary_id)) +
1673 *
" to another type of boundary before now setting " +
1674 *
"it as subsonic outflow"));
1675 *
AssertThrow(outflow_function->n_components == dim + 2,
1676 * ExcMessage(
"Expected function with dim+2 components"));
1678 * subsonic_outflow_boundaries[
boundary_id] = std::move(outflow_function);
1683 *
template <
int dim,
int degree,
int n_po
ints_1d>
1684 *
void EulerOperator<dim, degree, n_points_1d>::set_wall_boundary(
1687 *
AssertThrow(inflow_boundaries.find(boundary_id) ==
1688 * inflow_boundaries.end() &&
1689 * subsonic_outflow_boundaries.find(boundary_id) ==
1690 * subsonic_outflow_boundaries.end(),
1691 * ExcMessage(
"You already set the boundary with id " +
1692 * std::to_string(
static_cast<int>(boundary_id)) +
1693 *
" to another type of boundary before now setting " +
1694 *
"it as wall boundary"));
1696 * wall_boundaries.insert(boundary_id);
1701 *
template <
int dim,
int degree,
int n_po
ints_1d>
1702 *
void EulerOperator<dim, degree, n_points_1d>::set_body_force(
1703 * std::unique_ptr<Function<dim>> body_force)
1707 * this->body_force = std::move(body_force);
1712 *
template <
int dim,
int degree,
int n_po
ints_1d>
1713 *
void EulerOperator<dim, degree, n_points_1d>::project(
1714 *
const Function<dim> &function,
1715 * LinearAlgebra::distributed::Vector<Number> &solution)
const
1717 * FEEvaluation<dim, degree, degree + 1, dim + 2, Number, VectorizedArrayType>
1719 * MatrixFreeOperators::CellwiseInverseMassMatrix<dim,
1723 * VectorizedArrayType>
1725 * solution.zero_out_ghost_values();
1726 *
for (
unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1729 *
for (
const unsigned int q : phi.quadrature_point_indices())
1730 * phi.submit_dof_value(evaluate_function(function,
1731 * phi.quadrature_point(q)),
1733 * inverse.transform_from_q_points_to_basis(dim + 2,
1734 * phi.begin_dof_values(),
1735 * phi.begin_dof_values());
1736 * phi.set_dof_values(solution);
1742 *
template <
int dim,
int degree,
int n_po
ints_1d>
1743 * std::array<double, 3> EulerOperator<dim, degree, n_points_1d>::compute_errors(
1744 *
const Function<dim> &function,
1745 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const
1747 * TimerOutput::Scope t(timer,
"compute errors");
1748 *
double errors_squared[3] = {};
1749 * FEEvaluation<dim, degree, n_points_1d, dim + 2, Number, VectorizedArrayType>
1752 *
for (
unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1756 * VectorizedArrayType local_errors_squared[3] = {};
1757 *
for (
const unsigned int q : phi.quadrature_point_indices())
1759 *
const auto error =
1760 * evaluate_function(function, phi.quadrature_point(q)) -
1762 *
const auto JxW = phi.JxW(q);
1764 * local_errors_squared[0] += error[0] * error[0] * JxW;
1765 *
for (
unsigned int d = 0;
d < dim; ++
d)
1766 * local_errors_squared[1] += (error[d + 1] * error[d + 1]) * JxW;
1767 * local_errors_squared[2] += (error[dim + 1] * error[dim + 1]) * JxW;
1769 *
for (
unsigned int v = 0; v < data.n_active_entries_per_cell_batch(cell);
1771 *
for (
unsigned int d = 0;
d < 3; ++
d)
1772 * errors_squared[d] += local_errors_squared[d][v];
1777 * std::array<double, 3> errors;
1778 *
for (
unsigned int d = 0;
d < 3; ++
d)
1779 * errors[d] =
std::sqrt(errors_squared[d]);
1786 *
template <
int dim,
int degree,
int n_po
ints_1d>
1787 *
double EulerOperator<dim, degree, n_points_1d>::compute_cell_transport_speed(
1788 *
const LinearAlgebra::distributed::Vector<Number> &solution)
const
1790 * TimerOutput::Scope t(timer,
"compute transport speed");
1791 * Number max_transport = 0;
1792 * FEEvaluation<dim, degree, degree + 1, dim + 2, Number, VectorizedArrayType>
1795 *
for (
unsigned int cell = 0; cell < data.n_cell_batches(); ++cell)
1799 * VectorizedArrayType local_max = 0.;
1800 *
for (
const unsigned int q : phi.quadrature_point_indices())
1802 *
const auto solution = phi.get_value(q);
1803 *
const auto velocity = euler_velocity<dim>(solution);
1804 *
const auto pressure = euler_pressure<dim>(solution);
1806 *
const auto inverse_jacobian = phi.inverse_jacobian(q);
1807 *
const auto convective_speed = inverse_jacobian * velocity;
1808 * VectorizedArrayType convective_limit = 0.;
1809 *
for (
unsigned int d = 0;
d < dim; ++
d)
1810 * convective_limit =
1813 *
const auto speed_of_sound =
1814 *
std::sqrt(gamma * pressure * (1. / solution[0]));
1816 * Tensor<1, dim, VectorizedArrayType> eigenvector;
1817 *
for (
unsigned int d = 0;
d < dim; ++
d)
1818 * eigenvector[d] = 1.;
1819 *
for (
unsigned int i = 0; i < 5; ++i)
1821 * eigenvector =
transpose(inverse_jacobian) *
1822 * (inverse_jacobian * eigenvector);
1823 * VectorizedArrayType eigenvector_norm = 0.;
1824 *
for (
unsigned int d = 0;
d < dim; ++
d)
1825 * eigenvector_norm =
1827 * eigenvector /= eigenvector_norm;
1829 *
const auto jac_times_ev = inverse_jacobian * eigenvector;
1830 *
const auto max_eigenvalue =
std::sqrt(
1831 * (jac_times_ev * jac_times_ev) / (eigenvector * eigenvector));
1834 * max_eigenvalue * speed_of_sound + convective_limit);
1837 *
for (
unsigned int v = 0; v < data.n_active_entries_per_cell_batch(cell);
1839 * max_transport =
std::max(max_transport, local_max[v]);
1844 *
return max_transport;
1849 *
template <
int dim>
1850 *
class EulerProblem
1858 *
void make_grid_and_dofs();
1860 *
void output_results(
const unsigned int result_number);
1862 * LinearAlgebra::distributed::Vector<Number> solution;
1864 * ConditionalOStream pcout;
1866 * #ifdef DEAL_II_WITH_P4EST
1867 * parallel::distributed::Triangulation<dim> triangulation;
1869 * Triangulation<dim> triangulation;
1872 *
const FESystem<dim> fe;
1874 * DoFHandler<dim> dof_handler;
1876 * TimerOutput timer;
1878 * EulerOperator<dim, fe_degree, n_q_points_1d> euler_operator;
1880 *
double time, time_step;
1882 *
class Postprocessor :
public DataPostprocessor<dim>
1887 *
virtual void evaluate_vector_field(
1888 *
const DataPostprocessorInputs::Vector<dim> &inputs,
1889 * std::vector<Vector<double>> &computed_quantities)
const override;
1891 *
virtual std::vector<std::string> get_names()
const override;
1893 *
virtual std::vector<
1895 * get_data_component_interpretation()
const override;
1897 *
virtual UpdateFlags get_needed_update_flags()
const override;
1900 *
const bool do_schlieren_plot;
1906 *
template <
int dim>
1907 * EulerProblem<dim>::Postprocessor::Postprocessor()
1908 * : do_schlieren_plot(dim == 2)
1913 *
template <
int dim>
1914 *
void EulerProblem<dim>::Postprocessor::evaluate_vector_field(
1915 *
const DataPostprocessorInputs::Vector<dim> &inputs,
1916 * std::vector<Vector<double>> &computed_quantities)
const
1918 *
const unsigned int n_evaluation_points = inputs.solution_values.size();
1920 *
if (do_schlieren_plot ==
true)
1921 *
Assert(inputs.solution_gradients.size() == n_evaluation_points,
1922 * ExcInternalError());
1924 *
Assert(computed_quantities.size() == n_evaluation_points,
1925 * ExcInternalError());
1926 *
Assert(inputs.solution_values[0].size() == dim + 2, ExcInternalError());
1927 *
Assert(computed_quantities[0].size() ==
1928 * dim + 2 + (do_schlieren_plot ==
true ? 1 : 0),
1929 * ExcInternalError());
1931 *
for (
unsigned int p = 0; p < n_evaluation_points; ++p)
1933 * Tensor<1, dim + 2> solution;
1934 *
for (
unsigned int d = 0;
d < dim + 2; ++
d)
1935 * solution[d] = inputs.solution_values[p](d);
1937 *
const double density = solution[0];
1938 *
const Tensor<1, dim> velocity = euler_velocity<dim>(solution);
1939 *
const double pressure = euler_pressure<dim>(solution);
1941 *
for (
unsigned int d = 0;
d < dim; ++
d)
1942 * computed_quantities[p](d) = velocity[
d];
1943 * computed_quantities[p](dim) = pressure;
1944 * computed_quantities[p](dim + 1) =
std::sqrt(gamma * pressure / density);
1946 *
if (do_schlieren_plot ==
true)
1947 * computed_quantities[p](dim + 2) =
1948 * inputs.solution_gradients[p][0] * inputs.solution_gradients[p][0];
1954 *
template <
int dim>
1955 * std::vector<std::string> EulerProblem<dim>::Postprocessor::get_names() const
1957 * std::vector<std::string> names;
1958 *
for (
unsigned int d = 0;
d < dim; ++
d)
1959 * names.emplace_back(
"velocity");
1960 * names.emplace_back(
"pressure");
1961 * names.emplace_back(
"speed_of_sound");
1963 *
if (do_schlieren_plot ==
true)
1964 * names.emplace_back(
"schlieren_plot");
1971 *
template <
int dim>
1972 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
1973 * EulerProblem<dim>::Postprocessor::get_data_component_interpretation() const
1975 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
1977 *
for (
unsigned int d = 0;
d < dim; ++
d)
1978 * interpretation.push_back(
1983 *
if (do_schlieren_plot ==
true)
1984 * interpretation.push_back(
1987 *
return interpretation;
1992 *
template <
int dim>
1993 *
UpdateFlags EulerProblem<dim>::Postprocessor::get_needed_update_flags() const
1995 *
if (do_schlieren_plot ==
true)
2003 *
template <
int dim>
2004 * EulerProblem<dim>::EulerProblem()
2006 * #ifdef DEAL_II_WITH_P4EST
2007 * , triangulation(MPI_COMM_WORLD)
2009 * , fe(FE_DGQ<dim>(fe_degree), dim + 2)
2011 * , dof_handler(triangulation)
2013 * , euler_operator(timer)
2020 *
template <
int dim>
2021 *
void EulerProblem<dim>::make_grid_and_dofs()
2027 * Point<dim> lower_left;
2028 *
for (
unsigned int d = 1;
d < dim; ++
d)
2029 * lower_left[d] = -5;
2031 * Point<dim> upper_right;
2032 * upper_right[0] = 10;
2033 *
for (
unsigned int d = 1;
d < dim; ++
d)
2034 * upper_right[d] = 5;
2039 * triangulation.refine_global(2);
2041 * euler_operator.set_inflow_boundary(
2042 * 0, std::make_unique<ExactSolution<dim>>(0));
2050 * triangulation, 0.03, 1, 0,
true);
2052 * euler_operator.set_inflow_boundary(
2053 * 0, std::make_unique<ExactSolution<dim>>(0));
2054 * euler_operator.set_subsonic_outflow_boundary(
2055 * 1, std::make_unique<ExactSolution<dim>>(0));
2057 * euler_operator.set_wall_boundary(2);
2058 * euler_operator.set_wall_boundary(3);
2061 * euler_operator.set_body_force(
2062 * std::make_unique<Functions::ConstantFunction<dim>>(
2063 * std::vector<double>({0., 0., -0.2})));
2072 * triangulation.refine_global(n_global_refinements);
2074 * dof_handler.distribute_dofs(fe);
2076 * euler_operator.reinit(
mapping, dof_handler);
2077 * euler_operator.initialize_vector(solution);
2079 * std::locale s = pcout.get_stream().getloc();
2080 * pcout.get_stream().imbue(std::locale(
""));
2081 * pcout <<
"Number of degrees of freedom: " << dof_handler.n_dofs()
2082 * <<
" ( = " << (dim + 2) <<
" [vars] x "
2083 * << triangulation.n_global_active_cells() <<
" [cells] x "
2086 * pcout.get_stream().imbue(s);
2091 *
template <
int dim>
2092 *
void EulerProblem<dim>::output_results(
const unsigned int result_number)
2094 *
const std::array<double, 3> errors =
2095 * euler_operator.compute_errors(ExactSolution<dim>(time), solution);
2096 *
const std::string quantity_name = testcase == 0 ?
"error" :
"norm";
2098 * pcout <<
"Time:" << std::setw(8) << std::setprecision(3) << time
2099 * <<
", dt: " << std::setw(8) << std::setprecision(2) << time_step
2100 * <<
", " << quantity_name <<
" rho: " << std::setprecision(4)
2101 * << std::setw(10) << errors[0] <<
", rho * u: " << std::setprecision(4)
2102 * << std::setw(10) << errors[1] <<
", energy:" << std::setprecision(4)
2103 * << std::setw(10) << errors[2] << std::endl;
2106 * TimerOutput::Scope t(timer,
"output");
2108 * Postprocessor postprocessor;
2109 * DataOut<dim> data_out;
2111 * DataOutBase::VtkFlags flags;
2112 * flags.write_higher_order_cells =
true;
2113 * data_out.set_flags(flags);
2115 * data_out.attach_dof_handler(dof_handler);
2117 * std::vector<std::string> names;
2118 * names.emplace_back(
"density");
2119 *
for (
unsigned int d = 0;
d < dim; ++
d)
2120 * names.emplace_back(
"momentum");
2121 * names.emplace_back(
"energy");
2123 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
2125 * interpretation.push_back(
2127 *
for (
unsigned int d = 0;
d < dim; ++
d)
2128 * interpretation.push_back(
2130 * interpretation.push_back(
2133 * data_out.add_data_vector(dof_handler, solution, names, interpretation);
2135 * data_out.add_data_vector(solution, postprocessor);
2137 * LinearAlgebra::distributed::Vector<Number> reference;
2138 *
if (testcase == 0 && dim == 2)
2140 * reference.reinit(solution);
2141 * euler_operator.project(ExactSolution<dim>(time), reference);
2142 * reference.sadd(-1., 1, solution);
2143 * std::vector<std::string> names;
2144 * names.emplace_back(
"error_density");
2145 *
for (
unsigned int d = 0;
d < dim; ++
d)
2146 * names.emplace_back(
"error_momentum");
2147 * names.emplace_back(
"error_energy");
2149 * std::vector<DataComponentInterpretation::DataComponentInterpretation>
2151 * interpretation.push_back(
2153 *
for (
unsigned int d = 0;
d < dim; ++
d)
2154 * interpretation.push_back(
2156 * interpretation.push_back(
2159 * data_out.add_data_vector(dof_handler,
2165 * Vector<double> mpi_owner(triangulation.n_active_cells());
2167 * data_out.add_data_vector(mpi_owner,
"owner");
2169 * data_out.build_patches(
mapping,
2173 *
const std::string filename =
2175 * data_out.write_vtu_in_parallel(filename, MPI_COMM_WORLD);
2181 *
template <
int dim>
2182 *
void EulerProblem<dim>::run()
2185 *
const unsigned int n_vect_number = VectorizedArrayType::size();
2186 *
const unsigned int n_vect_bits = 8 *
sizeof(Number) * n_vect_number;
2188 * pcout <<
"Running with "
2190 * <<
" MPI processes" << std::endl;
2191 * pcout <<
"Vectorization over " << n_vect_number <<
' '
2192 * << (std::is_same_v<Number, double> ?
"doubles" :
"floats") <<
" = "
2193 * << n_vect_bits <<
" bits ("
2198 * make_grid_and_dofs();
2200 *
const LowStorageRungeKuttaIntegrator integrator(lsrk_scheme);
2202 * LinearAlgebra::distributed::Vector<Number> rk_register_1;
2203 * LinearAlgebra::distributed::Vector<Number> rk_register_2;
2204 * rk_register_1.reinit(solution);
2205 * rk_register_2.reinit(solution);
2207 * euler_operator.project(ExactSolution<dim>(time), solution);
2210 *
double min_vertex_distance = std::numeric_limits<double>::max();
2211 *
for (
const auto &cell : triangulation.active_cell_iterators())
2212 *
if (cell->is_locally_owned())
2213 * min_vertex_distance =
2214 *
std::min(min_vertex_distance, cell->minimum_vertex_distance());
2215 * min_vertex_distance =
2218 * time_step = courant_number * integrator.n_stages() /
2219 * euler_operator.compute_cell_transport_speed(solution);
2220 * pcout <<
"Time step size: " << time_step
2221 * <<
", minimal h: " << min_vertex_distance
2222 * <<
", initial transport scaling: "
2223 * << 1. / euler_operator.compute_cell_transport_speed(solution)
2227 * output_results(0);
2229 *
unsigned int timestep_number = 0;
2231 *
while (time < final_time - 1e-12 && timestep_number < max_time_steps)
2233 * ++timestep_number;
2234 *
if (timestep_number % 5 == 0)
2236 * courant_number * integrator.n_stages() /
2238 * euler_operator.compute_cell_transport_speed(solution), 3);
2241 * TimerOutput::Scope t(timer,
"rk time stepping total");
2242 * integrator.perform_time_step(euler_operator,
2250 * time += time_step;
2252 *
if (
static_cast<int>(time / output_tick) !=
2253 *
static_cast<int>((time - time_step) / output_tick) ||
2254 * time >= final_time - 1e-12)
2256 *
static_cast<unsigned int>(std::round(time / output_tick)));
2259 * timer.print_wall_time_statistics(MPI_COMM_WORLD);
2260 * pcout << std::endl;
2266 *
int main(
int argc,
char **argv)
2268 *
using namespace Euler_DG;
2269 *
using namespace dealii;
2271 * Utilities::MPI::MPI_InitFinalize mpi_initialization(argc, argv, 1);
2275 * EulerProblem<dimension> euler_problem;
2276 * euler_problem.run();
2278 *
catch (std::exception &exc)
2280 * std::cerr << std::endl
2282 * <<
"----------------------------------------------------"
2284 * std::cerr <<
"Exception on processing: " << std::endl
2285 * << exc.what() << std::endl
2286 * <<
"Aborting!" << std::endl
2287 * <<
"----------------------------------------------------"
2294 * std::cerr << std::endl
2296 * <<
"----------------------------------------------------"
2298 * std::cerr <<
"Unknown exception!" << std::endl
2299 * <<
"Aborting!" << std::endl
2300 * <<
"----------------------------------------------------"
2308<a name=
"step_76-Results"></a><h1>Results</h1>
2311Running the program with the
default settings on a machine with 40 processes
2312produces the following output:
2315Running with 40
MPI processes
2316Vectorization over 8 doubles = 512 bits (AVX512)
2317Number of degrees of freedom: 27.648.000 ( = 5 [vars] x 25.600 [cells] x 216 [dofs/cell/var] )
2318Time step size: 0.000295952, minimal h: 0.0075,
initial transport scaling: 0.00441179
2319Time: 0, dt: 0.0003,
norm rho: 5.385e-16, rho * u: 1.916e-16, energy: 1.547e-15
2320+--------------------------------------+------------------+------------+------------------+
2321| Total wallclock time elapsed | 17.52s 10 | 17.52s | 17.52s 11 |
2323| Section | no. calls |
min time rank |
avg time |
max time rank |
2324+--------------------------------------+------------------+------------+------------------+
2325| compute errors | 1 | 0.009594s 16 | 0.009705s | 0.009819s 8 |
2326| compute transport speed | 22 | 0.1366s 0 | 0.1367s | 0.1368s 18 |
2327| output | 1 | 1.233s 0 | 1.233s | 1.233s 32 |
2328| rk time stepping total | 100 | 8.746s 35 | 8.746s | 8.746s 0 |
2329| rk_stage - integrals L_h | 500 | 8.742s 36 | 8.742s | 8.743s 2 |
2330+--------------------------------------+------------------+------------+------------------+
2333and the following visual output:
2335<table align=
"center" class=
"doxtable" style=
"width:85%">
2338 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_010.png" alt=
"" width=
"100%">
2341 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_025.png" alt=
"" width=
"100%">
2346 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_050.png" alt=
"" width=
"100%">
2349 <img src=
"https://www.dealii.org/images/steps/developer/step-67.pressure_100.png" alt=
"" width=
"100%">
2354As a reference, the results of @ref step_67
"step-67" using FCL are:
2357Running with 40
MPI processes
2358Vectorization over 8 doubles = 512 bits (AVX512)
2359Number of degrees of freedom: 27.648.000 ( = 5 [vars] x 25.600 [cells] x 216 [dofs/cell/var] )
2360Time step size: 0.000295952, minimal h: 0.0075,
initial transport scaling: 0.00441179
2361Time: 0, dt: 0.0003,
norm rho: 5.385e-16, rho * u: 1.916e-16, energy: 1.547e-15
2362+-------------------------------------------+------------------+------------+------------------+
2363| Total wallclock time elapsed | 13.33s 0 | 13.34s | 13.35s 34 |
2365| Section | no. calls |
min time rank |
avg time |
max time rank |
2366+-------------------------------------------+------------------+------------+------------------+
2367| compute errors | 1 | 0.007977s 10 | 0.008053s | 0.008161s 30 |
2368| compute transport speed | 22 | 0.1228s 34 | 0.2227s | 0.3845s 0 |
2369| output | 1 | 1.255s 3 | 1.257s | 1.259s 27 |
2370| rk time stepping total | 100 | 11.15s 0 | 11.32s | 11.42s 34 |
2371| rk_stage - integrals L_h | 500 | 8.719s 10 | 8.932s | 9.196s 0 |
2372| rk_stage - inv mass + vec upd | 500 | 1.944s 0 | 2.377s | 2.55s 10 |
2373+-------------------------------------------+------------------+------------+------------------+
2376By the modifications shown in this tutorial, we were able to achieve a speedup of
237727% for the Runge-Kutta stages.
2379<a name=
"step_76-Possibilitiesforextensions"></a><h3>Possibilities for extensions</h3>
2382The algorithms are easily extendable to higher dimensions: a high-dimensional
2383<a href=
"https://github.com/hyperdeal/hyperdeal/blob/a9e67b4e625ff1dde2fed93ad91cdfacfaa3acdf/include/hyper.deal/operators/advection/advection_operation.h#L219-L569">advection operator based on cell-centric loops</a>
2384is part of the hyper.deal library. An extension of cell-centric loops
2385to locally-refined meshes is more involved.
2387<a name=
"step_76-ExtensiontothecompressibleNavierStokesequations"></a><h4>Extension to the compressible Navier-Stokes equations</h4>
2390The solver presented in this tutorial program can also be extended to the
2391compressible Navier–Stokes equations by adding viscous terms, as also
2392suggested in @ref step_67
"step-67". To keep as much of the performance obtained here despite
2393the additional cost of elliptic terms,
e.g. via an interior penalty method, that
2394tutorial has proposed to switch the basis from FE_DGQ to FE_DGQHermite like in
2395the @ref step_59
"step-59" tutorial program. The reasoning behind this switch is that in the
2396case of FE_DGQ all
values of neighboring cells (i.
e., @f$k+1@f$ layers) are needed,
2397whilst in the case of FE_DGQHermite only 2 layers, making the latter
2398significantly more suitable for higher degrees. The additional layers have to be,
2399on the one hand, loaded from main memory during flux computation and, one the
2400other hand, have to be communicated. Using the shared-memory capabilities
2401introduced in this tutorial, the second
point can be eliminated on a single
2402compute node or its influence can be reduced in a
hybrid context.
2404<a name=
"step_76-BlockGaussSeidellikepreconditioners"></a><h4>Block Gauss-Seidel-like preconditioners</h4>
2407Cell-centric loops could be used to create block Gauss-Seidel preconditioners
2408that are multiplicative within one process and additive over processes. These
2409type of preconditioners use during flux computation, in contrast to Jacobi-type
2410preconditioners, already updated
values from neighboring cells. The following
2411pseudo-code visualizes how this could in principal be achieved:
2415Vector<Number> visit_flags(data.n_cell_batches () + data.n_ghost_cell_batches ());
2418data.template loop_cell_centric<VectorType, VectorType>(
2419 [&](
const auto &data,
auto &dst,
const auto &src,
const auto cell_range) {
2421 for (
unsigned int cell = cell_range.first; cell < cell_range.second; ++cell)
2425 for (
unsigned int face = 0; face < GeometryInfo<dim>::faces_per_cell; ++face)
2427 const auto boundary_id = data.get_faces_by_cells_boundary_id(cell, face)[0];
2431 phi_p.reinit(cell, face);
2433 const auto flags = phi_p.read_cell_data(visit_flags);
2434 const auto all_neighbors_have_been_updated =
2436 flags().
begin() + data.n_active_entries_per_cell_batch(cell) == 1;
2438 if(all_neighbors_have_been_updated)
2455 phi.set_cell_data(visit_flags, VectorizedArrayType(1.0));
2461 MatrixFree<dim, Number, VectorizedArrayType>::DataAccessOnFaces::values);
2464For
this purpose, one can exploit the cell-data vector capabilities of
2465MatrixFree and the range-based iteration capabilities of VectorizedArray.
2467Please note that in the given example we process <code>VectorizedArrayType::size()</code>
2468number of blocks, since each lane corresponds to one block. We consider blocks
2469as updated
if all blocks processed by a vector
register have been updated. In
2470the
case of Cartesian meshes
this is a reasonable approach, however,
for
2471general unstructured meshes
this conservative approach might lead to a decrease in the
2472efficiency of the preconditioner. A
reduction of cells processed in parallel
2473by explicitly reducing the number of lanes used by <code>VectorizedArrayType</code>
2474might increase the quality of the preconditioner, but with the cost that each
2475iteration might be more expensive. This dilemma leads us to a further
2476"possibility for extension": vectorization within an element.
2479<a name=
"step_76-PlainProg"></a>
2480<h1> The plain program</h1>
2481@include
"step-76.cc"
void submit_value(const value_type val_in, const unsigned int q_point)
void loop_cell_centric(void(CLASS::*cell_operation)(const MatrixFree &, OutVector &, const InVector &, const std::pair< unsigned int, unsigned int > &) const, const CLASS *owning_class, OutVector &dst, const InVector &src, const bool zero_dst_vector=false, const DataAccessOnFaces src_vector_face_access=DataAccessOnFaces::unspecified) const
void reinit(const MappingType &mapping, const DoFHandler< dim > &dof_handler, const AffineConstraints< number2 > &constraint, const QuadratureType &quad, const AdditionalData &additional_data=AdditionalData())
std::enable_if_t< std::is_floating_point_v< T > &&std::is_floating_point_v< U >, typename ProductType< std::complex< T >, std::complex< U > >::type > operator*(const std::complex< T > &left, const std::complex< U > &right)
#define DEAL_II_ALWAYS_INLINE
const unsigned int DoFAccessor< structdim, dim, spacedim, level_dof_access >::dimension
__global__ void reduction(Number *result, const Number *v, const size_type N)
__global__ void set(Number *val, const Number s, const size_type N)
#define Assert(cond, exc)
#define AssertDimension(dim1, dim2)
#define AssertThrow(cond, exc)
void loop(IteratorType begin, std_cxx20::type_identity_t< IteratorType > end, DOFINFO &dinfo, INFOBOX &info, const std::function< void(std_cxx20::type_identity_t< DOFINFO > &, typename INFOBOX::CellInfo &)> &cell_worker, const std::function< void(std_cxx20::type_identity_t< DOFINFO > &, typename INFOBOX::CellInfo &)> &boundary_worker, const std::function< void(std_cxx20::type_identity_t< DOFINFO > &, std_cxx20::type_identity_t< DOFINFO > &, typename INFOBOX::CellInfo &, typename INFOBOX::CellInfo &)> &face_worker, AssemblerType &assembler, const LoopControl &lctrl=LoopControl())
const bool IsBlockVector< VectorType >::value
@ update_values
Shape function values.
@ update_normal_vectors
Normal vectors.
@ update_JxW_values
Transformed quadrature weights.
@ update_gradients
Shape function gradients.
@ update_quadrature_points
Transformed quadrature points.
#define DEAL_II_NOT_IMPLEMENTED()
MappingQ< dim, spacedim > StaticMappingQ1< dim, spacedim >::mapping
DataComponentInterpretation
@ component_is_part_of_vector
void hyper_rectangle(Triangulation< dim, spacedim > &tria, const Point< dim > &p1, const Point< dim > &p2, const bool colorize=false)
void channel_with_cylinder(Triangulation< dim > &tria, const double shell_region_width=0.03, const unsigned int n_shells=2, const double skewness=2.0, const bool colorize=false)
@ matrix
Contents is actually a matrix.
@ general
No special properties.
double norm(const FEValuesBase< dim > &fe, const ArrayView< const std::vector< Tensor< 1, dim > > > &Du)
Point< spacedim > point(const gp_Pnt &p, const double tolerance=1e-10)
SymmetricTensor< 2, dim, Number > C(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > e(const Tensor< 2, dim, Number > &F)
SymmetricTensor< 2, dim, Number > d(const Tensor< 2, dim, Number > &F, const Tensor< 2, dim, Number > &dF_dt)
@ LOW_STORAGE_RK_STAGE9_ORDER5
@ LOW_STORAGE_RK_STAGE3_ORDER3
@ LOW_STORAGE_RK_STAGE7_ORDER4
@ LOW_STORAGE_RK_STAGE5_ORDER4
VectorType::value_type * begin(VectorType &V)
T sum(const T &t, const MPI_Comm mpi_communicator)
unsigned int n_mpi_processes(const MPI_Comm mpi_communicator)
T max(const T &t, const MPI_Comm mpi_communicator)
T min(const T &t, const MPI_Comm mpi_communicator)
unsigned int this_mpi_process(const MPI_Comm mpi_communicator)
T reduce(const T &local_value, const MPI_Comm comm, const std::function< T(const T &, const T &)> &combiner, const unsigned int root_process=0)
std::string get_current_vectorization_level()
Number truncate_to_n_digits(const Number number, const unsigned int n_digits)
std::string int_to_string(const unsigned int value, const unsigned int digits=numbers::invalid_unsigned_int)
constexpr T pow(const T base, const int iexp)
void run(const Iterator &begin, const std_cxx20::type_identity_t< Iterator > &end, Worker worker, Copier copier, const ScratchData &sample_scratch_data, const CopyData &sample_copy_data, const unsigned int queue_length, const unsigned int chunk_size)
long double gamma(const unsigned int n)
void copy(const T *begin, const T *end, U *dest)
int(& functions)(const void *v1, const void *v2)
void reinit(MatrixBlock< MatrixType > &v, const BlockSparsityPattern &p)
static constexpr double PI
const types::boundary_id internal_face_boundary_id
static const unsigned int invalid_unsigned_int
void transform(const InputIterator &begin_in, const InputIterator &end_in, OutputIterator out, const Function &function, const unsigned int grainsize)
::VectorizedArray< Number, width > exp(const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > min(const ::VectorizedArray< Number, width > &, const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > max(const ::VectorizedArray< Number, width > &, const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > sqrt(const ::VectorizedArray< Number, width > &)
::VectorizedArray< Number, width > pow(const ::VectorizedArray< Number, width > &, const Number p)
::VectorizedArray< Number, width > abs(const ::VectorizedArray< Number, width > &)
bool hold_all_faces_to_owned_cells