19 template <
int p,
int c>
20 struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
21 static constexpr
auto geometry = mfem::Geometry::CUBE;
22 static constexpr
auto family = Family::L2;
23 static constexpr
int components = c;
24 static constexpr
int dim = 3;
25 static constexpr
int n = (p + 1);
26 static constexpr
int ndof = (p + 1) * (p + 1) * (p + 1);
27 static constexpr
int order = p;
29 static constexpr
int VALUE = 0, GRADIENT = 1;
30 static constexpr
int SOURCE = 0, FLUX = 1;
34 typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >
::type;
36 using dof_type = tensor<double, c, p + 1, p + 1, p + 1>;
38 using value_type =
typename std::conditional<components == 1, double, tensor<double, components> >
::type;
39 using derivative_type =
40 typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >
::type;
41 using qf_input_type = tuple<value_type, derivative_type>;
43 SERAC_HOST_DEVICE static constexpr tensor<double, ndof> shape_functions(tensor<double, dim> xi)
45 auto N_xi = GaussLobattoInterpolation<p + 1>(xi[0]);
46 auto N_eta = GaussLobattoInterpolation<p + 1>(xi[1]);
47 auto N_zeta = GaussLobattoInterpolation<p + 1>(xi[2]);
51 tensor<double, ndof> N{};
52 for (
int k = 0; k < p + 1; k++) {
53 for (
int j = 0; j < p + 1; j++) {
54 for (
int i = 0; i < p + 1; i++) {
55 N[count++] = N_xi[i] * N_eta[j] * N_zeta[k];
62 SERAC_HOST_DEVICE static constexpr tensor<double, ndof, dim> shape_function_gradients(tensor<double, dim> xi)
64 auto N_xi = GaussLobattoInterpolation<p + 1>(xi[0]);
65 auto N_eta = GaussLobattoInterpolation<p + 1>(xi[1]);
66 auto N_zeta = GaussLobattoInterpolation<p + 1>(xi[2]);
67 auto dN_xi = GaussLobattoInterpolationDerivative<p + 1>(xi[0]);
68 auto dN_eta = GaussLobattoInterpolationDerivative<p + 1>(xi[1]);
69 auto dN_zeta = GaussLobattoInterpolationDerivative<p + 1>(xi[2]);
74 tensor<double, ndof, dim> dN{};
75 for (
int k = 0; k < p + 1; k++) {
76 for (
int j = 0; j < p + 1; j++) {
77 for (
int i = 0; i < p + 1; i++) {
79 dN_xi[i] * N_eta[j] * N_zeta[k],
80 N_xi[i] * dN_eta[j] * N_zeta[k],
81 N_xi[i] * N_eta[j] * dN_zeta[k]
100 template <
bool apply_weights,
int q>
101 static constexpr
auto calculate_B()
103 constexpr
auto points1D = GaussLegendreNodes<q, mfem::Geometry::SEGMENT>();
104 [[maybe_unused]] constexpr
auto weights1D = GaussLegendreWeights<q, mfem::Geometry::SEGMENT>();
105 tensor<double, q, n> B{};
106 for (
int i = 0; i < q; i++) {
107 B[i] = GaussLobattoInterpolation<n>(points1D[i]);
108 if constexpr (apply_weights) B[i] = B[i] * weights1D[i];
123 template <
bool apply_weights,
int q>
124 static constexpr
auto calculate_G()
126 constexpr
auto points1D = GaussLegendreNodes<q, mfem::Geometry::SEGMENT>();
127 [[maybe_unused]] constexpr
auto weights1D = GaussLegendreWeights<q, mfem::Geometry::SEGMENT>();
128 tensor<double, q, n> G{};
129 for (
int i = 0; i < q; i++) {
130 G[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
131 if constexpr (apply_weights) G[i] = G[i] * weights1D[i];
136 template <
typename in_t,
int q>
137 static auto batch_apply_shape_fn(
int j, tensor<in_t, q * q * q> input,
const TensorProductQuadratureRule<q>&)
139 static constexpr
bool apply_weights =
false;
140 static constexpr
auto B = calculate_B<apply_weights, q>();
141 static constexpr
auto G = calculate_G<apply_weights, q>();
144 int jy = (j % (n * n)) / n;
145 int jz = j / (n * n);
147 using source_t = decltype(get<0>(get<0>(in_t{})) +
dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
148 using flux_t = decltype(get<0>(get<1>(in_t{})) +
dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
150 tensor<tuple<source_t, flux_t>, q * q * q> output;
152 for (
int qz = 0; qz < q; qz++) {
153 for (
int qy = 0; qy < q; qy++) {
154 for (
int qx = 0; qx < q; qx++) {
155 double phi_j = B(qx, jx) * B(qy, jy) * B(qz, jz);
156 tensor<double, dim> dphi_j_dxi = {G(qx, jx) * B(qy, jy) * B(qz, jz), B(qx, jx) * G(qy, jy) * B(qz, jz),
157 B(qx, jx) * B(qy, jy) * G(qz, jz)};
159 int Q = (qz * q + qy) * q + qx;
160 const auto& d00 = get<0>(get<0>(input(Q)));
161 const auto& d01 = get<1>(get<0>(input(Q)));
162 const auto& d10 = get<0>(get<1>(input(Q)));
163 const auto& d11 = get<1>(get<1>(input(Q)));
165 output[Q] = {d00 * phi_j +
dot(d01, dphi_j_dxi), d10 * phi_j +
dot(d11, dphi_j_dxi)};
174 SERAC_HOST_DEVICE static auto interpolate(
const dof_type& X,
const TensorProductQuadratureRule<q>&)
191 static constexpr
bool apply_weights =
false;
192 static constexpr
auto B = calculate_B<apply_weights, q>();
193 static constexpr
auto G = calculate_G<apply_weights, q>();
195 tensor<double, c, q, q, q> value{};
196 tensor<double, c, dim, q, q, q> gradient{};
198 for (
int i = 0; i < c; i++) {
199 auto A10 = contract<2, 1>(X[i], B);
200 auto A11 = contract<2, 1>(X[i], G);
202 auto A20 = contract<1, 1>(A10, B);
203 auto A21 = contract<1, 1>(A11, B);
204 auto A22 = contract<1, 1>(A10, G);
206 value(i) = contract<0, 1>(A20, B);
207 gradient(i, 0) = contract<0, 1>(A21, B);
208 gradient(i, 1) = contract<0, 1>(A22, B);
209 gradient(i, 2) = contract<0, 1>(A20, G);
214 tensor<qf_input_type, q * q * q> one_dimensional;
215 tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, q, q, q> three_dimensional;
218 for (
int qz = 0; qz < q; qz++) {
219 for (
int qy = 0; qy < q; qy++) {
220 for (
int qx = 0; qx < q; qx++) {
221 for (
int i = 0; i < c; i++) {
222 get<VALUE>(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx);
223 for (
int j = 0; j < dim; j++) {
224 get<GRADIENT>(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx);
231 return output.one_dimensional;
234 template <
typename source_type,
typename flux_type,
int q>
236 const TensorProductQuadratureRule<q>&, dof_type* element_residual,
239 if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
243 constexpr
int ntrial =
std::max(
size(source_type{}),
size(flux_type{}) / dim) / c;
245 using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q> >;
246 using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q> >;
248 static constexpr
bool apply_weights =
true;
249 static constexpr
auto B = calculate_B<apply_weights, q>();
250 static constexpr
auto G = calculate_G<apply_weights, q>();
252 for (
int j = 0; j < ntrial; j++) {
253 for (
int i = 0; i < c; i++) {
254 s_buffer_type source;
257 for (
int qz = 0; qz < q; qz++) {
258 for (
int qy = 0; qy < q; qy++) {
259 for (
int qx = 0; qx < q; qx++) {
260 int Q = (qz * q + qy) * q + qx;
261 if constexpr (!is_zero<source_type>{}) {
262 source(qz, qy, qx) =
reinterpret_cast<const double*
>(&get<SOURCE>(qf_output[Q]))[i * ntrial + j];
264 if constexpr (!is_zero<flux_type>{}) {
265 for (
int k = 0; k < dim; k++) {
266 flux(k, qz, qy, qx) =
267 reinterpret_cast<const double*
>(&get<FLUX>(qf_output[Q]))[(i * dim + k) * ntrial + j];
274 auto A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G);
275 auto A21 = contract<2, 0>(flux(1), B);
276 auto A22 = contract<2, 0>(flux(2), B);
278 auto A10 = contract<1, 0>(A20, B) + contract<1, 0>(A21, G);
279 auto A11 = contract<1, 0>(A22, B);
281 element_residual[j * step](i) += contract<0, 0>(A10, B) + contract<0, 0>(A11, G);
289 static SERAC_DEVICE auto interpolate(
const dof_type& X,
const tensor<double, dim, dim>& J,
290 const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache)
308 int tidx = threadIdx.x % q;
309 int tidy = (threadIdx.x % (q * q)) / q;
310 int tidz = threadIdx.x / (q * q);
312 static constexpr
auto points1D = GaussLegendreNodes<q>();
313 static constexpr
auto B_ = [=]() {
314 tensor<double, q, n> B{};
315 for (
int i = 0; i < q; i++) {
316 B[i] = GaussLobattoInterpolation<n>(points1D[i]);
321 static constexpr
auto G_ = [=]() {
322 tensor<double, q, n> G{};
323 for (
int i = 0; i < q; i++) {
324 G[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
329 __shared__ tensor<double, q, n> B;
330 __shared__ tensor<double, q, n> G;
331 for (
int entry = threadIdx.x; entry < n * q; entry += q * q * q) {
339 tuple<tensor<double, c>, tensor<double, c, 3> > qf_input{};
341 for (
int i = 0; i < c; i++) {
342 for (
int dz = tidz; dz < n; dz += q) {
343 for (
int dy = tidy; dy < n; dy += q) {
344 for (
int qx = tidx; qx < q; qx += q) {
346 for (
int dx = 0; dx < n; dx++) {
347 sum[0] += B(qx, dx) * X(i, dz, dy, dx);
348 sum[1] += G(qx, dx) * X(i, dz, dy, dx);
350 cache.A1(0, dz, dy, qx) = sum[0];
351 cache.A1(1, dz, dy, qx) = sum[1];
357 for (
int dz = tidz; dz < n; dz += q) {
358 for (
int qy = tidy; qy < q; qy += q) {
359 for (
int qx = tidx; qx < q; qx += q) {
361 for (
int dy = 0; dy < n; dy++) {
362 sum[0] += B(qy, dy) * cache.A1(0, dz, dy, qx);
363 sum[1] += B(qy, dy) * cache.A1(1, dz, dy, qx);
364 sum[2] += G(qy, dy) * cache.A1(0, dz, dy, qx);
366 cache.A2(0, dz, qy, qx) = sum[0];
367 cache.A2(1, dz, qy, qx) = sum[1];
368 cache.A2(2, dz, qy, qx) = sum[2];
374 for (
int qz = tidz; qz < q; qz += q) {
375 for (
int qy = tidy; qy < q; qy += q) {
376 for (
int qx = tidx; qx < q; qx += q) {
377 for (
int dz = 0; dz < n; dz++) {
378 get<0>(qf_input)[i] += B(qz, dz) * cache.A2(0, dz, qy, qx);
379 get<1>(qf_input)[i][0] += B(qz, dz) * cache.A2(1, dz, qy, qx);
380 get<1>(qf_input)[i][1] += B(qz, dz) * cache.A2(2, dz, qy, qx);
381 get<1>(qf_input)[i][2] += G(qz, dz) * cache.A2(0, dz, qy, qx);
388 get<1>(qf_input) =
dot(get<1>(qf_input),
inv(J));
393 template <
typename T1,
typename T2,
int q>
394 static SERAC_DEVICE void integrate(tuple<T1, T2>& response,
const tensor<double, dim, dim>& J,
395 const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache,
398 int tidx = threadIdx.x % q;
399 int tidy = (threadIdx.x % (q * q)) / q;
400 int tidz = threadIdx.x / (q * q);
402 static constexpr
auto points1D = GaussLegendreNodes<q>();
403 static constexpr
auto weights1D = GaussLegendreWeights<q>();
404 static constexpr
auto B_ = [=]() {
405 tensor<double, q, n> B{};
406 for (
int i = 0; i < q; i++) {
407 B[i] = GaussLobattoInterpolation<n>(points1D[i]);
412 static constexpr
auto G_ = [=]() {
413 tensor<double, q, n> G{};
414 for (
int i = 0; i < q; i++) {
415 G[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
420 __shared__ tensor<double, q, n> B;
421 __shared__ tensor<double, q, n> G;
422 for (
int entry = threadIdx.x; entry < n * q; entry += q * q * q) {
430 auto dv =
det(J) * weights1D[tidx] * weights1D[tidy] * weights1D[tidz];
432 get<0>(response) = get<0>(response) * dv;
435 for (
int i = 0; i < c; i++) {
438 for (
int qz = tidz; qz < q; qz += q) {
439 for (
int qy = tidy; qy < q; qy += q) {
440 for (
int dx = tidx; dx < n; dx += q) {
441 cache.A2(0, dx, qy, qz) = 0.0;
442 cache.A2(1, dx, qy, qz) = 0.0;
443 cache.A2(2, dx, qy, qz) = 0.0;
449 for (
int offset = 0; offset < n; offset++) {
450 int dx = (tidx + offset) % n;
451 auto sum = B(tidx, dx) * get<0>(response)(i) + G(tidx, dx) * get<1>(response)(i, 0);
452 atomicAdd(&cache.A2(0, dx, tidz, tidy), sum);
453 atomicAdd(&cache.A2(1, dx, tidz, tidy), B(tidx, dx) * get<1>(response)(i, 1));
454 atomicAdd(&cache.A2(2, dx, tidz, tidy), B(tidx, dx) * get<1>(response)(i, 2));
458 for (
int qz = tidz; qz < q; qz += q) {
459 for (
int dy = tidy; dy < n; dy += q) {
460 for (
int dx = tidx; dx < n; dx += q) {
462 for (
int qy = 0; qy < q; qy++) {
463 sum[0] += B(qy, dy) * cache.A2(0, dx, qz, qy);
464 sum[0] += G(qy, dy) * cache.A2(1, dx, qz, qy);
465 sum[1] += B(qy, dy) * cache.A2(2, dx, qz, qy);
467 cache.A1(0, qz, dy, dx) = sum[0];
468 cache.A1(1, qz, dy, dx) = sum[1];
474 for (
int dz = tidz; dz < n; dz += q) {
475 for (
int dy = tidy; dy < n; dy += q) {
476 for (
int dx = tidx; dx < n; dx += q) {
478 for (
int qz = 0; qz < q; qz++) {
479 sum += B(qz, dz) * cache.A1(0, qz, dy, dx);
480 sum += G(qz, dz) * cache.A1(1, qz, dy, dx);
482 residual(i, dz, dy, dx) += sum;
#define SERAC_DEVICE
Macro that evaluates to __device__ when compiling with nvcc and does nothing on a host compiler.
#define SERAC_HOST_DEVICE
Macro that evaluates to __host__ __device__ when compiling with nvcc and does nothing on a host compi...
constexpr SERAC_HOST_DEVICE auto dot(const isotropic_tensor< S, m, m > &I, const tensor< T, m, n... > &A)
dot product between an isotropic and (nonisotropic) tensor
tensor(const T(&data)[n1]) -> tensor< T, n1 >
class template argument deduction guide for type tensor.
SERAC_HOST_DEVICE auto max(dual< gradient_type > a, double b)
Implementation of max for dual numbers.
constexpr SERAC_HOST_DEVICE int size(const tensor< T, n... > &)
returns the total number of stored values in a tensor
constexpr SERAC_HOST_DEVICE auto det(const isotropic_tensor< T, m, m > &I)
compute the determinant of an isotropic tensor
constexpr SERAC_HOST_DEVICE auto type(const tuple< T... > &values)
a function intended to be used for extracting the ith type from a tuple.
constexpr SERAC_HOST_DEVICE auto inv(const isotropic_tensor< T, m, m > &I)
return the inverse of an isotropic tensor
constexpr SERAC_HOST_DEVICE auto transpose(const isotropic_tensor< T, m, m > &I)
return the transpose of an isotropic tensor