23 struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
24 static constexpr
auto geometry = mfem::Geometry::CUBE;
25 static constexpr
auto family = Family::HCURL;
26 static constexpr
int dim = 3;
27 static constexpr
int n = p + 1;
28 static constexpr
int ndof = 3 * p * (p + 1) * (p + 1);
29 static constexpr
int components = 1;
31 static constexpr
int VALUE = 0, CURL = 1;
32 static constexpr
int SOURCE = 0, FLUX = 1;
36 typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>
::type;
40 tensor<double, p + 1, p + 1, p> x;
41 tensor<double, p + 1, p, p + 1> y;
42 tensor<double, p, p + 1, p + 1> z;
46 using cpu_batched_values_type = tensor<tensor<double, 3>, q, q, q>;
49 using cpu_batched_derivatives_type = tensor<tensor<double, 3>, q, q, q>;
51 static constexpr
auto directions = [] {
52 int dof_per_direction = p * (p + 1) * (p + 1);
54 tensor<double, ndof, dim> directions{};
55 for (
int i = 0; i < dof_per_direction; i++) {
56 directions[i + 0 * dof_per_direction] = {1.0, 0.0, 0.0};
57 directions[i + 1 * dof_per_direction] = {0.0, 1.0, 0.0};
58 directions[i + 2 * dof_per_direction] = {0.0, 0.0, 1.0};
63 static constexpr
auto nodes = []() {
64 auto legendre_nodes = GaussLegendreNodes<p, mfem::Geometry::SEGMENT>();
65 auto lobatto_nodes = GaussLobattoNodes<p + 1>();
67 tensor<double, ndof, dim> nodes{};
70 for (
int k = 0; k < p + 1; k++) {
71 for (
int j = 0; j < p + 1; j++) {
72 for (
int i = 0; i < p; i++) {
73 nodes[count++] = {legendre_nodes[i], lobatto_nodes[j], lobatto_nodes[k]};
78 for (
int k = 0; k < p + 1; k++) {
79 for (
int j = 0; j < p; j++) {
80 for (
int i = 0; i < p + 1; i++) {
81 nodes[count++] = {lobatto_nodes[i], legendre_nodes[j], lobatto_nodes[k]};
86 for (
int k = 0; k < p; k++) {
87 for (
int j = 0; j < p + 1; j++) {
88 for (
int i = 0; i < p + 1; i++) {
89 nodes[count++] = {lobatto_nodes[i], lobatto_nodes[j], legendre_nodes[k]};
107 template <
bool apply_weights,
int q>
108 static constexpr
auto calculate_B1()
110 constexpr
auto points1D = GaussLegendreNodes<q>();
111 [[maybe_unused]] constexpr
auto weights1D = GaussLegendreWeights<q>();
112 tensor<double, q, p> B1{};
113 for (
int i = 0; i < q; i++) {
114 B1[i] = GaussLegendreInterpolation<p>(points1D[i]);
115 if constexpr (apply_weights) B1[i] = B1[i] * weights1D[i];
130 template <
bool apply_weights,
int q>
131 static constexpr
auto calculate_B2()
133 constexpr
auto points1D = GaussLegendreNodes<q>();
134 [[maybe_unused]] constexpr
auto weights1D = GaussLegendreWeights<q>();
135 tensor<double, q, p + 1> B2{};
136 for (
int i = 0; i < q; i++) {
137 B2[i] = GaussLobattoInterpolation<p + 1>(points1D[i]);
138 if constexpr (apply_weights) B2[i] = B2[i] * weights1D[i];
153 template <
bool apply_weights,
int q>
154 static constexpr
auto calculate_G2()
156 constexpr
auto points1D = GaussLegendreNodes<q>();
157 [[maybe_unused]] constexpr
auto weights1D = GaussLegendreWeights<q>();
158 tensor<double, q, p + 1> G2{};
159 for (
int i = 0; i < q; i++) {
160 G2[i] = GaussLobattoInterpolationDerivative<p + 1>(points1D[i]);
161 if constexpr (apply_weights) G2[i] = G2[i] * weights1D[i];
166 SERAC_HOST_DEVICE static constexpr tensor<double, ndof, dim> shape_functions(tensor<double, dim> xi)
168 tensor<double, ndof, dim> N{};
170 tensor<double, p> f[3] = {GaussLegendreInterpolation<p>(xi[0]), GaussLegendreInterpolation<p>(xi[1]),
171 GaussLegendreInterpolation<p>(xi[2])};
173 tensor<double, p + 1> g[3] = {GaussLobattoInterpolation<p + 1>(xi[0]), GaussLobattoInterpolation<p + 1>(xi[1]),
174 GaussLobattoInterpolation<p + 1>(xi[2])};
179 for (
int k = 0; k < p + 1; k++) {
180 for (
int j = 0; j < p + 1; j++) {
181 for (
int i = 0; i < p; i++) {
182 N[count++] = {f[0][i] * g[1][j] * g[2][k], 0.0, 0.0};
188 for (
int k = 0; k < p + 1; k++) {
189 for (
int j = 0; j < p; j++) {
190 for (
int i = 0; i < p + 1; i++) {
191 N[count++] = {0.0, g[0][i] * f[1][j] * g[2][k], 0.0};
197 for (
int k = 0; k < p; k++) {
198 for (
int j = 0; j < p + 1; j++) {
199 for (
int i = 0; i < p + 1; i++) {
200 N[count++] = {0.0, 0.0, g[0][i] * g[1][j] * f[2][k]};
208 SERAC_HOST_DEVICE static constexpr tensor<double, ndof, dim> shape_function_curl(tensor<double, dim> xi)
210 tensor<double, ndof, dim> curl{};
212 tensor<double, p> f[3] = {GaussLegendreInterpolation<p>(xi[0]), GaussLegendreInterpolation<p>(xi[1]),
213 GaussLegendreInterpolation<p>(xi[2])};
215 tensor<double, p + 1> g[3] = {GaussLobattoInterpolation<p + 1>(xi[0]), GaussLobattoInterpolation<p + 1>(xi[1]),
216 GaussLobattoInterpolation<p + 1>(xi[2])};
218 tensor<double, p + 1> dg[3] = {GaussLobattoInterpolationDerivative<p + 1>(xi[0]),
219 GaussLobattoInterpolationDerivative<p + 1>(xi[1]),
220 GaussLobattoInterpolationDerivative<p + 1>(xi[2])};
225 for (
int k = 0; k < p + 1; k++) {
226 for (
int j = 0; j < p + 1; j++) {
227 for (
int i = 0; i < p; i++) {
229 curl[count++] = {0.0, f[0][i] * g[1][j] * dg[2][k], -f[0][i] * dg[1][j] * g[2][k]};
235 for (
int k = 0; k < p + 1; k++) {
236 for (
int j = 0; j < p; j++) {
237 for (
int i = 0; i < p + 1; i++) {
239 curl[count++] = {-g[0][i] * f[1][j] * dg[2][k], 0.0, dg[0][i] * f[1][j] * g[2][k]};
245 for (
int k = 0; k < p; k++) {
246 for (
int j = 0; j < p + 1; j++) {
247 for (
int i = 0; i < p + 1; i++) {
249 curl[count++] = {g[0][i] * dg[1][j] * f[2][k], -dg[0][i] * g[1][j] * f[2][k], 0.0};
257 template <
typename in_t,
int q>
258 static auto batch_apply_shape_fn(
int j, tensor<in_t, q * q * q> input,
const TensorProductQuadratureRule<q>&)
260 constexpr
bool apply_weights =
false;
261 constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
262 constexpr tensor<double, q, p + 1> B2 = calculate_B2<apply_weights, q>();
263 constexpr tensor<double, q, p + 1> G2 = calculate_G2<apply_weights, q>();
268 int dir = j / (p * (p + 1) * (p + 1));
269 int remainder = j % (p * (p + 1) * (p + 1));
273 jy = (remainder % (p * (p + 1))) / p;
274 jz = remainder / (p * (p + 1));
278 jx = remainder % (p + 1);
279 jy = (remainder % (p * (p + 1))) / (p + 1);
280 jz = remainder / (p * (p + 1));
284 jx = remainder % (p + 1);
285 jy = (remainder % ((p + 1) * (p + 1))) / (p + 1);
286 jz = remainder / ((p + 1) * (p + 1));
290 using source_t = decltype(
dot(get<0>(get<0>(in_t{})),
vec3{}) +
dot(get<1>(get<0>(in_t{})),
vec3{}));
291 using flux_t = decltype(
dot(get<0>(get<1>(in_t{})),
vec3{}) +
dot(get<1>(get<1>(in_t{})),
vec3{}));
293 tensor<tuple<source_t, flux_t>, q * q * q> output;
295 for (
int qz = 0; qz < q; qz++) {
296 for (
int qy = 0; qy < q; qy++) {
297 for (
int qx = 0; qx < q; qx++) {
298 tensor<double, 3> phi_j{};
299 tensor<double, 3> curl_phi_j{};
303 phi_j[0] = B1(qx, jx) * B2(qy, jy) * B2(qz, jz);
304 curl_phi_j[1] = B1(qx, jx) * B2(qy, jy) * G2(qz, jz);
305 curl_phi_j[2] = -B1(qx, jx) * G2(qy, jy) * B2(qz, jz);
309 curl_phi_j[0] = -B2(qx, jx) * B1(qy, jy) * G2(qz, jz);
310 phi_j[1] = B2(qx, jx) * B1(qy, jy) * B2(qz, jz);
311 curl_phi_j[2] = G2(qx, jx) * B1(qy, jy) * B2(qz, jz);
315 curl_phi_j[0] = B2(qx, jx) * G2(qy, jy) * B1(qz, jz);
316 curl_phi_j[1] = -G2(qx, jx) * B2(qy, jy) * B1(qz, jz);
317 phi_j[2] = B2(qx, jx) * B2(qy, jy) * B1(qz, jz);
321 int Q = (qz * q + qy) * q + qx;
322 const auto& d00 = get<0>(get<0>(input(Q)));
323 const auto& d01 = get<1>(get<0>(input(Q)));
324 const auto& d10 = get<0>(get<1>(input(Q)));
325 const auto& d11 = get<1>(get<1>(input(Q)));
327 output[Q] = {
dot(d00, phi_j) +
dot(d01, curl_phi_j),
dot(d10, phi_j) +
dot(d11, curl_phi_j)};
336 SERAC_HOST_DEVICE static auto interpolate(
const dof_type& element_values,
const TensorProductQuadratureRule<q>&)
338 constexpr
bool apply_weights =
false;
339 constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
340 constexpr tensor<double, q, p + 1> B2 = calculate_B2<apply_weights, q>();
341 constexpr tensor<double, q, p + 1> G2 = calculate_G2<apply_weights, q>();
343 tensor<tensor<double, q, q, q>, 3> value{};
344 tensor<tensor<double, q, q, q>, 3> curl{};
347 constexpr
int x = 2, y = 1, z = 0;
351 auto A1 = contract< x, 1 >(element_values.x, B1);
352 auto A20 = contract< y, 1 >(A1, B2);
353 auto A21 = contract< y, 1 >(A1, G2);
354 value[0] = contract< z, 1 >(A20, B2);
355 curl[1] += contract< z, 1 >(A20, G2);
356 curl[2] -= contract< z, 1 >(A21, B2);
360 auto A1 = contract< y, 1 >(element_values.y, B1);
361 auto A20 = contract< z, 1 >(A1, B2);
362 auto A21 = contract< z, 1 >(A1, G2);
363 value[1] = contract< x, 1 >(A20, B2);
364 curl[2] += contract< x, 1 >(A20, G2);
365 curl[0] -= contract< x, 1 >(A21, B2);
369 auto A1 = contract< z, 1 >(element_values.z, B1);
370 auto A20 = contract< x, 1 >(A1, B2);
371 auto A21 = contract< x, 1 >(A1, G2);
372 value[2] = contract< y, 1 >(A20, B2);
373 curl[0] += contract< y, 1 >(A20, G2);
374 curl[1] -= contract< y, 1 >(A21, B2);
378 tensor<tuple<tensor<double, 3>, tensor<double, 3>>, q * q * q> qf_inputs;
381 for (
int qz = 0; qz < q; qz++) {
382 for (
int qy = 0; qy < q; qy++) {
383 for (
int qx = 0; qx < q; qx++) {
384 for (
int i = 0; i < 3; i++) {
385 get<VALUE>(qf_inputs(count))[i] = value[i](qz, qy, qx);
386 get<CURL>(qf_inputs(count))[i] = curl[i](qz, qy, qx);
396 template <
typename source_type,
typename flux_type,
int q>
398 const TensorProductQuadratureRule<q>&, dof_type* element_residual,
399 [[maybe_unused]]
int step = 1)
401 constexpr
bool apply_weights =
true;
402 constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
403 constexpr tensor<double, q, p + 1> B2 = calculate_B2<apply_weights, q>();
404 constexpr tensor<double, q, p + 1> G2 = calculate_G2<apply_weights, q>();
406 tensor<double, 3, q, q, q> source{};
407 tensor<double, 3, q, q, q> flux{};
409 for (
int qz = 0; qz < q; qz++) {
410 for (
int qy = 0; qy < q; qy++) {
411 for (
int qx = 0; qx < q; qx++) {
412 int k = (qz * q + qy) * q + qx;
413 tensor<double, 3> s{get<SOURCE>(qf_output[k])};
414 tensor<double, 3> f{get<FLUX>(qf_output[k])};
415 for (
int i = 0; i < 3; i++) {
416 source(i, qz, qy, qx) = s[i];
417 flux(i, qz, qy, qx) = f[i];
424 constexpr
int x = 2, y = 1, z = 0;
431 auto A20 = contract< z, 0 >(source[0], B2) + contract< z, 0 >(flux[1], G2);
432 auto A21 = contract< z, 0 >(flux[2], B2);
433 auto A1 = contract< y, 0 >(A20, B2) - contract< y, 0 >(A21, G2);
434 element_residual[0].x += contract< x, 0 >(A1, B1);
441 auto A20 = contract< x, 0 >(source[1], B2) + contract< x, 0 >(flux[2], G2);
442 auto A21 = contract< x, 0 >(flux[0], B2);
443 auto A1 = contract< z, 0 >(A20, B2) - contract< z, 0 >(A21, G2);
444 element_residual[0].y += contract< y, 0 >(A1, B1);
451 auto A20 = contract< y, 0 >(source[2], B2) + contract< y, 0 >(flux[0], G2);
452 auto A21 = contract< y, 0 >(flux[1], B2);
453 auto A1 = contract< x, 0 >(A20, B2) - contract< x, 0 >(A21, G2);
454 element_residual[0].z += contract< z, 0 >(A1, B1);
464 static SERAC_DEVICE auto interpolate(
const dof_type& element_values,
const tensor<double, dim, dim>& J,
465 const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache)
467 int tidx = threadIdx.x % q;
468 int tidy = (threadIdx.x % (q * q)) / q;
469 int tidz = threadIdx.x / (q * q);
471 static constexpr
auto points1D = GaussLegendreNodes<q>();
473 static constexpr
auto B1_ = [=]() {
474 tensor<double, q, p> B1{};
475 for (
int i = 0; i < q; i++) {
476 B1[i] = GaussLegendreInterpolation<p>(points1D[i]);
481 static constexpr
auto B2_ = [=]() {
482 tensor<double, q, n> B2{};
483 for (
int i = 0; i < q; i++) {
484 B2[i] = GaussLobattoInterpolation<n>(points1D[i]);
489 static constexpr
auto G2_ = [=]() {
490 tensor<double, q, n> G2{};
491 for (
int i = 0; i < q; i++) {
492 G2[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
497 __shared__ tensor<double, q, p> B1;
498 __shared__ tensor<double, q, n> B2;
499 __shared__ tensor<double, q, n> G2;
500 for (
int entry = threadIdx.x; entry < n * q; entry += q * q) {
504 B1(j, i) = B1_(j, i);
506 B2(j, i) = B2_(j, i);
507 G2(j, i) = G2_(j, i);
511 tensor<double, dim> value{};
512 tensor<double, dim> curl{};
517 for (
int dz = tidz; dz < p + 1; dz += q) {
518 for (
int dy = tidy; dy < p + 1; dy += q) {
519 for (
int qx = tidx; qx < q; qx += q) {
521 for (
int dx = 0; dx < p; dx++) {
522 sum += B1(qx, dx) * element_values.x(dz, dy, dx);
524 cache.A1(dz, dy, qx) = sum;
530 for (
int dz = tidz; dz < p + 1; dz += q) {
531 for (
int qy = tidy; qy < q; qy += q) {
532 for (
int qx = tidx; qx < q; qx += q) {
534 for (
int dy = 0; dy < (p + 1); dy++) {
535 sum[0] += B2(qy, dy) * cache.A1(dz, dy, qx);
536 sum[1] += G2(qy, dy) * cache.A1(dz, dy, qx);
538 cache.A2(0, dz, qy, qx) = sum[0];
539 cache.A2(1, dz, qy, qx) = sum[1];
545 for (
int qz = tidz; qz < q; qz += q) {
546 for (
int qy = tidy; qy < q; qy += q) {
547 for (
int qx = tidx; qx < q; qx += q) {
549 for (
int dz = 0; dz < (p + 1); dz++) {
550 sum[0] += B2(qz, dz) * cache.A2(0, dz, qy, qx);
551 sum[1] += G2(qz, dz) * cache.A2(0, dz, qy, qx);
552 sum[2] += B2(qz, dz) * cache.A2(1, dz, qy, qx);
565 for (
int dz = tidz; dz < p + 1; dz += q) {
566 for (
int dx = tidx; dx < p + 1; dx += q) {
567 for (
int qy = tidy; qy < q; qy += q) {
569 for (
int dy = 0; dy < p; dy++) {
570 sum += B1(qy, dy) * element_values.y(dz, dy, dx);
572 cache.A1(dz, dx, qy) = sum;
578 for (
int dz = tidz; dz < p + 1; dz += q) {
579 for (
int qy = tidy; qy < q; qy += q) {
580 for (
int qx = tidx; qx < q; qx += q) {
582 for (
int dx = 0; dx < (p + 1); dx++) {
583 sum[0] += B2(qx, dx) * cache.A1(dz, dx, qy);
584 sum[1] += G2(qx, dx) * cache.A1(dz, dx, qy);
586 cache.A2(0, dz, qy, qx) = sum[0];
587 cache.A2(1, dz, qy, qx) = sum[1];
593 for (
int qz = tidz; qz < q; qz += q) {
594 for (
int qy = tidy; qy < q; qy += q) {
595 for (
int qx = tidx; qx < q; qx += q) {
597 for (
int dz = 0; dz < (p + 1); dz++) {
598 sum[0] += B2(qz, dz) * cache.A2(0, dz, qy, qx);
599 sum[1] += G2(qz, dz) * cache.A2(0, dz, qy, qx);
600 sum[2] += B2(qz, dz) * cache.A2(1, dz, qy, qx);
613 for (
int dy = tidy; dy < p + 1; dy += q) {
614 for (
int dx = tidx; dx < p + 1; dx += q) {
615 for (
int qz = tidz; qz < q; qz += q) {
617 for (
int k = 0; k < p; k++) {
618 sum += B1(qz, k) * element_values.z(k, dy, dx);
620 cache.A1(dy, dx, qz) = sum;
626 for (
int dy = tidy; dy < p + 1; dy += q) {
627 for (
int qz = tidz; qz < q; qz += q) {
628 for (
int qx = tidx; qx < q; qx += q) {
630 for (
int dx = 0; dx < (p + 1); dx++) {
631 sum[0] += B2(qx, dx) * cache.A1(dy, dx, qz);
632 sum[1] += G2(qx, dx) * cache.A1(dy, dx, qz);
634 cache.A2(0, dy, qz, qx) = sum[0];
635 cache.A2(1, dy, qz, qx) = sum[1];
641 for (
int qz = tidz; qz < q; qz += q) {
642 for (
int qy = tidy; qy < q; qy += q) {
643 for (
int qx = tidx; qx < q; qx += q) {
645 for (
int dy = 0; dy < (p + 1); dy++) {
646 sum[0] += B2(qy, dy) * cache.A2(0, dy, qz, qx);
647 sum[1] += G2(qy, dy) * cache.A2(0, dy, qz, qx);
648 sum[2] += B2(qy, dy) * cache.A2(1, dy, qz, qx);
660 curl =
dot(J, curl) /
det(J);
662 return tuple{value, curl};
665 template <
typename T1,
typename T2,
int q>
666 static SERAC_DEVICE void integrate(tuple<T1, T2>& response,
const tensor<double, dim, dim>& J,
667 const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache,
670 int tidx = threadIdx.x % q;
671 int tidy = (threadIdx.x % (q * q)) / q;
672 int tidz = threadIdx.x / (q * q);
674 static constexpr
auto points1D = GaussLegendreNodes<q>();
675 static constexpr
auto weights1D = GaussLegendreWeights<q>();
677 static constexpr
auto B1_ = [=]() {
678 tensor<double, q, p> B1{};
679 for (
int i = 0; i < q; i++) {
680 B1[i] = GaussLegendreInterpolation<p>(points1D[i]);
685 static constexpr
auto B2_ = [=]() {
686 tensor<double, q, n> B2{};
687 for (
int i = 0; i < q; i++) {
688 B2[i] = GaussLobattoInterpolation<n>(points1D[i]);
693 static constexpr
auto G2_ = [=]() {
694 tensor<double, q, n> G2{};
695 for (
int i = 0; i < q; i++) {
696 G2[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
701 __shared__ tensor<double, q, p> B1;
702 __shared__ tensor<double, q, n> B2;
703 __shared__ tensor<double, q, n> G2;
704 for (
int entry = threadIdx.x; entry < n * q; entry += q * q) {
708 B1(j, i) = B1_(j, i);
710 B2(j, i) = B2_(j, i);
711 G2(j, i) = G2_(j, i);
720 auto dv = detJ * weights1D[tidx] * weights1D[tidy] * weights1D[tidz];
723 auto flux =
dot(get<1>(response), J) * (dv / detJ);
728 for (
int qz = tidz; qz < q; qz += q) {
729 for (
int qy = tidy; qy < q; qy += q) {
730 for (
int dx = tidx; dx < n; dx += q) {
731 cache.A2(0, dx, qy, qz) = 0.0;
732 cache.A2(1, dx, qy, qz) = 0.0;
738 for (
int offset = 0; offset < n; offset++) {
739 int dz = (tidz + offset) % n;
740 auto sum = B2(tidz, dz) * source[0] + G2(tidz, dz) * flux[1];
741 atomicAdd(&cache.A2(0, dz, tidy, tidx), sum);
742 atomicAdd(&cache.A2(1, dz, tidy, tidx), -B2(tidz, dz) * flux[2]);
746 for (
int dz = tidz; dz < p + 1; dz += q) {
747 for (
int dy = tidy; dy < p + 1; dy += q) {
748 for (
int qx = tidx; qx < q; qx += q) {
750 for (
int qy = 0; qy < q; qy++) {
751 sum += B2(qy, dy) * cache.A2(0, dz, qy, qx);
752 sum += G2(qy, dy) * cache.A2(1, dz, qy, qx);
754 cache.A1(dz, dy, qx) = sum;
760 for (
int dz = tidz; dz < p + 1; dz += q) {
761 for (
int dy = tidy; dy < p + 1; dy += q) {
762 for (
int dx = tidx; dx < p; dx += q) {
764 for (
int qx = 0; qx < q; qx++) {
765 sum += B1(qx, dx) * cache.A1(dz, dy, qx);
767 residual.x(dz, dy, dx) += sum;
776 for (
int qz = tidz; qz < q; qz += q) {
777 for (
int qy = tidy; qy < q; qy += q) {
778 for (
int dx = tidx; dx < n; dx += q) {
779 cache.A2(0, dx, qy, qz) = 0.0;
780 cache.A2(1, dx, qy, qz) = 0.0;
786 for (
int offset = 0; offset < n; offset++) {
787 int dz = (tidz + offset) % n;
788 auto sum = B2(tidz, dz) * source[1] - G2(tidz, dz) * flux[0];
789 atomicAdd(&cache.A2(0, dz, tidy, tidx), sum);
790 atomicAdd(&cache.A2(1, dz, tidy, tidx), B2(tidz, dz) * flux[2]);
794 for (
int dz = tidz; dz < p + 1; dz += q) {
795 for (
int dx = tidx; dx < p + 1; dx += q) {
796 for (
int qy = tidy; qy < q; qy += q) {
798 for (
int qx = 0; qx < q; qx++) {
799 sum += B2(qx, dx) * cache.A2(0, dz, qy, qx);
800 sum += G2(qx, dx) * cache.A2(1, dz, qy, qx);
802 cache.A1(dz, dx, qy) = sum;
808 for (
int dz = tidz; dz < p + 1; dz += q) {
809 for (
int dy = tidy; dy < p; dy += q) {
810 for (
int dx = tidx; dx < p + 1; dx += q) {
812 for (
int qy = 0; qy < q; qy++) {
813 sum += B1(qy, dy) * cache.A1(dz, dx, qy);
815 residual.y(dz, dy, dx) += sum;
824 for (
int qz = tidz; qz < q; qz += q) {
825 for (
int qy = tidy; qy < q; qy += q) {
826 for (
int dx = tidx; dx < n; dx += q) {
827 cache.A2(0, dx, qy, qz) = 0.0;
828 cache.A2(1, dx, qy, qz) = 0.0;
834 for (
int offset = 0; offset < n; offset++) {
835 int dx = (tidx + offset) % n;
836 auto sum = B2(tidx, dx) * source[2] - G2(tidx, dx) * flux[1];
837 atomicAdd(&cache.A2(0, dx, tidz, tidy), sum);
838 atomicAdd(&cache.A2(1, dx, tidz, tidy), B2(tidx, dx) * flux[0]);
842 for (
int dy = tidy; dy < p + 1; dy += q) {
843 for (
int dx = tidx; dx < p + 1; dx += q) {
844 for (
int qz = tidz; qz < q; qz += q) {
846 for (
int qy = 0; qy < q; qy++) {
847 sum += B2(qy, dy) * cache.A2(0, dx, qz, qy);
848 sum += G2(qy, dy) * cache.A2(1, dx, qz, qy);
850 cache.A1(dy, dx, qz) = sum;
856 for (
int dz = tidz; dz < p; dz += q) {
857 for (
int dy = tidy; dy < p + 1; dy += q) {
858 for (
int dx = tidx; dx < p + 1; dx += q) {
860 for (
int qz = 0; qz < q; qz++) {
861 sum += B1(qz, dz) * cache.A1(dy, dx, qz);
863 residual.z(dz, dy, dx) += sum;
#define SERAC_DEVICE
Macro that evaluates to __device__ when compiling with nvcc and does nothing on a host compiler.
#define SERAC_HOST_DEVICE
Macro that evaluates to __host__ __device__ when compiling with nvcc and does nothing on a host compi...
constexpr SERAC_HOST_DEVICE auto dot(const isotropic_tensor< S, m, m > &I, const tensor< T, m, n... > &A)
dot product between an isotropic and (nonisotropic) tensor
tensor(const T(&data)[n1]) -> tensor< T, n1 >
class template argument deduction guide for type tensor.
constexpr SERAC_HOST_DEVICE auto linear_solve(const LuFactorization< S, n > &lu_factors, const tensor< T, n, m... > &b)
tensor< double, 3 > vec3
statically sized vector of 3 doubles
constexpr SERAC_HOST_DEVICE auto det(const isotropic_tensor< T, m, m > &I)
compute the determinant of an isotropic tensor
tuple(T...) -> tuple< T... >
Class template argument deduction rule for tuples.
constexpr SERAC_HOST_DEVICE auto type(const tuple< T... > &values)
a function intended to be used for extracting the ith type from a tuple.
constexpr SERAC_HOST_DEVICE auto transpose(const isotropic_tensor< T, m, m > &I)
return the transpose of an isotropic tensor