Smith  0.1
Smith is an implicit thermal structural mechanics simulation code.
hexahedron_Hcurl.inl
Go to the documentation of this file.
1 // Copyright (c) Lawrence Livermore National Security, LLC and
2 // other Smith Project Developers. See the top-level LICENSE file for
3 // details.
4 //
5 // SPDX-License-Identifier: (BSD-3-Clause)
6 
13 // this specialization defines shape functions (and their curls) that
14 // interpolate at Gauss-Lobatto nodes for closed intervals, and Gauss-Legendre
15 // nodes for open intervals.
16 //
17 // note 1: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1]
18 // note 2: dofs are numbered by direction and then lexicographically in space.
19 // see quadrilateral_hcurl.inl for more information
20 // for additional information on the finite_element concept requirements, see finite_element.hpp
22 template <int p>
23 struct finite_element<mfem::Geometry::CUBE, Hcurl<p>> {
24  static constexpr auto geometry = mfem::Geometry::CUBE;
25  static constexpr auto family = Family::HCURL;
26  static constexpr int dim = 3;
27  static constexpr int n = p + 1;
28  static constexpr int ndof = 3 * p * (p + 1) * (p + 1);
29  static constexpr int components = 1;
30 
31  static constexpr int VALUE = 0, CURL = 1;
32  static constexpr int SOURCE = 0, FLUX = 1;
33 
34  // TODO: delete this in favor of dof_type
35  using residual_type =
36  typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components>>::type;
37 
38  // this is how mfem provides the data to us for these elements
39  struct dof_type {
40  tensor<double, p + 1, p + 1, p> x;
41  tensor<double, p + 1, p, p + 1> y;
42  tensor<double, p, p + 1, p + 1> z;
43  };
44 
45  template <int q>
46  using cpu_batched_values_type = tensor<tensor<double, 3>, q, q, q>;
47 
48  template <int q>
49  using cpu_batched_derivatives_type = tensor<tensor<double, 3>, q, q, q>;
50 
51  static constexpr auto directions = [] {
52  int dof_per_direction = p * (p + 1) * (p + 1);
53 
54  tensor<double, ndof, dim> directions{};
55  for (int i = 0; i < dof_per_direction; i++) {
56  directions[i + 0 * dof_per_direction] = {1.0, 0.0, 0.0};
57  directions[i + 1 * dof_per_direction] = {0.0, 1.0, 0.0};
58  directions[i + 2 * dof_per_direction] = {0.0, 0.0, 1.0};
59  }
60  return directions;
61  }();
62 
63  static constexpr auto nodes = []() {
64  auto legendre_nodes = GaussLegendreNodes<p, mfem::Geometry::SEGMENT>();
65  auto lobatto_nodes = GaussLobattoNodes<p + 1>();
66 
67  tensor<double, ndof, dim> nodes{};
68 
69  int count = 0;
70  for (int k = 0; k < p + 1; k++) {
71  for (int j = 0; j < p + 1; j++) {
72  for (int i = 0; i < p; i++) {
73  nodes[count++] = {legendre_nodes[i], lobatto_nodes[j], lobatto_nodes[k]};
74  }
75  }
76  }
77 
78  for (int k = 0; k < p + 1; k++) {
79  for (int j = 0; j < p; j++) {
80  for (int i = 0; i < p + 1; i++) {
81  nodes[count++] = {lobatto_nodes[i], legendre_nodes[j], lobatto_nodes[k]};
82  }
83  }
84  }
85 
86  for (int k = 0; k < p; k++) {
87  for (int j = 0; j < p + 1; j++) {
88  for (int i = 0; i < p + 1; i++) {
89  nodes[count++] = {lobatto_nodes[i], lobatto_nodes[j], legendre_nodes[k]};
90  }
91  }
92  }
93 
94  return nodes;
95  }();
96 
107  template <bool apply_weights, int q>
108  static constexpr auto calculate_B1()
109  {
110  constexpr auto points1D = GaussLegendreNodes<q>();
111  [[maybe_unused]] constexpr auto weights1D = GaussLegendreWeights<q>();
112  tensor<double, q, p> B1{};
113  for (int i = 0; i < q; i++) {
114  B1[i] = GaussLegendreInterpolation<p>(points1D[i]);
115  if constexpr (apply_weights) B1[i] = B1[i] * weights1D[i];
116  }
117  return B1;
118  }
119 
130  template <bool apply_weights, int q>
131  static constexpr auto calculate_B2()
132  {
133  constexpr auto points1D = GaussLegendreNodes<q>();
134  [[maybe_unused]] constexpr auto weights1D = GaussLegendreWeights<q>();
135  tensor<double, q, p + 1> B2{};
136  for (int i = 0; i < q; i++) {
137  B2[i] = GaussLobattoInterpolation<p + 1>(points1D[i]);
138  if constexpr (apply_weights) B2[i] = B2[i] * weights1D[i];
139  }
140  return B2;
141  }
142 
153  template <bool apply_weights, int q>
154  static constexpr auto calculate_G2()
155  {
156  constexpr auto points1D = GaussLegendreNodes<q>();
157  [[maybe_unused]] constexpr auto weights1D = GaussLegendreWeights<q>();
158  tensor<double, q, p + 1> G2{};
159  for (int i = 0; i < q; i++) {
160  G2[i] = GaussLobattoInterpolationDerivative<p + 1>(points1D[i]);
161  if constexpr (apply_weights) G2[i] = G2[i] * weights1D[i];
162  }
163  return G2;
164  }
165 
166  SMITH_HOST_DEVICE static constexpr tensor<double, ndof, dim> shape_functions(tensor<double, dim> xi)
167  {
168  tensor<double, ndof, dim> N{};
169 
170  tensor<double, p> f[3] = {GaussLegendreInterpolation<p>(xi[0]), GaussLegendreInterpolation<p>(xi[1]),
171  GaussLegendreInterpolation<p>(xi[2])};
172 
173  tensor<double, p + 1> g[3] = {GaussLobattoInterpolation<p + 1>(xi[0]), GaussLobattoInterpolation<p + 1>(xi[1]),
174  GaussLobattoInterpolation<p + 1>(xi[2])};
175 
176  int count = 0;
177 
178  // do all the x-facing nodes first
179  for (int k = 0; k < p + 1; k++) {
180  for (int j = 0; j < p + 1; j++) {
181  for (int i = 0; i < p; i++) {
182  N[count++] = {f[0][i] * g[1][j] * g[2][k], 0.0, 0.0};
183  }
184  }
185  }
186 
187  // then all the y-facing nodes
188  for (int k = 0; k < p + 1; k++) {
189  for (int j = 0; j < p; j++) {
190  for (int i = 0; i < p + 1; i++) {
191  N[count++] = {0.0, g[0][i] * f[1][j] * g[2][k], 0.0};
192  }
193  }
194  }
195 
196  // then, finally, all the z-facing nodes
197  for (int k = 0; k < p; k++) {
198  for (int j = 0; j < p + 1; j++) {
199  for (int i = 0; i < p + 1; i++) {
200  N[count++] = {0.0, 0.0, g[0][i] * g[1][j] * f[2][k]};
201  }
202  }
203  }
204 
205  return N;
206  }
207 
208  SMITH_HOST_DEVICE static constexpr tensor<double, ndof, dim> shape_function_curl(tensor<double, dim> xi)
209  {
210  tensor<double, ndof, dim> curl{};
211 
212  tensor<double, p> f[3] = {GaussLegendreInterpolation<p>(xi[0]), GaussLegendreInterpolation<p>(xi[1]),
213  GaussLegendreInterpolation<p>(xi[2])};
214 
215  tensor<double, p + 1> g[3] = {GaussLobattoInterpolation<p + 1>(xi[0]), GaussLobattoInterpolation<p + 1>(xi[1]),
216  GaussLobattoInterpolation<p + 1>(xi[2])};
217 
218  tensor<double, p + 1> dg[3] = {GaussLobattoInterpolationDerivative<p + 1>(xi[0]),
219  GaussLobattoInterpolationDerivative<p + 1>(xi[1]),
220  GaussLobattoInterpolationDerivative<p + 1>(xi[2])};
221 
222  int count = 0;
223 
224  // do all the x-facing nodes first
225  for (int k = 0; k < p + 1; k++) {
226  for (int j = 0; j < p + 1; j++) {
227  for (int i = 0; i < p; i++) {
228  // curl({f(x) g(y) g(z), 0.0, 0.0}) == {0.0, f(x) g(y) g'(z), -f(x) g'(y) g(z)};
229  curl[count++] = {0.0, f[0][i] * g[1][j] * dg[2][k], -f[0][i] * dg[1][j] * g[2][k]};
230  }
231  }
232  }
233 
234  // then all the y-facing nodes
235  for (int k = 0; k < p + 1; k++) {
236  for (int j = 0; j < p; j++) {
237  for (int i = 0; i < p + 1; i++) {
238  // curl({0.0, g(x) f(y) g(z), 0.0}) == {-g(x) f(y) g'(z), 0.0, g'(x) f(y) g(z)};
239  curl[count++] = {-g[0][i] * f[1][j] * dg[2][k], 0.0, dg[0][i] * f[1][j] * g[2][k]};
240  }
241  }
242  }
243 
244  // then, finally, all the z-facing nodes
245  for (int k = 0; k < p; k++) {
246  for (int j = 0; j < p + 1; j++) {
247  for (int i = 0; i < p + 1; i++) {
248  // curl({0.0, 0.0, g(x) g(y) f(z)}) == {g(x) g'(y) f(z), -g'(x) g(y) f(z), 0.0};
249  curl[count++] = {g[0][i] * dg[1][j] * f[2][k], -dg[0][i] * g[1][j] * f[2][k], 0.0};
250  }
251  }
252  }
253 
254  return curl;
255  }
256 
257  template <typename in_t, int q>
258  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
259  {
260  constexpr bool apply_weights = false;
261  constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
262  constexpr tensor<double, q, p + 1> B2 = calculate_B2<apply_weights, q>();
263  constexpr tensor<double, q, p + 1> G2 = calculate_G2<apply_weights, q>();
264 
265  // figure out which node and which direction
266  // correspond to the dof index "j"
267  int jx, jy, jz;
268  int dir = j / (p * (p + 1) * (p + 1));
269  int remainder = j % (p * (p + 1) * (p + 1));
270  switch (dir) {
271  case 0: // x-direction
272  jx = remainder % p;
273  jy = (remainder % (p * (p + 1))) / p;
274  jz = remainder / (p * (p + 1));
275  break;
276 
277  case 1: // y-direction
278  jx = remainder % (p + 1);
279  jy = (remainder % (p * (p + 1))) / (p + 1);
280  jz = remainder / (p * (p + 1));
281  break;
282 
283  case 2: // z-direction
284  jx = remainder % (p + 1);
285  jy = (remainder % ((p + 1) * (p + 1))) / (p + 1);
286  jz = remainder / ((p + 1) * (p + 1));
287  break;
288  }
289 
290  using source_t = decltype(dot(get<0>(get<0>(in_t{})), vec3{}) + dot(get<1>(get<0>(in_t{})), vec3{}));
291  using flux_t = decltype(dot(get<0>(get<1>(in_t{})), vec3{}) + dot(get<1>(get<1>(in_t{})), vec3{}));
292 
293  tensor<tuple<source_t, flux_t>, q * q * q> output;
294 
295  for (int qz = 0; qz < q; qz++) {
296  for (int qy = 0; qy < q; qy++) {
297  for (int qx = 0; qx < q; qx++) {
298  tensor<double, 3> phi_j{};
299  tensor<double, 3> curl_phi_j{};
300 
301  switch (dir) {
302  case 0:
303  phi_j[0] = B1(qx, jx) * B2(qy, jy) * B2(qz, jz);
304  curl_phi_j[1] = B1(qx, jx) * B2(qy, jy) * G2(qz, jz);
305  curl_phi_j[2] = -B1(qx, jx) * G2(qy, jy) * B2(qz, jz);
306  break;
307 
308  case 1:
309  curl_phi_j[0] = -B2(qx, jx) * B1(qy, jy) * G2(qz, jz);
310  phi_j[1] = B2(qx, jx) * B1(qy, jy) * B2(qz, jz);
311  curl_phi_j[2] = G2(qx, jx) * B1(qy, jy) * B2(qz, jz);
312  break;
313 
314  case 2:
315  curl_phi_j[0] = B2(qx, jx) * G2(qy, jy) * B1(qz, jz);
316  curl_phi_j[1] = -G2(qx, jx) * B2(qy, jy) * B1(qz, jz);
317  phi_j[2] = B2(qx, jx) * B2(qy, jy) * B1(qz, jz);
318  break;
319  }
320 
321  int Q = (qz * q + qy) * q + qx;
322  const auto& d00 = get<0>(get<0>(input(Q)));
323  const auto& d01 = get<1>(get<0>(input(Q)));
324  const auto& d10 = get<0>(get<1>(input(Q)));
325  const auto& d11 = get<1>(get<1>(input(Q)));
326 
327  output[Q] = {dot(d00, phi_j) + dot(d01, curl_phi_j), dot(d10, phi_j) + dot(d11, curl_phi_j)};
328  }
329  }
330  }
331 
332  return output;
333  }
334 
335  template <int q>
336  SMITH_HOST_DEVICE static auto interpolate(const dof_type& element_values, const TensorProductQuadratureRule<q>&)
337  {
338  constexpr bool apply_weights = false;
339  constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
340  constexpr tensor<double, q, p + 1> B2 = calculate_B2<apply_weights, q>();
341  constexpr tensor<double, q, p + 1> G2 = calculate_G2<apply_weights, q>();
342 
343  tensor<tensor<double, q, q, q>, 3> value{};
344  tensor<tensor<double, q, q, q>, 3> curl{};
345 
346  // to clarify which contractions correspond to which spatial dimensions
347  constexpr int x = 2, y = 1, z = 0;
348 
349  // clang-format off
350  {
351  auto A1 = contract< x, 1 >(element_values.x, B1);
352  auto A20 = contract< y, 1 >(A1, B2);
353  auto A21 = contract< y, 1 >(A1, G2);
354  value[0] = contract< z, 1 >(A20, B2);
355  curl[1] += contract< z, 1 >(A20, G2);
356  curl[2] -= contract< z, 1 >(A21, B2);
357  }
358 
359  {
360  auto A1 = contract< y, 1 >(element_values.y, B1);
361  auto A20 = contract< z, 1 >(A1, B2);
362  auto A21 = contract< z, 1 >(A1, G2);
363  value[1] = contract< x, 1 >(A20, B2);
364  curl[2] += contract< x, 1 >(A20, G2);
365  curl[0] -= contract< x, 1 >(A21, B2);
366  }
367 
368  {
369  auto A1 = contract< z, 1 >(element_values.z, B1);
370  auto A20 = contract< x, 1 >(A1, B2);
371  auto A21 = contract< x, 1 >(A1, G2);
372  value[2] = contract< y, 1 >(A20, B2);
373  curl[0] += contract< y, 1 >(A20, G2);
374  curl[1] -= contract< y, 1 >(A21, B2);
375  }
376  // clang-format on
377 
378  tensor<tuple<tensor<double, 3>, tensor<double, 3>>, q * q * q> qf_inputs;
379 
380  int count = 0;
381  for (int qz = 0; qz < q; qz++) {
382  for (int qy = 0; qy < q; qy++) {
383  for (int qx = 0; qx < q; qx++) {
384  for (int i = 0; i < 3; i++) {
385  get<VALUE>(qf_inputs(count))[i] = value[i](qz, qy, qx);
386  get<CURL>(qf_inputs(count))[i] = curl[i](qz, qy, qx);
387  }
388  count++;
389  }
390  }
391  }
392 
393  return qf_inputs;
394  }
395 
396  template <typename source_type, typename flux_type, int q>
397  SMITH_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
398  const TensorProductQuadratureRule<q>&, dof_type* element_residual,
399  [[maybe_unused]] int step = 1)
400  {
401  constexpr bool apply_weights = true;
402  constexpr tensor<double, q, p> B1 = calculate_B1<apply_weights, q>();
403  constexpr tensor<double, q, p + 1> B2 = calculate_B2<apply_weights, q>();
404  constexpr tensor<double, q, p + 1> G2 = calculate_G2<apply_weights, q>();
405 
406  tensor<double, 3, q, q, q> source{};
407  tensor<double, 3, q, q, q> flux{};
408 
409  for (int qz = 0; qz < q; qz++) {
410  for (int qy = 0; qy < q; qy++) {
411  for (int qx = 0; qx < q; qx++) {
412  int k = (qz * q + qy) * q + qx;
413  tensor<double, 3> s{get<SOURCE>(qf_output[k])};
414  tensor<double, 3> f{get<FLUX>(qf_output[k])};
415  for (int i = 0; i < 3; i++) {
416  source(i, qz, qy, qx) = s[i];
417  flux(i, qz, qy, qx) = f[i];
418  }
419  }
420  }
421  }
422 
423  // to clarify which contractions correspond to which spatial dimensions
424  constexpr int x = 2, y = 1, z = 0;
425 
426  // clang-format off
427  // r(0, dz, dy, dx) = s(0, qz, qy, qx) * B2(qz, dz) * B2(qy, dy) * B1(qx, dx)
428  // + f(1, qz, qy, qx) * G2(qz, dz) * B2(qy, dy) * B1(qx, dx)
429  // - f(2, qz, qy, qx) * B2(qz, dz) * G2(qy, dy) * B1(qx, dx);
430  {
431  auto A20 = contract< z, 0 >(source[0], B2) + contract< z, 0 >(flux[1], G2);
432  auto A21 = contract< z, 0 >(flux[2], B2);
433  auto A1 = contract< y, 0 >(A20, B2) - contract< y, 0 >(A21, G2);
434  element_residual[0].x += contract< x, 0 >(A1, B1);
435  }
436 
437  // r(1, dz, dy, dx) = s(1, qz, qy, qx) * B2(qz, dz) * B1(qy, dy) * B2(qx, dx)
438  // - f(0, qz, qy, qx) * G2(qz, dz) * B1(qy, dy) * B2(qx, dx)
439  // + f(2, qz, qy, qx) * B2(qz, dz) * B1(qy, dy) * G2(qx, dx);
440  {
441  auto A20 = contract< x, 0 >(source[1], B2) + contract< x, 0 >(flux[2], G2);
442  auto A21 = contract< x, 0 >(flux[0], B2);
443  auto A1 = contract< z, 0 >(A20, B2) - contract< z, 0 >(A21, G2);
444  element_residual[0].y += contract< y, 0 >(A1, B1);
445  }
446 
447  // r(2, dz, dy, dx) = s(2, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * B2(qx, dx)
448  // + f(0, qz, qy, qx) * B1(qz, dz) * G2(qy, dy) * B2(qx, dx)
449  // - f(1, qz, qy, qx) * B1(qz, dz) * B2(qy, dy) * G2(qx, dx);
450  {
451  auto A20 = contract< y, 0 >(source[2], B2) + contract< y, 0 >(flux[0], G2);
452  auto A21 = contract< y, 0 >(flux[1], B2);
453  auto A1 = contract< x, 0 >(A20, B2) - contract< x, 0 >(A21, G2);
454  element_residual[0].z += contract< z, 0 >(A1, B1);
455  }
456  // clang-format on
457  }
458 
460 
461 #if 0
462 
463  template <int q>
464  static SMITH_DEVICE auto interpolate(const dof_type& element_values, const tensor<double, dim, dim>& J,
465  const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache)
466  {
467  int tidx = threadIdx.x % q;
468  int tidy = (threadIdx.x % (q * q)) / q;
469  int tidz = threadIdx.x / (q * q);
470 
471  static constexpr auto points1D = GaussLegendreNodes<q>();
472 
473  static constexpr auto B1_ = [=]() {
474  tensor<double, q, p> B1{};
475  for (int i = 0; i < q; i++) {
476  B1[i] = GaussLegendreInterpolation<p>(points1D[i]);
477  }
478  return B1;
479  }();
480 
481  static constexpr auto B2_ = [=]() {
482  tensor<double, q, n> B2{};
483  for (int i = 0; i < q; i++) {
484  B2[i] = GaussLobattoInterpolation<n>(points1D[i]);
485  }
486  return B2;
487  }();
488 
489  static constexpr auto G2_ = [=]() {
490  tensor<double, q, n> G2{};
491  for (int i = 0; i < q; i++) {
492  G2[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
493  }
494  return G2;
495  }();
496 
497  __shared__ tensor<double, q, p> B1;
498  __shared__ tensor<double, q, n> B2;
499  __shared__ tensor<double, q, n> G2;
500  for (int entry = threadIdx.x; entry < n * q; entry += q * q) {
501  int i = entry % n;
502  int j = entry / n;
503  if (i < p) {
504  B1(j, i) = B1_(j, i);
505  }
506  B2(j, i) = B2_(j, i);
507  G2(j, i) = G2_(j, i);
508  }
509  __syncthreads();
510 
511  tensor<double, dim> value{};
512  tensor<double, dim> curl{};
513 
517  for (int dz = tidz; dz < p + 1; dz += q) {
518  for (int dy = tidy; dy < p + 1; dy += q) {
519  for (int qx = tidx; qx < q; qx += q) {
520  double sum = 0.0;
521  for (int dx = 0; dx < p; dx++) {
522  sum += B1(qx, dx) * element_values.x(dz, dy, dx);
523  }
524  cache.A1(dz, dy, qx) = sum;
525  }
526  }
527  }
528  __syncthreads();
529 
530  for (int dz = tidz; dz < p + 1; dz += q) {
531  for (int qy = tidy; qy < q; qy += q) {
532  for (int qx = tidx; qx < q; qx += q) {
533  double sum[2]{};
534  for (int dy = 0; dy < (p + 1); dy++) {
535  sum[0] += B2(qy, dy) * cache.A1(dz, dy, qx);
536  sum[1] += G2(qy, dy) * cache.A1(dz, dy, qx);
537  }
538  cache.A2(0, dz, qy, qx) = sum[0];
539  cache.A2(1, dz, qy, qx) = sum[1];
540  }
541  }
542  }
543  __syncthreads();
544 
545  for (int qz = tidz; qz < q; qz += q) {
546  for (int qy = tidy; qy < q; qy += q) {
547  for (int qx = tidx; qx < q; qx += q) {
548  double sum[3]{};
549  for (int dz = 0; dz < (p + 1); dz++) {
550  sum[0] += B2(qz, dz) * cache.A2(0, dz, qy, qx);
551  sum[1] += G2(qz, dz) * cache.A2(0, dz, qy, qx);
552  sum[2] += B2(qz, dz) * cache.A2(1, dz, qy, qx);
553  }
554  value[0] += sum[0];
555  curl[1] += sum[1];
556  curl[2] -= sum[2];
557  }
558  }
559  }
560  __syncthreads();
561 
565  for (int dz = tidz; dz < p + 1; dz += q) {
566  for (int dx = tidx; dx < p + 1; dx += q) {
567  for (int qy = tidy; qy < q; qy += q) {
568  double sum = 0.0;
569  for (int dy = 0; dy < p; dy++) {
570  sum += B1(qy, dy) * element_values.y(dz, dy, dx);
571  }
572  cache.A1(dz, dx, qy) = sum;
573  }
574  }
575  }
576  __syncthreads();
577 
578  for (int dz = tidz; dz < p + 1; dz += q) {
579  for (int qy = tidy; qy < q; qy += q) {
580  for (int qx = tidx; qx < q; qx += q) {
581  double sum[2]{};
582  for (int dx = 0; dx < (p + 1); dx++) {
583  sum[0] += B2(qx, dx) * cache.A1(dz, dx, qy);
584  sum[1] += G2(qx, dx) * cache.A1(dz, dx, qy);
585  }
586  cache.A2(0, dz, qy, qx) = sum[0];
587  cache.A2(1, dz, qy, qx) = sum[1];
588  }
589  }
590  }
591  __syncthreads();
592 
593  for (int qz = tidz; qz < q; qz += q) {
594  for (int qy = tidy; qy < q; qy += q) {
595  for (int qx = tidx; qx < q; qx += q) {
596  double sum[3]{};
597  for (int dz = 0; dz < (p + 1); dz++) {
598  sum[0] += B2(qz, dz) * cache.A2(0, dz, qy, qx);
599  sum[1] += G2(qz, dz) * cache.A2(0, dz, qy, qx);
600  sum[2] += B2(qz, dz) * cache.A2(1, dz, qy, qx);
601  }
602  value[1] += sum[0];
603  curl[2] += sum[2];
604  curl[0] -= sum[1];
605  }
606  }
607  }
608  __syncthreads();
609 
613  for (int dy = tidy; dy < p + 1; dy += q) {
614  for (int dx = tidx; dx < p + 1; dx += q) {
615  for (int qz = tidz; qz < q; qz += q) {
616  double sum = 0.0;
617  for (int k = 0; k < p; k++) {
618  sum += B1(qz, k) * element_values.z(k, dy, dx);
619  }
620  cache.A1(dy, dx, qz) = sum;
621  }
622  }
623  }
624  __syncthreads();
625 
626  for (int dy = tidy; dy < p + 1; dy += q) {
627  for (int qz = tidz; qz < q; qz += q) {
628  for (int qx = tidx; qx < q; qx += q) {
629  double sum[2]{};
630  for (int dx = 0; dx < (p + 1); dx++) {
631  sum[0] += B2(qx, dx) * cache.A1(dy, dx, qz);
632  sum[1] += G2(qx, dx) * cache.A1(dy, dx, qz);
633  }
634  cache.A2(0, dy, qz, qx) = sum[0];
635  cache.A2(1, dy, qz, qx) = sum[1];
636  }
637  }
638  }
639  __syncthreads();
640 
641  for (int qz = tidz; qz < q; qz += q) {
642  for (int qy = tidy; qy < q; qy += q) {
643  for (int qx = tidx; qx < q; qx += q) {
644  double sum[3]{};
645  for (int dy = 0; dy < (p + 1); dy++) {
646  sum[0] += B2(qy, dy) * cache.A2(0, dy, qz, qx);
647  sum[1] += G2(qy, dy) * cache.A2(0, dy, qz, qx);
648  sum[2] += B2(qy, dy) * cache.A2(1, dy, qz, qx);
649  }
650  value[2] += sum[0];
651  curl[0] += sum[1];
652  curl[1] -= sum[2];
653  }
654  }
655  }
656 
657  // apply covariant Piola transformation to go
658  // from parent element -> physical element
659  value = linear_solve(transpose(J), value);
660  curl = dot(J, curl) / det(J);
661 
662  return tuple{value, curl};
663  }
664 
665  template <typename T1, typename T2, int q>
666  static SMITH_DEVICE void integrate(tuple<T1, T2>& response, const tensor<double, dim, dim>& J,
667  const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache,
668  dof_type& residual)
669  {
670  int tidx = threadIdx.x % q;
671  int tidy = (threadIdx.x % (q * q)) / q;
672  int tidz = threadIdx.x / (q * q);
673 
674  static constexpr auto points1D = GaussLegendreNodes<q>();
675  static constexpr auto weights1D = GaussLegendreWeights<q>();
676 
677  static constexpr auto B1_ = [=]() {
678  tensor<double, q, p> B1{};
679  for (int i = 0; i < q; i++) {
680  B1[i] = GaussLegendreInterpolation<p>(points1D[i]);
681  }
682  return B1;
683  }();
684 
685  static constexpr auto B2_ = [=]() {
686  tensor<double, q, n> B2{};
687  for (int i = 0; i < q; i++) {
688  B2[i] = GaussLobattoInterpolation<n>(points1D[i]);
689  }
690  return B2;
691  }();
692 
693  static constexpr auto G2_ = [=]() {
694  tensor<double, q, n> G2{};
695  for (int i = 0; i < q; i++) {
696  G2[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
697  }
698  return G2;
699  }();
700 
701  __shared__ tensor<double, q, p> B1;
702  __shared__ tensor<double, q, n> B2;
703  __shared__ tensor<double, q, n> G2;
704  for (int entry = threadIdx.x; entry < n * q; entry += q * q) {
705  int i = entry % n;
706  int j = entry / n;
707  if (i < p) {
708  B1(j, i) = B1_(j, i);
709  }
710  B2(j, i) = B2_(j, i);
711  G2(j, i) = G2_(j, i);
712  }
713  __syncthreads();
714 
715  // transform the source and flux terms from values on the physical element,
716  // to values on the parent element. Also, the source/flux values are scaled
717  // according to the weight of their quadrature point, so that when we add them
718  // together, it approximates the integral over the element
719  auto detJ = det(J);
720  auto dv = detJ * weights1D[tidx] * weights1D[tidy] * weights1D[tidz];
721 
722  auto source = linear_solve(J, get<0>(response)) * dv;
723  auto flux = dot(get<1>(response), J) * (dv / detJ);
724 
728  for (int qz = tidz; qz < q; qz += q) {
729  for (int qy = tidy; qy < q; qy += q) {
730  for (int dx = tidx; dx < n; dx += q) {
731  cache.A2(0, dx, qy, qz) = 0.0;
732  cache.A2(1, dx, qy, qz) = 0.0;
733  }
734  }
735  }
736  __syncthreads();
737 
738  for (int offset = 0; offset < n; offset++) {
739  int dz = (tidz + offset) % n;
740  auto sum = B2(tidz, dz) * source[0] + G2(tidz, dz) * flux[1];
741  atomicAdd(&cache.A2(0, dz, tidy, tidx), sum);
742  atomicAdd(&cache.A2(1, dz, tidy, tidx), -B2(tidz, dz) * flux[2]);
743  }
744  __syncthreads();
745 
746  for (int dz = tidz; dz < p + 1; dz += q) {
747  for (int dy = tidy; dy < p + 1; dy += q) {
748  for (int qx = tidx; qx < q; qx += q) {
749  double sum = 0.0;
750  for (int qy = 0; qy < q; qy++) {
751  sum += B2(qy, dy) * cache.A2(0, dz, qy, qx);
752  sum += G2(qy, dy) * cache.A2(1, dz, qy, qx);
753  }
754  cache.A1(dz, dy, qx) = sum;
755  }
756  }
757  }
758  __syncthreads();
759 
760  for (int dz = tidz; dz < p + 1; dz += q) {
761  for (int dy = tidy; dy < p + 1; dy += q) {
762  for (int dx = tidx; dx < p; dx += q) {
763  double sum = 0.0;
764  for (int qx = 0; qx < q; qx++) {
765  sum += B1(qx, dx) * cache.A1(dz, dy, qx);
766  }
767  residual.x(dz, dy, dx) += sum;
768  }
769  }
770  }
771  __syncthreads();
772 
776  for (int qz = tidz; qz < q; qz += q) {
777  for (int qy = tidy; qy < q; qy += q) {
778  for (int dx = tidx; dx < n; dx += q) {
779  cache.A2(0, dx, qy, qz) = 0.0;
780  cache.A2(1, dx, qy, qz) = 0.0;
781  }
782  }
783  }
784  __syncthreads();
785 
786  for (int offset = 0; offset < n; offset++) {
787  int dz = (tidz + offset) % n;
788  auto sum = B2(tidz, dz) * source[1] - G2(tidz, dz) * flux[0];
789  atomicAdd(&cache.A2(0, dz, tidy, tidx), sum);
790  atomicAdd(&cache.A2(1, dz, tidy, tidx), B2(tidz, dz) * flux[2]);
791  }
792  __syncthreads();
793 
794  for (int dz = tidz; dz < p + 1; dz += q) {
795  for (int dx = tidx; dx < p + 1; dx += q) {
796  for (int qy = tidy; qy < q; qy += q) {
797  double sum = 0.0;
798  for (int qx = 0; qx < q; qx++) {
799  sum += B2(qx, dx) * cache.A2(0, dz, qy, qx);
800  sum += G2(qx, dx) * cache.A2(1, dz, qy, qx);
801  }
802  cache.A1(dz, dx, qy) = sum;
803  }
804  }
805  }
806  __syncthreads();
807 
808  for (int dz = tidz; dz < p + 1; dz += q) {
809  for (int dy = tidy; dy < p; dy += q) {
810  for (int dx = tidx; dx < p + 1; dx += q) {
811  double sum = 0.0;
812  for (int qy = 0; qy < q; qy++) {
813  sum += B1(qy, dy) * cache.A1(dz, dx, qy);
814  }
815  residual.y(dz, dy, dx) += sum;
816  }
817  }
818  }
819  __syncthreads();
820 
824  for (int qz = tidz; qz < q; qz += q) {
825  for (int qy = tidy; qy < q; qy += q) {
826  for (int dx = tidx; dx < n; dx += q) {
827  cache.A2(0, dx, qy, qz) = 0.0;
828  cache.A2(1, dx, qy, qz) = 0.0;
829  }
830  }
831  }
832  __syncthreads();
833 
834  for (int offset = 0; offset < n; offset++) {
835  int dx = (tidx + offset) % n;
836  auto sum = B2(tidx, dx) * source[2] - G2(tidx, dx) * flux[1];
837  atomicAdd(&cache.A2(0, dx, tidz, tidy), sum);
838  atomicAdd(&cache.A2(1, dx, tidz, tidy), B2(tidx, dx) * flux[0]);
839  }
840  __syncthreads();
841 
842  for (int dy = tidy; dy < p + 1; dy += q) {
843  for (int dx = tidx; dx < p + 1; dx += q) {
844  for (int qz = tidz; qz < q; qz += q) {
845  double sum = 0.0;
846  for (int qy = 0; qy < q; qy++) {
847  sum += B2(qy, dy) * cache.A2(0, dx, qz, qy);
848  sum += G2(qy, dy) * cache.A2(1, dx, qz, qy);
849  }
850  cache.A1(dy, dx, qz) = sum;
851  }
852  }
853  }
854  __syncthreads();
855 
856  for (int dz = tidz; dz < p; dz += q) {
857  for (int dy = tidy; dy < p + 1; dy += q) {
858  for (int dx = tidx; dx < p + 1; dx += q) {
859  double sum = 0.0;
860  for (int qz = 0; qz < q; qz++) {
861  sum += B1(qz, dz) * cache.A1(dy, dx, qz);
862  }
863  residual.z(dz, dy, dx) += sum;
864  }
865  }
866  }
867  }
868 
869 #endif
870 };
#define SMITH_HOST_DEVICE
Macro that evaluates to __host__ __device__ when compiling with nvcc or amdclang and does nothing on ...
Definition: accelerator.hpp:37
#define SMITH_DEVICE
Macro that evaluates to __device__ when compiling with nvcc or amdclang and does nothing on a host co...
Definition: accelerator.hpp:45
tuple(T...) -> tuple< T... >
Class template argument deduction rule for tuples.
constexpr SMITH_HOST_DEVICE auto type(const tuple< T... > &values)
a function intended to be used for extracting the ith type from a tuple.
Definition: tuple.hpp:376
tensor< double, 3 > vec3
statically sized vector of 3 doubles
Definition: tensor.hpp:114
constexpr SMITH_HOST_DEVICE auto transpose(const isotropic_tensor< T, m, m > &I)
return the transpose of an isotropic tensor
tensor(const T(&data)[n1]) -> tensor< T, n1 >
class template argument deduction guide for type tensor.
constexpr SMITH_HOST_DEVICE auto det(const isotropic_tensor< T, m, m > &I)
compute the determinant of an isotropic tensor
constexpr SMITH_HOST_DEVICE auto dot(const isotropic_tensor< S, m, m > &I, const tensor< T, m, n... > &A)
dot product between an isotropic and (nonisotropic) tensor
constexpr SMITH_HOST_DEVICE auto linear_solve(const LuFactorization< S, n > &lu_factors, const tensor< T, n, m... > &b)
Definition: tensor.hpp:1626