Smith  0.1
Smith is an implicit thermal structural mechanics simulation code.
hexahedron_L2.inl
Go to the documentation of this file.
1 // Copyright (c) Lawrence Livermore National Security, LLC and
2 // other Smith Project Developers. See the top-level LICENSE file for
3 // details.
4 //
5 // SPDX-License-Identifier: (BSD-3-Clause)
6 
13 // this specialization defines shape functions (and their gradients) that
14 // interpolate at Gauss-Lobatto nodes for the appropriate polynomial order
15 //
16 // note: mfem assumes the parent element domain is [0,1]x[0,1]x[0,1]
17 // for additional information on the finite_element concept requirements, see finite_element.hpp
19 template <int p, int c>
20 struct finite_element<mfem::Geometry::CUBE, L2<p, c> > {
21  static constexpr auto geometry = mfem::Geometry::CUBE;
22  static constexpr auto family = Family::L2;
23  static constexpr int components = c;
24  static constexpr int dim = 3;
25  static constexpr int n = (p + 1);
26  static constexpr int ndof = (p + 1) * (p + 1) * (p + 1);
27  static constexpr int order = p;
28 
29  static constexpr int VALUE = 0, GRADIENT = 1;
30  static constexpr int SOURCE = 0, FLUX = 1;
31 
32  // TODO: remove this
33  using residual_type =
34  typename std::conditional<components == 1, tensor<double, ndof>, tensor<double, ndof, components> >::type;
35 
36  using dof_type = tensor<double, c, p + 1, p + 1, p + 1>;
37 
38  using value_type = typename std::conditional<components == 1, double, tensor<double, components> >::type;
39  using derivative_type =
40  typename std::conditional<components == 1, tensor<double, dim>, tensor<double, components, dim> >::type;
41  using qf_input_type = tuple<value_type, derivative_type>;
42 
43  SMITH_HOST_DEVICE static constexpr tensor<double, ndof> shape_functions(tensor<double, dim> xi)
44  {
45  auto N_xi = GaussLobattoInterpolation<p + 1>(xi[0]);
46  auto N_eta = GaussLobattoInterpolation<p + 1>(xi[1]);
47  auto N_zeta = GaussLobattoInterpolation<p + 1>(xi[2]);
48 
49  int count = 0;
50 
51  tensor<double, ndof> N{};
52  for (int k = 0; k < p + 1; k++) {
53  for (int j = 0; j < p + 1; j++) {
54  for (int i = 0; i < p + 1; i++) {
55  N[count++] = N_xi[i] * N_eta[j] * N_zeta[k];
56  }
57  }
58  }
59  return N;
60  }
61 
62  SMITH_HOST_DEVICE static constexpr tensor<double, ndof, dim> shape_function_gradients(tensor<double, dim> xi)
63  {
64  auto N_xi = GaussLobattoInterpolation<p + 1>(xi[0]);
65  auto N_eta = GaussLobattoInterpolation<p + 1>(xi[1]);
66  auto N_zeta = GaussLobattoInterpolation<p + 1>(xi[2]);
67  auto dN_xi = GaussLobattoInterpolationDerivative<p + 1>(xi[0]);
68  auto dN_eta = GaussLobattoInterpolationDerivative<p + 1>(xi[1]);
69  auto dN_zeta = GaussLobattoInterpolationDerivative<p + 1>(xi[2]);
70 
71  int count = 0;
72 
73  // clang-format off
74  tensor<double, ndof, dim> dN{};
75  for (int k = 0; k < p + 1; k++) {
76  for (int j = 0; j < p + 1; j++) {
77  for (int i = 0; i < p + 1; i++) {
78  dN[count++] = {
79  dN_xi[i] * N_eta[j] * N_zeta[k],
80  N_xi[i] * dN_eta[j] * N_zeta[k],
81  N_xi[i] * N_eta[j] * dN_zeta[k]
82  };
83  }
84  }
85  }
86  return dN;
87  // clang-format on
88  }
89 
100  template <bool apply_weights, int q>
101  static constexpr auto calculate_B()
102  {
103  constexpr auto points1D = GaussLegendreNodes<q, mfem::Geometry::SEGMENT>();
104  [[maybe_unused]] constexpr auto weights1D = GaussLegendreWeights<q, mfem::Geometry::SEGMENT>();
105  tensor<double, q, n> B{};
106  for (int i = 0; i < q; i++) {
107  B[i] = GaussLobattoInterpolation<n>(points1D[i]);
108  if constexpr (apply_weights) B[i] = B[i] * weights1D[i];
109  }
110  return B;
111  }
112 
123  template <bool apply_weights, int q>
124  static constexpr auto calculate_G()
125  {
126  constexpr auto points1D = GaussLegendreNodes<q, mfem::Geometry::SEGMENT>();
127  [[maybe_unused]] constexpr auto weights1D = GaussLegendreWeights<q, mfem::Geometry::SEGMENT>();
128  tensor<double, q, n> G{};
129  for (int i = 0; i < q; i++) {
130  G[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
131  if constexpr (apply_weights) G[i] = G[i] * weights1D[i];
132  }
133  return G;
134  }
135 
136  template <typename in_t, int q>
137  static auto batch_apply_shape_fn(int j, tensor<in_t, q * q * q> input, const TensorProductQuadratureRule<q>&)
138  {
139  static constexpr bool apply_weights = false;
140  static constexpr auto B = calculate_B<apply_weights, q>();
141  static constexpr auto G = calculate_G<apply_weights, q>();
142 
143  int jx = j % n;
144  int jy = (j % (n * n)) / n;
145  int jz = j / (n * n);
146 
147  using source_t = decltype(get<0>(get<0>(in_t{})) + dot(get<1>(get<0>(in_t{})), tensor<double, dim>{}));
148  using flux_t = decltype(get<0>(get<1>(in_t{})) + dot(get<1>(get<1>(in_t{})), tensor<double, dim>{}));
149 
150  tensor<tuple<source_t, flux_t>, q * q * q> output;
151 
152  for (int qz = 0; qz < q; qz++) {
153  for (int qy = 0; qy < q; qy++) {
154  for (int qx = 0; qx < q; qx++) {
155  double phi_j = B(qx, jx) * B(qy, jy) * B(qz, jz);
156  tensor<double, dim> dphi_j_dxi = {G(qx, jx) * B(qy, jy) * B(qz, jz), B(qx, jx) * G(qy, jy) * B(qz, jz),
157  B(qx, jx) * B(qy, jy) * G(qz, jz)};
158 
159  int Q = (qz * q + qy) * q + qx;
160  const auto& d00 = get<0>(get<0>(input(Q)));
161  const auto& d01 = get<1>(get<0>(input(Q)));
162  const auto& d10 = get<0>(get<1>(input(Q)));
163  const auto& d11 = get<1>(get<1>(input(Q)));
164 
165  output[Q] = {d00 * phi_j + dot(d01, dphi_j_dxi), d10 * phi_j + dot(d11, dphi_j_dxi)};
166  }
167  }
168  }
169 
170  return output;
171  }
172 
173  template <int q>
174  SMITH_HOST_DEVICE static auto interpolate(const dof_type& X, const TensorProductQuadratureRule<q>&)
175  {
176  // we want to compute the following:
177  //
178  // X_q(u, v, w) := (B(u, i) * B(v, j) * B(w, k)) * X_e(i, j, k)
179  //
180  // where
181  // X_q(u, v, w) are the quadrature-point values at position {u, v, w},
182  // B(u, i) is the i^{th} 1D interpolation/differentiation (shape) function,
183  // evaluated at the u^{th} 1D quadrature point, and
184  // X_e(i, j, k) are the values at node {i, j, k} to be interpolated
185  //
186  // this algorithm carries out the above calculation in 3 steps:
187  //
188  // A1(dz, dy, qx) := B(qx, dx) * X_e(dz, dy, dx)
189  // A2(dz, qy, qx) := B(qy, dy) * A1(dz, dy, qx)
190  // X_q(qz, qy, qx) := B(qz, dz) * A2(dz, qy, qx)
191  static constexpr bool apply_weights = false;
192  static constexpr auto B = calculate_B<apply_weights, q>();
193  static constexpr auto G = calculate_G<apply_weights, q>();
194 
195  tensor<double, c, q, q, q> value{};
196  tensor<double, c, dim, q, q, q> gradient{};
197 
198  for (int i = 0; i < c; i++) {
199  auto A10 = contract<2, 1>(X[i], B);
200  auto A11 = contract<2, 1>(X[i], G);
201 
202  auto A20 = contract<1, 1>(A10, B);
203  auto A21 = contract<1, 1>(A11, B);
204  auto A22 = contract<1, 1>(A10, G);
205 
206  value(i) = contract<0, 1>(A20, B);
207  gradient(i, 0) = contract<0, 1>(A21, B);
208  gradient(i, 1) = contract<0, 1>(A22, B);
209  gradient(i, 2) = contract<0, 1>(A20, G);
210  }
211 
212  // transpose the quadrature data into a flat tensor of tuples
213  union {
214  tensor<qf_input_type, q * q * q> one_dimensional;
215  tensor<tuple<tensor<double, c>, tensor<double, c, dim> >, q, q, q> three_dimensional;
216  } output;
217 
218  for (int qz = 0; qz < q; qz++) {
219  for (int qy = 0; qy < q; qy++) {
220  for (int qx = 0; qx < q; qx++) {
221  for (int i = 0; i < c; i++) {
222  get<VALUE>(output.three_dimensional(qz, qy, qx))[i] = value(i, qz, qy, qx);
223  for (int j = 0; j < dim; j++) {
224  get<GRADIENT>(output.three_dimensional(qz, qy, qx))[i][j] = gradient(i, j, qz, qy, qx);
225  }
226  }
227  }
228  }
229  }
230 
231  return output.one_dimensional;
232  }
233 
234  template <typename source_type, typename flux_type, int q>
235  SMITH_HOST_DEVICE static void integrate(const tensor<tuple<source_type, flux_type>, q * q * q>& qf_output,
236  const TensorProductQuadratureRule<q>&, dof_type* element_residual,
237  int step = 1)
238  {
239  if constexpr (is_zero<source_type>{} && is_zero<flux_type>{}) {
240  return;
241  }
242 
243  constexpr int ntrial = std::max(size(source_type{}), size(flux_type{}) / dim) / c;
244 
245  using s_buffer_type = std::conditional_t<is_zero<source_type>{}, zero, tensor<double, q, q, q> >;
246  using f_buffer_type = std::conditional_t<is_zero<flux_type>{}, zero, tensor<double, dim, q, q, q> >;
247 
248  static constexpr bool apply_weights = true;
249  static constexpr auto B = calculate_B<apply_weights, q>();
250  static constexpr auto G = calculate_G<apply_weights, q>();
251 
252  for (int j = 0; j < ntrial; j++) {
253  for (int i = 0; i < c; i++) {
254  s_buffer_type source;
255  f_buffer_type flux;
256 
257  for (int qz = 0; qz < q; qz++) {
258  for (int qy = 0; qy < q; qy++) {
259  for (int qx = 0; qx < q; qx++) {
260  int Q = (qz * q + qy) * q + qx;
261  if constexpr (!is_zero<source_type>{}) {
262  source(qz, qy, qx) = reinterpret_cast<const double*>(&get<SOURCE>(qf_output[Q]))[i * ntrial + j];
263  }
264  if constexpr (!is_zero<flux_type>{}) {
265  for (int k = 0; k < dim; k++) {
266  flux(k, qz, qy, qx) =
267  reinterpret_cast<const double*>(&get<FLUX>(qf_output[Q]))[(i * dim + k) * ntrial + j];
268  }
269  }
270  }
271  }
272  }
273 
274  auto A20 = contract<2, 0>(source, B) + contract<2, 0>(flux(0), G);
275  auto A21 = contract<2, 0>(flux(1), B);
276  auto A22 = contract<2, 0>(flux(2), B);
277 
278  auto A10 = contract<1, 0>(A20, B) + contract<1, 0>(A21, G);
279  auto A11 = contract<1, 0>(A22, B);
280 
281  element_residual[j * step](i) += contract<0, 0>(A10, B) + contract<0, 0>(A11, G);
282  }
283  }
284  }
285 
286 #if 0
287 
288  template <int q>
289  static SMITH_DEVICE auto interpolate(const dof_type& X, const tensor<double, dim, dim>& J,
290  const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache)
291  {
292  // we want to compute the following:
293  //
294  // X_q(u, v, w) := (B(u, i) * B(v, j) * B(w, k)) * X_e(i, j, k)
295  //
296  // where
297  // X_q(u, v, w) are the quadrature-point values at position {u, v, w},
298  // B(u, i) is the i^{th} 1D interpolation/differentiation (shape) function,
299  // evaluated at the u^{th} 1D quadrature point, and
300  // X_e(i, j, k) are the values at node {i, j, k} to be interpolated
301  //
302  // this algorithm carries out the above calculation in 3 steps:
303  //
304  // A1(dz, dy, qx) := B(qx, dx) * X_e(dz, dy, dx)
305  // A2(dz, qy, qx) := B(qy, dy) * A1(dz, dy, qx)
306  // X_q(qz, qy, qx) := B(qz, dz) * A2(dz, qy, qx)
307 
308  int tidx = threadIdx.x % q;
309  int tidy = (threadIdx.x % (q * q)) / q;
310  int tidz = threadIdx.x / (q * q);
311 
312  static constexpr auto points1D = GaussLegendreNodes<q>();
313  static constexpr auto B_ = [=]() {
314  tensor<double, q, n> B{};
315  for (int i = 0; i < q; i++) {
316  B[i] = GaussLobattoInterpolation<n>(points1D[i]);
317  }
318  return B;
319  }();
320 
321  static constexpr auto G_ = [=]() {
322  tensor<double, q, n> G{};
323  for (int i = 0; i < q; i++) {
324  G[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
325  }
326  return G;
327  }();
328 
329  __shared__ tensor<double, q, n> B;
330  __shared__ tensor<double, q, n> G;
331  for (int entry = threadIdx.x; entry < n * q; entry += q * q * q) {
332  int i = entry % n;
333  int j = entry / n;
334  B(j, i) = B_(j, i);
335  G(j, i) = G_(j, i);
336  }
337  __syncthreads();
338 
339  tuple<tensor<double, c>, tensor<double, c, 3> > qf_input{};
340 
341  for (int i = 0; i < c; i++) {
342  for (int dz = tidz; dz < n; dz += q) {
343  for (int dy = tidy; dy < n; dy += q) {
344  for (int qx = tidx; qx < q; qx += q) {
345  double sum[2]{};
346  for (int dx = 0; dx < n; dx++) {
347  sum[0] += B(qx, dx) * X(i, dz, dy, dx);
348  sum[1] += G(qx, dx) * X(i, dz, dy, dx);
349  }
350  cache.A1(0, dz, dy, qx) = sum[0];
351  cache.A1(1, dz, dy, qx) = sum[1];
352  }
353  }
354  }
355  __syncthreads();
356 
357  for (int dz = tidz; dz < n; dz += q) {
358  for (int qy = tidy; qy < q; qy += q) {
359  for (int qx = tidx; qx < q; qx += q) {
360  double sum[3]{};
361  for (int dy = 0; dy < n; dy++) {
362  sum[0] += B(qy, dy) * cache.A1(0, dz, dy, qx);
363  sum[1] += B(qy, dy) * cache.A1(1, dz, dy, qx);
364  sum[2] += G(qy, dy) * cache.A1(0, dz, dy, qx);
365  }
366  cache.A2(0, dz, qy, qx) = sum[0];
367  cache.A2(1, dz, qy, qx) = sum[1];
368  cache.A2(2, dz, qy, qx) = sum[2];
369  }
370  }
371  }
372  __syncthreads();
373 
374  for (int qz = tidz; qz < q; qz += q) {
375  for (int qy = tidy; qy < q; qy += q) {
376  for (int qx = tidx; qx < q; qx += q) {
377  for (int dz = 0; dz < n; dz++) {
378  get<0>(qf_input)[i] += B(qz, dz) * cache.A2(0, dz, qy, qx);
379  get<1>(qf_input)[i][0] += B(qz, dz) * cache.A2(1, dz, qy, qx);
380  get<1>(qf_input)[i][1] += B(qz, dz) * cache.A2(2, dz, qy, qx);
381  get<1>(qf_input)[i][2] += G(qz, dz) * cache.A2(0, dz, qy, qx);
382  }
383  }
384  }
385  }
386  }
387 
388  get<1>(qf_input) = dot(get<1>(qf_input), inv(J));
389 
390  return qf_input;
391  }
392 
393  template <typename T1, typename T2, int q>
394  static SMITH_DEVICE void integrate(tuple<T1, T2>& response, const tensor<double, dim, dim>& J,
395  const TensorProductQuadratureRule<q>& rule, cache_type<q>& cache,
396  dof_type& residual)
397  {
398  int tidx = threadIdx.x % q;
399  int tidy = (threadIdx.x % (q * q)) / q;
400  int tidz = threadIdx.x / (q * q);
401 
402  static constexpr auto points1D = GaussLegendreNodes<q>();
403  static constexpr auto weights1D = GaussLegendreWeights<q>();
404  static constexpr auto B_ = [=]() {
405  tensor<double, q, n> B{};
406  for (int i = 0; i < q; i++) {
407  B[i] = GaussLobattoInterpolation<n>(points1D[i]);
408  }
409  return B;
410  }();
411 
412  static constexpr auto G_ = [=]() {
413  tensor<double, q, n> G{};
414  for (int i = 0; i < q; i++) {
415  G[i] = GaussLobattoInterpolationDerivative<n>(points1D[i]);
416  }
417  return G;
418  }();
419 
420  __shared__ tensor<double, q, n> B;
421  __shared__ tensor<double, q, n> G;
422  for (int entry = threadIdx.x; entry < n * q; entry += q * q * q) {
423  int i = entry % n;
424  int j = entry / n;
425  B(j, i) = B_(j, i);
426  G(j, i) = G_(j, i);
427  }
428  __syncthreads();
429 
430  auto dv = det(J) * weights1D[tidx] * weights1D[tidy] * weights1D[tidz];
431 
432  get<0>(response) = get<0>(response) * dv;
433  get<1>(response) = dot(get<1>(response), inv(transpose(J))) * dv;
434 
435  for (int i = 0; i < c; i++) {
436  // this first contraction is performed a little differently, since `response` is not
437  // in shared memory, so each thread can only access its own values
438  for (int qz = tidz; qz < q; qz += q) {
439  for (int qy = tidy; qy < q; qy += q) {
440  for (int dx = tidx; dx < n; dx += q) {
441  cache.A2(0, dx, qy, qz) = 0.0;
442  cache.A2(1, dx, qy, qz) = 0.0;
443  cache.A2(2, dx, qy, qz) = 0.0;
444  }
445  }
446  }
447  __syncthreads();
448 
449  for (int offset = 0; offset < n; offset++) {
450  int dx = (tidx + offset) % n;
451  auto sum = B(tidx, dx) * get<0>(response)(i) + G(tidx, dx) * get<1>(response)(i, 0);
452  atomicAdd(&cache.A2(0, dx, tidz, tidy), sum);
453  atomicAdd(&cache.A2(1, dx, tidz, tidy), B(tidx, dx) * get<1>(response)(i, 1));
454  atomicAdd(&cache.A2(2, dx, tidz, tidy), B(tidx, dx) * get<1>(response)(i, 2));
455  }
456  __syncthreads();
457 
458  for (int qz = tidz; qz < q; qz += q) {
459  for (int dy = tidy; dy < n; dy += q) {
460  for (int dx = tidx; dx < n; dx += q) {
461  double sum[2]{};
462  for (int qy = 0; qy < q; qy++) {
463  sum[0] += B(qy, dy) * cache.A2(0, dx, qz, qy);
464  sum[0] += G(qy, dy) * cache.A2(1, dx, qz, qy);
465  sum[1] += B(qy, dy) * cache.A2(2, dx, qz, qy);
466  }
467  cache.A1(0, qz, dy, dx) = sum[0];
468  cache.A1(1, qz, dy, dx) = sum[1];
469  }
470  }
471  }
472  __syncthreads();
473 
474  for (int dz = tidz; dz < n; dz += q) {
475  for (int dy = tidy; dy < n; dy += q) {
476  for (int dx = tidx; dx < n; dx += q) {
477  double sum = 0.0;
478  for (int qz = 0; qz < q; qz++) {
479  sum += B(qz, dz) * cache.A1(0, qz, dy, dx);
480  sum += G(qz, dz) * cache.A1(1, qz, dy, dx);
481  }
482  residual(i, dz, dy, dx) += sum;
483  }
484  }
485  }
486  }
487  }
488 
489 #endif
490 };
#define SMITH_HOST_DEVICE
Macro that evaluates to __host__ __device__ when compiling with nvcc or amdclang and does nothing on ...
Definition: accelerator.hpp:37
#define SMITH_DEVICE
Macro that evaluates to __device__ when compiling with nvcc or amdclang and does nothing on a host co...
Definition: accelerator.hpp:45
constexpr SMITH_HOST_DEVICE auto inv(const isotropic_tensor< T, m, m > &I)
return the inverse of an isotropic tensor
constexpr SMITH_HOST_DEVICE auto type(const tuple< T... > &values)
a function intended to be used for extracting the ith type from a tuple.
Definition: tuple.hpp:376
SMITH_HOST_DEVICE auto max(dual< gradient_type > a, double b)
Implementation of max for dual numbers.
Definition: dual.hpp:229
constexpr SMITH_HOST_DEVICE int size(const tensor< T, n... > &)
returns the total number of stored values in a tensor
Definition: tensor.hpp:1939
constexpr SMITH_HOST_DEVICE auto transpose(const isotropic_tensor< T, m, m > &I)
return the transpose of an isotropic tensor
tensor(const T(&data)[n1]) -> tensor< T, n1 >
class template argument deduction guide for type tensor.
constexpr SMITH_HOST_DEVICE auto det(const isotropic_tensor< T, m, m > &I)
compute the determinant of an isotropic tensor
constexpr SMITH_HOST_DEVICE auto dot(const isotropic_tensor< S, m, m > &I, const tensor< T, m, n... > &A)
dot product between an isotropic and (nonisotropic) tensor