Block Jacobi (#69)

dschwen · dschwen · commit fc1e5fbd413c · 2025-08-21T17:34:31.000-06:00
diff --git a/examples/degeus_mechanics/mech.i b/examples/degeus_mechanics/mech.i
@@ -7,7 +7,7 @@
   ymax = ${fparse 2*pi}
   zmax = ${fparse 2*pi}
   mesh_mode = DUMMY
-  device_names = cuda
+  device_names = cpu
 []
 
 [TensorComputes]
@@ -72,7 +72,11 @@
         constitutive_model = hyper_elasticity
         stress = stress
         applied_macroscopic_strain = applied_strain
-        hutchinson_steps = 6
+        # hutchinson_steps = 64
+        # jacobi_min_rel = 1e-2
+        # jacobi_inv_cap = 1e4
+        block_jacobi = true
+        block_jacobi_damp=1e-1
         verbose = true
       []
     []
diff --git a/include/tensor_buffers/NEML2TensorBuffer.h b/include/tensor_buffers/NEML2TensorBuffer.h
@@ -21,7 +21,7 @@ class NEML2TensorBuffer : public TensorBuffer<T>
 
   NEML2TensorBuffer(const InputParameters & parameters);
 
-  virtual void init();
+  virtual void init() override;
   virtual void makeCPUCopy() override;
 
   using TensorBuffer<T>::_u;
diff --git a/include/tensor_computes/FFTMechanics.h b/include/tensor_computes/FFTMechanics.h
@@ -67,6 +67,16 @@ class FFTMechanics : public TensorOperator<>
   /// steps for diagonal estimation
   const unsigned int _hutchinson_steps;
 
+  /// use block-Jacobi (local compliance) preconditioner
+  const bool _block_jacobi;
+  /// relative damping for block-Jacobi inversion
+  const Real _block_jacobi_damp;
+
+  /// minimum relative floor for Jacobi diagonal (relative to median)
+  const Real _jacobi_min_rel;
+  /// cap on inverse diagonal scaling (0 disables)
+  const Real _jacobi_inv_cap;
+
   /// add diagnostic output for iterations
   const bool _verbose;
 
diff --git a/include/utils/SwiftUtils.h b/include/utils/SwiftUtils.h
@@ -51,6 +51,16 @@ torch::Tensor dot24(const torch::Tensor & A2, const torch::Tensor & B4);
 torch::Tensor dot42(const torch::Tensor & A4, const torch::Tensor & B2);
 torch::Tensor dyad22(const torch::Tensor & A2, const torch::Tensor & B2);
 
+// Invert local 4th-order blocks (...., i, j, k, l) -> (...., i, j, k, l)
+// Treats each grid point's (i,j)-(k,l) matrix as a (d*d x d*d) block and inverts it in batch.
+// Returns a tensor with the same shape as the input, containing per-point block inverses.
+torch::Tensor invertLocalBlocks(const torch::Tensor & K4);
+
+// Damped inversion of local 4th-order blocks with optional pinv fallback.
+// damp_rel scales an identity added to each (d*d x d*d) block by damp_rel * mean(|diag|).
+// If inversion fails, falls back to pseudo-inverse.
+torch::Tensor invertLocalBlocksDamped(const torch::Tensor & K4, double damp_rel = 1e-8);
+
 torch::Tensor
 estimateJacobiPreconditioner(const std::function<torch::Tensor(const torch::Tensor &)> & A,
                              const torch::Tensor & template_vec,
diff --git a/src/tensor_computes/FFTMechanics.C b/src/tensor_computes/FFTMechanics.C
@@ -14,6 +14,7 @@
 #include <ATen/core/TensorBody.h>
 #include <ATen/ops/unsqueeze_ops.h>
 #include <util/Optional.h>
+#include <numeric>
 
 registerMooseObject("SwiftApp", FFTMechanics);
 
@@ -45,6 +46,18 @@ FFTMechanics::validParams()
                                 0,
                                 "Steps for diagonal estimation with Hutchinson's method used in "
                                 "Jacobi preconditioning. 0 skips preconditioning.");
+  params.addParam<bool>("block_jacobi",
+                        false,
+                        "Use block-Jacobi (local compliance) preconditioner instead of diagonal.");
+  params.addParam<Real>("block_jacobi_damp",
+                        1e-8,
+                        "Relative damping added to local tangent blocks before inversion.");
+  params.addParam<Real>(
+      "jacobi_min_rel",
+      1e-3,
+      "Minimum relative floor for stochastic Jacobi diagonal (relative to median).");
+  params.addParam<Real>(
+      "jacobi_inv_cap", 0.0, "Cap on inverse diagonal scaling; 0 disables clamping.");
   params.addParam<bool>("verbose", false, "Print non-linear residuals.");
   return params;
 }
@@ -74,6 +87,10 @@ FFTMechanics::FFTMechanics(const InputParameters & parameters)
                                     ? &getInputBuffer("applied_macroscopic_strain")
                                     : nullptr),
     _hutchinson_steps(getParam<unsigned int>("hutchinson_steps")),
+    _block_jacobi(getParam<bool>("block_jacobi")),
+    _block_jacobi_damp(getParam<Real>("block_jacobi_damp")),
+    _jacobi_min_rel(getParam<Real>("jacobi_min_rel")),
+    _jacobi_inv_cap(getParam<Real>("jacobi_inv_cap")),
     _verbose(getParam<bool>("verbose"))
 {
   // Build projection tensor once
@@ -133,14 +150,58 @@ FFTMechanics::computeBuffer()
   // iterate as long as the iterative update does not vanish
   for (const auto iiter : make_range(_nl_max_its))
   {
-    const auto diag_precond =
-        _hutchinson_steps ? torch::abs(estimateJacobiPreconditioner(G_K_dF, b, _hutchinson_steps))
-                          : torch::ones_like(b);
-    const auto M_inv = [&diag_precond](const torch::Tensor & x) { return x / diag_precond; };
+    c10::optional<torch::Tensor> invK4;
+    const auto diag_estimate =
+        (!_block_jacobi && _hutchinson_steps)
+            ? torch::abs(estimateJacobiPreconditioner(G_K_dF, b, _hutchinson_steps))
+            : torch::ones_like(b);
+    auto inv_diag = torch::ones_like(b);
+    if (!_block_jacobi && _hutchinson_steps)
+    {
+      // Robust floor relative to a nonzero scale to avoid huge inverse scaling
+      auto mask = diag_estimate > 1e-9; // ignore near-zero estimates
+      auto selected = at::masked_select(diag_estimate, mask);
+      auto scale_t = selected.numel() > 0 ? selected.mean() : diag_estimate.mean();
+      auto floor_t = scale_t * _jacobi_min_rel;
+      auto diag_precond = torch::clamp(diag_estimate, floor_t, c10::nullopt);
+      inv_diag = 1.0 / diag_precond;
+      if (_jacobi_inv_cap > 0.0)
+      {
+        inv_diag = torch::clamp(inv_diag, 0.0, _jacobi_inv_cap);
+      }
+    }
+    const auto M_inv = [&](const torch::Tensor & x)
+    {
+      if (_block_jacobi)
+      {
+        if (!invK4.has_value())
+          invK4 = MooseTensor::invertLocalBlocksDamped(_tK4, _block_jacobi_damp);
+        auto x2 = x.reshape(_r2_shape);
+        auto z2raw = MooseTensor::trans2(MooseTensor::ddot42(*invK4, MooseTensor::trans2(x2)));
+        // Enforce zero-mean (remove k=0 mode) without FFT cost
+        std::vector<int64_t> reduce_dims(z2raw.dim() - 2);
+        std::iota(reduce_dims.begin(), reduce_dims.end(), 0);
+        auto mean2 = z2raw.mean(reduce_dims, /*keepdim=*/true);
+        auto z2 = z2raw - mean2;
+        return z2.reshape(-1);
+      }
+      else
+      {
+        auto x2 = x.reshape(_r2_shape);
+        auto z2raw = x2 * inv_diag.reshape(_r2_shape);
+        // Enforce zero-mean (remove k=0 mode) without FFT cost
+        std::vector<int64_t> reduce_dims(z2raw.dim() - 2);
+        std::iota(reduce_dims.begin(), reduce_dims.end(), 0);
+        auto mean2 = z2raw.mean(reduce_dims, /*keepdim=*/true);
+        auto z2 = z2raw - mean2;
+        return z2.reshape(-1);
+      }
+    };
 
     const auto [dFm_new, iterations, lnorm] =
-        _hutchinson_steps ? conjugateGradientSolve(G_K_dF, b, dFm, _l_tol, _l_max_its, M_inv)
-                          : conjugateGradientSolve(G_K_dF, b, dFm, _l_tol, _l_max_its);
+        (_block_jacobi || _hutchinson_steps)
+            ? conjugateGradientSolve(G_K_dF, b, dFm, _l_tol, _l_max_its, M_inv)
+            : conjugateGradientSolve(G_K_dF, b, dFm, _l_tol, _l_max_its);
     dFm = dFm_new;
 
     // update DOFs (array -> tens.grid)
diff --git a/src/utils/SwiftUtils.C b/src/utils/SwiftUtils.C
@@ -10,6 +10,9 @@
 #include "SwiftApp.h"
 #include "MooseUtils.h"
 #include "Moose.h"
+// for batched linear algebra
+#include <ATen/ops/linalg_inv.h>
+#include <ATen/ops/linalg_pinv.h>
 
 namespace MooseTensor
 {
@@ -218,6 +221,51 @@ printBuffer(const torch::Tensor & t, const unsigned int & precision, const unsig
     std::cout << std::endl;
   }
 }
+// Invert local 4th-order blocks (batch of d*d x d*d matrices)
+torch::Tensor
+invertLocalBlocks(const torch::Tensor & K4)
+{
+  // Expect shape: [..., d, d, d, d]
+  const auto d = K4.size(-1);
+  // Flatten last 4 dims to (d*d, d*d) with batch = prod(leading dims)
+  auto K2 = K4.reshape({-1, d * d, d * d});
+  // Batched inverse
+  auto K2_inv = at::linalg_inv(K2);
+  // Restore original shape
+  return K2_inv.reshape(K4.sizes());
+}
+
+torch::Tensor
+invertLocalBlocksDamped(const torch::Tensor & K4, double damp_rel)
+{
+  // Flatten to (batch, n, n)
+  const auto d = K4.size(-1);
+  const auto n = d * d;
+  auto K2 = K4.reshape({-1, n, n});
+
+  // Build batched identity
+  auto I = torch::eye(n, K4.options()).unsqueeze(0).expand({K2.size(0), n, n});
+
+  // Scale damping by mean absolute diagonal across batch
+  auto diag = K2.diagonal(0, -2, -1);
+  double scale = diag.abs().mean().template item<double>();
+  if (!(scale > 0.0))
+    scale = 1.0;
+  const double eps = damp_rel > 0.0 ? damp_rel * scale : 0.0;
+
+  auto K2_reg = eps > 0.0 ? (K2 + eps * I) : K2;
+
+  try
+  {
+    auto K2_inv = at::linalg_inv(K2_reg);
+    return K2_inv.reshape(K4.sizes());
+  }
+  catch (const c10::Error &)
+  {
+    auto K2_pinv = at::linalg_pinv(K2_reg);
+    return K2_pinv.reshape(K4.sizes());
+  }
+}
 
 // Diagonal estimation with Hutchinson's method
 torch::Tensor