diff --git a/examples/degeus_mechanics/mech.i b/examples/degeus_mechanics/mech.i
index 94bef6fb..8d86e401 100644
--- a/examples/degeus_mechanics/mech.i
+++ b/examples/degeus_mechanics/mech.i
@@ -7,13 +7,20 @@
   ymax = ${fparse 2*pi}
   zmax = ${fparse 2*pi}
   mesh_mode = DUMMY
+  device_names = cpu
 []
 
 [TensorComputes]
   [Initialize]
     [Finit]
-      type = RankTwoIdentity
+      type = RankTwoDiagonalTensor
       buffer = F
+      value = 1
+    []
+    [Stressinit]
+      type = RankTwoDiagonalTensor
+      buffer = stress
+      value = 0
     []
 
     [phase]
@@ -58,12 +65,19 @@
         F = F
         K = K
         mu = mu
-        l_tol = 1e-2
-        nl_rel_tol = 2e-2
+        l_tol = 1e-3
+        l_max_its = 200
+        nl_rel_tol = 2e-3
         nl_abs_tol = 2e-2
         constitutive_model = hyper_elasticity
         stress = stress
         applied_macroscopic_strain = applied_strain
+        # hutchinson_steps = 64
+        # jacobi_min_rel = 1e-2
+        # jacobi_inv_cap = 1e4
+        block_jacobi = true
+        block_jacobi_damp=1e-1
+        verbose = true
       []
     []
   []
@@ -88,7 +102,7 @@
   # deformation tensor is just forwarded Fnew -> F
   forward_buffer = F
   forward_buffer_new = Fnew
-  substeps = 10
+  substeps = 1
 []
 
 [TensorOutputs]
@@ -105,3 +119,8 @@
   num_steps = 100
   dt = 0.01
 []
+
+[Outputs]
+  perf_graph = true
+  console = true
+[]
diff --git a/include/tensor_buffers/NEML2TensorBuffer.h b/include/tensor_buffers/NEML2TensorBuffer.h
index e50ecae2..2a4ea075 100644
--- a/include/tensor_buffers/NEML2TensorBuffer.h
+++ b/include/tensor_buffers/NEML2TensorBuffer.h
@@ -21,7 +21,7 @@ class NEML2TensorBuffer : public TensorBuffer<T>
 
   NEML2TensorBuffer(const InputParameters & parameters);
 
-  virtual void init();
+  virtual void init() override;
   virtual void makeCPUCopy() override;
 
   using TensorBuffer<T>::_u;
diff --git a/include/tensor_computes/FFTMechanics.h b/include/tensor_computes/FFTMechanics.h
index 7c93b4c9..efde660c 100644
--- a/include/tensor_computes/FFTMechanics.h
+++ b/include/tensor_computes/FFTMechanics.h
@@ -64,6 +64,20 @@ class FFTMechanics : public TensorOperator<>
   /// applied macroscopic (affine) strain
   const torch::Tensor * const _applied_macroscopic_strain;
 
+  /// steps for diagonal estimation
+  const unsigned int _hutchinson_steps;
+
+  /// use block-Jacobi (local compliance) preconditioner
+  const bool _block_jacobi;
+  /// relative damping for block-Jacobi inversion
+  const Real _block_jacobi_damp;
+
+  /// minimum relative floor for Jacobi diagonal (relative to median)
+  const Real _jacobi_min_rel;
+  /// cap on inverse diagonal scaling (0 disables)
+  const Real _jacobi_inv_cap;
+
+  /// add diagnostic output for iterations
   const bool _verbose;
 
   using TensorOperatorBase::_dim;
diff --git a/include/tensor_computes/RankTwoIdentity.h b/include/tensor_computes/RankTwoDiagonalTensor.h
similarity index 81%
rename from include/tensor_computes/RankTwoIdentity.h
rename to include/tensor_computes/RankTwoDiagonalTensor.h
index 6503013c..15e547cb 100644
--- a/include/tensor_computes/RankTwoIdentity.h
+++ b/include/tensor_computes/RankTwoDiagonalTensor.h
@@ -13,15 +13,18 @@
 /**
  * Identity rank two tensor Tensor
  */
-class RankTwoIdentity : public TensorOperator<>
+class RankTwoDiagonalTensor : public TensorOperator<>
 {
 public:
   static InputParameters validParams();
 
-  RankTwoIdentity(const InputParameters & parameters);
+  RankTwoDiagonalTensor(const InputParameters & parameters);
 
   virtual void computeBuffer() override;
 
   /// mesh dimension
   const unsigned int & _dim;
+
+  /// value on the diagonal
+  const Real & _value;
 };
diff --git a/include/utils/SwiftUtils.h b/include/utils/SwiftUtils.h
index ef85819e..71f19930 100644
--- a/include/utils/SwiftUtils.h
+++ b/include/utils/SwiftUtils.h
@@ -51,75 +51,29 @@ torch::Tensor dot24(const torch::Tensor & A2, const torch::Tensor & B4);
 torch::Tensor dot42(const torch::Tensor & A4, const torch::Tensor & B2);
 torch::Tensor dyad22(const torch::Tensor & A2, const torch::Tensor & B2);
 
-template <typename T1, typename T2>
-std::tuple<torch::Tensor, unsigned int, double>
-conjugateGradientSolve(T1 A, torch::Tensor b, torch::Tensor x0, double tol, int64_t maxiter, T2 M)
-{
-  // initialize solution guess
-  torch::Tensor x = x0.defined() ? x0.clone() : torch::zeros_like(b);
-
-  // norm of b (for relative tolerance)
-  const double b_norm = torch::norm(b).cpu().template item<double>();
-  if (b_norm == 0.0)
-    // solution is zero if b is zero
-    return {x, 0u, 0.0};
-
-  // default max iterations
-  if (!maxiter)
-    maxiter = b.numel();
-
-  // initial residual
-  torch::Tensor r = b - A(x);
-
-  // Apply preconditioner (or identity)
-  torch::Tensor z = M(r); // z = M^{-1} r
-
-  // initial search direction p
-  torch::Tensor p = z.clone();
-
-  // dot product (r, z)
-  double rz_old = torch::sum(r * z).cpu().template item<double>();
-
-  // CG iteration
-  double res_norm;
-  for (const auto k : libMesh::make_range(maxiter))
-  {
-    // compute matrix-vector product
-    const auto Ap = A(p);
+// Invert local 4th-order blocks (...., i, j, k, l) -> (...., i, j, k, l)
+// Treats each grid point's (i,j)-(k,l) matrix as a (d*d x d*d) block and inverts it in batch.
+// Returns a tensor with the same shape as the input, containing per-point block inverses.
+torch::Tensor invertLocalBlocks(const torch::Tensor & K4);
 
-    // step size alpha
-    double alpha = rz_old / torch::sum(p * Ap).cpu().template item<double>();
+// Damped inversion of local 4th-order blocks with optional pinv fallback.
+// damp_rel scales an identity added to each (d*d x d*d) block by damp_rel * mean(|diag|).
+// If inversion fails, falls back to pseudo-inverse.
+torch::Tensor invertLocalBlocksDamped(const torch::Tensor & K4, double damp_rel = 1e-8);
 
-    // update solution
-    x = x + alpha * p;
+torch::Tensor
+estimateJacobiPreconditioner(const std::function<torch::Tensor(const torch::Tensor &)> & A,
+                             const torch::Tensor & template_vec,
+                             int num_samples = 6);
 
-    // update residual
-    r = r - alpha * Ap;
-    res_norm = torch::norm(r).cpu().template item<double>(); // ||r||
-
-    // std::cout << res_norm << '\n';
-
-    // Converged to desired tolerance
-    if (res_norm <= tol * b_norm)
-      return {x, k + 1, res_norm};
-
-    // apply preconditioner to new residual
-    z = M(r);
-    const auto rz_new = torch::sum(r * z).cpu().template item<double>();
-
-    // update scalar beta
-    double beta = rz_new / rz_old;
-
-    // update search direction
-    p = z + beta * p;
-
-    // prepare for next iteration
-    rz_old = rz_new;
-  }
+std::tuple<torch::Tensor, unsigned int, double>
 
-  // Reached max iterations without full convergence
-  return {x, maxiter, res_norm};
-}
+conjugateGradientSolve(const std::function<torch::Tensor(const torch::Tensor &)> & A,
+                       torch::Tensor b,
+                       torch::Tensor x0,
+                       double tol,
+                       int64_t maxiter,
+                       const std::function<torch::Tensor(const torch::Tensor &)> & M);
 
 template <typename T>
 std::tuple<torch::Tensor, unsigned int, double>
diff --git a/src/tensor_computes/FFTMechanics.C b/src/tensor_computes/FFTMechanics.C
index ced49666..84ccae52 100644
--- a/src/tensor_computes/FFTMechanics.C
+++ b/src/tensor_computes/FFTMechanics.C
@@ -14,6 +14,7 @@
 #include <ATen/core/TensorBody.h>
 #include <ATen/ops/unsqueeze_ops.h>
 #include <util/Optional.h>
+#include <numeric>
 
 registerMooseObject("SwiftApp", FFTMechanics);
 
@@ -41,6 +42,22 @@ FFTMechanics::validParams()
   params.addParam<TensorInputBufferName>("applied_macroscopic_strain",
                                          "Applied macroscopic strain");
   params.addParam<TensorInputBufferName>("F", "F", "Deformation gradient tensor.");
+  params.addParam<unsigned int>("hutchinson_steps",
+                                0,
+                                "Steps for diagonal estimation with Hutchinson's method used in "
+                                "Jacobi preconditioning. 0 skips preconditioning.");
+  params.addParam<bool>("block_jacobi",
+                        false,
+                        "Use block-Jacobi (local compliance) preconditioner instead of diagonal.");
+  params.addParam<Real>("block_jacobi_damp",
+                        1e-8,
+                        "Relative damping added to local tangent blocks before inversion.");
+  params.addParam<Real>(
+      "jacobi_min_rel",
+      1e-3,
+      "Minimum relative floor for stochastic Jacobi diagonal (relative to median).");
+  params.addParam<Real>(
+      "jacobi_inv_cap", 0.0, "Cap on inverse diagonal scaling; 0 disables clamping.");
   params.addParam<bool>("verbose", false, "Print non-linear residuals.");
   return params;
 }
@@ -69,6 +86,11 @@ FFTMechanics::FFTMechanics(const InputParameters & parameters)
     _applied_macroscopic_strain(isParamValid("applied_macroscopic_strain")
                                     ? &getInputBuffer("applied_macroscopic_strain")
                                     : nullptr),
+    _hutchinson_steps(getParam<unsigned int>("hutchinson_steps")),
+    _block_jacobi(getParam<bool>("block_jacobi")),
+    _block_jacobi_damp(getParam<Real>("block_jacobi_damp")),
+    _jacobi_min_rel(getParam<Real>("jacobi_min_rel")),
+    _jacobi_inv_cap(getParam<Real>("jacobi_inv_cap")),
     _verbose(getParam<bool>("verbose"))
 {
   // Build projection tensor once
@@ -123,18 +145,67 @@ FFTMechanics::computeBuffer()
   const auto Fn =
       at::linalg_norm(_u, c10::nullopt, c10::nullopt, false, c10::nullopt).cpu().item<double>();
 
-  unsigned int iiter = 0;
   auto dFm = torch::zeros_like(b);
 
   // iterate as long as the iterative update does not vanish
-  while (true)
+  for (const auto iiter : make_range(_nl_max_its))
   {
+    c10::optional<torch::Tensor> invK4;
+    const auto diag_estimate =
+        (!_block_jacobi && _hutchinson_steps)
+            ? torch::abs(estimateJacobiPreconditioner(G_K_dF, b, _hutchinson_steps))
+            : torch::ones_like(b);
+    auto inv_diag = torch::ones_like(b);
+    if (!_block_jacobi && _hutchinson_steps)
+    {
+      // Robust floor relative to a nonzero scale to avoid huge inverse scaling
+      auto mask = diag_estimate > 1e-9; // ignore near-zero estimates
+      auto selected = at::masked_select(diag_estimate, mask);
+      auto scale_t = selected.numel() > 0 ? selected.mean() : diag_estimate.mean();
+      auto floor_t = scale_t * _jacobi_min_rel;
+      auto diag_precond = torch::clamp(diag_estimate, floor_t, c10::nullopt);
+      inv_diag = 1.0 / diag_precond;
+      if (_jacobi_inv_cap > 0.0)
+      {
+        inv_diag = torch::clamp(inv_diag, 0.0, _jacobi_inv_cap);
+      }
+    }
+    const auto M_inv = [&](const torch::Tensor & x)
+    {
+      if (_block_jacobi)
+      {
+        if (!invK4.has_value())
+          invK4 = MooseTensor::invertLocalBlocksDamped(_tK4, _block_jacobi_damp);
+        auto x2 = x.reshape(_r2_shape);
+        auto z2raw = MooseTensor::trans2(MooseTensor::ddot42(*invK4, MooseTensor::trans2(x2)));
+        // Enforce zero-mean (remove k=0 mode) without FFT cost
+        std::vector<int64_t> reduce_dims(z2raw.dim() - 2);
+        std::iota(reduce_dims.begin(), reduce_dims.end(), 0);
+        auto mean2 = z2raw.mean(reduce_dims, /*keepdim=*/true);
+        auto z2 = z2raw - mean2;
+        return z2.reshape(-1);
+      }
+      else
+      {
+        auto x2 = x.reshape(_r2_shape);
+        auto z2raw = x2 * inv_diag.reshape(_r2_shape);
+        // Enforce zero-mean (remove k=0 mode) without FFT cost
+        std::vector<int64_t> reduce_dims(z2raw.dim() - 2);
+        std::iota(reduce_dims.begin(), reduce_dims.end(), 0);
+        auto mean2 = z2raw.mean(reduce_dims, /*keepdim=*/true);
+        auto z2 = z2raw - mean2;
+        return z2.reshape(-1);
+      }
+    };
+
     const auto [dFm_new, iterations, lnorm] =
-        conjugateGradientSolve(G_K_dF, b, dFm, _l_tol, _l_max_its);
+        (_block_jacobi || _hutchinson_steps)
+            ? conjugateGradientSolve(G_K_dF, b, dFm, _l_tol, _l_max_its, M_inv)
+            : conjugateGradientSolve(G_K_dF, b, dFm, _l_tol, _l_max_its);
     dFm = dFm_new;
 
     // update DOFs (array -> tens.grid)
-    _u = _u + dFm.reshape(_r2_shape);
+    _u += dFm.reshape(_r2_shape);
 
     // new residual stress and tangent
     _constitutive_model.computeBuffer();
@@ -148,16 +219,13 @@ FFTMechanics::computeBuffer()
 
     // print nonlinear residual to the screen
     if (_verbose)
-      _console << "|R|=" << anorm << "\t|R/R0|=" << rnorm << '\n';
+      Moose::out << iiter << " |R|=" << anorm << "\t|R/R0|=" << rnorm << std::endl;
 
     // check convergence
-    if ((rnorm < _nl_rel_tol || anorm < _nl_abs_tol) && iiter > 0)
-      break;
-
-    iiter++;
-
-    if (iiter > _nl_max_its)
-      paramError("nl_max_its",
-                 "Exceeded the maximum number of nonlinear iterations without converging.");
+    if (rnorm < _nl_rel_tol || anorm < _nl_abs_tol)
+      return;
   }
+
+  paramError("nl_max_its",
+             "Exceeded the maximum number of nonlinear iterations without converging.");
 }
diff --git a/src/tensor_computes/RandomTensor.C b/src/tensor_computes/RandomTensor.C
index 46535bb2..952c3210 100644
--- a/src/tensor_computes/RandomTensor.C
+++ b/src/tensor_computes/RandomTensor.C
@@ -38,8 +38,9 @@ RandomTensor::computeBuffer()
 {
   const auto min = getParam<Real>("min");
   const auto max = getParam<Real>("max");
-  if (isParamValid("seed"))
-    torch::manual_seed(getParam<int>("seed"));
+
+  if (const auto seed = queryParam<int>("seed"))
+    torch::manual_seed(*seed);
 
   if (_generate_on_cpu)
   {
diff --git a/src/tensor_computes/RankTwoIdentity.C b/src/tensor_computes/RankTwoDiagonalTensor.C
similarity index 62%
rename from src/tensor_computes/RankTwoIdentity.C
rename to src/tensor_computes/RankTwoDiagonalTensor.C
index fe5f8bed..fbaa7484 100644
--- a/src/tensor_computes/RankTwoIdentity.C
+++ b/src/tensor_computes/RankTwoDiagonalTensor.C
@@ -6,28 +6,29 @@
 /*                        ALL RIGHTS RESERVED                         */
 /**********************************************************************/
 
-#include "RankTwoIdentity.h"
+#include "RankTwoDiagonalTensor.h"
 #include "SwiftUtils.h"
 #include "DomainAction.h"
 
-registerMooseObject("SwiftApp", RankTwoIdentity);
+registerMooseObject("SwiftApp", RankTwoDiagonalTensor);
 
 InputParameters
-RankTwoIdentity::validParams()
+RankTwoDiagonalTensor::validParams()
 {
   InputParameters params = TensorOperator<>::validParams();
   params.addClassDescription("Rank two identity tensor in real space.");
+  params.addParam<Real>("value", 1.0, "Value on the diagonal");
   return params;
 }
 
-RankTwoIdentity::RankTwoIdentity(const InputParameters & parameters)
-  : TensorOperator(parameters), _dim(_domain.getDim())
+RankTwoDiagonalTensor::RankTwoDiagonalTensor(const InputParameters & parameters)
+  : TensorOperator(parameters), _dim(_domain.getDim()), _value(getParam<Real>("value"))
 {
 }
 
 void
-RankTwoIdentity::computeBuffer()
+RankTwoDiagonalTensor::computeBuffer()
 {
   const auto full = _domain.getValueShape({_dim, _dim});
-  _u = torch::eye(_dim, MooseTensor::floatTensorOptions()).expand(full);
+  _u = (torch::eye(_dim, MooseTensor::floatTensorOptions()) * _value).expand(full);
 }
diff --git a/src/utils/SwiftUtils.C b/src/utils/SwiftUtils.C
index 9d4d9537..cf1196e7 100644
--- a/src/utils/SwiftUtils.C
+++ b/src/utils/SwiftUtils.C
@@ -10,6 +10,9 @@
 #include "SwiftApp.h"
 #include "MooseUtils.h"
 #include "Moose.h"
+// for batched linear algebra
+#include <ATen/ops/linalg_inv.h>
+#include <ATen/ops/linalg_pinv.h>
 
 namespace MooseTensor
 {
@@ -218,5 +221,146 @@ printBuffer(const torch::Tensor & t, const unsigned int & precision, const unsig
     std::cout << std::endl;
   }
 }
+// Invert local 4th-order blocks (batch of d*d x d*d matrices)
+torch::Tensor
+invertLocalBlocks(const torch::Tensor & K4)
+{
+  // Expect shape: [..., d, d, d, d]
+  const auto d = K4.size(-1);
+  // Flatten last 4 dims to (d*d, d*d) with batch = prod(leading dims)
+  auto K2 = K4.reshape({-1, d * d, d * d});
+  // Batched inverse
+  auto K2_inv = at::linalg_inv(K2);
+  // Restore original shape
+  return K2_inv.reshape(K4.sizes());
+}
+
+torch::Tensor
+invertLocalBlocksDamped(const torch::Tensor & K4, double damp_rel)
+{
+  // Flatten to (batch, n, n)
+  const auto d = K4.size(-1);
+  const auto n = d * d;
+  auto K2 = K4.reshape({-1, n, n});
+
+  // Build batched identity
+  auto I = torch::eye(n, K4.options()).unsqueeze(0).expand({K2.size(0), n, n});
+
+  // Scale damping by mean absolute diagonal across batch
+  auto diag = K2.diagonal(0, -2, -1);
+  double scale = diag.abs().mean().template item<double>();
+  if (!(scale > 0.0))
+    scale = 1.0;
+  const double eps = damp_rel > 0.0 ? damp_rel * scale : 0.0;
+
+  auto K2_reg = eps > 0.0 ? (K2 + eps * I) : K2;
+
+  try
+  {
+    auto K2_inv = at::linalg_inv(K2_reg);
+    return K2_inv.reshape(K4.sizes());
+  }
+  catch (const c10::Error &)
+  {
+    auto K2_pinv = at::linalg_pinv(K2_reg);
+    return K2_pinv.reshape(K4.sizes());
+  }
+}
+
+// Diagonal estimation with Hutchinson's method
+torch::Tensor
+estimateJacobiPreconditioner(const std::function<torch::Tensor(const torch::Tensor &)> & A,
+                             const torch::Tensor & template_vec,
+                             int num_samples)
+{
+  torch::Tensor diag_est = torch::zeros_like(template_vec);
+
+  for (int i = 0; i < num_samples; ++i)
+  {
+    torch::Tensor v = torch::randint(0, 2, template_vec.sizes(), template_vec.options()) * 2 - 1;
+    torch::Tensor Av = A(v);
+    diag_est += v * Av;
+  }
+
+  diag_est /= num_samples;
+
+  // Avoid division by zero
+  const torch::Tensor eps = torch::tensor(1e-10, diag_est.options());
+  return torch::where(torch::abs(diag_est) < eps, eps, diag_est);
+}
+
+std::tuple<torch::Tensor, unsigned int, double>
+conjugateGradientSolve(const std::function<torch::Tensor(const torch::Tensor &)> & A,
+                       torch::Tensor b,
+                       torch::Tensor x0,
+                       double tol,
+                       int64_t maxiter,
+                       const std::function<torch::Tensor(const torch::Tensor &)> & M)
+{
+  // initialize solution guess
+  torch::Tensor x = x0.defined() ? x0.clone() : torch::zeros_like(b);
+
+  // norm of b (for relative tolerance)
+  const double b_norm = torch::norm(b).cpu().template item<double>();
+  if (b_norm == 0.0)
+    // solution is zero if b is zero
+    return {x, 0u, 0.0};
+
+  // default max iterations
+  if (!maxiter)
+    maxiter = b.numel();
+
+  // initial residual
+  torch::Tensor r = b - A(x);
+
+  // Apply preconditioner (or identity)
+  torch::Tensor z = M(r); // z = M^{-1} r
+
+  // initial search direction p
+  torch::Tensor p = z.clone();
+
+  // dot product (r, z)
+  double rz_old = torch::sum(r * z).cpu().template item<double>();
+
+  // CG iteration
+  double res_norm;
+  for (const auto k : libMesh::make_range(maxiter))
+  {
+    // compute matrix-vector product
+    const auto Ap = A(p);
+
+    // step size alpha
+    double alpha = rz_old / torch::sum(p * Ap).cpu().template item<double>();
+
+    // update solution
+    x = x + alpha * p;
+
+    // update residual
+    r = r - alpha * Ap;
+    res_norm = torch::norm(r).cpu().template item<double>(); // ||r||
+
+    // std::cout << k << ' '<< res_norm << '\n';
+
+    // Converged to desired tolerance
+    if (res_norm <= tol * b_norm)
+      return {x, k + 1, res_norm};
+
+    // apply preconditioner to new residual
+    z = M(r);
+    const auto rz_new = torch::sum(r * z).cpu().template item<double>();
+
+    // update scalar beta
+    double beta = rz_new / rz_old;
+
+    // update search direction
+    p = z + beta * p;
+
+    // prepare for next iteration
+    rz_old = rz_new;
+  }
+
+  // Reached max iterations without full convergence
+  return {x, maxiter, res_norm};
+}
 
 } // namespace MooseTensor
diff --git a/test/tests/mechanics/gold/mech3d.h5 b/test/tests/mechanics/gold/mech3d.h5
index fcd62a51..07dd5134 100644
Binary files a/test/tests/mechanics/gold/mech3d.h5 and b/test/tests/mechanics/gold/mech3d.h5 differ
diff --git a/test/tests/mechanics/mech.i b/test/tests/mechanics/mech.i
index 9a37d7c8..eb95d72d 100644
--- a/test/tests/mechanics/mech.i
+++ b/test/tests/mechanics/mech.i
@@ -33,7 +33,7 @@
       constant_expressions = '0.5 5'
     []
     [Finit]
-      type = RankTwoIdentity
+      type = RankTwoDiagonalTensor
       buffer = F
     []
   []
diff --git a/test/tests/mechanics/mech3d.i b/test/tests/mechanics/mech3d.i
index 30e24a44..bc2c6d33 100644
--- a/test/tests/mechanics/mech3d.i
+++ b/test/tests/mechanics/mech3d.i
@@ -34,7 +34,7 @@
       constant_expressions = '0.5 5'
     []
     [Finit]
-      type = RankTwoIdentity
+      type = RankTwoDiagonalTensor
       buffer = F
     []
   []