From f7535b75c0bcb8f7b07a3a5143a57ee0f45f4a0d Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 8 Sep 2025 09:11:00 -0700
Subject: [PATCH 1/8] pointwise conv 1d partial fix

---
 .../vivado/passes/convolution_templates.py    | 11 ++-
 .../vivado/passes/pointwise_codegen.py        | 84 -------------------
 hls4ml/backends/vivado/vivado_backend.py      |  3 +-
 .../templates/vitis/nnet_utils/nnet_conv1d.h  | 42 +++++++++-
 .../vitis/nnet_utils/nnet_conv1d_latency.h    | 16 ++--
 .../templates/vivado/nnet_utils/nnet_conv1d.h | 38 +++++++++
 .../vivado/nnet_utils/nnet_conv1d_latency.h   | 16 ++--
 7 files changed, 102 insertions(+), 108 deletions(-)
 delete mode 100644 hls4ml/backends/vivado/passes/pointwise_codegen.py
diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 4d86f54049..1d255eeab2 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -102,7 +102,7 @@ def format(self, node):
             and node.model.config.get_config_value('IOType') == 'io_parallel'
         )
         if is_pointwise_parallel_latency:
-            params['conv_fn'] = f'{namespace}::pointwise_conv_{node.index}'
+            params['conv_fn'] = 'nnet::BatchedDenseForConv1D'
         else:
             if node.get_attr('strategy').lower() == 'latency':
                 params['conv_fn'] = 'nnet::Conv1DLatency'
@@ -116,10 +116,13 @@ def format(self, node):
 
         mult_params = self._default_config_params(node)
         if is_pointwise_parallel_latency:
-            mult_params['n_in'] = int(
-                node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']
+            mult_params['n_in'] = (
+                node.get_attr('in_width')
+                * node.get_attr('n_chan')
+                * node.get_attr('filt_width')
+                // mult_params['n_partitions']
             )
-            mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse'])
+            mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // mult_params['n_partitions']
         else:
             mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
             mult_params['n_out'] = node.get_attr('n_filt')
diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py
deleted file mode 100644
index fa7f795fbc..0000000000
--- a/hls4ml/backends/vivado/passes/pointwise_codegen.py
+++ /dev/null
@@ -1,84 +0,0 @@
-from hls4ml.model.layers import Conv1D
-from hls4ml.model.optimizer import OptimizerPass
-from hls4ml.model.types import Source
-
-
-def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1):
-    """Generate a C++ function for a pointwise convolution layer.
-
-    Args:
-        layer_idx (int): Index of layer ('index' attribute).
-        reuse_factor (int): Number of partitions to divide the input into.
-
-    Returns:
-        str: Generated C++ function
-    """
-
-    generated_code = (
-        'template<class data_T, class res_T, typename CONFIG_T>\n'
-        'class pointwise_conv_{index} : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {{\n'
-        '  public:\n'
-        '    static void conv(\n'
-        '                     data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n'
-        '                     res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n'
-        '                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n'
-        '                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n'
-        '        data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n'  # noqa: E501
-        '        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n'
-        '        res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n'  # noqa: E501
-        '        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n'
-        '    RFInputLoop:\n'
-        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n'
-        '        #pragma HLS UNROLL\n'
-        '        InnerInputLoop:\n'
-        '            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n'
-        '                #pragma HLS UNROLL\n'
-        '                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n'  # noqa: E501
-        '            }}\n'
-        '        }}\n\n'
-    ).format(index=layer_idx)
-    indent = '        '
-    for i in range(reuse_factor):
-        generated_code += indent
-        generated_code += (
-            f'nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[{i}], res_tmp[{i}], weights, biases);\n'
-        )
-
-    generated_code += (
-        '\n'
-        '    RFOutputLoop:\n'
-        '        for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n'
-        '        #pragma HLS UNROLL\n'
-        '        InnerOutputLoop:\n'
-        '            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n'
-        '                #pragma HLS UNROLL\n'
-        '                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n'  # noqa: E501
-        '            }\n'
-        '        }\n'
-        '    }\n'
-        '};\n'
-    )
-
-    return generated_code
-
-
-class GeneratePointwiseConv1D(OptimizerPass):
-    '''Generates code for pointwise 1D convolution'''
-
-    def match(self, node):
-        return (
-            isinstance(node, Conv1D)
-            and node.model.config.get_config_value('IOType') == 'io_parallel'
-            and node.get_attr('filt_width') == 1
-        )
-
-    def transform(self, model, node):
-        self._generate_pointwise_conv1d(node)
-
-    def _generate_pointwise_conv1d(self, node):
-        code_str = generate_pointwise_conv1d_fn(
-            node.get_attr('index'),
-            node.get_attr('reuse_factor'),
-        )
-
-        node.set_attr('pointwise_conv1d_codegen', Source(code_str))
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index d5309c377f..fcde896b38 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -183,7 +183,6 @@ def _register_flows(self):
             'vivado:generate_conv_streaming_instructions',
             'vivado:apply_resource_strategy',
             'vivado:generate_conv_im2col',
-            'vivado:generate_pointwise_conv1_d',
             'vivado:generate_unrolled_dense_resource',
             'vivado:set_pipeline_style',
             'vivado:d_a_latency_dense_template',
@@ -418,7 +417,7 @@ def init_conv1d(self, layer):
         if user_pf is not None and layer_pf is not None:
             if user_pf != layer_pf:
                 warn(
-                    f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.'  # noqa: E501
+                    f'Parallelization factor of {layer_pf} is embedded in layer {layer.name} is overridden by user config to {user_pf}.'  # noqa: E501
                 )
 
         valid_pf = self.get_valid_conv_partition_splits(1, out_width)
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 0c1ab6ade9..5dc580dc14 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -61,7 +61,7 @@ class Conv1DLatency : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
     static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
                      typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                      typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-        //#pragma HLS INLINE region
+        #pragma HLS INLINE recursive
         conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
 };
@@ -72,11 +72,49 @@ class Conv1DResource : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
     static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
                      typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                      typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-        //#pragma HLS INLINE region
+        #pragma HLS INLINE recursive
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
 };
 
+template <class data_T, class res_T, typename CONFIG_T>
+class BatchedDenseForConv1D : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+        #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
+        #pragma HLS INLINE RECURSIVE
+        data_T data_tmp[CONFIG_T::n_partitions][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions];
+        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0
+        res_T res_tmp[CONFIG_T::n_partitions][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions];
+        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0
+
+        for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
+            #pragma HLS UNROLL
+            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions; ii++) {
+                #pragma HLS UNROLL
+                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions + ii];
+            }
+        }
+
+        #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1
+
+        for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
+            nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[jj], res_tmp[jj], weights, biases);
+        }
+
+        for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
+            #pragma HLS UNROLL
+            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions; ii++) {
+                #pragma HLS UNROLL
+                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions + ii] = res_tmp[jj][ii];
+            }
+        }
+    }
+};
+
 } // namespace nnet
 
 #endif
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
index e166cdd470..c027abc49e 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
@@ -86,14 +86,14 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
-                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions],
                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
 
-    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
-    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::n_partitions];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::n_partitions][CONFIG_T::n_filt];
 
     #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
     #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
@@ -111,7 +111,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
 
 // Convolve, saving all multiplication results to accumulate later
 ConvOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
     ConvFilt:
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
         ConvChan:
@@ -133,7 +133,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     }         // end output loop
 
     // Initialize accumulator with input biases
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
             #pragma HLS UNROLL
             acc[ii][ff] = biases[ff];
@@ -142,7 +142,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
 
 // Accumulate multiplication result
 AccumOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
     AccumFilt:
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
         // Do "dot product" sum within filter and sum over channels
@@ -155,7 +155,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     }         // end output loop
 
     // Cast to "res_t" type
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
             #pragma HLS UNROLL
             res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index 72bce78067..8df6a993da 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -73,6 +73,44 @@ template <class data_T, class res_T, typename CONFIG_T> class Conv1DResource : p
     }
 };
 
+template <class data_T, class res_T, typename CONFIG_T>
+class BatchedDenseForConv1D : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
+  public:
+    static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
+                     typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
+                     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
+
+        #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
+        #pragma HLS INLINE RECURSIVE
+        data_T data_tmp[CONFIG_T::n_partitions][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions];
+        #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0
+        res_T res_tmp[CONFIG_T::n_partitions][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions];
+        #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0
+
+        for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
+            #pragma HLS UNROLL
+            for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions; ii++) {
+                #pragma HLS UNROLL
+                data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions + ii];
+            }
+        }
+
+        #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1
+
+        for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
+            nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[jj], res_tmp[jj], weights, biases);
+        }
+
+        for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
+            #pragma HLS UNROLL
+            for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions; ii++) {
+                #pragma HLS UNROLL
+                res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions + ii] = res_tmp[jj][ii];
+            }
+        }
+    }
+};
+
 } // namespace nnet
 
 #endif
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
index ef2f94dcaf..11a2230e43 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h
@@ -85,14 +85,14 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 }
 
 template <class data_T, class res_T, typename CONFIG_T>
-void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
-                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
+void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions],
+                                  res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions],
                                   typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                                   typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
 
-    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
-    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
+    typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::n_partitions];
+    typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::n_partitions][CONFIG_T::n_filt];
 
     #pragma HLS ARRAY_PARTITION variable=mult complete dim=0
     #pragma HLS ARRAY_PARTITION variable=acc complete dim=0
@@ -110,7 +110,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
 
 // Convolve, saving all multiplication results to accumulate later
 ConvOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
     ConvFilt:
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
         ConvChan:
@@ -132,7 +132,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     }         // end output loop
 
     // Initialize accumulator with input biases
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
             #pragma HLS UNROLL
             acc[ii][ff] = biases[ff];
@@ -141,7 +141,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
 
 // Accumulate multiplication result
 AccumOut:
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
     AccumFilt:
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
         // Do "dot product" sum within filter and sum over channels
@@ -154,7 +154,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
     }         // end output loop
 
     // Cast to "res_t" type
-    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
+    for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
         for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
             #pragma HLS UNROLL
             res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);

From e88b856f50acf72763499f5512f94b9eb5ea5cab Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 8 Sep 2025 10:21:10 -0700
Subject: [PATCH 2/8] pointwise inherit all property

---
 hls4ml/backends/catapult/passes/pointwise.py | 2 +-
 hls4ml/backends/oneapi/passes/pointwise.py   | 2 +-
 hls4ml/backends/quartus/passes/pointwise.py  | 2 +-
 hls4ml/backends/vivado/passes/pointwise.py   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py
index fd464ef172..bc6896d7f1 100755
--- a/hls4ml/backends/catapult/passes/pointwise.py
+++ b/hls4ml/backends/catapult/passes/pointwise.py
@@ -73,7 +73,7 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
+        new_attrs = node.attributes.attributes
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
index ccf410d1f6..0f84a7e444 100644
--- a/hls4ml/backends/oneapi/passes/pointwise.py
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -147,7 +147,7 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
+        new_attrs = node.attributes.attributes
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py
index d65ab22569..ed0dd7f723 100644
--- a/hls4ml/backends/quartus/passes/pointwise.py
+++ b/hls4ml/backends/quartus/passes/pointwise.py
@@ -79,7 +79,7 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
+        new_attrs = node.attributes.attributes
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py
index 3815df00f1..969f4f1d27 100644
--- a/hls4ml/backends/vivado/passes/pointwise.py
+++ b/hls4ml/backends/vivado/passes/pointwise.py
@@ -77,7 +77,7 @@ def match(self, node):
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
         # to remove warning, since these get set again
-        new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
+        new_attrs = node.attributes.attributes
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )

From 7a0ee88a3cdead89ae86f0cce697a87b47e65d7b Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 8 Sep 2025 10:26:01 -0700
Subject: [PATCH 3/8] inline for parallel conv, pointwise only for npart=1
 (otherwise II will be too high)

---
 .../vivado/passes/convolution_templates.py    |  7 +++++--
 .../templates/vitis/nnet_utils/nnet_conv1d.h  | 20 +++++++++++++++----
 .../templates/vitis/nnet_utils/nnet_conv2d.h  | 17 +++++++++++++---
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 1d255eeab2..058dd90b2f 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -101,7 +101,10 @@ def format(self, node):
             and node.get_attr('strategy').lower() == 'latency'
             and node.model.config.get_config_value('IOType') == 'io_parallel'
         )
-        if is_pointwise_parallel_latency:
+
+        n_partitions = node.attributes['n_partitions']
+
+        if is_pointwise_parallel_latency and n_partitions == 1:
             params['conv_fn'] = 'nnet::BatchedDenseForConv1D'
         else:
             if node.get_attr('strategy').lower() == 'latency':
@@ -115,7 +118,7 @@ def format(self, node):
         conv_config = self.template.format(**params)
 
         mult_params = self._default_config_params(node)
-        if is_pointwise_parallel_latency:
+        if is_pointwise_parallel_latency and n_partitions == 1:
             mult_params['n_in'] = (
                 node.get_attr('in_width')
                 * node.get_attr('n_chan')
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index 5dc580dc14..d575b998cd 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -37,7 +37,13 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
                 typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                 typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
-    //#pragma HLS INLINE recursive
+    // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/
+    // Vitis2025.1 hangs in RTL simulation with this, though
+
+    #pragma HLS INLINE recursive
+
+    // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
+    // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be
 
     CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
@@ -50,7 +56,13 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
     assert(CONFIG_T::filt_width == 1);
 
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
-    //#pragma HLS INLINE recursive
+    // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/¯
+    // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
+
+    #pragma HLS INLINE recursive
+
+    // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
+    // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be
 
     CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
 }
@@ -61,7 +73,7 @@ class Conv1DLatency : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
     static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
                      typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                      typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-        #pragma HLS INLINE recursive
+        // #pragma HLS INLINE recursive
         conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
 };
@@ -72,7 +84,7 @@ class Conv1DResource : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
     static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
                      typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
                      typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
-        #pragma HLS INLINE recursive
+        // #pragma HLS INLINE recursive
         conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
     }
 };
diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h
index 261e5cc379..dfba8028d1 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h
@@ -45,7 +45,13 @@ void conv_2d_cl(
     typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
     typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
-    //#pragma HLS INLINE recursive
+    // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/
+    // Vitis2025.1 hangs in RTL simulation with this, though
+
+    #pragma HLS INLINE recursive
+
+    // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
+    // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be
 
     if (CONFIG_T::strategy == nnet::latency || CONFIG_T::strategy == nnet::distributed_arithmetic) {
         conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
@@ -60,9 +66,14 @@ void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width *
                           typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
                           typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
     assert(CONFIG_T::filt_width == 1);
-
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
-    //#pragma HLS INLINE recursive
+    // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/
+    // Vitis2025.1 hangs in RTL simulation with this, though
+
+    #pragma HLS INLINE recursive
+
+    // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
+    // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be
 
     // Nothing special to be done for io_parallel implementation
     if (CONFIG_T::strategy == nnet::latency || CONFIG_T::strategy == nnet::distributed_arithmetic) {

From d077b7224df41b64a36e2b18774078c3cd057e99 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Mon, 8 Sep 2025 11:43:02 -0700
Subject: [PATCH 4/8] avoid unexpected shared state

---
 hls4ml/backends/catapult/passes/pointwise.py | 2 +-
 hls4ml/backends/oneapi/passes/pointwise.py   | 2 +-
 hls4ml/backends/quartus/passes/pointwise.py  | 2 +-
 hls4ml/backends/vivado/passes/pointwise.py   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py
index bc6896d7f1..7a8044f9e7 100755
--- a/hls4ml/backends/catapult/passes/pointwise.py
+++ b/hls4ml/backends/catapult/passes/pointwise.py
@@ -73,7 +73,7 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        new_attrs = node.attributes.attributes
+        new_attrs = node.attributes.attributes.copy()
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py
index 0f84a7e444..84fc242070 100644
--- a/hls4ml/backends/oneapi/passes/pointwise.py
+++ b/hls4ml/backends/oneapi/passes/pointwise.py
@@ -147,7 +147,7 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        new_attrs = node.attributes.attributes
+        new_attrs = node.attributes.attributes.copy()
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py
index ed0dd7f723..8f1ef49a52 100644
--- a/hls4ml/backends/quartus/passes/pointwise.py
+++ b/hls4ml/backends/quartus/passes/pointwise.py
@@ -79,7 +79,7 @@ def match(self, node):
 
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
-        new_attrs = node.attributes.attributes
+        new_attrs = node.attributes.attributes.copy()
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )
diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py
index 969f4f1d27..aca323a980 100644
--- a/hls4ml/backends/vivado/passes/pointwise.py
+++ b/hls4ml/backends/vivado/passes/pointwise.py
@@ -77,7 +77,7 @@ def match(self, node):
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
         # to remove warning, since these get set again
-        new_attrs = node.attributes.attributes
+        new_attrs = node.attributes.attributes.copy()
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
         )

From f349e41c62cdb0183db16b2e3a4c51711de9a0fd Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 9 Sep 2025 05:24:56 -0700
Subject: [PATCH 5/8] rm useless warning

---
 hls4ml/backends/vivado/vivado_backend.py |  6 ------
 hls4ml/model/layers.py                   | 14 ++++++++------
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index fcde896b38..a937aca672 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -414,12 +414,6 @@ def init_conv1d(self, layer):
         user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None)
         layer_pf = layer.get_attr('parallelization_factor', None)
         chosen_pf = user_pf or layer_pf or 1
-        if user_pf is not None and layer_pf is not None:
-            if user_pf != layer_pf:
-                warn(
-                    f'Parallelization factor of {layer_pf} is embedded in layer {layer.name} is overridden by user config to {user_pf}.'  # noqa: E501
-                )
-
         valid_pf = self.get_valid_conv_partition_splits(1, out_width)
         if chosen_pf not in valid_pf:
             closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py
index a47dca285a..2931b61ca8 100644
--- a/hls4ml/model/layers.py
+++ b/hls4ml/model/layers.py
@@ -1,5 +1,6 @@
 import typing
 from copy import copy
+from warnings import warn
 
 import numpy as np
 
@@ -111,17 +112,18 @@ def __init__(self, model, name, attributes, inputs, outputs=None, initialize=Tru
             layer_config = self.model.config.get_layer_config(self)
             for config_key, config_value in layer_config.items():
                 config_key = convert_to_snake_case(config_key)
-                if config_key in self.attributes:
-                    print(
-                        'WARNING: Config parameter "{}" overwrites an existing attribute in layer "{}" ({})'.format(
-                            config_key, self.name, self.class_name
-                        )
-                    )
                 if config_key.endswith('_t') and isinstance(
                     config_value, str
                 ):  # TODO maybe move this to __setitem__ of AttributeDict?
                     precision = self.model.config.backend.convert_precision_string(config_value)
                     config_value = NamedType(self.name + '_' + config_key, precision)
+                if (old_value := self.attributes.get(config_key, config_value)) != config_value:
+                    warn(
+                        f"Overriding attribute '{config_key}' of layer '{self.name}' ({self.class_name}):"
+                        f"{old_value} -> {config_value}",
+                        UserWarning,
+                        stacklevel=3,
+                    )
                 self.attributes[config_key] = config_value
 
             self.initialize()

From 39d416bc84cbfd5faa1a1fb16d5769b4eeafe6c3 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 9 Sep 2025 06:05:57 -0700
Subject: [PATCH 6/8] rm line

---
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index d575b998cd..f88985aaef 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -57,7 +57,6 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
 
     // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
     // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/¯
-    // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
 
     #pragma HLS INLINE recursive
 

From 7a6f1db6039eb0cddb28f0e658899b25f1ad957a Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 9 Sep 2025 06:08:54 -0700
Subject: [PATCH 7/8] .

---
 hls4ml/backends/vivado/passes/convolution_templates.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py
index 058dd90b2f..08ab1e1228 100644
--- a/hls4ml/backends/vivado/passes/convolution_templates.py
+++ b/hls4ml/backends/vivado/passes/convolution_templates.py
@@ -120,12 +120,9 @@ def format(self, node):
         mult_params = self._default_config_params(node)
         if is_pointwise_parallel_latency and n_partitions == 1:
             mult_params['n_in'] = (
-                node.get_attr('in_width')
-                * node.get_attr('n_chan')
-                * node.get_attr('filt_width')
-                // mult_params['n_partitions']
+                node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') // n_partitions
             )
-            mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // mult_params['n_partitions']
+            mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // n_partitions
         else:
             mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
             mult_params['n_out'] = node.get_attr('n_filt')

From e530e1877a076bac92becbcc3f45f25de031f702 Mon Sep 17 00:00:00 2001
From: Chang Sun <chsun@cern.ch>
Date: Tue, 23 Sep 2025 05:59:41 -0700
Subject: [PATCH 8/8] fix pragma

---
 hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h  | 2 +-
 hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
index f88985aaef..4f76c030d3 100644
--- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
@@ -110,7 +110,7 @@ class BatchedDenseForConv1D : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T>
             }
         }
 
-        #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1
+        #pragma HLS ALLOCATION function instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1
 
         for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
             nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[jj], res_tmp[jj], weights, biases);
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
index 8df6a993da..63151d6745 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h
@@ -95,7 +95,7 @@ class BatchedDenseForConv1D : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T>
             }
         }
 
-        #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1
+        #pragma HLS ALLOCATION function instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1
 
         for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
             nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[jj], res_tmp[jj], weights, biases);