From f7535b75c0bcb8f7b07a3a5143a57ee0f45f4a0d Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Mon, 8 Sep 2025 09:11:00 -0700 Subject: [PATCH 1/8] pointwise conv 1d partial fix --- .../vivado/passes/convolution_templates.py | 11 ++- .../vivado/passes/pointwise_codegen.py | 84 ------------------- hls4ml/backends/vivado/vivado_backend.py | 3 +- .../templates/vitis/nnet_utils/nnet_conv1d.h | 42 +++++++++- .../vitis/nnet_utils/nnet_conv1d_latency.h | 16 ++-- .../templates/vivado/nnet_utils/nnet_conv1d.h | 38 +++++++++ .../vivado/nnet_utils/nnet_conv1d_latency.h | 16 ++-- 7 files changed, 102 insertions(+), 108 deletions(-) delete mode 100644 hls4ml/backends/vivado/passes/pointwise_codegen.py diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 4d86f54049..1d255eeab2 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -102,7 +102,7 @@ def format(self, node): and node.model.config.get_config_value('IOType') == 'io_parallel' ) if is_pointwise_parallel_latency: - params['conv_fn'] = f'{namespace}::pointwise_conv_{node.index}' + params['conv_fn'] = 'nnet::BatchedDenseForConv1D' else: if node.get_attr('strategy').lower() == 'latency': params['conv_fn'] = 'nnet::Conv1DLatency' @@ -116,10 +116,13 @@ def format(self, node): mult_params = self._default_config_params(node) if is_pointwise_parallel_latency: - mult_params['n_in'] = int( - node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse'] + mult_params['n_in'] = ( + node.get_attr('in_width') + * node.get_attr('n_chan') + * node.get_attr('filt_width') + // mult_params['n_partitions'] ) - mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse']) + mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // mult_params['n_partitions'] else: mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') mult_params['n_out'] = node.get_attr('n_filt') diff --git a/hls4ml/backends/vivado/passes/pointwise_codegen.py b/hls4ml/backends/vivado/passes/pointwise_codegen.py deleted file mode 100644 index fa7f795fbc..0000000000 --- a/hls4ml/backends/vivado/passes/pointwise_codegen.py +++ /dev/null @@ -1,84 +0,0 @@ -from hls4ml.model.layers import Conv1D -from hls4ml.model.optimizer import OptimizerPass -from hls4ml.model.types import Source - - -def generate_pointwise_conv1d_fn(layer_idx, reuse_factor=1): - """Generate a C++ function for a pointwise convolution layer. - - Args: - layer_idx (int): Index of layer ('index' attribute). - reuse_factor (int): Number of partitions to divide the input into. - - Returns: - str: Generated C++ function - """ - - generated_code = ( - 'template\n' - 'class pointwise_conv_{index} : public nnet::Conv1DKernel {{\n' - ' public:\n' - ' static void conv(\n' - ' data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],\n' - ' res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],\n' - ' typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],\n' - ' typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {{\n' - ' data_T data_tmp[CONFIG_T::reuse_factor][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor];\n' # noqa: E501 - ' #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0\n' - ' res_T res_tmp[CONFIG_T::reuse_factor][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor];\n' # noqa: E501 - ' #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0\n\n' - ' RFInputLoop:\n' - ' for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {{\n' - ' #pragma HLS UNROLL\n' - ' InnerInputLoop:\n' - ' for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor; ii++) {{\n' - ' #pragma HLS UNROLL\n' - ' data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor + ii];\n' # noqa: E501 - ' }}\n' - ' }}\n\n' - ).format(index=layer_idx) - indent = ' ' - for i in range(reuse_factor): - generated_code += indent - generated_code += ( - f'nnet::pointwise_conv_1d_latency_cl(data_tmp[{i}], res_tmp[{i}], weights, biases);\n' - ) - - generated_code += ( - '\n' - ' RFOutputLoop:\n' - ' for (int jj = 0; jj < CONFIG_T::reuse_factor; jj++) {\n' - ' #pragma HLS UNROLL\n' - ' InnerOutputLoop:\n' - ' for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor; ii++) {\n' - ' #pragma HLS UNROLL\n' - ' res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor + ii] = res_tmp[jj][ii];\n' # noqa: E501 - ' }\n' - ' }\n' - ' }\n' - '};\n' - ) - - return generated_code - - -class GeneratePointwiseConv1D(OptimizerPass): - '''Generates code for pointwise 1D convolution''' - - def match(self, node): - return ( - isinstance(node, Conv1D) - and node.model.config.get_config_value('IOType') == 'io_parallel' - and node.get_attr('filt_width') == 1 - ) - - def transform(self, model, node): - self._generate_pointwise_conv1d(node) - - def _generate_pointwise_conv1d(self, node): - code_str = generate_pointwise_conv1d_fn( - node.get_attr('index'), - node.get_attr('reuse_factor'), - ) - - node.set_attr('pointwise_conv1d_codegen', Source(code_str)) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index d5309c377f..fcde896b38 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -183,7 +183,6 @@ def _register_flows(self): 'vivado:generate_conv_streaming_instructions', 'vivado:apply_resource_strategy', 'vivado:generate_conv_im2col', - 'vivado:generate_pointwise_conv1_d', 'vivado:generate_unrolled_dense_resource', 'vivado:set_pipeline_style', 'vivado:d_a_latency_dense_template', @@ -418,7 +417,7 @@ def init_conv1d(self, layer): if user_pf is not None and layer_pf is not None: if user_pf != layer_pf: warn( - f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.' # noqa: E501 + f'Parallelization factor of {layer_pf} is embedded in layer {layer.name} is overridden by user config to {user_pf}.' # noqa: E501 ) valid_pf = self.get_valid_conv_partition_splits(1, out_width) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 0c1ab6ade9..5dc580dc14 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -61,7 +61,7 @@ class Conv1DLatency : public nnet::Conv1DKernel { static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - //#pragma HLS INLINE region + #pragma HLS INLINE recursive conv_1d_latency_cl(data, res, weights, biases); } }; @@ -72,11 +72,49 @@ class Conv1DResource : public nnet::Conv1DKernel { static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - //#pragma HLS INLINE region + #pragma HLS INLINE recursive conv_1d_resource_cl(data, res, weights, biases); } }; +template +class BatchedDenseForConv1D : public nnet::Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions + #pragma HLS INLINE RECURSIVE + data_T data_tmp[CONFIG_T::n_partitions][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions]; + #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 + res_T res_tmp[CONFIG_T::n_partitions][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions]; + #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + + for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { + #pragma HLS UNROLL + for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions; ii++) { + #pragma HLS UNROLL + data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions + ii]; + } + } + + #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl limit=1 + + for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { + nnet::pointwise_conv_1d_latency_cl(data_tmp[jj], res_tmp[jj], weights, biases); + } + + for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { + #pragma HLS UNROLL + for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions; ii++) { + #pragma HLS UNROLL + res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions + ii] = res_tmp[jj][ii]; + } + } + } +}; + } // namespace nnet #endif diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h index e166cdd470..c027abc49e 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h @@ -86,14 +86,14 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions], typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::n_partitions]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::n_partitions][CONFIG_T::n_filt]; #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 @@ -111,7 +111,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Convolve, saving all multiplication results to accumulate later ConvOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { ConvFilt: for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { ConvChan: @@ -133,7 +133,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c } // end output loop // Initialize accumulator with input biases - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL acc[ii][ff] = biases[ff]; @@ -142,7 +142,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Accumulate multiplication result AccumOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { AccumFilt: for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Do "dot product" sum within filter and sum over channels @@ -155,7 +155,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c } // end output loop // Cast to "res_t" type - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = cast(acc[ii][ff]); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index 72bce78067..8df6a993da 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -73,6 +73,44 @@ template class Conv1DResource : p } }; +template +class BatchedDenseForConv1D : public nnet::Conv1DKernel { + public: + static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], + typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], + typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { + + #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions + #pragma HLS INLINE RECURSIVE + data_T data_tmp[CONFIG_T::n_partitions][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions]; + #pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0 + res_T res_tmp[CONFIG_T::n_partitions][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions]; + #pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0 + + for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { + #pragma HLS UNROLL + for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions; ii++) { + #pragma HLS UNROLL + data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions + ii]; + } + } + + #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl limit=1 + + for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { + nnet::pointwise_conv_1d_latency_cl(data_tmp[jj], res_tmp[jj], weights, biases); + } + + for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { + #pragma HLS UNROLL + for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions; ii++) { + #pragma HLS UNROLL + res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions + ii] = res_tmp[jj][ii]; + } + } + } +}; + } // namespace nnet #endif diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h index ef2f94dcaf..11a2230e43 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d_latency.h @@ -85,14 +85,14 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], } template -void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor], - res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor], +void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions], + res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions], typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); - typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor]; - typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt]; + typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::n_partitions]; + typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::n_partitions][CONFIG_T::n_filt]; #pragma HLS ARRAY_PARTITION variable=mult complete dim=0 #pragma HLS ARRAY_PARTITION variable=acc complete dim=0 @@ -110,7 +110,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Convolve, saving all multiplication results to accumulate later ConvOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { ConvFilt: for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { ConvChan: @@ -132,7 +132,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c } // end output loop // Initialize accumulator with input biases - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL acc[ii][ff] = biases[ff]; @@ -141,7 +141,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c // Accumulate multiplication result AccumOut: - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { AccumFilt: for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { // Do "dot product" sum within filter and sum over channels @@ -154,7 +154,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c } // end output loop // Cast to "res_t" type - for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) { + for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) { for (int ff = 0; ff < CONFIG_T::n_filt; ff++) { #pragma HLS UNROLL res[ii * CONFIG_T::n_filt + ff] = cast(acc[ii][ff]); From e88b856f50acf72763499f5512f94b9eb5ea5cab Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Mon, 8 Sep 2025 10:21:10 -0700 Subject: [PATCH 2/8] pointwise inherit all property --- hls4ml/backends/catapult/passes/pointwise.py | 2 +- hls4ml/backends/oneapi/passes/pointwise.py | 2 +- hls4ml/backends/quartus/passes/pointwise.py | 2 +- hls4ml/backends/vivado/passes/pointwise.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py index fd464ef172..bc6896d7f1 100755 --- a/hls4ml/backends/catapult/passes/pointwise.py +++ b/hls4ml/backends/catapult/passes/pointwise.py @@ -73,7 +73,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + new_attrs = node.attributes.attributes pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py index ccf410d1f6..0f84a7e444 100644 --- a/hls4ml/backends/oneapi/passes/pointwise.py +++ b/hls4ml/backends/oneapi/passes/pointwise.py @@ -147,7 +147,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + new_attrs = node.attributes.attributes pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py index d65ab22569..ed0dd7f723 100644 --- a/hls4ml/backends/quartus/passes/pointwise.py +++ b/hls4ml/backends/quartus/passes/pointwise.py @@ -79,7 +79,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + new_attrs = node.attributes.attributes pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index 3815df00f1..969f4f1d27 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -77,7 +77,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' # to remove warning, since these get set again - new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')} + new_attrs = node.attributes.attributes pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) From 7a0ee88a3cdead89ae86f0cce697a87b47e65d7b Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Mon, 8 Sep 2025 10:26:01 -0700 Subject: [PATCH 3/8] inline for parallel conv, pointwise only for npart=1 (otherwise II will be too high) --- .../vivado/passes/convolution_templates.py | 7 +++++-- .../templates/vitis/nnet_utils/nnet_conv1d.h | 20 +++++++++++++++---- .../templates/vitis/nnet_utils/nnet_conv2d.h | 17 +++++++++++++--- 3 files changed, 35 insertions(+), 9 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 1d255eeab2..058dd90b2f 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -101,7 +101,10 @@ def format(self, node): and node.get_attr('strategy').lower() == 'latency' and node.model.config.get_config_value('IOType') == 'io_parallel' ) - if is_pointwise_parallel_latency: + + n_partitions = node.attributes['n_partitions'] + + if is_pointwise_parallel_latency and n_partitions == 1: params['conv_fn'] = 'nnet::BatchedDenseForConv1D' else: if node.get_attr('strategy').lower() == 'latency': @@ -115,7 +118,7 @@ def format(self, node): conv_config = self.template.format(**params) mult_params = self._default_config_params(node) - if is_pointwise_parallel_latency: + if is_pointwise_parallel_latency and n_partitions == 1: mult_params['n_in'] = ( node.get_attr('in_width') * node.get_attr('n_chan') diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index 5dc580dc14..d575b998cd 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -37,7 +37,13 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. - //#pragma HLS INLINE recursive + // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/ + // Vitis2025.1 hangs in RTL simulation with this, though + + #pragma HLS INLINE recursive + + // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions + // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be CONFIG_T::template conv_kernel::conv(data, res, weights, biases); } @@ -50,7 +56,13 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], assert(CONFIG_T::filt_width == 1); // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. - //#pragma HLS INLINE recursive + // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/¯ + // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions + + #pragma HLS INLINE recursive + + // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions + // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be CONFIG_T::template conv_kernel::conv(data, res, weights, biases); } @@ -61,7 +73,7 @@ class Conv1DLatency : public nnet::Conv1DKernel { static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - #pragma HLS INLINE recursive + // #pragma HLS INLINE recursive conv_1d_latency_cl(data, res, weights, biases); } }; @@ -72,7 +84,7 @@ class Conv1DResource : public nnet::Conv1DKernel { static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt], typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { - #pragma HLS INLINE recursive + // #pragma HLS INLINE recursive conv_1d_resource_cl(data, res, weights, biases); } }; diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h index 261e5cc379..dfba8028d1 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h @@ -45,7 +45,13 @@ void conv_2d_cl( typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. - //#pragma HLS INLINE recursive + // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/ + // Vitis2025.1 hangs in RTL simulation with this, though + + #pragma HLS INLINE recursive + + // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions + // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be if (CONFIG_T::strategy == nnet::latency || CONFIG_T::strategy == nnet::distributed_arithmetic) { conv_2d_latency_cl(data, res, weights, biases); @@ -60,9 +66,14 @@ void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt], typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) { assert(CONFIG_T::filt_width == 1); - // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. - //#pragma HLS INLINE recursive + // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/ + // Vitis2025.1 hangs in RTL simulation with this, though + + #pragma HLS INLINE recursive + + // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions + // ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be // Nothing special to be done for io_parallel implementation if (CONFIG_T::strategy == nnet::latency || CONFIG_T::strategy == nnet::distributed_arithmetic) { From d077b7224df41b64a36e2b18774078c3cd057e99 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Mon, 8 Sep 2025 11:43:02 -0700 Subject: [PATCH 4/8] avoid unexpected shared state --- hls4ml/backends/catapult/passes/pointwise.py | 2 +- hls4ml/backends/oneapi/passes/pointwise.py | 2 +- hls4ml/backends/quartus/passes/pointwise.py | 2 +- hls4ml/backends/vivado/passes/pointwise.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/hls4ml/backends/catapult/passes/pointwise.py b/hls4ml/backends/catapult/passes/pointwise.py index bc6896d7f1..7a8044f9e7 100755 --- a/hls4ml/backends/catapult/passes/pointwise.py +++ b/hls4ml/backends/catapult/passes/pointwise.py @@ -73,7 +73,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - new_attrs = node.attributes.attributes + new_attrs = node.attributes.attributes.copy() pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) diff --git a/hls4ml/backends/oneapi/passes/pointwise.py b/hls4ml/backends/oneapi/passes/pointwise.py index 0f84a7e444..84fc242070 100644 --- a/hls4ml/backends/oneapi/passes/pointwise.py +++ b/hls4ml/backends/oneapi/passes/pointwise.py @@ -147,7 +147,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - new_attrs = node.attributes.attributes + new_attrs = node.attributes.attributes.copy() pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) diff --git a/hls4ml/backends/quartus/passes/pointwise.py b/hls4ml/backends/quartus/passes/pointwise.py index ed0dd7f723..8f1ef49a52 100644 --- a/hls4ml/backends/quartus/passes/pointwise.py +++ b/hls4ml/backends/quartus/passes/pointwise.py @@ -79,7 +79,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' - new_attrs = node.attributes.attributes + new_attrs = node.attributes.attributes.copy() pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) diff --git a/hls4ml/backends/vivado/passes/pointwise.py b/hls4ml/backends/vivado/passes/pointwise.py index 969f4f1d27..aca323a980 100644 --- a/hls4ml/backends/vivado/passes/pointwise.py +++ b/hls4ml/backends/vivado/passes/pointwise.py @@ -77,7 +77,7 @@ def match(self, node): def transform(self, model, node): dim = node.__class__.__name__[-2:] # '1D' or '2D' # to remove warning, since these get set again - new_attrs = node.attributes.attributes + new_attrs = node.attributes.attributes.copy() pw_node = model.make_node( 'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy() ) From f349e41c62cdb0183db16b2e3a4c51711de9a0fd Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 9 Sep 2025 05:24:56 -0700 Subject: [PATCH 5/8] rm useless warning --- hls4ml/backends/vivado/vivado_backend.py | 6 ------ hls4ml/model/layers.py | 14 ++++++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py index fcde896b38..a937aca672 100644 --- a/hls4ml/backends/vivado/vivado_backend.py +++ b/hls4ml/backends/vivado/vivado_backend.py @@ -414,12 +414,6 @@ def init_conv1d(self, layer): user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None) layer_pf = layer.get_attr('parallelization_factor', None) chosen_pf = user_pf or layer_pf or 1 - if user_pf is not None and layer_pf is not None: - if user_pf != layer_pf: - warn( - f'Parallelization factor of {layer_pf} is embedded in layer {layer.name} is overridden by user config to {user_pf}.' # noqa: E501 - ) - valid_pf = self.get_valid_conv_partition_splits(1, out_width) if chosen_pf not in valid_pf: closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf) diff --git a/hls4ml/model/layers.py b/hls4ml/model/layers.py index a47dca285a..2931b61ca8 100644 --- a/hls4ml/model/layers.py +++ b/hls4ml/model/layers.py @@ -1,5 +1,6 @@ import typing from copy import copy +from warnings import warn import numpy as np @@ -111,17 +112,18 @@ def __init__(self, model, name, attributes, inputs, outputs=None, initialize=Tru layer_config = self.model.config.get_layer_config(self) for config_key, config_value in layer_config.items(): config_key = convert_to_snake_case(config_key) - if config_key in self.attributes: - print( - 'WARNING: Config parameter "{}" overwrites an existing attribute in layer "{}" ({})'.format( - config_key, self.name, self.class_name - ) - ) if config_key.endswith('_t') and isinstance( config_value, str ): # TODO maybe move this to __setitem__ of AttributeDict? precision = self.model.config.backend.convert_precision_string(config_value) config_value = NamedType(self.name + '_' + config_key, precision) + if (old_value := self.attributes.get(config_key, config_value)) != config_value: + warn( + f"Overriding attribute '{config_key}' of layer '{self.name}' ({self.class_name}):" + f"{old_value} -> {config_value}", + UserWarning, + stacklevel=3, + ) self.attributes[config_key] = config_value self.initialize() From 39d416bc84cbfd5faa1a1fb16d5769b4eeafe6c3 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 9 Sep 2025 06:05:57 -0700 Subject: [PATCH 6/8] rm line --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h | 1 - 1 file changed, 1 deletion(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index d575b998cd..f88985aaef 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -57,7 +57,6 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], // Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully. // But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/¯ - // #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions #pragma HLS INLINE recursive From 7a6f1db6039eb0cddb28f0e658899b25f1ad957a Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 9 Sep 2025 06:08:54 -0700 Subject: [PATCH 7/8] . --- hls4ml/backends/vivado/passes/convolution_templates.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/hls4ml/backends/vivado/passes/convolution_templates.py b/hls4ml/backends/vivado/passes/convolution_templates.py index 058dd90b2f..08ab1e1228 100644 --- a/hls4ml/backends/vivado/passes/convolution_templates.py +++ b/hls4ml/backends/vivado/passes/convolution_templates.py @@ -120,12 +120,9 @@ def format(self, node): mult_params = self._default_config_params(node) if is_pointwise_parallel_latency and n_partitions == 1: mult_params['n_in'] = ( - node.get_attr('in_width') - * node.get_attr('n_chan') - * node.get_attr('filt_width') - // mult_params['n_partitions'] + node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') // n_partitions ) - mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // mult_params['n_partitions'] + mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // n_partitions else: mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width') mult_params['n_out'] = node.get_attr('n_filt') From e530e1877a076bac92becbcc3f45f25de031f702 Mon Sep 17 00:00:00 2001 From: Chang Sun Date: Tue, 23 Sep 2025 05:59:41 -0700 Subject: [PATCH 8/8] fix pragma --- hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h | 2 +- hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h index f88985aaef..4f76c030d3 100644 --- a/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h @@ -110,7 +110,7 @@ class BatchedDenseForConv1D : public nnet::Conv1DKernel } } - #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl limit=1 + #pragma HLS ALLOCATION function instances=nnet::pointwise_conv_1d_latency_cl limit=1 for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { nnet::pointwise_conv_1d_latency_cl(data_tmp[jj], res_tmp[jj], weights, biases); diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h index 8df6a993da..63151d6745 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_conv1d.h @@ -95,7 +95,7 @@ class BatchedDenseForConv1D : public nnet::Conv1DKernel } } - #pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl limit=1 + #pragma HLS ALLOCATION function instances=nnet::pointwise_conv_1d_latency_cl limit=1 for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) { nnet::pointwise_conv_1d_latency_cl(data_tmp[jj], res_tmp[jj], weights, biases);