Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hls4ml/backends/catapult/passes/pointwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def match(self, node):

def transform(self, model, node):
dim = node.__class__.__name__[-2:] # '1D' or '2D'
new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
new_attrs = node.attributes.attributes.copy()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mentioned this in a comment in the conversation. I think trace, precision, and reuse_factor are regenerated no matter what, so the values you copy here get overriden (unless something has changed from before). It may be an hls4ml behavior worth revisiting and potentially revising, but I don't think this change fixes anything.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reuse_factor defined under Model key is not propagated as expected otherwise. Rm'ed warning if the update opr is trivial

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you see why it's not propagated properly? Shouldn't it come from a configuration in the first place?

pw_node = model.make_node(
'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
)
Expand Down
2 changes: 1 addition & 1 deletion hls4ml/backends/oneapi/passes/pointwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def match(self, node):

def transform(self, model, node):
dim = node.__class__.__name__[-2:] # '1D' or '2D'
new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
new_attrs = node.attributes.attributes.copy()
pw_node = model.make_node(
'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
)
Expand Down
2 changes: 1 addition & 1 deletion hls4ml/backends/quartus/passes/pointwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def match(self, node):

def transform(self, model, node):
dim = node.__class__.__name__[-2:] # '1D' or '2D'
new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
new_attrs = node.attributes.attributes.copy()
pw_node = model.make_node(
'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
)
Expand Down
15 changes: 9 additions & 6 deletions hls4ml/backends/vivado/passes/convolution_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,11 @@ def format(self, node):
and node.get_attr('strategy').lower() == 'latency'
and node.model.config.get_config_value('IOType') == 'io_parallel'
)
if is_pointwise_parallel_latency:
params['conv_fn'] = f'{namespace}::pointwise_conv_{node.index}'

n_partitions = node.attributes['n_partitions']

if is_pointwise_parallel_latency and n_partitions == 1:
params['conv_fn'] = 'nnet::BatchedDenseForConv1D'
else:
if node.get_attr('strategy').lower() == 'latency':
params['conv_fn'] = 'nnet::Conv1DLatency'
Expand All @@ -115,11 +118,11 @@ def format(self, node):
conv_config = self.template.format(**params)

mult_params = self._default_config_params(node)
if is_pointwise_parallel_latency:
mult_params['n_in'] = int(
node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') / mult_params['reuse']
if is_pointwise_parallel_latency and n_partitions == 1:
mult_params['n_in'] = (
node.get_attr('in_width') * node.get_attr('n_chan') * node.get_attr('filt_width') // n_partitions
)
mult_params['n_out'] = int(node.get_attr('in_width') * node.get_attr('n_filt') / mult_params['reuse'])
mult_params['n_out'] = node.get_attr('in_width') * node.get_attr('n_filt') // n_partitions
else:
mult_params['n_in'] = node.get_attr('n_chan') * node.get_attr('filt_width')
mult_params['n_out'] = node.get_attr('n_filt')
Expand Down
2 changes: 1 addition & 1 deletion hls4ml/backends/vivado/passes/pointwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def match(self, node):
def transform(self, model, node):
dim = node.__class__.__name__[-2:] # '1D' or '2D'
# to remove warning, since these get set again
new_attrs = {k: v for k, v in node.attributes.items() if k not in ('trace', 'precision', 'reuse_factor')}
new_attrs = node.attributes.attributes.copy()
pw_node = model.make_node(
'PointwiseConv' + dim, node.name, new_attrs, node.inputs.copy(), outputs=node.outputs.copy()
)
Expand Down
84 changes: 0 additions & 84 deletions hls4ml/backends/vivado/passes/pointwise_codegen.py

This file was deleted.

7 changes: 0 additions & 7 deletions hls4ml/backends/vivado/vivado_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,6 @@ def _register_flows(self):
'vivado:generate_conv_streaming_instructions',
'vivado:apply_resource_strategy',
'vivado:generate_conv_im2col',
'vivado:generate_pointwise_conv1_d',
'vivado:generate_unrolled_dense_resource',
'vivado:set_pipeline_style',
'vivado:d_a_latency_dense_template',
Expand Down Expand Up @@ -415,12 +414,6 @@ def init_conv1d(self, layer):
user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None)
layer_pf = layer.get_attr('parallelization_factor', None)
chosen_pf = user_pf or layer_pf or 1
if user_pf is not None and layer_pf is not None:
if user_pf != layer_pf:
warn(
f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.' # noqa: E501
)

valid_pf = self.get_valid_conv_partition_splits(1, out_width)
if chosen_pf not in valid_pf:
closest_pf = self.get_closest_reuse_factor(valid_pf, chosen_pf)
Expand Down
14 changes: 8 additions & 6 deletions hls4ml/model/layers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import typing
from copy import copy
from warnings import warn

import numpy as np

Expand Down Expand Up @@ -111,17 +112,18 @@ def __init__(self, model, name, attributes, inputs, outputs=None, initialize=Tru
layer_config = self.model.config.get_layer_config(self)
for config_key, config_value in layer_config.items():
config_key = convert_to_snake_case(config_key)
if config_key in self.attributes:
print(
'WARNING: Config parameter "{}" overwrites an existing attribute in layer "{}" ({})'.format(
config_key, self.name, self.class_name
)
)
if config_key.endswith('_t') and isinstance(
config_value, str
): # TODO maybe move this to __setitem__ of AttributeDict?
precision = self.model.config.backend.convert_precision_string(config_value)
config_value = NamedType(self.name + '_' + config_key, precision)
if (old_value := self.attributes.get(config_key, config_value)) != config_value:
warn(
f"Overriding attribute '{config_key}' of layer '{self.name}' ({self.class_name}):"
f"{old_value} -> {config_value}",
UserWarning,
stacklevel=3,
)
self.attributes[config_key] = config_value

self.initialize()
Expand Down
57 changes: 53 additions & 4 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv1d.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,13 @@ void conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CO
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive
// But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/
// Vitis2025.1 hangs in RTL simulation with this, though

#pragma HLS INLINE recursive

// #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
// ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be

CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
}
Expand All @@ -50,7 +56,12 @@ void pointwise_conv_1d_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
assert(CONFIG_T::filt_width == 1);

// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive
// But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/¯

#pragma HLS INLINE recursive

// #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
// ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be

CONFIG_T::template conv_kernel<data_T, res_T, CONFIG_T>::conv(data, res, weights, biases);
}
Expand All @@ -61,7 +72,7 @@ class Conv1DLatency : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
//#pragma HLS INLINE region
// #pragma HLS INLINE recursive
conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
};
Expand All @@ -72,11 +83,49 @@ class Conv1DResource : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
//#pragma HLS INLINE region
// #pragma HLS INLINE recursive
conv_1d_resource_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
}
};

template <class data_T, class res_T, typename CONFIG_T>
class BatchedDenseForConv1D : public nnet::Conv1DKernel<data_T, res_T, CONFIG_T> {
Copy link
Contributor

@jmitrevs jmitrevs Sep 8, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add a comment to say the purpose of this code (and also for the 1D version)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The PR allows modification from contributors. Feel free to add some if you find it necessary.

public:
static void conv(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan], res_T res[CONFIG_T::out_width * CONFIG_T::n_filt],
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {

#pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
#pragma HLS INLINE RECURSIVE
data_T data_tmp[CONFIG_T::n_partitions][CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions];
#pragma HLS ARRAY_PARTITION variable=data_tmp complete dim=0
res_T res_tmp[CONFIG_T::n_partitions][CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions];
#pragma HLS ARRAY_PARTITION variable=res_tmp complete dim=0

for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
#pragma HLS UNROLL
for (int ii = 0; ii < CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions; ii++) {
#pragma HLS UNROLL
data_tmp[jj][ii] = data[jj * CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions + ii];
}
}

#pragma HLS ALLOCATION operation instances=nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T> limit=1

for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
nnet::pointwise_conv_1d_latency_cl<data_T, res_T, CONFIG_T>(data_tmp[jj], res_tmp[jj], weights, biases);
}

for (int jj = 0; jj < CONFIG_T::n_partitions; jj++) {
#pragma HLS UNROLL
for (int ii = 0; ii < CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions; ii++) {
#pragma HLS UNROLL
res[jj * CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions + ii] = res_tmp[jj][ii];
}
}
}
};

} // namespace nnet

#endif
16 changes: 8 additions & 8 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv1d_latency.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,14 +86,14 @@ void conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan],
}

template <class data_T, class res_T, typename CONFIG_T>
void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::reuse_factor],
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::reuse_factor],
void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_chan / CONFIG_T::n_partitions],
res_T res[CONFIG_T::out_width * CONFIG_T::n_filt / CONFIG_T::n_partitions],
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
assert(CONFIG_T::filt_width == 1);

typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::reuse_factor];
typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::reuse_factor][CONFIG_T::n_filt];
typename CONFIG_T::accum_t mult[CONFIG_T::out_width * CONFIG_T::n_filt * CONFIG_T::n_chan / CONFIG_T::n_partitions];
typename CONFIG_T::accum_t acc[CONFIG_T::out_width / CONFIG_T::n_partitions][CONFIG_T::n_filt];

#pragma HLS ARRAY_PARTITION variable=mult complete dim=0
#pragma HLS ARRAY_PARTITION variable=acc complete dim=0
Expand All @@ -111,7 +111,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c

// Convolve, saving all multiplication results to accumulate later
ConvOut:
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
ConvFilt:
for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
ConvChan:
Expand All @@ -133,7 +133,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
} // end output loop

// Initialize accumulator with input biases
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
#pragma HLS UNROLL
acc[ii][ff] = biases[ff];
Expand All @@ -142,7 +142,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c

// Accumulate multiplication result
AccumOut:
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
AccumFilt:
for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
// Do "dot product" sum within filter and sum over channels
Expand All @@ -155,7 +155,7 @@ void pointwise_conv_1d_latency_cl(data_T data[CONFIG_T::in_width * CONFIG_T::n_c
} // end output loop

// Cast to "res_t" type
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::reuse_factor; ii++) {
for (int ii = 0; ii < CONFIG_T::out_width / CONFIG_T::n_partitions; ii++) {
for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
#pragma HLS UNROLL
res[ii * CONFIG_T::n_filt + ff] = cast<data_T, res_T, typename CONFIG_T::mult_config>(acc[ii][ff]);
Expand Down
17 changes: 14 additions & 3 deletions hls4ml/templates/vitis/nnet_utils/nnet_conv2d.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,13 @@ void conv_2d_cl(
typename CONFIG_T::weight_t weights[CONFIG_T::filt_height * CONFIG_T::filt_width * CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive
// But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/
// Vitis2025.1 hangs in RTL simulation with this, though

#pragma HLS INLINE recursive

// #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
// ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be

if (CONFIG_T::strategy == nnet::latency || CONFIG_T::strategy == nnet::distributed_arithmetic) {
conv_2d_latency_cl<data_T, res_T, CONFIG_T>(data, res, weights, biases);
Expand All @@ -60,9 +66,14 @@ void pointwise_conv_2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width *
typename CONFIG_T::weight_t weights[CONFIG_T::n_chan * CONFIG_T::n_filt],
typename CONFIG_T::bias_t biases[CONFIG_T::n_filt]) {
assert(CONFIG_T::filt_width == 1);

// Inlining helps reduce latency, but may also cause timing issues in some cases, use carefully.
//#pragma HLS INLINE recursive
// But without inlining Vitis HLS doesn't respect the parallelization factor config ¯\_(ツ)_/
// Vitis2025.1 hangs in RTL simulation with this, though

#pragma HLS INLINE recursive

// #pragma HLS PIPELINE II = CONFIG_T::reuse_factor * CONFIG_T::n_partitions
// ↑ This makes II=2 in for all n_partitions > 1, no matter what the actual II should be

// Nothing special to be done for io_parallel implementation
if (CONFIG_T::strategy == nnet::latency || CONFIG_T::strategy == nnet::distributed_arithmetic) {
Expand Down
Loading
Loading