fastmachinelearning
diff --git a/‎.gitlab-ci.yml‎
Lines changed: 1 addition & 1 deletion b/‎.gitlab-ci.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 3 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎hls4ml/backends/catapult/passes/pointwise.py‎
Lines changed: 0 additions & 5 deletions b/‎hls4ml/backends/catapult/passes/pointwise.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎hls4ml/backends/fpga/passes/hgq_proxy_model.py‎
Lines changed: 107 additions & 0 deletions b/‎hls4ml/backends/fpga/passes/hgq_proxy_model.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎hls4ml/backends/quartus/passes/core_templates.py‎
Lines changed: 2 additions & 1 deletion b/‎hls4ml/backends/quartus/passes/core_templates.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎hls4ml/backends/quartus/passes/pointwise.py‎
Lines changed: 0 additions & 5 deletions b/‎hls4ml/backends/quartus/passes/pointwise.py‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎hls4ml/backends/quartus/passes/recurrent_templates.py‎
Lines changed: 10 additions & 1 deletion b/‎hls4ml/backends/quartus/passes/recurrent_templates.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎hls4ml/backends/quartus/quartus_backend.py‎
Lines changed: 24 additions & 2 deletions b/‎hls4ml/backends/quartus/quartus_backend.py‎
Lines changed: 24 additions & 2 deletions
diff --git a/‎hls4ml/backends/vitis/vitis_backend.py‎
Lines changed: 30 additions & 1 deletion b/‎hls4ml/backends/vitis/vitis_backend.py‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎hls4ml/backends/vivado/passes/core_templates.py‎
Lines changed: 2 additions & 1 deletion b/‎hls4ml/backends/vivado/passes/core_templates.py‎
Lines changed: 2 additions & 1 deletion
@@ -7,7 +7,7 @@ generator:
   stage: generate
   image: python:3.8-alpine
   variables:
-    N_TESTS_PER_YAML: 5
+    N_TESTS_PER_YAML: 4
   tags:
     - k8s-default
   before_script:
 
@@ -2,7 +2,7 @@ exclude: (^hls4ml\/templates\/(vivado|quartus)\/(ap_types|ac_types)\/|^test/pyte
 
 repos:
 - repo: https://github.com/psf/black
-  rev: 24.4.2
+  rev: 24.8.0
   hooks:
   - id: black
     language_version: python3
@@ -30,7 +30,7 @@ repos:
     args: ["--profile", "black", --line-length=125]
 
 - repo: https://github.com/asottile/pyupgrade
-  rev: v3.16.0
+  rev: v3.17.0
   hooks:
   - id: pyupgrade
     args: ["--py36-plus"]
@@ -41,7 +41,7 @@ repos:
   - id: setup-cfg-fmt
 
 - repo: https://github.com/pycqa/flake8
-  rev: 7.1.0
+  rev: 7.1.1
   hooks:
   - id: flake8
     exclude: docs/conf.py
 
@@ -1,7 +1,5 @@
 from copy import copy
 
-import numpy as np
-
 from hls4ml.backends.catapult.passes.convolution_templates import (
     Conv1DConfigTemplate,
     Conv1DFunctionTemplate,
@@ -78,9 +76,6 @@ def match(self, node):
     def transform(self, model, node):
         dim = node.__class__.__name__[-2:]  # '1D' or '2D'
         pw_node = model.make_node('PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy())
-        if len(node.weights['weight'].data.shape) == 2:  # This can happen if we assign weights of Dense layer to 1x1 Conv2D
-            expand_axis = tuple(range(int(dim[0])))
-            pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis)
         pw_node.weights['bias'].data = node.weights['bias'].data
         # Set strategy to ensure lowercase string is passed to the template
         if model.config.is_resource_strategy(pw_node):
 
@@ -0,0 +1,107 @@
+import numpy as np
+
+from hls4ml.backends import Backend
+from hls4ml.backends.template import FunctionCallTemplate
+from hls4ml.model.layers import Layer
+from hls4ml.model.optimizer import OptimizerPass
+from hls4ml.model.optimizer.passes.hgq_proxy_model import FixedPointQuantizer, UnaryLUT
+from hls4ml.model.types import Source
+
+
+def to_apfixed(k, b, i, RND, SAT):
+    u = 'u' if k == 0 else ''
+    return f'ap_{u}fixed<{b},{i},AP_{RND},AP_{SAT}>'
+
+
+def to_acfixed(k, b, i, RND, SAT):
+    k = 'false' if k == 0 else 'true'
+    return f'ac_fixed<{b},{i},{k},AC_{RND},AC_{SAT}>'
+
+
+def generate_mask_fn(
+    name: str, shape: tuple[int, ...], k: np.ndarray, b: np.ndarray, i: np.ndarray, RND: str, SAT: str, backend: str
+) -> str:
+    """Generate heterogenous quantization mask function, ONLY works for IOType=io_parallel"""
+    assert k.shape[0] == b.shape[0] == i.shape[0] == 1
+    assert backend.lower() in ('quartus', 'vivado', 'vitis'), f'Backend {backend} not tested'
+    Ks, Bs, Is = k[0], b[0], i[0]
+    Ks, Bs, Is = np.broadcast_to(Ks, shape), np.broadcast_to(Bs, shape), np.broadcast_to(Is, shape)
+    Ks, Bs, Is = Ks.ravel(), Bs.ravel(), Is.ravel()
+    masks = []
+    to_fixed = to_acfixed if backend.lower() == 'quartus' else to_apfixed
+    for idx, (k, b, i) in enumerate(zip(Ks, Bs, Is)):
+        if b == 0:
+            fn = f'out[{idx}] = 0;'
+        else:
+            fn = f'out[{idx}] = {to_fixed(k, b, i, RND, SAT)}(inp[{idx}]);'
+        masks.append(f'    {fn}')
+    body = "\n".join(masks)
+    mask_fn = f'''
+template<typename input_t, typename output_t>
+void {name}(input_t *inp, output_t *out) {{
+    #pragma HLS INLINE
+
+{body}
+}}
+'''
+    return mask_fn
+
+
+class ProcessFixedPointQuantizerLayer(OptimizerPass):
+    def match(self, node: Layer):
+        return isinstance(node, FixedPointQuantizer)
+
+    def transform(self, model, node: FixedPointQuantizer):
+        if node.fusible:
+            model.remove_node(node, rewire=True)
+            return True
+
+        if model.config.config['IOType'] != 'io_parallel':
+            raise NotImplementedError('Heterogenous quantization for activations is only supported with IOType=io_parallel')
+
+        backend = model.config.config['Backend']
+
+        name = node.name
+
+        assert node.mask_kbi is not None
+        k, b, i = node.mask_kbi
+        RND = node.RND
+        SAT = node.SAT
+        mask_fn: str = generate_mask_fn(name, node.get_input_variable().shape, k, b, i, RND, SAT, backend)
+
+        node.set_attr('mask_fn_codegen', Source(mask_fn))
+
+
+class ProcessFixedPointQuantizerCall(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(FixedPointQuantizer, include_header=[])
+        self.template = 'nnet::{name}<{input_t}, {output_t}>({input}, {output});'
+
+    def format(self, node):
+        params = self._default_function_params(node)
+
+        return self.template.format(**params)
+
+
+class ProcessUnaryLUTCall(FunctionCallTemplate):
+    def __init__(self):
+        super().__init__(UnaryLUT, include_header=[])
+        self.template = 'nnet::unary_lut<{input_t}, {output_t}, {config}>({input}, {output}, {table});'
+        self.include_header = [
+            'nnet_utils/nnet_activation.h',
+            'nnet_utils/nnet_activation_stream.h',
+        ]
+
+    def format(self, node):
+        params = self._default_function_params(node)
+        node.attributes['result_t'].precision = node.attributes['table_t'].precision
+        params['config'] = f'unary_lut_config{node.index}'
+        params['table'] = node.get_weights('table').name
+
+        return self.template.format(**params)
+
+
+def register_hgq_proxy_model(backend: Backend):
+    backend.register_pass('process_fixed_point_quantizer_layer', ProcessFixedPointQuantizerLayer)
+    backend.register_template(ProcessFixedPointQuantizerCall)
+    backend.register_template(ProcessUnaryLUTCall)
@@ -1,6 +1,7 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+from hls4ml.model.optimizer.passes.hgq_proxy_model import UnaryLUT
 
 # Dense templates
 
@@ -152,7 +153,7 @@ def format(self, node):
 
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__((Activation, ParametrizedActivation, PReLU))
+        super().__init__((Activation, ParametrizedActivation, PReLU, UnaryLUT))
         self.template = activ_config_template
 
     def format(self, node):
 
@@ -1,7 +1,5 @@
 from copy import copy
 
-import numpy as np
-
 from hls4ml.backends.fpga.fpga_layers import PointwiseConv1D, PointwiseConv2D
 from hls4ml.backends.quartus.passes.convolution_templates import (
     Conv1DConfigTemplate,
@@ -86,9 +84,6 @@ def transform(self, model, node):
         pw_node = model.make_node(
             'PointwiseConv' + dim, node.name, copy(node.attributes), node.inputs.copy(), outputs=node.outputs.copy()
         )
-        if len(node.weights['weight'].data.shape) == 2:  # This can happen if we assign weights of Dense layer to 1x1 Conv2D
-            expand_axis = tuple(range(int(dim[0])))
-            pw_node.weights['weight'].data = np.expand_dims(node.weights['weight'].data, axis=expand_axis)
         pw_node.weights['bias'].data = node.weights['bias'].data
         model.replace_node(node, pw_node)
 
 
@@ -66,6 +66,7 @@
     using activation_recr = nnet::activation::{recurrent_activation}<x_T, y_T, config_T>;
 
     static const unsigned reuse_factor = {reuse};
+    static const unsigned pytorch_order = {pytorch};
     static const bool store_weights_in_bram = false;
 }};\n'''
 
@@ -92,6 +93,7 @@ def format(self, node):
         params['config_mult_h'] = f'config{node.index}_h_mult'
         params['act_t'] = '{}_config{}'.format(node.get_attr('activation'), str(node.index) + '_act')
         params['act_recurrent_t'] = '{}_config{}'.format(node.get_attr('recurrent_activation'), str(node.index) + '_rec_act')
+        params['pytorch'] = 'true' if node.get_attr('pytorch', False) else 'false'
         gru_config = self.gru_template.format(**params)
 
         # Activation is on candidate hidden state, dimensionality (1, n_units)
@@ -256,6 +258,9 @@ def format(self, node):
 }};\n"""
 
 simple_rnn_function_template = 'nnet::simple_rnn<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
+simple_rnn_pytorch_function_template = (
+    'nnet::simple_rnn_pytorch<{input_t}, {output_t}, {config}>({input}, {output}, {weights});'
+)
 
 
 class SimpleRNNConfigTemplate(LayerConfigTemplate):
@@ -301,5 +306,9 @@ def __init__(self):
 
     def format(self, node):
         params = self._default_function_params(node)
-        params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index))
+        if node.get_attr('pytorch', False):
+            self.template = simple_rnn_pytorch_function_template
+            params['weights'] = 'w{0}, wr{0}, b{0}, br{0}'.format(str(node.index))
+        else:
+            params['weights'] = 'w{0}, wr{0}, b{0}'.format(str(node.index))
         return self.template.format(**params)
@@ -1,5 +1,6 @@
 import os
 from contextlib import contextmanager
+from warnings import warn
 
 import numpy as np
 
@@ -73,6 +74,7 @@ def _register_flows(self):
             'quartus:inplace_stream_flatten',
             'quartus:skip_softmax',
             'quartus:fix_softmax_table_size',
+            'quartus:process_fixed_point_quantizer_layer',
             'infer_precision_types',
         ]
         optimization_flow = register_flow('optimize', optimization_passes, requires=[init_flow], backend=self.name)
@@ -265,7 +267,17 @@ def init_conv1d(self, layer):
         n_in, n_out = self.get_layer_mult_size(layer)
         self.set_target_reuse_factor(layer)
         self.set_closest_reuse_factor(layer, n_in, n_out)
-        layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1))
+
+        # Not overriding user parallelization factor, if already set and user has not specified a value
+        user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None)
+        layer_pf = layer.get_attr('parallelization_factor', None)
+        chosen_pf = user_pf or layer_pf or 1
+        if user_pf is not None and layer_pf is not None:
+            if user_pf != layer_pf:
+                warn(
+                    f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.'  # noqa: E501
+                )
+        layer.set_attr('parallelization', chosen_pf)
 
         # impl_filt_width determines the filter size post-Winograd transformation
         layer.set_attr('impl_filt_width', layer.get_attr('filt_width'))
@@ -295,7 +307,17 @@ def init_conv2d(self, layer):
         n_in, n_out = self.get_layer_mult_size(layer)
         self.set_target_reuse_factor(layer)
         self.set_closest_reuse_factor(layer, n_in, n_out)
-        layer.set_attr('parallelization', layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', 1))
+
+        # Not overriding user parallelization factor, if already set and user has not specified a value
+        user_pf = layer.model.config.get_layer_config_value(layer, 'ParallelizationFactor', None)
+        layer_pf = layer.get_attr('parallelization_factor', None)
+        chosen_pf = user_pf or layer_pf or 1
+        if user_pf is not None and layer_pf is not None:
+            if user_pf != layer_pf:
+                warn(
+                    f'For layer {layer.name}, parallelization factor of {layer_pf} is defined in the proxy-model, but is overridden by the user to {user_pf}.'  # noqa: E501
+                )
+        layer.set_attr('parallelization', chosen_pf)
 
         # impl_filt_width & impl_filt_height determine the filter size post-Winograd transformation
         layer.set_attr('impl_filt_height', layer.get_attr('filt_height'))
 
@@ -34,15 +34,44 @@ def _register_flows(self):
         self._default_flow = register_flow('ip', None, requires=ip_flow_requirements, backend=self.name)
 
     def create_initial_config(
-        self, part='xcvu13p-flga2577-2-e', clock_period=5, clock_uncertainty='27%', io_type='io_parallel', **_
+        self,
+        part='xcvu13p-flga2577-2-e',
+        clock_period=5,
+        clock_uncertainty='27%',
+        io_type='io_parallel',
+        namespace=None,
+        write_weights_txt=True,
+        write_tar=False,
+        **_,
     ):
+        """Create initial configuration of the Vitis backend.
+
+        Args:
+            part (str, optional): The FPGA part to be used. Defaults to 'xcvu13p-flga2577-2-e'.
+            clock_period (int, optional): The clock period. Defaults to 5.
+            clock_uncertainty (str, optional): The clock uncertainty. Defaults to 27%.
+            io_type (str, optional): Type of implementation used. One of
+                'io_parallel' or 'io_stream'. Defaults to 'io_parallel'.
+            namespace (str, optional): If defined, place all generated code within a namespace. Defaults to None.
+            write_weights_txt (bool, optional): If True, writes weights to .txt files which speeds up compilation.
+                Defaults to True.
+            write_tar (bool, optional): If True, compresses the output directory into a .tar.gz file. Defaults to False.
+
+        Returns:
+            dict: initial configuration.
+        """
         config = {}
 
         config['Part'] = part if part is not None else 'xcvu13p-flga2577-2-e'
         config['ClockPeriod'] = clock_period if clock_period is not None else 5
         config['ClockUncertainty'] = clock_uncertainty if clock_uncertainty is not None else '27%'
         config['IOType'] = io_type if io_type is not None else 'io_parallel'
         config['HLSConfig'] = {}
+        config['WriterConfig'] = {
+            'Namespace': namespace,
+            'WriteWeightsTxt': write_weights_txt,
+            'WriteTar': write_tar,
+        }
 
         return config
 
 
@@ -1,6 +1,7 @@
 from hls4ml.backends.backend import get_backend
 from hls4ml.backends.template import FunctionCallTemplate, LayerConfigTemplate
 from hls4ml.model.layers import Activation, BatchNormalization, Dense, HardActivation, ParametrizedActivation, PReLU, Softmax
+from hls4ml.model.optimizer.passes.hgq_proxy_model import UnaryLUT
 
 # Dense templates
 
@@ -144,7 +145,7 @@ def format(self, node):
 
 class ActivationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
-        super().__init__((Activation, ParametrizedActivation, PReLU))
+        super().__init__((Activation, ParametrizedActivation, PReLU, UnaryLUT))
         self.template = activ_config_template
 
     def format(self, node):