diff --git a/README.md b/README.md
index fd04bfca74..a6f4a080cb 100644
--- a/README.md
+++ b/README.md
@@ -125,6 +125,18 @@ Additionally, if you use specific features developed in later papers, please cit
year = "2022"
}
```
+Distributed arithmetic:
+```bibtex
+@misc{Sun:2025,
+ title={da4ml: Distributed Arithmetic for Real-time Neural Networks on FPGAs},
+ author={Chang Sun and others},
+ year={2025},
+ eprint={2507.04535},
+ archivePrefix={arXiv},
+ primaryClass={cs.AR},
+ url={https://arxiv.org/abs/2507.04535},
+}
+```
binary/ternary networks:
```bibtex
@article{Loncar:2020hqp,
diff --git a/docs/advanced/_static/da4ml-workflow.svg b/docs/advanced/_static/da4ml-workflow.svg
new file mode 100644
index 0000000000..ae1226979c
--- /dev/null
+++ b/docs/advanced/_static/da4ml-workflow.svg
@@ -0,0 +1 @@
+
diff --git a/docs/advanced/_static/hgq-overview.svg b/docs/advanced/_static/hgq-overview.svg
new file mode 100644
index 0000000000..166a6af99d
--- /dev/null
+++ b/docs/advanced/_static/hgq-overview.svg
@@ -0,0 +1 @@
+
diff --git a/docs/advanced/auto.rst b/docs/advanced/auto.rst
index f944a11e54..70ada44e35 100644
--- a/docs/advanced/auto.rst
+++ b/docs/advanced/auto.rst
@@ -20,3 +20,6 @@ inference will never set a bitwdith larger than the bitwidth of the ``max_precis
When manually setting bitdwidths, the accumulator can overflow, and the precision may need to be reduced. For the accumulator, it is usually a bad idea to explicitly
enable rounding or saturation modes since it dramatically increases the execution time. For other types (e.g. output types or weight types), however, rounding and saturation handling
can be enabled as needed.
+
+.. note::
+ For supported models (Most ``HGQ/HGQ2`` models and some ``QKeras`` models), Model-wise Precision Inference (documented in `model-wise precision inference <../precision.html>`_) can be used to achieve bit-exact conversion. Please refer to that section for more details.
diff --git a/docs/advanced/da.rst b/docs/advanced/da.rst
new file mode 100644
index 0000000000..a52f8a3996
--- /dev/null
+++ b/docs/advanced/da.rst
@@ -0,0 +1,32 @@
+======================
+Distributed Arithmetic
+======================
+
+.. image:: https://img.shields.io/badge/License-LGPLv3-blue.svg
+ :target: https://www.gnu.org/licenses/lgpl-3.0.en.html
+.. image:: https://badge.fury.io/py/da4ml.svg
+ :target: https://badge.fury.io/py/da4ml
+.. image:: https://img.shields.io/badge/arXiv-2507.04535-b31b1b.svg
+ :target: https://arxiv.org/abs/2507.04535
+
+
+Distributed Arithmetic (DA) is a strategy for constant-matrix-vector multiplication (CMVM) operations used in hls4ml. The implementation is provided by an external library, `da4ml `__, which can be installed with ``pip install hls4ml[da]``. The library transforms the CMVM operations into an adder graph with common subexpression elimations to reduce the overall complexity. As the CMVM operation is fully unrolled, `reuse_factor` **must** be 1 (by default) for the corresponding CMVM operations [*]_. Comparing to the traditional `Latency` strategy CMVM kernels, DA can usually reduce up to 30% of the LUTs and all DSPs used.
+
+.. rst-class:: light
+.. image:: _static/da4ml-workflow.svg
+ :alt: Workflow of DA in hls4ml
+ :width: 600
+
+When the DA strategy is used, the CMVM operations will be implemented bit-exactly, and the accumulator precision setting will not be used.
+
+.. [*] Not to be confused with `II=1`. `reuse_factor` is the `II` for one CMVM operation, not one layer. One layer may invoke the same CMVM kernel multiple times and thus has `II>1` while each CMVM operation is unrolled, e.g., convolution layers with more than one partition.
+
+Currently, the DA strategy is only available for the Vivado/Vitis HLS backends. The following layers are supported:
+* Dense
+* Convolutional (1D, 2D)
+* EinsumDense
+* Multi-head attention (implemented as multiple EinsumDense layers)
+
+While possible, the RNN layers are not yet supported by the DA strategy.
+
+For more details, please refer to the `da4ml repository `__ or the `paper `__.
diff --git a/docs/advanced/extension.rst b/docs/advanced/extension.rst
index d9f71d39b7..c8c23ab29d 100644
--- a/docs/advanced/extension.rst
+++ b/docs/advanced/extension.rst
@@ -26,14 +26,14 @@ For concreteness, let's say our custom layer ``KReverse`` is implemented in Kera
.. code-block:: Python
# Keras implementation of a custom layer
- class KReverse(tf.keras.layers.Layer):
+ class KReverse(keras.layers.Layer):
'''Keras implementation of a hypothetical custom layer'''
def __init__(self):
super().__init__()
def call(self, inputs):
- return tf.reverse(inputs, axis=[-1])
+ return inputs[..., ::-1]
def get_config(self):
return super().get_config()
@@ -58,19 +58,44 @@ This parser reads the attributes of the Keras layer instance and populates a dic
It also returns a list of output shapes (one sjape for each output).
In this case, there a single output with the same shape as the input.
-.. code-block:: Python
+.. tabs::
+ .. tab:: Keras v2
+
+ .. code-block:: Python
+
+ # Parser for converter
+ def parse_reverse_layer(keras_layer, input_names, input_shapes, data_reader):
+ layer = {}
+ layer['class_name'] = 'KReverse'
+ layer['name'] = keras_layer['config']['name']
+ layer['n_in'] = input_shapes[0][1]
+
+ if input_names is not None:
+ layer['inputs'] = input_names
+
+ return layer, [shape for shape in input_shapes[0]]
+
+ .. tab:: Keras v3
+
+ .. code-block:: Python
- # Parser for converter
- def parse_reverse_layer(keras_layer, input_names, input_shapes, data_reader):
- layer = {}
- layer['class_name'] = 'HReverse'
- layer['name'] = keras_layer['config']['name']
- layer['n_in'] = input_shapes[0][1]
+ from hls4ml.converters.keras_v3._base import register, KerasV3LayerHandler
- if input_names is not None:
- layer['inputs'] = input_names
+ @register
+ class KReverseHandler(KerasV3LayerHandler):
+ '''Keras v3 layer handler for KReverse'''
- return layer, [shape for shape in input_shapes[0]]
+ handles = ('KReverse',)
+ def handle(
+ self,
+ layer: 'keras.Layer',
+ in_tensors: Sequence['KerasTensor'],
+ out_tensors: Sequence['KerasTensor'],
+ ) -> dict[str, Any] | tuple[dict[str, Any], ...]:
+ # Only layer-specific parameters are needed.
+ # Common parameters are automatically added in the base class.
+ assert len(in_tensors[0].shape) == 2, 'KReverse is only supported for 2D tensors'
+ return {'n_in': in_tensors[0].shape[-1]}
Next, we need the actual HLS implementaton of the function, which can be written in a header file ``nnet_reverse.h``.
@@ -140,33 +165,33 @@ In this case, the HLS code is valid for both the Vivado and Quartus backends.
.. code-block:: Python
# Register the converter for custom Keras layer
- hls4ml.converters.register_keras_layer_handler('KReverse', parse_reverse_layer)
+ hls4ml.converters.register_keras_v2_layer_handler('KReverse', parse_reverse_layer)
+ # For keras v3, use register on subclassed KerasV3LayerHandler from hls4ml.converters.keras_v3._base instead
# Register the hls4ml's IR layer
- hls4ml.model.layers.register_layer('HReverse', HReverse)
+ hls4ml.model.layers.register_layer('KReverse', HReverse)
for backend_id in ['Vivado', 'Quartus']:
# Register the optimization passes (if any)
backend = hls4ml.backends.get_backend(backend_id)
- backend.register_pass('remove_duplicate_reverse', RemoveDuplicateReverse, flow=f'{backend_id.lower()}:optimize')
# Register template passes for the given backend
backend.register_template(HReverseConfigTemplate)
backend.register_template(HReverseFunctionTemplate)
# Register HLS implementation
- backend.register_source('nnet_reverse.h')
+ backend.register_source('/path/to/your/nnet_reverse.h')
Finally, we can actually test the ``hls4ml`` custom layer compared to the Keras one.
.. code-block:: Python
# Test if it works
- kmodel = tf.keras.models.Sequential(
+ kmodel = keras.models.Sequential(
[
- tf.keras.layers.Input(shape=(8,)),
+ keras.layers.Input(shape=(8,)),
KReverse(),
- tf.keras.layers.ReLU(),
+ keras.layers.ReLU(),
]
)
diff --git a/docs/advanced/hgq.rst b/docs/advanced/hgq.rst
index dd0faad7dc..d93774a6a7 100644
--- a/docs/advanced/hgq.rst
+++ b/docs/advanced/hgq.rst
@@ -1,49 +1,71 @@
-===================================
-High Granularity Quantization (HGQ)
-===================================
-
-.. image:: https://github.com/calad0i/HGQ/actions/workflows/sphinx-build.yml/badge.svg
- :target: https://calad0i.github.io/HGQ/
-.. image:: https://badge.fury.io/py/hgq.svg
- :target: https://badge.fury.io/py/hgq
+======================================
+High Granularity Quantization (HGQ2)
+======================================
+
+.. note::
+ New projects are encouraged to use ``HGQ2`` instead of the original ``HGQ`` (doc page moved `here <../hgq.html>`_).
+ HGQ2 is a major improvement over HGQ with more supported layers, more quantizer options, better performance. As HGQ2 moves to Keras v3, it can be used natively with ``JAX``, ``PyTorch``, and ``TensorFlow`` backends.
+
+.. image:: https://img.shields.io/badge/License-LGPLv3-blue.svg
+ :target: https://www.gnu.org/licenses/lgpl-3.0.en.html
+.. image:: https://github.com/calad0i/HGQ2/actions/workflows/sphinx-build.yml/badge.svg
+ :target: https://calad0i.github.io/HGQ2/
+.. image:: https://badge.fury.io/py/hgq2.svg
+ :target: https://badge.fury.io/py/hgq2
.. image:: https://img.shields.io/badge/arXiv-2405.00645-b31b1b.svg
:target: https://arxiv.org/abs/2405.00645
-`High Granularity Quantization (HGQ) `_ is a library that performs gradient-based automatic bitwidth optimization and quantization-aware training algorithm for neural networks to be deployed on FPGAs. By leveraging gradients, it allows for bitwidth optimization at arbitrary granularity, up to per-weight and per-activation level.
+HGQ2 (High Granularity Quantization 2) is a quantization-aware training framework built on Keras v3, targeting real-time deep learning applications on edge devices like FPGAs. It provides a comprehensive set of tools for creating and training quantized neural networks with minimal effort.
-.. image:: https://calad0i.github.io/HGQ/_images/overview.svg
- :alt: Overview of HGQ
- :align: center
+HGQ2 implements an gradient-based automatic bitwidth optimization and quantization-aware training algorithm. By laveraging gradients, it allows for bitwidth optimization at arbitrary granularity, up to per-weight and per-activation level.
-Conversion of models made with HGQ library is fully supported. The HGQ models are first converted to proxy model format, which can then be parsed by hls4ml bit-accurately. Below is an example of how to create a model with HGQ and convert it to hls4ml model.
+.. rst-class:: light
+.. image:: _static/hgq-overview.svg
+ :alt: HGQ-overview
+ :width: 600
-.. code-block:: Python
+Key Features
+------------
- import keras
- from HGQ.layers import HDense, HDenseBatchNorm, HQuantize
- from HGQ import ResetMinMax, FreeBOPs
+- **Multi-backend support**: Works with TensorFlow, JAX, and PyTorch through Keras v3
+- **Flexible quantization**: Supports different quantization schemes including fixed-point and minifloat
+- **Hardware synthesis**: Direct integration with hls4ml for FPGA deployment
+- **Trainable quantization parameters**: Optimize bitwidths through gradient-based methods
+- **Effective Bit-Operations (EBOP)**: Accurate resource estimation during training for the deployed firmware
+- **Advanced layer support**: HGQ2 supports advanced layers like einsum, einsum dense, and multi-head attention layers with quantization and hardware synthesis support
- model = keras.models.Sequential([
- HQuantize(beta=1.e-5),
- HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
- HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
- HDense(10, beta=1.e-5),
- ])
- opt = keras.optimizers.Adam(learning_rate=0.001)
- loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
- model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])
- callbacks = [ResetMinMax(), FreeBOPs()]
+.. code-block:: python
+ :caption: Simple example
- model.fit(..., callbacks=callbacks)
+ import keras
+ from hgq.layers import QDense, QConv2D
+ from hgq.config import LayerConfigScope, QuantizerConfigScope
- from HGQ import trace_minmax, to_proxy_model
- from hls4ml.converters import convert_from_keras_model
+ # Setup quantization configuration
+ # These values are the defaults, just for demonstration purposes here
+ with (
+ # Configuration scope for setting the default quantization type and overflow mode
+ # The second configuration scope overrides the first one for the 'datalane' place
+ QuantizerConfigScope(place='all', default_q_type='kbi', overflow_mode='SAT_SYM'),
+ # Configuration scope for enabling EBOPs and setting the beta0 value
+ QuantizerConfigScope(place='datalane', default_q_type='kif', overflow_mode='WRAP'),
+ LayerConfigScope(enable_ebops=True, beta0=1e-5),
+ ):
+ model = keras.Sequential([
+ QConv2D(32, (3, 3), activation='relu'),
+ keras.layers.MaxPooling2D((2, 2)),
+ keras.layers.Flatten(),
+ QDense(10)
+ ])
- trace_minmax(model, x_train, cover_factor=1.0)
- proxy = to_proxy_model(model, aggressive=True)
+ ... # Training, evaluation, and anything else you want to do with the model
- model_hls = convert_from_keras_model(proxy, backend='vivado',output_dir=... ,part=...)
+ model_hls = hls4ml.converters.convert_from_keras(model, ...)
+ # Model-wise precision propagation is done automatically for HGQ models for bit-exactness
+ # Do NOT pass precision config if you don't know what you are doing
+ model_hls.compile()
-An interactive example of HGQ can be found in the `kaggle notebook `_. Full documentation can be found at `calad0i.github.io/HGQ `_.
+.. note::
+ Do not pass any precision configuration from ``hls4ml.converters.convert_from_keras`` in general. HGQ-defined models will invoke model-wise precision propagation automatically to ensure bit-exactness between the Keras model and the generated HLS code (See `here <./precision.html>`__ for more details).
diff --git a/docs/advanced/hgq1.rst b/docs/advanced/hgq1.rst
new file mode 100644
index 0000000000..6a02e76bb7
--- /dev/null
+++ b/docs/advanced/hgq1.rst
@@ -0,0 +1,52 @@
+===================================
+High Granularity Quantization (HGQ)
+===================================
+
+.. warning::
+ While still supported and maintained, HGQ is deprecated in favor of `HGQ2 <../hgq2.html>`_. New projects are strongly encouraged to use HGQ2 instead.
+
+.. image:: https://github.com/calad0i/HGQ/actions/workflows/sphinx-build.yml/badge.svg
+ :target: https://calad0i.github.io/HGQ/
+.. image:: https://badge.fury.io/py/hgq.svg
+ :target: https://badge.fury.io/py/hgq
+.. image:: https://img.shields.io/badge/arXiv-2405.00645-b31b1b.svg
+ :target: https://arxiv.org/abs/2405.00645
+
+`High Granularity Quantization (HGQ) `_ is a library that performs gradient-based automatic bitwidth optimization and quantization-aware training algorithm for neural networks to be deployed on FPGAs. By leveraging gradients, it allows for bitwidth optimization at arbitrary granularity, up to per-weight and per-activation level.
+
+.. image:: https://calad0i.github.io/HGQ/_images/overview.svg
+ :alt: Overview of HGQ
+ :align: center
+
+Conversion of models made with HGQ library is fully supported. The HGQ models are first converted to proxy model format, which can then be parsed by hls4ml bit-accurately. Below is an example of how to create a model with HGQ and convert it to hls4ml model.
+
+.. code-block:: Python
+
+ import keras
+ from HGQ.layers import HDense, HDenseBatchNorm, HQuantize
+ from HGQ import ResetMinMax, FreeBOPs
+
+ model = keras.models.Sequential([
+ HQuantize(beta=1.e-5),
+ HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
+ HDenseBatchNorm(32, beta=1.e-5, activation='relu'),
+ HDense(10, beta=1.e-5),
+ ])
+
+ opt = keras.optimizers.Adam(learning_rate=0.001)
+ loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+ model.compile(optimizer=opt, loss=loss, metrics=['accuracy'])
+ callbacks = [ResetMinMax(), FreeBOPs()]
+
+ model.fit(..., callbacks=callbacks)
+
+ from HGQ import trace_minmax, to_proxy_model
+ from hls4ml.converters import convert_from_keras_model
+
+ trace_minmax(model, x_train, cover_factor=1.0)
+ proxy = to_proxy_model(model, aggressive=True)
+
+ model_hls = convert_from_keras_model(proxy, backend='vivado',output_dir=... ,part=...)
+
+
+An interactive example of HGQ can be found in the `kaggle notebook `_. Full documentation can be found at `calad0i.github.io/HGQ `_.
diff --git a/docs/advanced/precision.rst b/docs/advanced/precision.rst
new file mode 100644
index 0000000000..6e9eac33c0
--- /dev/null
+++ b/docs/advanced/precision.rst
@@ -0,0 +1,23 @@
+==============================
+Model-wise Precision Inference
+==============================
+
+The model-wise precision inference (implemented in :py:class:`~hls4ml.model.optimizer.passes.bit_exact.BitExact`) attempts to infer the appropriate configuration for **all** precision in the model. Unlike the automatic precision inference, this pass disregards all user-defined precision, and "trust" only data embedded in the model, i.e., the actual values of the weights and explicit quantizers defined between layers.
+
+This pass uses modified symbolic interval arithmetic to compute the ranges and the needed quantization steps for all precision in the model graph, with the goal of eliminating any discrepancy between the quantized model and the original model. In the inference process, only the raw weight values and the explicit quantizers (either ``FixedPointQuantizer``, or ``linear/relu`` layers with ``trusted=True``) are considered as sources of precision information. All other precision information (e.g., user-defined precision in ``config_from_*`` functions) will not be used in the inference process.
+
+Invoking of this pass is configured by the ``bit_exact`` key in the backend configuration (default: ``None``). There are two ways to enable this pass:
+- When converting from ``HGQ/HGQ2`` models, this pass is automatically enabled unless ``bit_exact`` is explicitly set to ``False``.
+- For other models, this pass can be enabled by setting ``bit_exact`` to ``True``. Currently, only ``QKeras`` sets this key automatically when converting from ``QKeras`` models. Support for ``QONNX`` is planned but not yet implemented.
+
+If the original model is not properly quantized, this pass will lead to huge bitwidths in the model. In this context, properly quantized models are those that have quantizers defined between **all layers with non-trivial arithmetics** (i.e., essentially all layers other than reshape/flatten/transpose/linear-like layers only rearranging elements). The successful application of this pass should result in bit-exact model, i.e., the quantized model should produce the same outputs as the original model for all inputs [*]_.
+
+Not all operator types are supported in this pass. If any unsupported operator is encountered during the inference, this pass will **crash** the conversion process to prevent silent failures. Please consider use `automatic precision inference <../auto.html>`_ if your model contains unsupported operators or unquantized components.
+
+.. warning::
+ Importantly, quantizers **should be used immediately after the inputs**, or the input precision may not be properly inferred. If you are using ``HGQ/HGQ2``, this is automatically taken care of in most cases. If you are using ``QKeras``, make sure to put a ``QActivation`` with ``quantized_bits`` right after the input layer such that the input precision can be derived.
+
+.. [*] While quantized, the original model will still operate on float-point values, so there is a chance that the outputs will not be exactly the same due to float rounding errors in the original model.
+
+.. note::
+ When this functionality is used, one **should not** use the ``config_from_*`` functions to set the precision in the model. Automatic precision inference and this pass cannot be used simultaneously.
diff --git a/docs/advanced/profiling.rst b/docs/advanced/profiling.rst
index 266fe16443..81b2d71a73 100644
--- a/docs/advanced/profiling.rst
+++ b/docs/advanced/profiling.rst
@@ -2,7 +2,7 @@
Profiling
=========
-In the ``hls4ml`` configuration file, it is possible to specify the model ``Precision`` and ``ReuseFactor`` with fine granularity.
+In the ``hls4ml`` configuration file, it is possible to specify the model ``Precision`` and ``ReuseFactor`` with up to layer-wise granularity.
Using a low precision can help reduce the FPGA resource usage of a model, but may result in loss of model performance if chosen inappropriately. The profiling tools in ``hls4ml`` help you to decide the appropriate model precision.
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
index 1bc8f0676c..ccd6cf53cf 100644
--- a/docs/api/configuration.rst
+++ b/docs/api/configuration.rst
@@ -75,6 +75,9 @@ and QONNX model parsing. Passing the backend to these functions is recommended b
and similar for more information on the various options. Note specifically the documentation of :py:class:`~hls4ml.utils.config.config_from_pytorch_model` on how to handle differences in input data
formats between pytorch and keras (hls4ml follows keras conventions internally).
+.. warning::
+ Note that passing precision configurations when invoking the full model precision propagation (by default for HGQ/HGQ2 models, or when `bit_exact=True` is set for other frontends) is **not needed** and **should not be done** without understanding the implications.
+
One can override specific values before using the configuration:
.. code-block:: python
@@ -158,12 +161,12 @@ For Vivado backend the options are:
* **Part**\ : the particular FPGA part number that you are considering, here it's a Xilinx Virtex UltraScale+ VU13P FPGA
* **ClockPeriod**\ : the clock period, in ns, at which your algorithm runs
Then you have some optimization parameters for how your algorithm runs:
-* **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here `__.
-* **HLSConfig**\: the detailed configuration of precision and parallelism, including:
+* **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines how data is transferred into and out of the HLS model IP, and how the data is transferred between layers. For ``io_parallel``, data are directly wired between layers fully in parallel. For ``io_stream``, HLS streams are used, which instantiates as stateful FIFO buffers, effectively decouples the producer and consumer (upstream and downstream in a neural network) and removing the need of a global state machine coordinating the exact timing for io operations. This is particular useful with the DATAFLOW pipeline style. For more information, see `here `__.
+ * **HLSConfig**\: the detailed configuration of precision and parallelism, including:
* **ReuseFactor**\ : in the case that you are pipelining, this defines the pipeline interval or initiation interval
* **ParallelizationFactor**\ : The number of output "pixels" to compute in parallel in convolutional layers. Increasing this parameter results in significant increase in resources required on the FPGA.
- * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource" or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor larger than 1 should be specified when using "resource" or "unrolled" strategy. An example of using larger reuse factor can be found `here. `__
+ * **Strategy**\ : Optimization strategy on FPGA, either "Latency", "Resource", "distributed_arithmetic" (or "da"), or "Unrolled". If none is supplied then hl4ml uses "Latency" as default. Note that a reuse factor must be 1 if using "distributed_arithmetic", and should be larger than 1 when using "resource" or "unrolled" strategy.
* **PipelineStyle**\ : Set the top level pipeline style. Valid options are "auto", "pipeline" and "dataflow". If unspecified, it defaults to "auto".
* **PipelineInterval**\ : Optionally override the desired initiation interval of the design. Only valid in combination with "pipeline" style. If unspecified, it is left to the compiler to decide, ideally matching the largest reuse factor of the network.
* **Precision**\ : this defines the precision of your inputs, outputs, weights and biases. It is denoted by ``fixed``\ , where ``Y`` is the number of bits representing the signed number above the binary point (i.e. the integer part), and ``X`` is the total number of bits. Additionally, integers in the type (\ ``int``\ , where ``N`` is a bit-size from 1 to 1024) can also be used. The format follows ``ap_fixed`` and ``ap_int`` conventions. You have a chance to further configure this more finely with per-layer configuration described below. In the per-layer configuration (but not globally) one can also use ``'auto'`` precision.
diff --git a/docs/conf.py b/docs/conf.py
index e4d7f399c1..b91f3d8119 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -70,6 +70,7 @@ def get_pypi_version(package, url_pattern=URL_PATTERN):
'sphinx.ext.napoleon',
'sphinx_contributors',
'sphinx_github_changelog',
+ 'sphinx_tabs.tabs',
]
# Note: to build locally, you will need to set the SPHINX_GITHUB_CHANGELOG_TOKEN
@@ -103,7 +104,7 @@ def get_pypi_version(package, url_pattern=URL_PATTERN):
html_theme_options = {
'canonical_url': '',
- 'analytics_id': '', # Provided by Google in your dashboard
+ 'analytics_id': '', # Provided by Google in your dashboard
'logo_only': True,
'display_version': True,
'prev_next_buttons_location': 'bottom',
diff --git a/docs/frontend/keras.rst b/docs/frontend/keras.rst
index 093db120b4..5445602973 100644
--- a/docs/frontend/keras.rst
+++ b/docs/frontend/keras.rst
@@ -2,18 +2,22 @@
Keras and its quantized variants
================================
-Keras and the quantization library QKeras are well supported in ``hls4ml``. Both Keras v2 (``tf.keras``) and the new Keras v3 are supported. While the Keras v2 support is based on parsing the serialized json representation of the model, the Keras v3 support uses direct model inspection.
+Keras and its quantized variants are supported in ``hls4ml``. Both Keras v2 (``tf.keras``) and the new Keras v3 are supported. While the Keras v2 support is based on parsing the serialized json representation of the model, the Keras v3 support uses direct model inspection.
-Currently, ``hls4ml`` can parse most Keras layers, including core layers, convolutional layers, pooling layers, recurrent layers, merging/reshaping layers and activation layers, implemented either via sequential or functional API. Notably missing are the attention and normalization layers. The ``Lambda`` layers don't save their state in the serialized format and are thus impossible to parse. In this case, the ``Lambda`` layers can be implemented as custom layers and parsed via the :ref:`Extension API`.
+For Keras v2, QKeras, and HGQ, ``hls4ml`` supports most of its layers, including core layers, convolutional layers, pooling layers, recurrent layers (not implemented in HGQ), merging/reshaping layers, and activation layers. The ``(Q)BatchNormalization`` layer is also supported. Experimental support for ``LayerNormalization`` is added for vanilla Keras v2.
-The ``data_format='channels_first'`` parameter of Keras layers is supported, but not extensively tested. All HLS implementations in ``hls4ml`` are based on ``channels_last`` data format and need to be converted to that format before the HLS code can be emitted. We encourage users of ``channels_first`` to report their experiences to developers on GitHub.
+For Keras v3, the support for EinsumDense layer is added in addition, but without recurrent layers in general. For HGQ2, some extra layers are supported in addition, such as ``QEinsum``, ``QMultiHeadAttention``, `QUnaryFunctionLUT` (arbitrary unary function as a 1-d lookup table) and some binary operators.
+
+keras ``Operators`` that are not layers are generally not supported in ``hls4ml``. This includes operators such as ``Add``, ``Subtract``, ``Multiply``, and ``Divide``. Please use the corresponding Keras layers instead.
+
+Arbitrary ``Lambda`` layers are not, and are not planned to be supported in ``hls4ml`` due to the difficultness to parse generic lambda expression. For custom operations required, please refer to the :ref:`Extension API` documentation to add custom layers to the conversion process.
+
+The ``data_format='channels_first'`` parameter of Keras layers is supported for a limited subset of layers and it is not extensively tested. All HLS implementations in ``hls4ml`` are based on ``channels_last`` data format convention and need to be converted to that format before the HLS code can be emitted. We encourage users of ``channels_first`` to report their experiences to developers on GitHub.
* `QKeras `_
- The equivalent QKeras API and its quantizers are also supported by ``hls4ml``. QKeras is not compatible with Keras v3. Currently, only HGQ2 is compatible with Keras v3 (see below).
+ The equivalent QKeras API and its quantizers are also supported by ``hls4ml``. QKeras is not compatible with Keras v3.
* `HGQ `_
- The equivalent HGQ API is also supported. HGQ is not compatible with Keras v3. See `advanced/HGQ <../advanced/hgq.html>`__ for more information.
+ The equivalent HGQ API is also supported. Still maintained but deprecated in favor of `HGQ2 <../hgq2.html>`_.
* `HGQ2 `_
- HGQ2 is based on Keras v3. Its support in hls4ml is currently under development.
-
-The development team of ``hls4ml`` is currently exploring options for QKeras alternative and will provide a drop-in replacement API compatible with Keras v3.
+ The equivalent HGQ2 API is also supported, plus some additional advanced operators.
diff --git a/docs/index.rst b/docs/index.rst
index a87e7c95e6..2bdb7964ed 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,8 @@
advanced/profiling
advanced/auto
advanced/hgq
+ advanced/da
+ advanced/precision
advanced/fifo_depth
advanced/extension
advanced/model_optimization
diff --git a/docs/intro/introduction.rst b/docs/intro/introduction.rst
index 8d603bd78f..8c276e02a7 100644
--- a/docs/intro/introduction.rst
+++ b/docs/intro/introduction.rst
@@ -20,11 +20,11 @@ The Solution: ``hls4ml``
========================
.. image:: ../img/overview.jpg
-
+ :alt: Overview of hls4ml, taken from `JINST 13 P07027 (2018) `_.
With this in mind, let's take a look at how ``hls4ml`` helps to achieve such a goal. First, it's important to realize the architecture differences between an FPGA and a CPU or GPU.
An FPGA can be specifically programmed to do a certain task, in this case evaluate neural networks given a set of inputs, and as such can be highly optimized for the task, with tricks like pipelining and parallel evaluation. However, this means dynamic remapping while running isn't really a possibility.
FPGAs also often come at a comparatively low power cost with respect to CPUs and GPUs. This allows ``hls4ml`` to build HLS code from compressed neural networks that results in predictions on the microsecond scale for latency.
The ``hls4ml`` tool saves the time investment needed to convert a neural network to a hardware design language or even HLS code, thus allowing for rapid prototyping.
-For more detailed information on technical details of ``hls4ml``, read the "Internals" section of our documentation or our :doc:`References ` page. All figures on this page are taken from the following paper: `JINST 13 P07027 (2018) `_.
+For more detailed information on technical details of ``hls4ml``, read the "Internals" section of our documentation or our :doc:`References ` page.
diff --git a/docs/intro/reference.rst b/docs/intro/reference.rst
index 0bd5912bb1..653c8c76d4 100644
--- a/docs/intro/reference.rst
+++ b/docs/intro/reference.rst
@@ -68,6 +68,20 @@ Additionally, if you use specific features developed in later papers, please cit
year = "2022"
}
+distributed arithmetic:
+
+.. code-block:: bibtex
+
+ @misc{Sun:2025,
+ title={da4ml: Distributed Arithmetic for Real-time Neural Networks on FPGAs},
+ author={Chang Sun and others},
+ year={2025},
+ eprint={2507.04535},
+ archivePrefix={arXiv},
+ primaryClass={cs.AR},
+ url={https://arxiv.org/abs/2507.04535},
+ }
+
binary/ternary networks:
.. code-block:: bibtex
diff --git a/docs/intro/setup.rst b/docs/intro/setup.rst
index 1462d0ba62..8184df6173 100644
--- a/docs/intro/setup.rst
+++ b/docs/intro/setup.rst
@@ -14,15 +14,11 @@ The latest release of ``hls4ml`` can be installed with ``pip``:
pip install hls4ml
-If you want to use our :doc:`profiling <../advanced/profiling>` toolbox, you might need to install extra dependencies:
-
-.. code-block::
-
- pip install hls4ml[profiling]
-
.. warning::
Previously, versions of hls4ml were made available on ``conda-forge``. These are outdated and should NOT be used. Installing with ``pip`` is currently the only supported method.
+Some features of ``hls4ml`` need extra dependencies. See the `Optional Dependencies <#optional-dependencies>`_ section for more details.
+
Development version
-------------------
@@ -40,6 +36,10 @@ Dependencies
.. note::
As of version 1.1.0+, all conversion frontend specific packages are optional. Only install the packages you need.
+
+Frontend
+--------
+
The ``hls4ml`` library requires python 3.10 or later, and depends on a number of Python packages and external tools for synthesis and simulation. Python dependencies are automatically managed by ``pip`` or ``conda``.
The following Python packages are all optional and are only required if you intend to use the corresponding converter.
@@ -55,22 +55,23 @@ The following Python packages are all optional and are only required if you inte
* Quantization support
* `QKeras `_: based on Keras v2. See `frontend/keras <../frontend/keras.html>`_ for more details
* `HGQ `_: Based on Keras v2. See `advanced/HGQ <../advanced/hgq.html>`_ for more details.
+ * `HGQ2 `_: Based on Keras v3. See `advanced/HGQ2 <../advanced/hgq.html>`_ for more details.
* `Brevitas `_: Based on PyTorch. See `frontend/pytorch <../frontend/pytorch.html>`_ for more details.
* `QONNX `_: Based on ONNX. See `frontend/onnx <../frontend/onnx.html>`_ for more details.
-Running C simulation from Python requires a C++11-compatible compiler. On Linux, a GCC C++ compiler ``g++`` is required. Any version from a recent
-Linux should work. On MacOS, the *clang*-based ``g++`` is enough. For the oneAPI backend, one must have oneAPI installed, along with the FPGA compiler,
-to run C/SYCL simulations.
+Running C simulation from Python requires a C++11-compatible compiler. On Linux, a GCC C++ compiler ``g++`` is required. Any version from a recent Linux should work. On MacOS, the *clang*-based ``g++`` is enough. For the oneAPI backend, one must have `oneAPI=2025.0` (2025.1 is known **not to work**) installed, along with the FPGA compiler, to run C/SYCL simulations.
+
+Specific functionalities may need additional Python packages. If any needed is missing, ``hls4ml`` will raise an error and prompt you to install the missing packages.
To run FPGA synthesis, installation of following tools is required:
-* Xilinx Vivado HLS 2018.2 to 2020.1 for synthesis for Xilinx FPGAs using the ``Vivado`` backend.
+* Xilinx Vivado HLS 2020.1 for synthesis for Xilinx FPGAs using the ``Vivado`` backend. Older versions may work, but use at your own risk.
* Vitis HLS 2022.2 or newer is required for synthesis for Xilinx FPGAs using the ``Vitis`` backend.
* Intel Quartus 20.1 to 21.4 for the synthesis for Intel/Altera FPGAs using the ``Quartus`` backend.
-* oneAPI 2024.1 to 2025.0 with the FPGA compiler and recent Intel/Altera Quartus for Intel/Altera FPGAs using the ``oneAPI`` backend.
+* oneAPI 2024.1 to 2025.0 with the FPGA compiler and recent Intel/Altera Quartus for Intel/Altera FPGAs using the ``oneAPI`` backend. Newer versions of ``OneAPI`` removed FPGA support and **will to work** with ``hls4ml``.
Catapult HLS 2024.1_1 or 2024.2 can be used to synthesize both for ASICs and FPGAs.
@@ -84,15 +85,15 @@ Here we give line-by-line instructions to demonstrate the general workflow.
.. code-block:: python
import hls4ml
- import tensorflow as tf
- from tensorflow.keras.layers import Dense, Activation
+
+ from keras.models import Sequential
+ from keras.layers import Dense, Activation
+
# Construct a basic keras model
- model = tf.keras.models.Sequential()
- model.add(Dense(64, input_shape=(16,), name='Dense', kernel_initializer='lecun_uniform', kernel_regularizer=None))
- model.add(Activation(activation='elu', name='Activation'))
- model.add(Dense(32, name='Dense2', kernel_initializer='lecun_uniform', kernel_regularizer=None))
- model.add(Activation(activation='elu', name='Activation2'))
+ model = Sequential()
+ model.add(Dense(64, input_shape=(16,), activation='relu'))
+ model.add(Dense(32, activation='relu'))
# This is where you would train the model in a real-world scenario
@@ -139,89 +140,71 @@ Done! You've built your first project using ``hls4ml``! To learn more about our
If you want to configure your model further, check out our :doc:`Configuration <../api/configuration>` page.
-..
- Apart from our main API, we also support model conversion using a command line interface, check out our next section to find out more:
-
- Getting started with hls4ml CLI (deprecated)
- --------------------------------------------
-
- As an alternative to the recommended Python PI, the command-line interface is provided via the ``hls4ml`` command.
- To follow this tutorial, you must first download our ``example-models`` repository:
-
- .. code-block:: bash
-
- git clone https://github.com/fastmachinelearning/example-models
-
- Alternatively, you can clone the ``hls4ml`` repository with submodules
-
- .. code-block:: bash
-
- git clone --recurse-submodules https://github.com/fastmachinelearning/hls4ml
-
- The model files, along with other configuration parameters, are defined in the ``.yml`` files.
- Further information about ``.yml`` files can be found in :doc:`Configuration ` page.
-
- In order to create an example HLS project, first go to ``example-models/`` from the main directory:
-
- .. code-block:: bash
-
- cd example-models/
+Existing examples
+-----------------
- And use this command to translate a Keras model:
+* Training codes and examples of resources needed to train the models can be found in the `tutorial `__.
+* Examples of model files and weights can be found in `example_models `_ directory.
- .. code-block:: bash
+Uninstalling
+------------
- hls4ml convert -c keras-config.yml
+To uninstall ``hls4ml``:
- This will create a new HLS project directory with an implementation of a model from the ``example-models/keras/`` directory.
- To build the HLS project, do:
+.. code-block:: bash
- .. code-block:: bash
+ pip uninstall hls4ml
- hls4ml build -p my-hls-test -a
+If installed with ``conda``, remove the package with:
- This will create a Vivado HLS project with your model implementation!
+.. code-block:: bash
- **NOTE:** For the last step, you can alternatively do the following to build the HLS project:
+ conda remove hls4ml
- .. code-block:: Bash
- cd my-hls-test
- vivado_hls -f build_prj.tcl
+Optional Dependencies
+=====================
- ``vivado_hls`` can be controlled with:
+``hls4ml`` provides several optional dependency groups that can be installed based on your specific needs:
- .. code-block:: bash
+.. warning::
+ Some optional dependencies may conflict with each other. For example, Keras v2 and Keras v3 cannot coexist in the same Python environment; ``qkeras`` requires certain versions of TensorFlow that may conflict with other packages.
- vivado_hls -f build_prj.tcl "csim=1 synth=1 cosim=1 export=1 vsynth=1"
+.. code-block::
- Setting the additional parameters from ``1`` to ``0`` disables that step, but disabling ``synth`` also disables ``cosim`` and ``export``.
+ # For distributed arithmetic
+ pip install hls4ml[da]
- Further help
- ^^^^^^^^^^^^
+ # For HGQ frontend
+ pip install hls4ml[hgq]
- * For further information about how to use ``hls4ml``\ , do: ``hls4ml --help`` or ``hls4ml -h``
- * If you need help for a particular ``command``\ , ``hls4ml command -h`` will show help for the requested ``command``
- * We provide a detailed documentation for each of the command in the :doc:`Command Help ` section
+ # For HGQ2 frontend
+ pip install hls4ml[hgq2]
-Existing examples
------------------
+ # For Keras v3 frontend
+ pip install hls4ml[keras-v3]
-* Training codes and examples of resources needed to train the models can be found in the `tutorial `__.
-* Examples of model files and weights can be found in `example_models `_ directory.
+ # For ONNX frontend
+ pip install hls4ml[onnx]
-Uninstalling
-------------
+ # For DSP-aware pruning
+ pip install hls4ml[optimization]
-To uninstall ``hls4ml``:
+ # For weights and activation range visualization
+ pip install hls4ml[profiling]
-.. code-block:: bash
+ # For QKeras frontend
+ pip install hls4ml[qkeras]
- pip uninstall hls4ml
+ # For Quartus report parsing
+ pip install hls4ml[quartus-report]
-If installed with ``conda``, remove the package with:
+ # For symbolic regression
+ pip install hls4ml[sr]
-.. code-block:: bash
+ # For documentation building (developers)
+ pip install hls4ml[doc]
- conda remove hls4ml
+ # For testing (developers)
+ pip install hls4ml[testing]
diff --git a/docs/intro/status.rst b/docs/intro/status.rst
index f025d3f79c..7966903a4b 100644
--- a/docs/intro/status.rst
+++ b/docs/intro/status.rst
@@ -13,20 +13,32 @@ See the :ref:`Release Notes` section for a changelog.
Features
========
-A list of supported ML frameworks, HLS backends, and neural network architectures, including a summary table is below. Dependencies are given in the :doc:`Setup ` page.
+A list of supported ML frameworks (Frontends), HLS backends, and neural network architectures, including a summary table is below. Dependencies are given in the :doc:`Setup ` page.
-ML framework support:
+Frontend support:
-* (Q)Keras
+* Keras
+
+ * Keras v2
+
+ * QKeras
+ * HGQ
+ * Keras v3
+
+ * HGQ2
* PyTorch
-* (Q)ONNX
+* ONNX
+
+ * QONNX
Neural network architectures:
* Fully connected NN (multilayer perceptron, MLP)
-* Convolutional NN
-* Recurrent NN (LSTM)
-* Graph NN (GarNet)
+* Convolutional NN (1D and 2D)
+* Recurrent NN (RNN, LSTM, GRU)
+* GarNet
+* Einsum and EinsumDense (Einsum)
+* Multi-head attention (MHA) (experimental)
HLS backends:
@@ -38,68 +50,52 @@ HLS backends:
A summary of the on-going status of the ``hls4ml`` tool is in the table below.
-.. list-table::
- :header-rows: 1
-
- * - ML framework/HLS backend
- - (Q)Keras
- - PyTorch
- - (Q)ONNX
- - Vivado HLS
- - Intel HLS
- - Vitis HLS
- - Catapult HLS
- - oneAPI
- * - MLP
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``experimental``
- * - CNN
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``experimental``
- * - RNN (LSTM)
- - ``supported``
- - ``supported``
- - ``N/A``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``supported``
- - ``experimental``
- * - GNN (GarNet)
- - ``supported``
- - ``in development``
- - ``N/A``
- - ``N/A``
- - ``N/A``
- - ``N/A``
- - ``N/A``
- - ``N/A``
+.. table:: hls4ml Supported Features
+
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| Frontend/Backend | MLP | CNN | RNN/LSTM/GRU | GarNet | Einsum | MHA |
++=======================+=====+=====+==============+========+========+=====+
+| Keras v2 | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| QKeras | ✅ | ✅ | ✅ | ✅ | N/A | N/A |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| HGQ | ✅ | ✅ | N/A | N/A | N/A | N/A |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| Keras v3 | ✅ | ✅ | ✅ | N/A | ✅ | ❌ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| HGQ2 | ✅ | ✅ | N/A | N/A | ✅ | ✅ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| Torch | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| ONNX | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| QONNX | ✅ | ✅ | ❌ | N/A | N/A | N/A |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| Vivado/Vitis HLS | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| Intel HLS | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| Catapult HLS | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
+| oneAPI (experimental) | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ |
++-----------------------+-----+-----+--------------+--------+--------+-----+
Other feature notes:
* ``hls4ml`` is tested on the following platforms. Newer versions might work just fine, but try at your own risk.
- * Vivado HLS versions 2018.2 to 2020.1
- * Intel HLS versions 20.1 to 21.4, versions \> 21.4 have not been tested.
- * Vitis HLS versions 2022.2 to 2024.1. Versions \<= 2022.1 are known not to work.
- * Catapult HLS versions 2024.1_1 to 2024.2
- * oneAPI versions 2024.1 to 2025.0
-
-* ``hls4ml`` supports Linux and requires python \>=3.10. hlsml does not require a specific Linux distribution version and we recommended to follow the requirements of the HLS tool you are using.
-* Windows and macOS are not supported. Setting up ``hls4ml`` on these platforms, for example using the Windows Subsystem for Linux (WSL) should be possible, but we do not provide support for such use cases.
+
+ - Vivado HLS 2020.1. Older versions may work, but use at your own risk.
+ - Intel HLS versions 20.1 to 21.4, versions > 21.4 have not been tested.
+ - Vitis HLS versions 2022.2 to 2024.1. Versions > 2024.1 are less tested.
+ - Catapult HLS versions 2024.1_1 to 2024.2
+ - oneAPI versions 2024.1 to 2025.0. Any future versions are known to not work.
+
+* ``hls4ml`` supports Linux [*]_ and requires python >=3.10. hls4ml does not require a specific Linux distribution version and we recommend following the requirements of the HLS tool you are using.
+* Windows and macOS are not supported. Setting up ``hls4ml`` on these platforms, for example using the Windows Subsystem for Linux (WSL), should be possible, but we do not provide support for such use cases.
* BDT support has moved to the `Conifer `__ package
+.. [*] For compiling the projects for simulation or actual HLS. Otherwise, the code **may** be used on other platforms and it will likely to work. However, please note that Windows or other platforms are **not supported** in general and are not tested.
+
Example Models
==============
diff --git a/docs/ir/multimodelgraph.rst b/docs/ir/multimodelgraph.rst
index 347b3a5d15..d743853764 100644
--- a/docs/ir/multimodelgraph.rst
+++ b/docs/ir/multimodelgraph.rst
@@ -42,7 +42,6 @@ Key Methods for MultiModelGraph
* :ref:`compile `
* :ref:`predict `
* :ref:`build `
-* :ref:`trace `
----
diff --git a/docs/requirements.txt b/docs/requirements.txt
index fe3c4f2544..77a36ef399 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,6 +1,7 @@
.
setuptools_scm[toml]>=5
sphinx>=3.2.1
+sphinx-tabs
sphinx_contributors
sphinx_github_changelog
sphinx_rtd_theme
diff --git a/hls4ml/backends/vivado/vivado_backend.py b/hls4ml/backends/vivado/vivado_backend.py
index d5309c377f..84daf5f5f4 100644
--- a/hls4ml/backends/vivado/vivado_backend.py
+++ b/hls4ml/backends/vivado/vivado_backend.py
@@ -359,7 +359,7 @@ def init_dense(self, layer):
else:
self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
layer.set_attr('strategy', 'resource_unrolled')
- elif layer.model.config.get_strategy(layer).lower() == 'distributed_arithmetic':
+ elif layer.model.config.get_strategy(layer).lower() in ('distributed_arithmetic', 'da'):
rf = layer.get_attr('reuse_factor')
if rf != 1:
raise Exception(f'Layer {layer.name} has rf = {rf} != 1, but has strategy = "distributed_arithmetic".')
@@ -401,7 +401,7 @@ def init_conv1d(self, layer):
else:
self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
layer.set_attr('strategy', 'resource_unrolled')
- elif layer.model.config.get_strategy(layer).lower() == 'distributed_arithmetic':
+ elif layer.model.config.get_strategy(layer).lower() in ('distributed_arithmetic', 'da'):
rf = layer.get_attr('reuse_factor')
if rf != 1:
raise Exception(f'Layer {layer.name} has rf = {rf} != 1, but has strategy = "distributed_arithmetic".')
@@ -527,7 +527,7 @@ def init_conv2d(self, layer):
else:
self.set_closest_reuse_factor(layer, n_in, n_out, include_max_rf=False)
layer.set_attr('strategy', 'resource_unrolled')
- elif layer.model.config.get_strategy(layer).lower() == 'distributed_arithmetic':
+ elif layer.model.config.get_strategy(layer).lower() in ('distributed_arithmetic', 'da'):
rf = layer.get_attr('reuse_factor')
if rf != 1:
raise Exception(f'Layer {layer.name} has rf = {rf} != 1, but has strategy = "distributed_arithmetic".')
diff --git a/hls4ml/contrib/kl_layer/kl_layer.py b/hls4ml/contrib/kl_layer/kl_layer.py
index 02b396052b..40873793cc 100644
--- a/hls4ml/contrib/kl_layer/kl_layer.py
+++ b/hls4ml/contrib/kl_layer/kl_layer.py
@@ -123,7 +123,7 @@ def parse_klloss_layer(keras_layer, input_names, input_shapes, data_reader):
def main():
# Register the converter for custom Keras layer
- hls4ml.converters.register_keras_layer_handler('KLLoss', parse_klloss_layer)
+ hls4ml.converters.register_keras_v2_layer_handler('KLLoss', parse_klloss_layer)
# Register the hls4ml's IR layer
hls4ml.model.layers.register_layer('KLLoss', HKLLoss)
diff --git a/hls4ml/converters/__init__.py b/hls4ml/converters/__init__.py
index 0d7b6ef802..80f755d41f 100644
--- a/hls4ml/converters/__init__.py
+++ b/hls4ml/converters/__init__.py
@@ -8,7 +8,7 @@
from hls4ml.converters.keras_v2_to_hls import KerasReader # noqa: F401
from hls4ml.converters.keras_v2_to_hls import get_supported_keras_layers # noqa: F401
from hls4ml.converters.keras_v2_to_hls import parse_keras_model # noqa: F401
-from hls4ml.converters.keras_v2_to_hls import keras_v2_to_hls, register_keras_layer_handler
+from hls4ml.converters.keras_v2_to_hls import keras_v2_to_hls, register_keras_v2_layer_handler
from hls4ml.converters.keras_v3_to_hls import keras_v3_to_hls, parse_keras_v3_model # noqa: F401
from hls4ml.converters.onnx_to_hls import get_supported_onnx_layers # noqa: F401
from hls4ml.converters.onnx_to_hls import parse_onnx_model # noqa: F401
@@ -41,7 +41,7 @@
if callable(func) and hasattr(func, 'handles') and func.__module__ == lib.__name__:
for layer in func.handles: # type: ignore
if model_type == 'keras':
- register_keras_layer_handler(layer, func)
+ register_keras_v2_layer_handler(layer, func)
elif model_type == 'pytorch':
register_pytorch_layer_handler(layer, func)
elif model_type == 'onnx':
diff --git a/hls4ml/converters/keras_v2_to_hls.py b/hls4ml/converters/keras_v2_to_hls.py
index daa9fc5575..cebaeab25b 100644
--- a/hls4ml/converters/keras_v2_to_hls.py
+++ b/hls4ml/converters/keras_v2_to_hls.py
@@ -127,7 +127,7 @@ def get_layer_handlers():
return layer_handlers
-def register_keras_layer_handler(layer_cname, handler_func):
+def register_keras_v2_layer_handler(layer_cname, handler_func):
"""Register a handler function for the given layer class name.
The handler function should have the following signature:
diff --git a/pyproject.toml b/pyproject.toml
index d8b35e743a..c7cce0fae9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -33,6 +33,7 @@ optional-dependencies.doc = [
"sphinx-contributors",
"sphinx-github-changelog",
"sphinx-rtd-theme",
+ "sphinx-tabs",
]
optional-dependencies.hgq = [ "hgq>=0.2.3" ]
optional-dependencies.hgq2 = [ "hgq2>=0.0.1" ]
diff --git a/test/pytest/test_extensions.py b/test/pytest/test_extensions.py
index 5d06726aca..4c7400ddec 100644
--- a/test/pytest/test_extensions.py
+++ b/test/pytest/test_extensions.py
@@ -123,7 +123,7 @@ def format(self, node):
@pytest.fixture(scope='session', autouse=True)
def register_custom_layer():
# Register the converter for custom Keras layer
- hls4ml.converters.register_keras_layer_handler('KReverse', parse_reverse_layer)
+ hls4ml.converters.register_keras_v2_layer_handler('KReverse', parse_reverse_layer)
# Register the hls4ml's IR layer
hls4ml.model.layers.register_layer('HReverse', HReverse)