diff --git a/.gitignore b/.gitignore
index 22c8ff685b..8151cd3af9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ sdist/
 *.egg-info/
 vivado_prj
 .vscode
+.idea
 my-hls-test
 *.tar.gz
 docs/_build
@@ -14,3 +15,6 @@ docs/autodoc/*
 hls4mlprj_*
 *~
 *.ipynb_checkpoints/
+
+test/pytest/test_backend/input_file/*
+test/pytest/test_backend/output_file/*
diff --git a/hls4ml/backends/__init__.py b/hls4ml/backends/__init__.py
index 4a48f072cd..3bb8aa7c6d 100644
--- a/hls4ml/backends/__init__.py
+++ b/hls4ml/backends/__init__.py
@@ -11,9 +11,13 @@
 
 from hls4ml.backends.vitis.vitis_backend import VitisBackend  # isort: skip
 
+from hls4ml.backends.vitis_unified.vitis_unified_backend import VitisUnifiedBackend  # isort: skip
+
+
 register_backend('Vivado', VivadoBackend)
 register_backend('VivadoAccelerator', VivadoAcceleratorBackend)
 register_backend('Vitis', VitisBackend)
+register_backend('VitisUnified', VitisUnifiedBackend)
 register_backend('Quartus', QuartusBackend)
 register_backend('Catapult', CatapultBackend)
 register_backend('SymbolicExpression', SymbolicExpressionBackend)
diff --git a/hls4ml/backends/vitis_unified/passes/fifo_depth_optimization.py b/hls4ml/backends/vitis_unified/passes/fifo_depth_optimization.py
new file mode 100644
index 0000000000..0451270ca1
--- /dev/null
+++ b/hls4ml/backends/vitis_unified/passes/fifo_depth_optimization.py
@@ -0,0 +1,113 @@
+# we inherit it from vitis
+import zipfile
+
+from hls4ml.backends.vitis.passes.fifo_depth_optimization import (
+    generate_depths_file,
+    initialize_large_fifos,
+    set_optimized_fifo_depths,
+)
+from hls4ml.model.optimizer.optimizer import ConfigurableOptimizerPass, ModelOptimizerPass
+
+
+def get_vitis_optimized_fifo_depths(model, cus_hls_prj_path=None):
+    """Parse the files generated by the co-simulation to retrieve the optimized depths for the FIFOs.
+    Attention, only the FIFOs between the layers are profiled!
+
+    Args:
+        model (ModelGraph): The model to which FIFO depth optimization is applied.
+
+    Returns:
+        Dict[str, int]: A dictionary that contains the FIFO names as keys and the optimized depths as values.
+    """
+    # channel.zip is generated after the co-simulation and contains the chan_status*.csv files
+    # in the chan_status*.csv files the max depth achieved during co-simulation can be found at the last (4th) line
+
+    if cus_hls_prj_path is None:
+        cus_hls_prj_path = model.config.get_output_dir() + '/' + model.config.get_project_name() + '/_prj/solution1'
+
+    path_to_zip_file = cus_hls_prj_path + '/.autopilot/db/channel_depth_info/'
+
+    with zipfile.ZipFile(f'{path_to_zip_file}channel.zip', 'r') as zip_ref:
+        zip_ref.extractall(path_to_zip_file)
+
+    # the channel_info.csv file contains the mapping of each fifo name (i.e layer4_out_U) to the respective
+    # chan_status*.csv file
+    names_file_path = cus_hls_prj_path + '/.autopilot/db/channel_info.csv'
+
+    csv_fifo_depth_files = {}
+    with open(names_file_path) as names_file:
+        for _ in range(4):
+            next(names_file)
+        for line in names_file:
+            layer_name = line.split(',')[1]
+            csv_file_name = line.split(',')[3][:-1]
+            csv_fifo_depth_files[layer_name] = csv_file_name
+
+    optmized_fifo_depths = {}
+    for layer_name, file_name in csv_fifo_depth_files.items():
+        with open(path_to_zip_file + file_name) as chan_status_file:
+            lines = chan_status_file.readlines()
+            optmized_fifo_depths[layer_name[:-4]] = int(
+                lines[-1]
+            )  # remove "_i_U" from the layer name string and keep the last line of the file that contains the max depth
+
+    return optmized_fifo_depths
+
+
+def execute_cosim_to_profile_fifos(model):
+    model.write()
+    model.build(
+        reset=False,
+        csim=False,
+        synth=True,
+        cosim=False,
+        validation=False,
+        export=False,
+        vsynth=False,
+        fifo_opt=True,
+        bitfile=False,
+        log_to_stdout=False,
+    )
+
+
+class FifoDepthOptimization(ConfigurableOptimizerPass, ModelOptimizerPass):
+
+    def __init__(self):
+        self.profiling_fifo_depth = 100_000
+
+    def transform(self, model):
+        """Perform FIFO depth optimization between the FIFOs of all layers to reduce resource utilization as the
+        initial FIFOs set by hls4ml might be larger than required. At the end of the optimization the FIFOs will
+        have the largest depths achieved during co-simulation without causing any deadlocks between the layers
+        (producer-consumer), thus no additional delays between the layers. In some cases, this optimization
+        might lead to bigger FIFOs than initially set by the hls4ml tool in order to prevent deadlocks.
+
+        Args:
+            model (ModelGraph): The model to which FIFO depth optimization is applied.
+
+        Raises:
+            ValueError: If the FIFO depth for profiling provided by the user is not a non-negative integer.
+            RuntimeError: If the IO type is not set to "io_stream".
+
+        Returns:
+            bool: The execution state of the Optimizer Pass
+        """
+
+        if not isinstance(self.profiling_fifo_depth, int) or self.profiling_fifo_depth <= 0:
+            raise ValueError('The FIFO depth for profiling (profiling_fifo_depth variable) must be a non-negative integer.')
+
+        # check axi-stream or io-stream
+        if not (model.config.get_config_value('IOType') == 'io_stream'):
+            raise RuntimeError('To use this optimization you have to set `IOType` field to `io_stream` in the HLS config.')
+
+        hlsPrjPath = model.config.backend.writer.mg.get_vitis_hls_exec_dir(model)
+
+        initial_fifo_depths = initialize_large_fifos(model, self.profiling_fifo_depth)
+        execute_cosim_to_profile_fifos(model)
+        optimized_fifo_depths = get_vitis_optimized_fifo_depths(model, cus_hls_prj_path=hlsPrjPath + "/hls")
+        generate_depths_file(model, initial_fifo_depths, optimized_fifo_depths)
+        set_optimized_fifo_depths(model, optimized_fifo_depths)
+
+        print('FIFO optimization completed')
+
+        return False
diff --git a/hls4ml/backends/vitis_unified/vitis_unified_backend.py b/hls4ml/backends/vitis_unified/vitis_unified_backend.py
new file mode 100644
index 0000000000..8bfed0f88a
--- /dev/null
+++ b/hls4ml/backends/vitis_unified/vitis_unified_backend.py
@@ -0,0 +1,181 @@
+import os
+import subprocess
+import sys
+from shutil import copy2
+
+from hls4ml.backends import VitisBackend, VivadoBackend
+from hls4ml.model.flow import register_flow
+from hls4ml.writer.vitis_unified_writer.meta_gen import VitisUnified_MetaGen as mg
+
+
+class VitisUnifiedBackend(VitisBackend):
+    def __init__(self):
+        super(VivadoBackend, self).__init__(name='VitisUnified')
+        self._register_layer_attributes()
+        self._register_flows()
+
+    def run_term_command(self, model, taskName: str, command: str, logStdOut: bool, cwd):
+
+        print("-------------------------------------------------------")
+        print(f"start running task : {taskName}")
+        print(f"    with command: {command}")
+        print("-------------------------------------------------------")
+
+        output_dir = model.config.get_output_dir()
+
+        out_log_path = os.path.join(output_dir, f'{taskName}_out.log')
+        err_log_path = os.path.join(output_dir, f'{taskName}_err.log')
+        out_target = None if logStdOut else open(out_log_path, 'w')
+        err_target = None if logStdOut else open(err_log_path, 'w')
+
+        try:
+            runningProcess = subprocess.Popen(command, shell=True, cwd=cwd, stdout=out_target, stderr=err_target, text=True)
+            runningProcess.communicate()
+            if runningProcess.returncode != 0:
+                raise Exception(
+                    f'Package failed for {taskName} for project {model.config.get_project_name()}. See logs for details.'
+                )
+
+            stdout, stderr = runningProcess.communicate()
+            print(f"stdout: {stdout}")
+            print(f"stderr: {stderr}")
+
+            print(f"task {taskName} finished")
+
+        except Exception as e:
+            print(f"task {taskName} failed")
+            print(e)
+            raise e
+        finally:
+            if out_target:
+                out_target.close()
+            if err_target:
+                err_target.close()
+
+    def build(
+        self,
+        model,
+        reset=False,
+        csim=False,
+        synth=False,
+        cosim=False,
+        validation=False,
+        export=False,
+        vsynth=False,
+        fifo_opt=False,
+        bitfile=False,
+        log_to_stdout=True,
+    ):
+        # it builds and return vivado reports
+        if 'linux' in sys.platform:
+            found = os.system('command -v vitis > /dev/null')
+            if found != 0:
+                raise Exception('Vitis installation not found. Make sure "vitis" is on PATH.')
+
+        if csim:
+            raise Exception("Current Vitis Unified not support csim. Please set csim=False to run Vitis Unified.")
+        if validation:
+            raise Exception(
+                "Current Vitis Unified not support validation. Please set validation=False to run Vitis Unified."
+            )
+        if export:
+            raise Exception("Current Vitis Unified not support export. Please set export=False to run Vitis Unified.")
+
+        output_dir = model.config.get_output_dir()
+
+        hls_config_file = os.path.join(output_dir, "hls_kernel_config.cfg")
+        # build command
+        csynth_cmd = ("v++ -c --mode hls --config {configPath} --work_dir unifiedPrj").format(configPath=hls_config_file)
+        csynth_cwd = mg.get_vitis_hls_dir(model)
+
+        # util template (used in csim/cosim/package)
+        util_command = "vitis-run --mode hls --{op} --config {configPath} --work_dir unifiedPrj"
+
+        # command for each configuration
+
+        package_cmd = util_command.format(op="package", configPath=hls_config_file)
+        package_cwd = mg.get_vitis_hls_dir(model)
+        cosim_cmd = util_command.format(op="cosim", configPath=hls_config_file)
+        cosim_cwd = mg.get_vitis_hls_dir(model)
+        csim_cmd = util_command.format(op="csim", configPath=hls_config_file)
+        csim_cwd = mg.get_vitis_hls_dir(model)
+
+        kerlink_cmd = "./buildAcc.sh"
+        kerlink_cwd = mg.get_vitis_linker_dir(model)
+
+        if synth:
+            self.prepare_sim_config_file(model, True)
+            self.run_term_command(model, "csynth", csynth_cmd, log_to_stdout, csynth_cwd)
+            self.run_term_command(model, "package", package_cmd, log_to_stdout, package_cwd)
+
+        if csim:
+            self.prepare_sim_config_file(model, True)
+            self.run_term_command(model, "csim", csim_cmd, log_to_stdout, csim_cwd)
+
+        if cosim or fifo_opt:
+            self.prepare_sim_config_file(model, False)
+            self.run_term_command(model, "cosim", cosim_cmd, log_to_stdout, cosim_cwd)
+
+        # if bitfile
+        if bitfile:
+            self.run_term_command(model, "kerlink", kerlink_cmd, log_to_stdout, kerlink_cwd)
+
+    def prepare_sim_config_file(self, model, is_csim):
+        suffix = "csim" if is_csim else "cosim"
+        src = f"{model.config.get_output_dir()}/hls_kernel_config_{suffix}.cfg"
+        des = f"{model.config.get_output_dir()}/hls_kernel_config.cfg"
+        copy2(src, des)
+        return des
+
+    def create_initial_config(
+        self,
+        board='zcu102',
+        part=None,
+        clock_period=5,
+        clock_uncertainty='12.5%',
+        io_type='io_stream',
+        driver='python',
+        input_type='float',
+        output_type='float',
+        in_stream_buf_size=128,
+        out_stream_buf_size=128,
+        xpfmPath='/opt/Xilinx/Vitis/2023.2/base_platforms/' 'xilinx_zcu102_base_202320_1/xilinx_zcu102_base_202320_1.xpfm',
+        **_,
+    ):
+
+        config = super().create_initial_config(part, clock_period, clock_uncertainty, io_type)
+
+        config['UnifiedConfig'] = {}
+        config['UnifiedConfig']["in_stream_buf_Size"] = in_stream_buf_size
+        config['UnifiedConfig']["out_stream_buf_Size"] = out_stream_buf_size
+        config['UnifiedConfig']['XPFMPath'] = xpfmPath
+        config['UnifiedConfig']['Board'] = board
+        config['UnifiedConfig']['Driver'] = driver
+        config['UnifiedConfig']['InputDtype'] = input_type  # float, double or ap_fixed<a,b>
+        config['UnifiedConfig']['OutputDtype'] = output_type  # float, double or ap_fixed<a,b>
+
+        if io_type != "io_stream":
+            raise Exception("io_type must be io_stream")
+        if input_type not in ["double", "float"]:
+            raise Exception("input_type must be float or double")
+        if output_type not in ["double", "float"]:
+            raise Exception("output_type must be float or double")
+
+        return config
+
+    def get_default_flow(self):
+        return self._default_flow
+
+    def get_writer_flow(self):
+        return self._writer_flow
+
+    def _register_flows(self):
+        vitis_ip = 'vitis:ip'
+        writer_passes = ['make_stamp', 'vitisunified:write_hls']
+        self._writer_flow = register_flow('write', writer_passes, requires=['vitis:ip'], backend=self.name)
+        self._default_flow = vitis_ip
+
+        # register fifo depth optimization
+        fifo_depth_opt_passes = ['vitisunified:fifo_depth_optimization'] + writer_passes
+
+        register_flow('fifo_depth_optimization', fifo_depth_opt_passes, requires=['vitis:ip'], backend=self.name)
diff --git a/hls4ml/backends/vitis_unified/vitis_unified_config.py b/hls4ml/backends/vitis_unified/vitis_unified_config.py
new file mode 100644
index 0000000000..8f3289931e
--- /dev/null
+++ b/hls4ml/backends/vitis_unified/vitis_unified_config.py
@@ -0,0 +1,49 @@
+class VitisUnifiedConfig:
+
+    def __init__(self, config, model_inputs, model_outputs):
+        self.config = config.config
+        self.board = self.config.get('UnifiedConfig', {}).get('Board', 'pynq-z2')
+
+        # before first and afterlast layer we have the configuratble buffer
+        # [platform]<-->[in_steram_bufferSz]<-->[hls]<-->[out_stream_bufferSz]<-->[platform]
+        self.in_steram_bufferSz = self.config["UnifiedConfig"]["in_stream_buf_Size"]
+        self.out_stream_bufferSz = self.config["UnifiedConfig"]["out_stream_buf_Size"]
+
+        # the path to the generated platform
+        self.XPFMPath = self.config["UnifiedConfig"]["XPFMPath"]
+
+        self.driver = self.config['UnifiedConfig']['Driver']
+
+        # c++ type for input and output of the hls kernel it must be str (float/double)
+        self.input_type = self.config['UnifiedConfig']['InputDtype']
+        self.output_type = self.config['UnifiedConfig']['OutputDtype']
+
+        assert self.input_type == self.output_type, "Input and Output data types must be the same type different"
+        assert len(model_inputs) >= 1, "Only models with at least one input tensor are currently supported by VitisUnified"
+        assert len(model_outputs) >= 1, "Only models with one output tensor are currently supported by VitisUnified"
+        self.inps = model_inputs.copy()
+        self.outs = model_outputs.copy()
+
+    def get_corrected_types(self):
+        return self.input_type, self.output_type, self.inps, self.outs
+
+    def get_driver(self):
+        return self.driver
+
+    def get_board(self):
+        return self.board
+
+    def get_input_type(self):
+        return self.input_type
+
+    def get_output_type(self):
+        return self.output_type
+
+    def get_in_stream_bufferSz(self):
+        return self.in_steram_bufferSz
+
+    def get_out_stream_bufferSz(self):
+        return self.out_stream_bufferSz
+
+    def get_XPFMPath(self):
+        return self.XPFMPath
diff --git a/hls4ml/templates/vitis_unified/build_lib.sh b/hls4ml/templates/vitis_unified/build_lib.sh
new file mode 100644
index 0000000000..2645804f90
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/build_lib.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+VITIS_UNIFIED_FLAGS="VITIS_UNIFIED"
+CFLAGS="$CFLAGS -D$VITIS_UNIFIED_FLAGS"
+
+INCFLAGS="-Ifirmware/ap_types/"
+
+PROJECT=myprojectBaseName
+WRAPPER_NAME=myprojectWrapName
+LIB_STAMP=mystamp
+BASEDIR="$(cd "$(dirname "$0")" && pwd)"
+WEIGHTS_DIR="\"${BASEDIR}/firmware/weights\""
+
+echo "------------- This is build_lib.sh debug message ----------------"
+echo "Compiling for OSTYPE: $OSTYPE"
+echo "CFLAGS: $CFLAGS"
+echo "Include Flags: $INCFLAGS"
+echo "Weights directory: $WEIGHTS_DIR"
+echo "-----------------------------------------------------------------"
+
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${PROJECT}.cpp -o ${PROJECT}.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c firmware/${WRAPPER_NAME}.cpp -o ${WRAPPER_NAME}.o
+${CC} ${CFLAGS} ${INCFLAGS} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c ${PROJECT}_bridge.cpp -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} -shared ${PROJECT}.o ${WRAPPER_NAME}.o ${PROJECT}_bridge.o -o firmware/${PROJECT}-${LIB_STAMP}.so
+rm -f *.o
diff --git a/hls4ml/templates/vitis_unified/build_lib_multigraph.sh b/hls4ml/templates/vitis_unified/build_lib_multigraph.sh
new file mode 100644
index 0000000000..9dcd85f7d1
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/build_lib_multigraph.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+set -e
+
+CC=g++
+if [[ "$OSTYPE" == "linux-gnu" ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11 -fno-gnu-unique"
+elif [[ "$OSTYPE" == "darwin"* ]]; then
+    CFLAGS="-O3 -fPIC -std=c++11"
+fi
+
+graph_project_names=(mygraph_name_list)
+
+LDFLAGS=
+VITIS_UNIFIED_FLAGS="VITIS_UNIFIED"
+CFLAGS="$CFLAGS -D$VITIS_UNIFIED_FLAGS"
+
+ORIGINAL_PROJECT=myproject
+PROJECT=myproject_stitched
+LIB_STAMP=mystamp
+BASEDIR="$(cd "$(dirname "$0")" && cd .. && pwd)"
+INCFLAGS=""
+OUTPUT_DIR="${BASEDIR}/stitched/firmware"
+WEIGHTS_DIR="\"${BASEDIR}/stitched/firmware/weights\""
+
+mkdir -p "${OUTPUT_DIR}"
+
+# Compile all graphs in parallel
+OBJECT_FILES=()
+PIDS=()
+
+for g in "${graph_project_names[@]}"; do
+    SRC_FILE="${g}/firmware/${ORIGINAL_PROJECT}_${g}.cpp"
+    OBJ_FILE="${ORIGINAL_PROJECT}_${g}.o"
+    AP_TYPES_PATH="-I${BASEDIR}/${g}/firmware/ap_types/"
+    (
+        ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
+    ) &
+    PIDS+=($!)
+    OBJECT_FILES+=("${OBJ_FILE}")
+    INCFLAGS+="-I${BASEDIR}/${g}/ "
+done
+
+# compile axi_stream as well
+
+for g in "${graph_project_names[@]}"; do
+    SRC_FILE="${g}/firmware/${ORIGINAL_PROJECT}_${g}_axi.cpp"
+    OBJ_FILE="${ORIGINAL_PROJECT}_${g}_axi.o"
+    AP_TYPES_PATH="-I${BASEDIR}/${g}/firmware/ap_types/"
+    (
+        ${CC} ${CFLAGS} ${AP_TYPES_PATH} -D WEIGHTS_DIR="${WEIGHTS_DIR}" -c "${BASEDIR}/${SRC_FILE}" -o "${OBJ_FILE}"
+    ) &
+    PIDS+=($!)
+    OBJECT_FILES+=("${OBJ_FILE}")
+    #INCFLAGS+="-I${BASEDIR}/${g}/ "
+done
+
+
+
+for pid in "${PIDS[@]}"; do
+    wait $pid
+done
+
+AP_TYPES_PATH="-I${BASEDIR}/${graph_project_names[@]: -1}/firmware/ap_types/"
+
+${CC} ${CFLAGS} ${INCFLAGS} ${AP_TYPES_PATH} -c "${PROJECT}_bridge.cpp" -o ${PROJECT}_bridge.o
+${CC} ${CFLAGS} ${INCFLAGS} ${AP_TYPES_PATH} -shared "${OBJECT_FILES[@]}" ${PROJECT}_bridge.o -o "${OUTPUT_DIR}/${PROJECT}-${LIB_STAMP}.so"
+
+rm -f "${OBJECT_FILES[@]}"
+rm -f ${PROJECT}_bridge.o
diff --git a/hls4ml/templates/vitis_unified/driver/pynq/pynq_driver.py.hls4ml b/hls4ml/templates/vitis_unified/driver/pynq/pynq_driver.py.hls4ml
new file mode 100644
index 0000000000..e8c1d0de5c
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/driver/pynq/pynq_driver.py.hls4ml
@@ -0,0 +1,98 @@
+# import the library
+import os
+import re
+import subprocess
+import time
+
+import numpy as np
+from pynq import DefaultIP  # import the ip connector library for extension
+from pynq import Overlay  # import the overlay
+from pynq import allocate  # import for CMA (contingeous memory allocation)
+
+
+class MyDfxCtrl(DefaultIP):
+    def __init__(self, description):
+        super().__init__(description=description)
+
+        self.REG_ADDR_AP_CTRL = 0x00
+        self.REG_ADDR_AMT_QUERY = VAL
+
+        self.REG_ADDR_GIE = 0x04
+        self.REG_ADDR_IER = 0x08
+        self.REG_ADDR_ISR = 0x0C
+
+        self.INP_PORT_NAMEs = [
+            # hls-driver-input-dbg-name
+        ]
+
+        self.REG_ADDR_INP_PTRs = [
+            # hls-driver-input-ptr
+        ]
+
+        self.OUT_PORT_NAMEs = [
+            # hls-driver-output-dbg-name
+        ]
+
+        self.REG_ADDR_OUT_PTRs = [
+            # hls-driver-output-ptr
+        ]
+
+    bindto = ['xilinx.com:hls:<TOP_NAME>:1.0']
+
+    def enable_gie(self):
+        print("global interrupt enable register")
+        self.write(self.REG_ADDR_GIE, 0x01)
+        print("enable gie successful")
+
+    def disable_gie(self):
+        print("global interrupt enable register")
+        self.write(self.REG_ADDR_GIE, 0x01)
+        print("disable gie successful")
+
+    def enable_done_intr(self):
+        print("ap_done interrupt enable register")
+        self.write(self.REG_ADDR_IER, 0x01)
+        print("enable ap_done interrupt successful")
+
+    def clear_done_status(self):
+        print("ap_done register clear")
+        self.write(self.REG_ADDR_ISR, 0x01)
+        print("clear ap_done interrupt successful")
+
+    def prepare_intr(self):
+        print("prepare your interrupt")
+        self.enable_gie()
+        self.enable_done_intr()
+        self.clear_done_status()
+        print("----------------------")
+
+    def set_single_bit(self, addr, idx):
+        self.write(addr, 1 << idx)
+
+    def ctrl_start(self):
+        self.write(0x00, 0x01)  # ap_start = 1
+
+    def wait_until_done(self):
+        while (self.read(0x00) & 0x2) == 0:  # Wait for ap_done
+            time.sleep(0.001)
+
+    def set_input(self, idx, buffer):
+
+        print(
+            f"input {self.INP_PORT_NAMEs[idx]} will be set to addr: {hex(buffer.physical_address)} with elements: {buffer.size}"
+        )
+        self.write(self.REG_ADDR_INP_PTRs[idx], buffer.physical_address)
+        self.write(self.REG_ADDR_INP_PTRs[idx] + 4, 0)
+        buffer.flush()
+
+    def set_output(self, idx, buffer):
+
+        print(
+            f"output {self.OUT_PORT_NAMEs[idx]} will be set to addr: {hex(buffer.physical_address)} with elements: {buffer.size}"
+        )
+        self.write(self.REG_ADDR_OUT_PTRs[idx], buffer.physical_address)
+        self.write(self.REG_ADDR_OUT_PTRs[idx] + 4, 0)
+
+    def set_amt_query(self, val):
+        print(f"amount of queries will be set to: {val} at address: {hex(self.REG_ADDR_AMT_QUERY)}")
+        self.write(self.REG_ADDR_AMT_QUERY, val)
diff --git a/hls4ml/templates/vitis_unified/hls_kernel_config.cfg b/hls4ml/templates/vitis_unified/hls_kernel_config.cfg
new file mode 100644
index 0000000000..c1d12a0c18
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/hls_kernel_config.cfg
@@ -0,0 +1,24 @@
+part={PART}
+
+[hls]
+clock={CLK}
+clock_uncertainty={CLK_UC}
+flow_target=vivado
+syn.file={OUTDIR}/firmware/{FILE_NAME_WRAP}.cpp
+syn.file={OUTDIR}/firmware/{FILE_NAME_BASE}.cpp
+syn.file_cflags={OUTDIR}/firmware/{FILE_NAME_WRAP}.cpp,-std=c++0x
+syn.file_cflags={OUTDIR}/firmware/{FILE_NAME_BASE}.cpp,-std=c++0x
+
+syn.top={TOP_NAME}
+
+tb.file={OUTDIR}/{SIM_FILE_NAME}.cpp
+tb.file={OUTDIR}/firmware/weights
+tb.file={OUTDIR}/tb_data
+tb.file_cflags={OUTDIR}/{SIM_FILE_NAME}.cpp,-std=c++0x
+tb.file_cflags={OUTDIR}/{SIM_FILE_NAME}.cpp,-DRTL_SIM
+package.ip.version=1.0.0
+package.output.format={OUTPUT_KERNEL_TYPE}
+syn.compile.name_max_length=80
+syn.schedule.enable_dsp_full_reg=0
+package.output.syn=1
+cosim.enable_fifo_sizing=true
diff --git a/hls4ml/templates/vitis_unified/myproject_bridge.cpp b/hls4ml/templates/vitis_unified/myproject_bridge.cpp
new file mode 100644
index 0000000000..9a56f10d99
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/myproject_bridge.cpp
@@ -0,0 +1,71 @@
+#ifndef MYPROJECT_BRIDGE_H_
+#define MYPROJECT_BRIDGE_H_
+
+#include "firmware/PROJECT_FILE_NAME.h"
+#include "firmware/nnet_utils/nnet_helpers.h"
+#include <algorithm>
+#include <map>
+
+// hls-fpga-machine-learning insert bram
+
+namespace nnet {
+bool trace_enabled = false;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+extern "C" {
+
+struct trace_data {
+    const char *name;
+    void *data;
+};
+
+void allocate_trace_storage(size_t element_size) {
+    nnet::trace_enabled = true;
+    nnet::trace_outputs = new std::map<std::string, void *>;
+    nnet::trace_type_size = element_size;
+    // hls-fpga-machine-learning insert trace_outputs
+}
+
+void free_trace_storage() {
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        void *ptr = i->second;
+        free(ptr);
+    }
+    nnet::trace_outputs->clear();
+    delete nnet::trace_outputs;
+    nnet::trace_outputs = NULL;
+    nnet::trace_enabled = false;
+}
+
+void collect_trace_output(struct trace_data *c_trace_outputs) {
+    int ii = 0;
+    for (std::map<std::string, void *>::iterator i = nnet::trace_outputs->begin(); i != nnet::trace_outputs->end(); i++) {
+        c_trace_outputs[ii].name = i->first.c_str();
+        c_trace_outputs[ii].data = i->second;
+        ii++;
+    }
+}
+
+// hls-fpga-machine-learning insert tb_input_writer
+
+// Wrapper of top level function for Python bridge
+void myproject_float(
+    // hls-fpga-machine-learning insert header #float
+) {
+    // hls-fpga-machine-learning insert namespace
+
+    // hls-fpga-machine-learning insert wrapper #float
+}
+
+void myproject_double(
+    // hls-fpga-machine-learning insert header #double
+) {
+    // hls-fpga-machine-learning insert namespace
+
+    // hls-fpga-machine-learning insert wrapper #double
+}
+}
+
+#endif
diff --git a/hls4ml/templates/vitis_unified/myproject_dm.cpp b/hls4ml/templates/vitis_unified/myproject_dm.cpp
new file mode 100644
index 0000000000..fa373d5a4c
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/myproject_dm.cpp
@@ -0,0 +1,64 @@
+#include <hls_stream.h>
+#include <iostream>
+#include <stdint.h>
+//#include "ap_axi_sdata.h"
+#include "MY_PROJECT_DM_INC.h"
+
+#define STREAM_BUF_IN_SZ VAL
+#define STREAM_BUF_OUT_SZ VAL
+
+template <typename ATOMIC_TYPE, typename INPUT_LAYER_ARR>
+void load_input(ATOMIC_TYPE *in, hls::stream<INPUT_LAYER_ARR> &inStream, int amtQuery, const int TENSOR_SIZE) {
+mem_rd:
+    int baseQuery = 0;
+    for (int q = 0; q < amtQuery; q++) {
+        for (int i = 0; i < TENSOR_SIZE / INPUT_LAYER_ARR::size; i++) {
+            INPUT_LAYER_ARR tmp;
+            for (int j = 0; j < INPUT_LAYER_ARR::size; j++) {
+                tmp[j] = in[baseQuery];
+                baseQuery++;
+            }
+            inStream.write(tmp);
+        }
+    }
+}
+
+template <typename ATOMIC_TYPE, typename OUT_LAYER_ARR>
+void store_result(ATOMIC_TYPE *out, hls::stream<OUT_LAYER_ARR> &out_stream, int amtQuery, const int TENSOR_SIZE) {
+mem_wr:
+    int baseQuery = 0;
+    for (int q = 0; q < amtQuery; q++) {
+        for (int i = 0; i < TENSOR_SIZE / OUT_LAYER_ARR::size; i++) {
+            OUT_LAYER_ARR tmp = out_stream.read();
+            for (int j = 0; j < OUT_LAYER_ARR::size; j++) {
+                out[baseQuery] = tmp[j];
+                baseQuery++;
+            }
+        }
+    }
+}
+
+void MY_PROJECT_TOP_FUNC(
+    // vitis-unified-wrapper-io
+    , int amtQuery
+
+) {
+
+    // vitis-unified-wrapper-interface
+    #pragma HLS INTERFACE s_axilite port=amtQuery bundle=control
+    #pragma HLS INTERFACE s_axilite port=return bundle=control
+
+    // vitis-unified-wrapper-stream-dec
+
+    // vitis-unified-wrapper-stream-config
+
+    #pragma HLS dataflow
+
+    // vitis-unified-wrapper-load
+
+    for (int q = 0; q < amtQuery; q++) {
+        // vitis-unified-wrapper-compute
+    }
+
+    // vitis-unified-wrapper-store
+}
diff --git a/hls4ml/templates/vitis_unified/myproject_dm.h b/hls4ml/templates/vitis_unified/myproject_dm.h
new file mode 100644
index 0000000000..d8ca8eb0f7
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/myproject_dm.h
@@ -0,0 +1,13 @@
+#ifndef FILENAME_H
+#define FILENAME_H
+
+#include <iostream>
+
+#include "MY_PROJECT_INC.h"
+
+void MY_PROJECT_TOP_FUNC(
+
+    // vitis-unified-wrapper-io
+    , int amtQuery);
+
+#endif
diff --git a/hls4ml/templates/vitis_unified/myproject_test.cpp b/hls4ml/templates/vitis_unified/myproject_test.cpp
new file mode 100644
index 0000000000..a8d031e304
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/myproject_test.cpp
@@ -0,0 +1,97 @@
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+// hls-fpga-machine-learning insert include
+
+#include "firmware/nnet_utils/nnet_helpers.h"
+
+// hls-fpga-machine-learning insert bram
+
+#define CHECKPOINT 5000
+
+namespace nnet {
+bool trace_enabled = true;
+std::map<std::string, void *> *trace_outputs = NULL;
+size_t trace_type_size = sizeof(double);
+} // namespace nnet
+
+int main(int argc, char **argv) {
+    // hls-fpga-machine-learning insert namespace
+
+    // load input data from text file
+    std::ifstream fin("tb_data/tb_input_features.dat");
+    // load predictions from text file
+    std::ifstream fpr("tb_data/tb_output_predictions.dat");
+
+#ifdef RTL_SIM
+    std::string RESULTS_LOG = "tb_data/rtl_cosim_results.log";
+#else
+    std::string RESULTS_LOG = "tb_data/csim_results.log";
+#endif
+    std::ofstream fout(RESULTS_LOG);
+
+    std::string iline;
+    std::string pline;
+    int e = 0;
+
+    if (fin.is_open() && fpr.is_open()) {
+        while (std::getline(fin, iline) && std::getline(fpr, pline)) {
+            if (e % CHECKPOINT == 0)
+                std::cout << "Processing input " << e << std::endl;
+            char *cstr = const_cast<char *>(iline.c_str());
+            char *current;
+            std::vector<float> in;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                in.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+            cstr = const_cast<char *>(pline.c_str());
+            std::vector<float> pr;
+            current = strtok(cstr, " ");
+            while (current != NULL) {
+                pr.push_back(atof(current));
+                current = strtok(NULL, " ");
+            }
+
+            // hls-fpga-machine-learning insert data
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            if (e % CHECKPOINT == 0) {
+                std::cout << "Predictions" << std::endl;
+                // hls-fpga-machine-learning insert predictions
+                std::cout << "Quantized predictions" << std::endl;
+                // hls-fpga-machine-learning insert quantized
+            }
+            e++;
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+        fin.close();
+        fpr.close();
+    } else {
+        std::cout << "INFO: Unable to open input/predictions file, using default input." << std::endl;
+        const unsigned NUM_TEST_SAMPLES = 5;
+        for (unsigned i = 0; i < NUM_TEST_SAMPLES; i++) {
+            // hls-fpga-machine-learning insert zero
+
+            // hls-fpga-machine-learning insert top-level-function
+
+            // hls-fpga-machine-learning insert output
+
+            // hls-fpga-machine-learning insert tb-output
+        }
+    }
+
+    fout.close();
+    std::cout << "INFO: Saved inference results to file: " << RESULTS_LOG << std::endl;
+
+    return 0;
+}
diff --git a/hls4ml/templates/vitis_unified/workspace/projectName/vitis-comp.json b/hls4ml/templates/vitis_unified/workspace/projectName/vitis-comp.json
new file mode 100644
index 0000000000..9c7eb3fb62
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/workspace/projectName/vitis-comp.json
@@ -0,0 +1,9 @@
+{
+  "name": "{HLS_NAME}",
+  "type": "HLS",
+  "configuration": {
+    "componentType": "HLS",
+    "configFiles": ["{CONFIG_FILE}"],
+    "work_dir": "unifiedPrj"
+  }
+}
diff --git a/hls4ml/templates/vitis_unified/workspace/sysProj/buildAcc.sh b/hls4ml/templates/vitis_unified/workspace/sysProj/buildAcc.sh
new file mode 100644
index 0000000000..38da5ac4e5
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/workspace/sysProj/buildAcc.sh
@@ -0,0 +1,6 @@
+v++ -l -t hw --platform {PLATFORM_XPFM} {KERNEL_XO} --config buildConfig.cfg -o {PROJECT_NAME}.xclbin --save-temps
+[ -f ../../export/system.bit ] && rm -f ../../export/system.bit
+[ -f ../../export/system.hwh ] && rm -f ../../export/system.hwh
+
+xclbinutil --dump-section BITSTREAM:RAW:../../export/system.bit --input {PROJECT_NAME}.xclbin
+cp _x/link/vivado/vpl/prj/prj.gen/sources_1/bd/vitis_design/hw_handoff/vitis_design.hwh ../../export/system.hwh
diff --git a/hls4ml/templates/vitis_unified/workspace/sysProj/buildConfig.cfg b/hls4ml/templates/vitis_unified/workspace/sysProj/buildConfig.cfg
new file mode 100644
index 0000000000..c1266844d0
--- /dev/null
+++ b/hls4ml/templates/vitis_unified/workspace/sysProj/buildConfig.cfg
@@ -0,0 +1,2 @@
+[vivado]
+gui={GUI_STATUS}
diff --git a/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
new file mode 100755
index 0000000000..2913ce80a1
--- /dev/null
+++ b/hls4ml/templates/vivado/ap_types/ap_axi_sdata.h
@@ -0,0 +1,402 @@
+// Copyright 1986-2022 Xilinx, Inc. All Rights Reserved.
+// Copyright 2022-2023 Advanced Micro Devices, Inc. All Rights Reserved.
+
+// 67d7842dbbe25473c3c32b93c0da8047785f30d78e8a024de1b57352245f9689
+
+/*
+ * This file contains the definition of the data types for AXI streaming.
+ * ap_axi_s is a signed interpretation of the AXI stream
+ * ap_axi_u is an unsigned interpretation of the AXI stream
+ */
+
+#ifndef __AP__AXI_SDATA__
+#define __AP__AXI_SDATA__
+
+#include "ap_int.h"
+#include "hls_stream.h"
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <type_traits>
+//#include "ap_fixed.h"
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_fixed;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+struct ap_ufixed;
+
+namespace hls {
+
+template <typename T> constexpr std::size_t bitwidth = sizeof(T) * CHAR_BIT;
+template <> constexpr std::size_t bitwidth<void> = 1 * CHAR_BIT;
+
+template <std::size_t W> constexpr std::size_t bitwidth<ap_int<W>> = W;
+template <std::size_t W> constexpr std::size_t bitwidth<ap_uint<W>> = W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_fixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> =
+    _AP_W;
+template <int _AP_W, int _AP_I, ap_q_mode _AP_Q, ap_o_mode _AP_O, int _AP_N>
+constexpr std::size_t bitwidth<ap_ufixed<_AP_W, _AP_I, _AP_Q, _AP_O, _AP_N>> =
+    _AP_W;
+
+template <typename T>
+constexpr std::size_t bytewidth = (bitwidth<T> + CHAR_BIT - 1) / CHAR_BIT;
+template <> constexpr std::size_t bytewidth<void> = 1;
+
+struct axis_disabled_signal {};
+
+// Enablement for axis signals
+#define AXIS_ENABLE_DATA 0b00000001
+#define AXIS_ENABLE_DEST 0b00000010
+#define AXIS_ENABLE_ID 0b00000100
+#define AXIS_ENABLE_KEEP 0b00001000
+#define AXIS_ENABLE_LAST 0b00010000
+#define AXIS_ENABLE_STRB 0b00100000
+#define AXIS_ENABLE_USER 0b01000000
+
+// clang-format off
+// Disablement mask for DATA axis signals
+#define AXIS_DISABLE_DATA (0b11111111 ^ AXIS_ENABLE_DATA) & \
+                          (0b11111111 ^ AXIS_ENABLE_KEEP) & \
+                          (0b11111111 ^ AXIS_ENABLE_STRB)
+
+// Enablement/disablement of all axis signals
+#define AXIS_ENABLE_ALL  0b01111111
+#define AXIS_DISABLE_ALL 0b00000000
+
+// Struct: axis - struct that has one or more member 'signals'
+//   Signals: DATA, DEST, ID, KEEP, LAST, STRB, USER
+//   All signals are optional:
+//     LAST is enabled by default
+//     DEST, ID, & USER are disabled by default
+//     DATA, KEEP, & STRB are enabled by default for non-void DATA type
+//   Template parameters:
+//     T                : type of the DATA signal
+//     WUser            : size of the USER signal, if zero signal will be disabled
+//     WId              : size of the ID signal, if zero signal will be disabled
+//     WDest            : size of the DEST signal, if zero signal will be disabled
+//     EnableSignals    : bit field to enable signals, see AXIS_ENABLE_*
+//     StrictEnablement : when true check that EnableSignals matches other parameters
+// clang-format on
+template <typename T, std::size_t WUser = 0, std::size_t WId = 0,
+          std::size_t WDest = 0,
+          uint8_t EnableSignals =
+              (AXIS_ENABLE_KEEP | AXIS_ENABLE_LAST | AXIS_ENABLE_STRB),
+          bool StrictEnablement = false>
+struct axis {
+  static_assert((EnableSignals & 0b10000000) == 0,
+                "Template parameter 'EnableSignals' is invalid only "
+                "low 7 bits can be set!");
+  friend class stream<
+      axis<T, WUser, WId, WDest, EnableSignals, StrictEnablement>>;
+
+  static constexpr bool has_data = !std::is_void<T>::value;
+  static constexpr bool has_user = WUser > 0;
+  static constexpr bool has_id = WId > 0;
+  static constexpr bool has_dest = WDest > 0;
+  static constexpr bool has_keep = EnableSignals & AXIS_ENABLE_KEEP;
+  static constexpr bool has_strb = EnableSignals & AXIS_ENABLE_STRB;
+  static constexpr bool has_last = EnableSignals & AXIS_ENABLE_LAST;
+
+  static constexpr std::size_t width_user = has_user ? WUser : 1;
+  static constexpr std::size_t width_id = has_id ? WId : 1;
+  static constexpr std::size_t width_dest = has_dest ? WDest : 1;
+  static constexpr std::size_t width_keep = bytewidth<T>;
+  static constexpr std::size_t width_strb = bytewidth<T>;
+  static constexpr std::size_t width_last = 1;
+
+  static_assert(has_data || has_user || has_id || has_dest || has_keep ||
+                    has_strb || has_last,
+                "No axis signals are enabled");
+
+  static_assert(StrictEnablement
+                    ? has_data == (bool)(EnableSignals & AXIS_ENABLE_DATA)
+                    : true,
+                "Found mismatched enablement for DATA signal");
+  static_assert(StrictEnablement
+                    ? has_user == (bool)(EnableSignals & AXIS_ENABLE_USER)
+                    : true,
+                "Found mismatched enablement for USER signal");
+  static_assert(StrictEnablement
+                    ? has_id == (bool)(EnableSignals & AXIS_ENABLE_ID)
+                    : true,
+                "Found mismatched enablement for ID signal");
+  static_assert(StrictEnablement
+                    ? has_dest == (bool)(EnableSignals & AXIS_ENABLE_DEST)
+                    : true,
+                "Found mismatched enablement for DEST signal");
+
+  typedef typename std::conditional<has_data, T, axis_disabled_signal>::type
+      Type_data;
+  Type_data data;
+
+#ifdef AESL_SYN
+
+  NODEBUG Type_data get_data() const {
+#pragma HLS inline
+    assert(has_data);
+    return data;
+  }
+  NODEBUG void set_data(Type_data d) {
+#pragma HLS inline
+    assert(has_data);
+    data = d;
+  }
+
+#define _AXIS_CHANNEL_API(CHAN_NAME)                                           \
+  typedef                                                                      \
+      typename std::conditional<has_##CHAN_NAME, ap_uint<width_##CHAN_NAME>,   \
+                                axis_disabled_signal>::type Type_##CHAN_NAME;  \
+  Type_##CHAN_NAME CHAN_NAME;                                                  \
+  __attribute__((nodebug)) __attribute__((always_inline))                      \
+      Type_##CHAN_NAME get_##CHAN_NAME() const {                               \
+    assert(has_##CHAN_NAME);                                                   \
+    return CHAN_NAME;                                                          \
+  }                                                                            \
+  __attribute__((nodebug)) __attribute__(                                      \
+      (always_inline)) void set_##CHAN_NAME(Type_##CHAN_NAME value) {          \
+    assert(has_##CHAN_NAME);                                                   \
+    CHAN_NAME = value;                                                         \
+  }
+
+#else
+
+  Type_data get_data() const {
+    if (!has_data)
+      throw std::runtime_error("CHAN_NAME is not enabled");
+    return data;
+  }
+  void set_data(Type_data d) {
+    if (!has_data)
+      throw std::runtime_error("CHAN_NAME is not enabled");
+    data = d;
+  }
+
+#define _AXIS_CHANNEL_API(CHAN_NAME)                                           \
+  typedef                                                                      \
+      typename std::conditional<has_##CHAN_NAME, ap_uint<width_##CHAN_NAME>,   \
+                                axis_disabled_signal>::type Type_##CHAN_NAME;  \
+  Type_##CHAN_NAME CHAN_NAME;                                                  \
+  Type_##CHAN_NAME get_##CHAN_NAME() const {                                   \
+    if (!has_##CHAN_NAME)                                                      \
+      throw std::runtime_error("CHAN_NAME is not enabled");                    \
+    return CHAN_NAME;                                                          \
+  }                                                                            \
+  void set_##CHAN_NAME(Type_##CHAN_NAME value) {                               \
+    if (!has_##CHAN_NAME)                                                      \
+      throw std::runtime_error("CHAN_NAME is not enabled");                    \
+    CHAN_NAME = value;                                                         \
+  }
+
+#endif
+
+  _AXIS_CHANNEL_API(keep)
+  _AXIS_CHANNEL_API(strb)
+  _AXIS_CHANNEL_API(user)
+  _AXIS_CHANNEL_API(last)
+  _AXIS_CHANNEL_API(id)
+  _AXIS_CHANNEL_API(dest)
+#undef _AXIS_CHANNEL_API
+
+// For original `qdma_axis`
+#ifdef AESL_SYN
+  NODEBUG
+#endif
+  void keep_all() {
+#pragma HLS inline
+#ifdef AESL_SYN
+    assert(has_keep);
+#else
+    if (!has_data)
+      throw std::runtime_error("CHAN_NAME is not enabled");
+#endif
+    ap_uint<width_keep> k = 0;
+    keep = ~k;
+  }
+
+private:
+#ifdef AESL_SYN
+#define _AXIS_CHANNEL_INTERNAL_API(CHAN_NAME)                                  \
+  __attribute__((nodebug)) __attribute__((always_inline))                      \
+      Type_##CHAN_NAME *get_##CHAN_NAME##_ptr() {                              \
+    return (!has_##CHAN_NAME) ? nullptr : &CHAN_NAME;                          \
+  }
+
+  _AXIS_CHANNEL_INTERNAL_API(data)
+  _AXIS_CHANNEL_INTERNAL_API(keep)
+  _AXIS_CHANNEL_INTERNAL_API(strb)
+  _AXIS_CHANNEL_INTERNAL_API(user)
+  _AXIS_CHANNEL_INTERNAL_API(last)
+  _AXIS_CHANNEL_INTERNAL_API(id)
+  _AXIS_CHANNEL_INTERNAL_API(dest)
+#undef _AXIS_CHANNEL_INTERNAL_API
+#endif
+};
+
+// clang-format off
+// Struct: axis_data (alternative to axis)
+//   DATA signal always enabled
+//   All other signals are optional, disabled by default
+// Example usage:
+//   hls::axis_data<int, AXIS_ENABLE_LAST> A; // DATA and LAST signals only
+//   hls::axis_data<int, AXIS_ENABLE_LAST | AXIS_ENABLE_USER, 32> B; // DATA, LAST, and USER signals only (USER width is 32)
+//   hls::axis_data<int, AXIS_ENABLE_ALL, 32, 8, 8> C; // All signals enabled
+//   hls::axis_data<int, AXIS_ENABLE_ALL> D; // All signals enabled, this throw an exception due to zero size for WUser/WId/WDest
+// clang-format on
+template <typename TData, uint8_t EnableSignals = AXIS_ENABLE_DATA,
+          std::size_t WUser = 0, std::size_t WId = 0, std::size_t WDest = 0,
+          bool StrictEnablement = true>
+using axis_data = axis<TData, WUser, WId, WDest,
+                       (EnableSignals | AXIS_ENABLE_DATA), StrictEnablement>;
+
+// Struct: axis_user (alternative to axis)
+//   USER signal always enabled
+//   DATA signal always disabled
+//   All other signals are optional, disabled by default
+// Example usage:
+//   hls::axis_user<32> C; // USER signal only
+//   hls::axis_user<32, AXIS_ENABLE_LAST> D; // USER and LAST signals only
+template <std::size_t WUser, uint8_t EnableSignals = AXIS_ENABLE_USER,
+          std::size_t WId = 0, std::size_t WDest = 0,
+          bool StrictEnablement = true>
+using axis_user = axis<void, WUser, WId, WDest,
+                       (EnableSignals & AXIS_DISABLE_DATA), StrictEnablement>;
+
+} // namespace hls
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId,
+          std::size_t WDest,
+          uint8_t EnableSignals =
+              (AXIS_ENABLE_KEEP | AXIS_ENABLE_LAST | AXIS_ENABLE_STRB),
+          bool StrictEnablement = false>
+using ap_axis = hls::axis<ap_int<WData>, WUser, WId, WDest, EnableSignals,
+                          StrictEnablement>;
+
+template <std::size_t WData, std::size_t WUser, std::size_t WId,
+          std::size_t WDest,
+          uint8_t EnableSignals =
+              (AXIS_ENABLE_KEEP | AXIS_ENABLE_LAST | AXIS_ENABLE_STRB),
+          bool StrictEnablement = false>
+using ap_axiu = hls::axis<ap_uint<WData>, WUser, WId, WDest, EnableSignals,
+                          StrictEnablement>;
+
+// original usage: qdma_axis<WData, 0, 0, 0>, and TSTRB is omitted.
+template <std::size_t WData, std::size_t WUser, std::size_t WId,
+          std::size_t WDest>
+using qdma_axis = hls::axis<ap_uint<WData>, WUser, WId, WDest,
+                            AXIS_ENABLE_ALL ^ AXIS_ENABLE_STRB, false>;
+
+#ifdef AESL_SYN
+#if ((__clang_major__ != 3) || (__clang_minor__ != 1))
+namespace hls {
+
+template <typename T, std::size_t WUser, std::size_t WId, std::size_t WDest,
+          uint8_t EnableSignals, bool StrictEnablement>
+class stream<axis<T, WUser, WId, WDest, EnableSignals, StrictEnablement>>
+    final {
+  typedef axis<T, WUser, WId, WDest, EnableSignals, StrictEnablement>
+      __STREAM_T__;
+
+public:
+  /// Constructors
+  INLINE NODEBUG stream() {}
+
+  INLINE NODEBUG stream(const char *name) { (void)name; }
+
+  /// Make copy constructor and assignment operator private
+private:
+  INLINE NODEBUG stream(const stream<__STREAM_T__> &chn) : V(chn.V) {}
+
+public:
+  /// Overload >> and << operators to implement read() and write()
+  INLINE NODEBUG void operator>>(__STREAM_T__ &rdata) { read(rdata); }
+
+  INLINE NODEBUG void operator<<(const __STREAM_T__ &wdata) { write(wdata); }
+
+  /// empty & full
+  NODEBUG bool empty() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_valid(
+        V.get_data_ptr(), V.get_keep_ptr(), V.get_strb_ptr(), V.get_user_ptr(),
+        V.get_last_ptr(), V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  NODEBUG bool full() {
+#pragma HLS inline
+    bool tmp = __fpga_axis_ready(
+        V.get_data_ptr(), V.get_keep_ptr(), V.get_strb_ptr(), V.get_user_ptr(),
+        V.get_last_ptr(), V.get_id_ptr(), V.get_dest_ptr());
+    return !tmp;
+  }
+
+  /// Blocking read
+  NODEBUG void read(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(V.get_data_ptr(), V.get_keep_ptr(), V.get_strb_ptr(),
+                    V.get_user_ptr(), V.get_last_ptr(), V.get_id_ptr(),
+                    V.get_dest_ptr(), tmp.get_data_ptr(), tmp.get_keep_ptr(),
+                    tmp.get_strb_ptr(), tmp.get_user_ptr(), tmp.get_last_ptr(),
+                    tmp.get_id_ptr(), tmp.get_dest_ptr());
+    dout = tmp;
+  }
+
+  NODEBUG __STREAM_T__ read() {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    __fpga_axis_pop(V.get_data_ptr(), V.get_keep_ptr(), V.get_strb_ptr(),
+                    V.get_user_ptr(), V.get_last_ptr(), V.get_id_ptr(),
+                    V.get_dest_ptr(), tmp.get_data_ptr(), tmp.get_keep_ptr(),
+                    tmp.get_strb_ptr(), tmp.get_user_ptr(), tmp.get_last_ptr(),
+                    tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return tmp;
+  }
+
+  /// Blocking write
+  NODEBUG void write(const __STREAM_T__ &din) {
+#pragma HLS inline
+    __STREAM_T__ tmp = din;
+    __fpga_axis_push(V.get_data_ptr(), V.get_keep_ptr(), V.get_strb_ptr(),
+                     V.get_user_ptr(), V.get_last_ptr(), V.get_id_ptr(),
+                     V.get_dest_ptr(), tmp.get_data_ptr(), tmp.get_keep_ptr(),
+                     tmp.get_strb_ptr(), tmp.get_user_ptr(), tmp.get_last_ptr(),
+                     tmp.get_id_ptr(), tmp.get_dest_ptr());
+  }
+
+  /// Non-Blocking read
+  NODEBUG bool read_nb(__STREAM_T__ &dout) {
+#pragma HLS inline
+    __STREAM_T__ tmp;
+    if (__fpga_axis_nb_pop(V.get_data_ptr(), V.get_keep_ptr(), V.get_strb_ptr(),
+                           V.get_user_ptr(), V.get_last_ptr(), V.get_id_ptr(),
+                           V.get_dest_ptr(), tmp.get_data_ptr(),
+                           tmp.get_keep_ptr(), tmp.get_strb_ptr(),
+                           tmp.get_user_ptr(), tmp.get_last_ptr(),
+                           tmp.get_id_ptr(), tmp.get_dest_ptr())) {
+      dout = tmp;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /// Non-Blocking write
+  NODEBUG bool write_nb(const __STREAM_T__ &in) {
+#pragma HLS inline
+    __STREAM_T__ tmp = in;
+    bool full_n = __fpga_axis_nb_push(
+        V.get_data_ptr(), V.get_keep_ptr(), V.get_strb_ptr(), V.get_user_ptr(),
+        V.get_last_ptr(), V.get_id_ptr(), V.get_dest_ptr(), tmp.get_data_ptr(),
+        tmp.get_keep_ptr(), tmp.get_strb_ptr(), tmp.get_user_ptr(),
+        tmp.get_last_ptr(), tmp.get_id_ptr(), tmp.get_dest_ptr());
+    return full_n;
+  }
+
+private:
+  __STREAM_T__ V NO_CTOR;
+};
+
+} // namespace hls
+#endif
+#endif
+#endif
diff --git a/hls4ml/writer/__init__.py b/hls4ml/writer/__init__.py
index 8de19fe1d2..52b00604b5 100644
--- a/hls4ml/writer/__init__.py
+++ b/hls4ml/writer/__init__.py
@@ -2,6 +2,7 @@
 from hls4ml.writer.oneapi_writer import OneAPIWriter
 from hls4ml.writer.quartus_writer import QuartusWriter
 from hls4ml.writer.symbolic_writer import SymbolicExpressionWriter
+from hls4ml.writer.vitis_unified_writer import VitisUnifiedWriter
 from hls4ml.writer.vitis_writer import VitisWriter
 from hls4ml.writer.vivado_accelerator_writer import VivadoAcceleratorWriter
 from hls4ml.writer.vivado_writer import VivadoWriter
@@ -10,6 +11,7 @@
 register_writer('Vivado', VivadoWriter)
 register_writer('VivadoAccelerator', VivadoAcceleratorWriter)
 register_writer('Vitis', VitisWriter)
+register_writer('VitisUnified', VitisUnifiedWriter)
 register_writer('Quartus', QuartusWriter)
 register_writer('oneAPI', OneAPIWriter)
 register_writer('Catapult', CatapultWriter)
diff --git a/hls4ml/writer/vitis_unified_writer/__init__.py b/hls4ml/writer/vitis_unified_writer/__init__.py
new file mode 100644
index 0000000000..51a0c2fba7
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/__init__.py
@@ -0,0 +1,88 @@
+import os
+
+from hls4ml.backends.vitis_unified.vitis_unified_config import VitisUnifiedConfig
+from hls4ml.writer.vitis_writer import VitisWriter
+
+from .meta import VitisUnifiedWriterMeta
+
+
+class VitisUnifiedWriter(VitisWriter):
+
+    def __init__(self):
+        super().__init__()
+        self.writer_meta = VitisUnifiedWriterMeta()
+
+        from .build_gen import VitisUnified_BuildGen
+        from .driver_gen import VitisUnified_DriverGen
+        from .meta_gen import VitisUnified_MetaGen
+        from .test_bridge_gen import VitisUnified_BridgeGen
+        from .test_cosim_gen import VitisUnified_TestGen
+        from .wrap_gen import VitisUnified_WrapperGen
+
+        self.bg = VitisUnified_BuildGen
+        self.dg = VitisUnified_DriverGen
+        self.mg = VitisUnified_MetaGen
+        self.tbg = VitisUnified_BridgeGen
+        self.tcg = VitisUnified_TestGen
+        self.wg = VitisUnified_WrapperGen
+
+    def write_board_script_override(self, model):
+        pass
+
+    def write_build_prj_override(self, model):
+        pass
+
+    def write_build_opts(self, model):
+        pass
+
+    def write_tar(self, model):
+        pass
+
+    def write_bridge(self, model):  # test bench gen
+        self.tbg.write_bridge(self.writer_meta, model, self.mg)
+
+    def write_build_script(self, model):
+        # for bridge simulation
+        self.bg.write_bridge_build_script(self.writer_meta, model, self.mg)
+        # for hls kernel generation
+        self.bg.build_unified_project_ske(self.writer_meta, model, self.mg)
+        self.bg.write_hls_kernel_cfg(self.writer_meta, model, self.mg, True)
+        self.bg.write_hls_kernel_cfg(self.writer_meta, model, self.mg, False)
+        # for v++ to link hls to the system
+        self.bg.write_launch_vitis_linker_dir(self.writer_meta, model, self.mg)
+        self.bg.write_launch_vitis_linker_launcher(self.writer_meta, model, self.mg)
+        self.bg.write_launch_vitis_linker_cfg(self.writer_meta, model, self.mg)
+
+    def generate_config(self, model):
+
+        self.writer_meta.vitis_unified_config = VitisUnifiedConfig(
+            model.config, model.get_input_variables(), model.get_output_variables()
+        )
+
+    def make_export_path(self, model):
+        export_path = f'{model.config.get_output_dir()}/export'
+        if not os.path.exists(export_path):
+            os.makedirs(export_path)
+
+    def write_hls(self, model, is_multigraph=False):
+
+        if is_multigraph:
+            raise Exception(
+                "Vitis Unified does not support multigraphs; however, vitis unified partial backend is please use it instead"
+            )
+
+        # generate kernel and its driver
+        self.generate_config(model)
+        super().write_hls(model, is_multigraph=False)
+        self.wg.write_wrapper(self.writer_meta, model, self.mg)
+
+        self.make_export_path(model)
+        self.dg.write_driver(self.writer_meta, model, self.mg)
+        self.tcg.write_wrapper_test(self.writer_meta, model, self.mg)
+
+        # self.write_new_tar(model)
+        # if not is_multigraph:
+
+        # else:
+        #    self.write_bridge_multigraph(model)
+        # self.modify_write_build_script_multigraph(model)
diff --git a/hls4ml/writer/vitis_unified_writer/build_gen.py b/hls4ml/writer/vitis_unified_writer/build_gen.py
new file mode 100644
index 0000000000..9abbc99f73
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/build_gen.py
@@ -0,0 +1,143 @@
+import os
+import stat
+from pathlib import Path
+
+from .meta import VitisUnifiedWriterMeta
+
+
+class VitisUnified_BuildGen:
+
+    @classmethod
+    def write_bridge_build_script(self, meta: VitisUnifiedWriterMeta, model, mg):
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/build_lib.sh'))
+        fout = open(f"{model.config.get_output_dir()}/build_lib.sh", 'w')
+
+        for line in fin.readlines():
+            if 'myprojectBaseName' in line:
+                line = line.replace('myprojectBaseName', format(model.config.get_project_name()))
+            if 'myprojectWrapName' in line:
+                line = line.replace('myprojectWrapName', mg.get_wrapper_file_name(model))
+            if 'mystamp' in line:
+                line = line.replace('mystamp', model.config.get_config_value('Stamp'))
+
+            fout.write(line)
+
+        fin.close()
+        fout.close()
+
+        # change permission
+        build_lib_dst = Path(f'{model.config.get_output_dir()}/build_lib.sh').resolve()
+        build_lib_dst.chmod(build_lib_dst.stat().st_mode | stat.S_IEXEC)
+
+    @classmethod
+    def write_hls_kernel_cfg(self, meta, model, mg, is_csim=False):  # True is_csim else is cosim+fifo_optimization
+        # This will gen hls_kernel_config_<csim/cosim>.cfg file which Vitis_hls unified will use it to config
+        # the synthesizer
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        sufix = "csim" if is_csim else "cosim"
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/hls_kernel_config.cfg'))
+        fout = open(f"{model.config.get_output_dir()}/hls_kernel_config_{sufix}.cfg", 'w')
+
+        for line in fin.readlines():
+            if "{PART}" in line:
+                line = line.replace("{PART}", model.config.get_config_value('Part'))
+            if "{CLK}" in line:
+                line = line.replace("{CLK}", model.config.get_config_value('ClockPeriod'))
+            if "{CLK_UC}" in line:
+                line = line.replace("{CLK_UC}", model.config.get_config_value('ClockUncertainty'))
+            if "{OUTDIR}" in line:
+                line = line.replace("{OUTDIR}", model.config.get_output_dir())
+            if "{TOP_NAME}" in line:
+                line = line.replace("{TOP_NAME}", mg.get_top_wrap_func_name(model))
+            if "{FILE_NAME_WRAP}" in line:
+                line = line.replace("{FILE_NAME_WRAP}", mg.get_wrapper_file_name(model))
+            if "{SIM_FILE_NAME}" in line:
+                line = line.replace("{SIM_FILE_NAME}", mg.get_sim_file_name())
+            if "{FILE_NAME_BASE}" in line:
+                line = line.replace("{FILE_NAME_BASE}", mg.get_main_file_name(model))
+            if "{OUTPUT_KERNEL_TYPE}" in line:
+                line = line.replace("{OUTPUT_KERNEL_TYPE}", mg.get_output_kernel_type())
+            if is_csim and (("enable_fifo_sizing" in line) or ("-DRTL_SIM" in line)):
+                line = "#" + line
+
+            fout.write(line)
+
+        fin.close()
+        fout.close()
+
+    @classmethod
+    def build_unified_project_ske(self, meta, model, mg, workspaceDir=None):
+        # this will generate the vitis-comp.json file, the file will enable vitis ide gui to see it
+        # as a project
+        if workspaceDir is None:
+            workspaceDir = mg.get_vitis_unified_working_directory_dir(model)
+        hlsDir = mg.get_vitis_hls_dir(model)
+        execDir = mg.get_vitis_hls_dir(model)
+        vitisComp = os.path.join(str(hlsDir), "vitis-comp.json")
+
+        # create my own project for this graph
+        os.makedirs(workspaceDir, exist_ok=True)
+        os.makedirs(hlsDir, exist_ok=True)
+        os.makedirs(execDir, exist_ok=True)
+        # create project vitis-comp.json to
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, "../../templates/vitis_unified/workspace/projectName/vitis-comp.json"))
+        fout = open(vitisComp, 'w')
+
+        for line in fin.readlines():
+            if "{HLS_NAME}" in line:
+                line = line.replace("{HLS_NAME}", model.config.get_project_name())
+            if "{CONFIG_FILE}" in line:
+                line = line.replace("{CONFIG_FILE}", f"{model.config.get_output_dir()}/hls_kernel_config.cfg")
+            fout.write(line)
+
+        fin.close()
+        fout.close()
+
+    @classmethod
+    def write_launch_vitis_linker_dir(self, meta, model, mg):
+        os.makedirs(mg.get_vitis_linker_dir(model), exist_ok=True)
+
+    @classmethod
+    def write_launch_vitis_linker_launcher(self, meta, model, mg):
+        # This section generate buildAcc.sh file to combine the platform and the hls kernel together
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/workspace/sysProj/buildAcc.sh'))
+        fout = open(f"{mg.get_vitis_linker_dir(model)}/buildAcc.sh", 'w')
+
+        for line in fin.readlines():
+            if "{PLATFORM_XPFM}" in line:
+                line = line.replace("{PLATFORM_XPFM}", meta.vitis_unified_config.get_XPFMPath())
+            if "{KERNEL_XO}" in line:
+                line = line.replace("{KERNEL_XO}", mg.get_xo_file_path(model))
+            if "{PROJECT_NAME}" in line:
+                line = line.replace("{PROJECT_NAME}", model.config.get_project_name())
+
+            fout.write(line)
+
+        fin.close()
+        fout.close()
+
+        link_lib_dst = Path(f"{mg.get_vitis_linker_dir(model)}/buildAcc.sh").resolve()
+        link_lib_dst.chmod(link_lib_dst.stat().st_mode | stat.S_IEXEC)
+
+    @classmethod
+    def write_launch_vitis_linker_cfg(self, meta, model, mg):
+        # this will generate the config file that linker (platform + vitis)
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/workspace/sysProj/buildConfig.cfg'))
+        fout = open(f"{mg.get_vitis_linker_dir(model)}/buildConfig.cfg", 'w')
+
+        for line in fin.readlines():
+            if "{CLK}" in line:
+                line = line.replace("{CLK}", str(100_000_000))  # model.config.get_config_value('ClockPeriod'))
+            if "{KERNEL_NAME}" in line:
+                line = line.replace("{KERNEL_NAME}", mg.get_top_wrap_func_name(model))
+            if "{GUI_STATUS}" in line:
+                line = line.replace("{GUI_STATUS}", "true")
+            line = ""
+            fout.write(line)
+
+        fin.close()
+        fout.close()
diff --git a/hls4ml/writer/vitis_unified_writer/driver_gen.py b/hls4ml/writer/vitis_unified_writer/driver_gen.py
new file mode 100644
index 0000000000..e8e6ef3088
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/driver_gen.py
@@ -0,0 +1,48 @@
+import os
+
+
+class VitisUnified_DriverGen:
+
+    @classmethod
+    def write_driver(self, meta, model, mg):
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/driver/pynq/pynq_driver.py.hls4ml'))
+        fout = open(f'{model.config.get_output_dir()}/export/pynq_driver.py', 'w')
+
+        inp_gmem_t, out_gmem_t, inps, outs = meta.vitis_unified_config.get_corrected_types()
+
+        strideInPtrAddr = 4 * 3
+        strideOutPtrAddr = 4 * 3
+
+        startInPtrAddr = 0x10
+        startOutPtrAddr = startInPtrAddr + strideInPtrAddr * len(inps)
+        startAmtQueryAddr = startOutPtrAddr + strideOutPtrAddr * len(outs)
+
+        def genHexAddrList(startAddr, stride, size, indent):
+            addrs = [f"{indent}{hex(startAddr + inp_idx * stride)}" for inp_idx in range(size)]
+            return addrs
+
+        indentAmt = 3
+        indentStr = indentAmt * "    " if indentAmt > 0 else ""
+
+        for line in fin.readlines():
+
+            if "REG_ADDR_AMT_QUERY" in line:
+                line = line.replace("VAL", str(hex(startAmtQueryAddr)))
+            if "# hls-driver-input-dbg-name" in line:
+                input_names = [f'{indentStr}"{mg.get_io_port_name(inp, True, idx)}"' for idx, inp in enumerate(inps)]
+                line += ",\n".join(input_names) + "\n"
+            if "# hls-driver-input-ptr" in line:
+                line += ",\n".join(genHexAddrList(startInPtrAddr, strideInPtrAddr, len(inps), indentStr)) + "\n"
+            if "# hls-driver-output-dbg-name" in line:
+                output_names = [f'{indentStr}"{mg.get_io_port_name(out, False, idx)}"' for idx, out in enumerate(outs)]
+                line += ",\n".join(output_names) + "\n"
+            if "# hls-driver-output-ptr" in line:
+                line += ",\n".join(genHexAddrList(startOutPtrAddr, strideOutPtrAddr, len(outs), indentStr)) + "\n"
+            if "<TOP_NAME>" in line:
+                line = line.replace("<TOP_NAME>", mg.get_top_wrap_func_name(model))
+
+            fout.write(line)
+
+        fin.close()
+        fout.close()
diff --git a/hls4ml/writer/vitis_unified_writer/meta.py b/hls4ml/writer/vitis_unified_writer/meta.py
new file mode 100644
index 0000000000..19917da0d9
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/meta.py
@@ -0,0 +1,5 @@
+class VitisUnifiedWriterMeta:
+
+    def __init__(self):
+        super().__init__()
+        self.vitis_unified_config = None
diff --git a/hls4ml/writer/vitis_unified_writer/meta_gen.py b/hls4ml/writer/vitis_unified_writer/meta_gen.py
new file mode 100644
index 0000000000..a30a5769fd
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/meta_gen.py
@@ -0,0 +1,90 @@
+import os
+
+# file and directory
+
+
+class VitisUnified_MetaGen:
+
+    @classmethod
+    def get_wrapper_file_name(self, model):
+        return f"{model.config.get_project_name()}_dm"
+
+    @classmethod
+    def get_sim_file_name(cls):
+        return "myproject_test"
+
+    @classmethod
+    def get_main_file_name(self, model):
+        return model.config.get_project_name()
+
+    @classmethod
+    def get_vitis_unified_working_directory_dir(self, model):
+        return os.path.join(model.config.get_output_dir(), "unifiedWorkspace")
+
+    @classmethod
+    def get_vitis_hls_dir(self, model):
+        vitisWorkingDir = self.get_vitis_unified_working_directory_dir(model)
+        return os.path.join(vitisWorkingDir, model.config.get_project_name())
+
+    @classmethod
+    def get_vitis_hls_exec_dir(self, model):
+        hlsDir = self.get_vitis_hls_dir(model)
+        return os.path.join(hlsDir, "unifiedPrj")
+
+    @classmethod
+    def get_vitis_linker_dir(self, model):
+        vitisWorkingDir = self.get_vitis_unified_working_directory_dir(model)
+        return os.path.join(vitisWorkingDir, "linker")
+
+    @classmethod
+    def get_xo_file_name(self, model):
+        return f"{self.get_top_wrap_func_name(model)}.xo"
+
+    @classmethod
+    def get_xo_file_path(self, model):
+        return os.path.join(self.get_vitis_hls_exec_dir(model), self.get_xo_file_name(model))
+
+    # naming of variable function helper
+
+    # FOR GMEM WRAPPER
+
+    @classmethod
+    def get_io_port_name(self, tensorVar, isInput: bool, idx: int):
+        ioDirect = "in" if isInput else "out"
+        return f"gmem_{ioDirect}{str(idx)}_ptr_{tensorVar.name}"
+
+    @classmethod
+    def get_io_port_size_name(self, tensorVar, isInput: bool, idx: int):
+        ioDirect = "in" if isInput else "out"
+        return f"gmem_{ioDirect}{str(idx)}_size_{tensorVar.name}"
+
+    @classmethod
+    def get_local_stream_name(self, tensorVar, isInput: bool, idx: int):
+        ioDirect = "in" if isInput else "out"
+        return f"stream_{ioDirect}{str(idx)}_{tensorVar.name}"
+
+    @classmethod
+    def get_dma_type_name(self):
+        return "dma_data_packet"
+
+    @classmethod
+    def get_wrapper_port_name(self, tensorVar, isInput: bool):
+        ioStr = "in" if isInput else "out"
+        return f"par_{ioStr}_{tensorVar.name}"
+
+    @classmethod
+    def get_top_model_name(self, model):
+        return f"{model.config.get_project_name()}"
+
+    @classmethod
+    def get_top_wrap_func_name(self, model):
+        return f"{model.config.get_project_name()}_gem"
+
+    # it is renamed for stitch layer
+    @classmethod
+    def rename_type(self, tensorVar, layerIdx: int, isInput: bool):
+        return "result_" + tensorVar.type.name + f"_at_layer_{str(layerIdx)}"
+
+    @classmethod
+    def get_output_kernel_type(cls):
+        return "xo"
diff --git a/hls4ml/writer/vitis_unified_writer/test_bridge_gen.py b/hls4ml/writer/vitis_unified_writer/test_bridge_gen.py
new file mode 100644
index 0000000000..999ebe9818
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/test_bridge_gen.py
@@ -0,0 +1,110 @@
+import os
+
+from .meta import VitisUnifiedWriterMeta
+
+
+class VitisUnified_BridgeGen:
+
+    @classmethod
+    def write_bridge(self, meta: VitisUnifiedWriterMeta, model, mg):
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/myproject_bridge.cpp'))
+        fout = open(f"{model.config.get_output_dir()}/{model.config.get_project_name()}_bridge.cpp", 'w')
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        indent = '    '
+
+        for line in fin.readlines():
+            newline = ""
+            if 'MYPROJECT' in line:
+                newline = line.replace('MYPROJECT', format(model.config.get_project_name().upper()))
+
+            elif 'myproject' in line:
+                newline = line.replace('myproject', format(model.config.get_project_name()))
+
+            elif 'PROJECT_FILE_NAME' in line:
+                newline = line.replace('PROJECT_FILE_NAME', format(mg.get_wrapper_file_name(model)))
+
+            elif '// hls-fpga-machine-learning insert bram' in line:
+                newline = line
+                for bram in model_brams:
+                    newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+            elif '// hls-fpga-machine-learning insert header' in line:
+                # this section will write the function arment composed of input and output of hls kernel
+                # for both myproject_float and myproject_double
+                dtype = line.split('#', 1)[1].strip()
+
+                input_ios = []
+                output_ios = []
+
+                for idx, inp in enumerate(model_inputs):
+                    input_ios.append(f"{dtype} {mg.get_io_port_name(inp, True, idx)}[{inp.size_cpp()}]")
+                for idx, out in enumerate(model_outputs):
+                    output_ios.append(f"{dtype} {mg.get_io_port_name(out, False, idx)}[{out.size_cpp()}]")
+
+                inputs_str = ', '.join(input_ios)
+                outputs_str = ', '.join(output_ios)
+
+                newline = ''
+                newline += indent + inputs_str + ',\n'
+                newline += indent + outputs_str + '\n'
+
+            elif '// hls-fpga-machine-learning insert wrapper' in line:
+
+                # This section will write the calling function to main kernel
+
+                dtype = line.split('#', 1)[1].strip()
+                if dtype == meta.vitis_unified_config.get_input_type():
+                    newline = ''
+                    input_vars = []
+                    input_sizes = []
+                    output_vars = []
+                    otuput_sizes = []
+
+                    for idx, inp in enumerate(model_inputs):
+                        input_vars.append(mg.get_io_port_name(inp, True, idx))
+                        input_sizes.append(inp.size_cpp())
+                    for idx, out in enumerate(model_outputs):
+                        output_vars.append(mg.get_io_port_name(out, False, idx))
+                        otuput_sizes.append(out.size_cpp())
+
+                    inputs_str = ', '.join(input_vars)
+                    outputs_str = ', '.join(output_vars)
+
+                    newline = ''
+                    newline += indent + mg.get_top_wrap_func_name(model) + "(\n"
+                    newline += indent + inputs_str + ',\n'
+                    newline += indent + outputs_str + ',\n'
+                    newline += indent + "1);\n"  # amount query should be one only
+
+            elif '// hls-fpga-machine-learning insert trace_outputs' in line:
+                newline = ''
+                for layer in model.get_layers():
+                    func = layer.get_attr('function_cpp', None)
+                    if func and model.config.trace_output and layer.get_attr('trace', False):
+                        vars = layer.get_variables()
+                        for var in vars:
+                            newline += (
+                                indent
+                                + 'nnet::trace_outputs->insert(std::pair<std::string, void *>('
+                                + f'"{layer.name}", (void *) malloc({var.size_cpp()} * element_size)));\n'
+                            )
+
+            elif '// hls-fpga-machine-learning insert namespace' in line:
+                newline = ''
+
+                namespace = model.config.get_writer_config().get('Namespace', None)
+                if namespace is not None:
+                    newline += indent + f'using namespace {namespace};\n'
+
+            else:
+                newline = line
+            fout.write(newline)
+
+        fin.close()
+        fout.close()
diff --git a/hls4ml/writer/vitis_unified_writer/test_cosim_gen.py b/hls4ml/writer/vitis_unified_writer/test_cosim_gen.py
new file mode 100644
index 0000000000..f9f6fe4be5
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/test_cosim_gen.py
@@ -0,0 +1,140 @@
+import os
+
+
+class VitisUnified_TestGen:
+
+    @classmethod
+    def write_wrapper_test(self, meta, model, mg):
+        pass
+
+        inp_gmem_t, out_gmem_t, inps, outs = meta.vitis_unified_config.get_corrected_types()
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        f = open(os.path.join(filedir, '../../templates/vitis_unified/myproject_test.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/{mg.get_sim_file_name()}.cpp', 'w')
+
+        model_inputs = model.get_input_variables()
+        model_outputs = model.get_output_variables()
+        model_brams = [var for var in model.get_weight_variables() if var.storage.lower() == 'bram']
+
+        fout.write("//// generated by Vitis Unified Backend\n")
+
+        for line in f.readlines():
+            indent = ' ' * (len(line) - len(line.lstrip(' ')))
+
+            # Insert numbers
+            if 'myproject' in line:
+                newline = line.replace('myproject', model.config.get_project_name())
+            elif '// hls-fpga-machine-learning insert include' in line:
+                newline = line + f'#include "firmware/{mg.get_wrapper_file_name(model)}.h"\n'
+
+            elif '// hls-fpga-machine-learning insert bram' in line:
+                newline = line
+                for bram in model_brams:
+                    newline += f'#include \"firmware/weights/{bram.name}.h\"\n'
+
+            elif '// hls-fpga-machine-learning insert data' in line:
+                # This section will convert the input which stored in vector<float> to the float pointer
+                # the float pointer will point to the start section of for each input for
+                newline = line
+                offset = 0
+                for inputIdx, inp in enumerate(model_inputs):
+                    # input should be float
+                    newline += indent + 'float* {inputPortName} = &in[{startIdx}];\n'.format(
+                        # can not be double because it fix by template
+                        inputPortName=mg.get_io_port_name(inp, True, inputIdx),
+                        startIdx=str(offset),
+                    )
+                    offset += inp.size()
+                # This section will declare float arrays used to store input from output layer
+                for outputIdx, out in enumerate(model_outputs):
+                    newline += indent + f"float {mg.get_io_port_name(out, False, outputIdx)}[{out.size()}];\n"
+
+            elif '// hls-fpga-machine-learning insert top-level-function' in line:
+
+                # This function will invoke the <Project_name>_dm.cpp which is the wrapper of the system
+
+                newline = line
+
+                input_ios = []
+                output_ios = []
+                bram_ios = [b.name for b in model_brams]
+
+                for inpIdx, inp in enumerate(model_inputs):
+                    input_ios.append(mg.get_io_port_name(inp, True, inpIdx))
+
+                for outIdx, out in enumerate(model_outputs):
+                    output_ios.append(mg.get_io_port_name(out, False, outIdx))
+
+                # Concatenate the input, output, and bram variables. Filter out empty/null values
+                all_vars = ' ,'.join(filter(None, [*input_ios, *output_ios, *bram_ios, "1"]))
+                top_level = indent + f'{mg.get_top_wrap_func_name(model)}({all_vars});\n'
+                newline += top_level
+
+            elif '// hls-fpga-machine-learning insert predictions' in line:
+                newline = line
+                for out in model_outputs:
+                    # TODO fix this size retrieve
+
+                    newline += indent + f'for(int i = 0; i < {out.size()}; i++) {{\n'
+                    newline += indent + '  std::cout << pr[i] << " ";\n'
+                    newline += indent + '}\n'
+                    newline += indent + 'std::cout << std::endl;\n'
+            elif '// hls-fpga-machine-learning insert zero' in line:
+                newline = line
+                for inpIdx, inp in enumerate(model_inputs):
+                    newline += indent + f'float {mg.get_io_port_name(inp, True, inpIdx)}[{str(inp.size())}] = {{}};\n'
+
+                for outIdx, out in enumerate(model_outputs):
+                    newline += indent + f"float {mg.get_io_port_name(out, False, outIdx)}[{str(out.size())}] = {{}};\n"
+
+            elif '// hls-fpga-machine-learning insert tb-output' in line:
+                newline = line
+                tb_stream = model.config.get_writer_config().get('TBOutputStream', 'both')
+                if tb_stream != "stdout":  # it can be both or file
+                    for outIdx, out in enumerate(model_outputs):
+                        newline += (
+                            indent
+                            + 'nnet::print_result<{actualType}, {cpysize}>({portName}, {des}, {keepOutput});\n'.format(
+                                actualType="float",
+                                cpysize=out.size(),
+                                portName=mg.get_io_port_name(out, False, outIdx),
+                                des="fout",
+                                keepOutput="false",
+                            )
+                        )
+            elif (
+                '// hls-fpga-machine-learning insert output' in line
+                or '// hls-fpga-machine-learning insert quantized' in line
+            ):
+
+                newline = line
+                tb_stream = model.config.get_writer_config().get('TBOutputStream', 'both')
+                keep_output = str(tb_stream != "stdout").lower()
+
+                if tb_stream != "file":
+                    for outIdx, out in enumerate(model_outputs):
+                        newline += (
+                            indent
+                            + 'nnet::print_result<{actualType}, {cpysize}>({portName}, {des}, {keepOutput});\n'.format(
+                                actualType="float",
+                                cpysize=out.size(),
+                                portName=mg.get_io_port_name(out, False, outIdx),
+                                des="std::cout",
+                                keepOutput=keep_output,
+                            )
+                        )
+
+            elif '// hls-fpga-machine-learning insert namespace' in line:
+                newline = ''
+
+                namespace = model.config.get_writer_config().get('Namespace', None)
+                if namespace is not None:
+                    newline += indent + f'using namespace {namespace};\n'
+
+            else:
+                newline = line
+
+            fout.write(newline)
+        f.close()
+        fout.close()
diff --git a/hls4ml/writer/vitis_unified_writer/wrap_gen.py b/hls4ml/writer/vitis_unified_writer/wrap_gen.py
new file mode 100644
index 0000000000..59b21a49c3
--- /dev/null
+++ b/hls4ml/writer/vitis_unified_writer/wrap_gen.py
@@ -0,0 +1,144 @@
+import os
+
+from .meta import VitisUnifiedWriterMeta
+
+# main function
+
+
+class VitisUnified_WrapperGen:
+
+    @classmethod
+    def gen_io_str(self, mg, indent, inp_gmem_t, out_gmem_t, inps, outs, meta=None):
+
+        inputPtrList = []
+        outputPtrList = []
+
+        for inp_idx, inp in enumerate(inps):
+            inputPtrList.append(f"{indent} {inp_gmem_t}* {mg.get_io_port_name(inp, True, inp_idx)}")
+
+        for out_idx, out in enumerate(outs):
+            outputPtrList.append(f"{indent} {out_gmem_t}* {mg.get_io_port_name(out, False, out_idx)}")
+
+        line = ", ".join(inputPtrList) + ",\n"
+        line += ", ".join(outputPtrList) + "\n"
+
+        return line
+
+    @classmethod
+    def write_wrapper(self, meta: VitisUnifiedWriterMeta, model, mg):
+
+        inp_gmem_t, out_gmem_t, inps, outs = meta.vitis_unified_config.get_corrected_types()
+        indent = '      '
+
+        # start write myproject_dm.cpp
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/myproject_dm.cpp'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/myproject_dm.cpp', 'w')
+
+        for line in fin.readlines():
+
+            if "MY_PROJECT_DM_INC" in line:
+                line = line.replace("MY_PROJECT_DM_INC", mg.get_wrapper_file_name(model))
+            elif "MY_PROJECT_TOP_FUNC" in line:
+                line = line.replace("MY_PROJECT_TOP_FUNC", mg.get_top_wrap_func_name(model))
+            elif "STREAM_BUF_IN_SZ" in line:
+                line = line.replace("VAL", str(meta.vitis_unified_config.get_in_stream_bufferSz()))
+            elif "STREAM_BUF_OUT_SZ" in line:
+                line = line.replace("VAL", str(meta.vitis_unified_config.get_out_stream_bufferSz()))
+
+            elif "// vitis-unified-wrapper-io" in line:
+                line = self.gen_io_str(mg, indent, inp_gmem_t, out_gmem_t, inps, outs) + "\n"
+            elif "// vitis-unified-wrapper-interface" in line:
+                # This section will generate the pragma to specify interface type
+                # --> axi master (memory read input)
+                # --> axi master (memory write output)
+                # BOTH IS MASTER
+                # Please note that gmem_in/out depth size must match with the cosim array allocation
+                # if the cosim allocation is larger than depth, the result will not correct
+                # if the cosim allocation is lower than depth, the result is correct,
+                # but the system will throw segment falut error
+                # the depth size will not impact the resource usage in hls generation
+                for inp_idx, inp in enumerate(inps):
+                    line += (
+                        f"#pragma HLS INTERFACE m_axi     port={mg.get_io_port_name(inp, True, inp_idx)} "
+                        f"bundle = gmem_in{inp_idx} depth={str(inp.size())}\n"
+                    )
+                for out_idx, out in enumerate(outs):
+                    line += (
+                        f"#pragma HLS INTERFACE m_axi     port={mg.get_io_port_name(out, False, out_idx)} "
+                        f"bundle = gmem_out{out_idx} depth={str(out.size())}\n"
+                    )
+            elif "// vitis-unified-wrapper-stream-dec" in line:
+                # this declare the stream buffer that axi master read will store the input  and axi master write
+                # will retrieve the output
+                for inp_idx, inp in enumerate(inps):
+                    line += f"{indent} static hls::stream<{inp.type.name}> {mg.get_local_stream_name(inp, True, inp_idx)};\n"
+                for out_idx, out in enumerate(outs):
+                    line += (
+                        f"{indent} static hls::stream<{out.type.name}> {mg.get_local_stream_name(out, False, out_idx)};\n"
+                    )
+
+            elif "// vitis-unified-wrapper-stream-config" in line:
+                for inp_idx, inp in enumerate(inps):
+                    line += (
+                        f"#pragma HLS STREAM variable={mg.get_local_stream_name(inp, True, inp_idx)} "
+                        f"depth=STREAM_BUF_IN_SZ\n"
+                    )
+                for out_idx, out in enumerate(outs):
+                    line += (
+                        f"#pragma HLS STREAM variable={mg.get_local_stream_name(out, False, out_idx)} "
+                        f"depth=STREAM_BUF_OUT_SZ\n"
+                    )
+
+            elif "// vitis-unified-wrapper-load" in line:
+                # this call the load_input function to  convert axi_master read to axi stream (buffer)
+                for inp_idx, inp in enumerate(inps):
+                    line += (
+                        f"load_input({mg.get_io_port_name(inp, True, inp_idx)}, "
+                        f"{mg.get_local_stream_name(inp, True, inp_idx)}, amtQuery, {str(inp.size())});\n"
+                    )
+            elif "// vitis-unified-wrapper-compute" in line:
+                #
+                poolList = []
+                for inp_idx, inp in enumerate(inps):
+                    poolList.append(f"{mg.get_local_stream_name(inp, True, inp_idx)}")
+                for out_idx, out in enumerate(outs):
+                    poolList.append(f"{mg.get_local_stream_name(out, False, out_idx)}")
+                joinedIo = f",\n{indent}{indent}{indent}".join(poolList)
+                line += f"{indent} {mg.get_top_model_name(model)}({joinedIo});\n"
+
+            elif "// vitis-unified-wrapper-store" in line:
+                # this call the store_result function to convert axi_master read to axi stream (buffer)
+                for out_idx, out in enumerate(outs):
+                    line += (
+                        f"store_result({mg.get_io_port_name(out, False, out_idx)}, "
+                        f"{mg.get_local_stream_name(out, False, out_idx)}, amtQuery, {str(out.size())});\n"
+                    )
+
+            fout.write(line)
+
+        fin.close()
+        fout.close()
+
+        #
+        # start write myproject_dm.h
+
+        filedir = os.path.dirname(os.path.abspath(__file__))
+        fin = open(os.path.join(filedir, '../../templates/vitis_unified/myproject_dm.h'))
+        fout = open(f'{model.config.get_output_dir()}/firmware/myproject_dm.h', 'w')
+
+        for line in fin.readlines():
+
+            if "FILENAME" in line:
+                line = line.replace("FILENAME", mg.get_wrapper_file_name(model).upper())
+            elif "MY_PROJECT_INC.h" in line:
+                line = line.replace("MY_PROJECT_INC", mg.get_main_file_name(model))
+            elif "MY_PROJECT_TOP_FUNC" in line:
+                line = line.replace("MY_PROJECT_TOP_FUNC", mg.get_top_wrap_func_name(model))
+            elif "// vitis-unified-wrapper-io" in line:
+                line += self.gen_io_str(mg, indent, inp_gmem_t, out_gmem_t, inps, outs) + "\n"
+            fout.write(line)
+
+        fin.close()
+        fout.close()
diff --git a/hls4ml/writer/vitis_writer.py b/hls4ml/writer/vitis_writer.py
index 46c0ba3044..35b13c2201 100644
--- a/hls4ml/writer/vitis_writer.py
+++ b/hls4ml/writer/vitis_writer.py
@@ -64,11 +64,14 @@ def write_build_prj_override(self, model):
         dstpath = f'{model.config.get_output_dir()}/build_prj.tcl'
         copyfile(srcpath, dstpath)
 
-    def write_hls(self, model):
+    def write_hls(self, model, is_multigraph=False):
         """
         Write the HLS project. Calls the steps from VivadoWriter, adapted for Vitis
         """
-        super().write_hls(model)
+        if is_multigraph:
+            super().write_hls(model, is_multigraph=True)
+            return
+        super().write_hls(model, is_multigraph=False)
         self.write_nnet_utils_overrides(model)
         self.write_board_script_override(model)
         self.write_build_prj_override(model)
diff --git a/test/pytest/test_backend/cmpResult.py b/test/pytest/test_backend/cmpResult.py
new file mode 100644
index 0000000000..b6753240b3
--- /dev/null
+++ b/test/pytest/test_backend/cmpResult.py
@@ -0,0 +1,26 @@
+import os
+from pathlib import Path
+
+import numpy as np
+
+test_root_path = Path(__file__).parent
+os.environ['XILINX_VITIS'] = "/tools/Xilinx/Vitis/2023.2"
+os.environ['PATH'] = os.environ['XILINX_VITIS'] + '/bin:' + os.environ['PATH']
+
+
+def checkEqual(a, b):
+    equal = np.array_equal(a, b)
+    if equal:
+        print("Test pass both are equal \U0001f642")
+    else:
+        print("Test Fail both are not equal \U0001f62c")
+
+
+bridge_result = np.load(test_root_path / "output_file/outputGenbit.npy")
+zcu_result = np.load(test_root_path / "output_file/out_hw.npy")
+zcu_flat = zcu_result.reshape(zcu_result.shape[0], -1)
+
+print(bridge_result.shape)
+print(zcu_result.shape)
+
+checkEqual(bridge_result, zcu_flat)
diff --git a/test/pytest/test_backend/pynq_example.ipynb b/test/pytest/test_backend/pynq_example.ipynb
new file mode 100644
index 0000000000..f0e62618b7
--- /dev/null
+++ b/test/pytest/test_backend/pynq_example.ipynb
@@ -0,0 +1,755 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "a0e69eaf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import the library\n",
+    "from pynq import Overlay     # import the overlay\n",
+    "from pynq import allocate    # import for CMA (contingeous memory allocation)\n",
+    "from pynq import DefaultIP   # import the ip connector library for extension\n",
+    "from pynq import Interrupt\n",
+    "import pynq_driver\n",
+    "import asyncio\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import subprocess\n",
+    "import re\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "89082d04",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/javascript": [
+       "\n",
+       "try {\n",
+       "require(['notebook/js/codecell'], function(codecell) {\n",
+       "  codecell.CodeCell.options_default.highlight_modes[\n",
+       "      'magic_text/x-csrc'] = {'reg':[/^%%microblaze/]};\n",
+       "  Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n",
+       "      Jupyter.notebook.get_cells().map(function(cell){\n",
+       "          if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n",
+       "  });\n",
+       "});\n",
+       "} catch (e) {};\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/javascript": [
+       "\n",
+       "try {\n",
+       "require(['notebook/js/codecell'], function(codecell) {\n",
+       "  codecell.CodeCell.options_default.highlight_modes[\n",
+       "      'magic_text/x-csrc'] = {'reg':[/^%%pybind11/]};\n",
+       "  Jupyter.notebook.events.one('kernel_ready.Kernel', function(){\n",
+       "      Jupyter.notebook.get_cells().map(function(cell){\n",
+       "          if (cell.cell_type == 'code'){ cell.auto_highlight(); } }) ;\n",
+       "  });\n",
+       "});\n",
+       "} catch (e) {};\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Help on Overlay in module pynq.overlay:\n",
+      "\n",
+      "<pynq.overlay.Overlay object>\n",
+      "    Default documentation for overlay system.bit. The following\n",
+      "    attributes are available on this overlay:\n",
+      "    \n",
+      "    IP Blocks\n",
+      "    ----------\n",
+      "    myproject_gem_1      : pynq_driver.MyDfxCtrl\n",
+      "    axi_intc_0           : pynq.overlay.DefaultIP\n",
+      "    ps_e                 : pynq.overlay.DefaultIP\n",
+      "    \n",
+      "    Hierarchies\n",
+      "    -----------\n",
+      "    None\n",
+      "    \n",
+      "    Interrupts\n",
+      "    ----------\n",
+      "    None\n",
+      "    \n",
+      "    GPIO Outputs\n",
+      "    ------------\n",
+      "    None\n",
+      "    \n",
+      "    Memories\n",
+      "    ------------\n",
+      "    PSDDR                : Memory\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "overlay = Overlay(\"system.bit\")\n",
+    "help(overlay)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b19361e3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/json": {
+       "axi_intc_0_intr_1_interrupt_concat/In0": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In0",
+        "index": 0
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In1": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In1",
+        "index": 1
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In10": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In10",
+        "index": 10
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In11": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In11",
+        "index": 11
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In12": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In12",
+        "index": 12
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In13": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In13",
+        "index": 13
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In14": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In14",
+        "index": 14
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In15": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In15",
+        "index": 15
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In16": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In16",
+        "index": 16
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In17": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In17",
+        "index": 17
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In18": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In18",
+        "index": 18
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In19": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In19",
+        "index": 19
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In2": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In2",
+        "index": 2
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In20": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In20",
+        "index": 20
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In21": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In21",
+        "index": 21
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In22": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In22",
+        "index": 22
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In23": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In23",
+        "index": 23
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In24": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In24",
+        "index": 24
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In25": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In25",
+        "index": 25
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In26": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In26",
+        "index": 26
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In27": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In27",
+        "index": 27
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In28": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In28",
+        "index": 28
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In29": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In29",
+        "index": 29
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In3": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In3",
+        "index": 3
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In30": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In30",
+        "index": 30
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In31": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In31",
+        "index": 31
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In4": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In4",
+        "index": 4
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In5": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In5",
+        "index": 5
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In6": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In6",
+        "index": 6
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In7": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In7",
+        "index": 7
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In8": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In8",
+        "index": 8
+       },
+       "axi_intc_0_intr_1_interrupt_concat/In9": {
+        "controller": "axi_intc_0",
+        "fullpath": "axi_intc_0_intr_1_interrupt_concat/In9",
+        "index": 9
+       },
+       "irq_const_tieoff/dout": {
+        "controller": "axi_intc_0",
+        "fullpath": "irq_const_tieoff/dout",
+        "index": 31
+       },
+       "myproject_gem_1/interrupt": {
+        "controller": "axi_intc_0",
+        "fullpath": "myproject_gem_1/interrupt",
+        "index": 1
+       }
+      },
+      "text/plain": [
+       "{'irq_const_tieoff/dout': {'controller': 'axi_intc_0',\n",
+       "  'index': 31,\n",
+       "  'fullpath': 'irq_const_tieoff/dout'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In0': {'controller': 'axi_intc_0',\n",
+       "  'index': 0,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In0'},\n",
+       " 'myproject_gem_1/interrupt': {'controller': 'axi_intc_0',\n",
+       "  'index': 1,\n",
+       "  'fullpath': 'myproject_gem_1/interrupt'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In1': {'controller': 'axi_intc_0',\n",
+       "  'index': 1,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In1'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In2': {'controller': 'axi_intc_0',\n",
+       "  'index': 2,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In2'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In3': {'controller': 'axi_intc_0',\n",
+       "  'index': 3,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In3'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In4': {'controller': 'axi_intc_0',\n",
+       "  'index': 4,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In4'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In5': {'controller': 'axi_intc_0',\n",
+       "  'index': 5,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In5'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In6': {'controller': 'axi_intc_0',\n",
+       "  'index': 6,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In6'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In7': {'controller': 'axi_intc_0',\n",
+       "  'index': 7,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In7'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In8': {'controller': 'axi_intc_0',\n",
+       "  'index': 8,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In8'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In9': {'controller': 'axi_intc_0',\n",
+       "  'index': 9,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In9'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In10': {'controller': 'axi_intc_0',\n",
+       "  'index': 10,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In10'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In11': {'controller': 'axi_intc_0',\n",
+       "  'index': 11,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In11'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In12': {'controller': 'axi_intc_0',\n",
+       "  'index': 12,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In12'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In13': {'controller': 'axi_intc_0',\n",
+       "  'index': 13,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In13'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In14': {'controller': 'axi_intc_0',\n",
+       "  'index': 14,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In14'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In15': {'controller': 'axi_intc_0',\n",
+       "  'index': 15,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In15'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In16': {'controller': 'axi_intc_0',\n",
+       "  'index': 16,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In16'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In17': {'controller': 'axi_intc_0',\n",
+       "  'index': 17,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In17'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In18': {'controller': 'axi_intc_0',\n",
+       "  'index': 18,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In18'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In19': {'controller': 'axi_intc_0',\n",
+       "  'index': 19,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In19'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In20': {'controller': 'axi_intc_0',\n",
+       "  'index': 20,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In20'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In21': {'controller': 'axi_intc_0',\n",
+       "  'index': 21,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In21'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In22': {'controller': 'axi_intc_0',\n",
+       "  'index': 22,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In22'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In23': {'controller': 'axi_intc_0',\n",
+       "  'index': 23,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In23'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In24': {'controller': 'axi_intc_0',\n",
+       "  'index': 24,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In24'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In25': {'controller': 'axi_intc_0',\n",
+       "  'index': 25,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In25'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In26': {'controller': 'axi_intc_0',\n",
+       "  'index': 26,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In26'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In27': {'controller': 'axi_intc_0',\n",
+       "  'index': 27,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In27'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In28': {'controller': 'axi_intc_0',\n",
+       "  'index': 28,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In28'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In29': {'controller': 'axi_intc_0',\n",
+       "  'index': 29,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In29'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In30': {'controller': 'axi_intc_0',\n",
+       "  'index': 30,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In30'},\n",
+       " 'axi_intc_0_intr_1_interrupt_concat/In31': {'controller': 'axi_intc_0',\n",
+       "  'index': 31,\n",
+       "  'fullpath': 'axi_intc_0_intr_1_interrupt_concat/In31'}}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {
+      "application/json": {
+       "expanded": false,
+       "root": "interrupt_pins"
+      }
+     },
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "overlay.interrupt_pins"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0e5b0e03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create an instance of the interrupt\n",
+    "my_interrupt = Interrupt('myproject_gem_1/interrupt')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "9459e8ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load input from .npy file\n",
+    "input_array  = np.load(\"inputGenbit.npy\").astype(np.float32)  # shape (20,4,4,1)\n",
+    "output_array = np.zeros(input_array.shape, dtype=np.float32)\n",
+    "\n",
+    "# Allocate physically contiguous memory for input and output\n",
+    "input_buffer = allocate(shape=input_array.shape, dtype=np.float32)\n",
+    "output_buffer = allocate(shape=output_array.shape, dtype=np.float32)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "625c2b1f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "input array shape (10000, 4, 4, 1)\n",
+      "output array shape (10000, 4, 4, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# check input shape\n",
+    "print(f\"input array shape {input_array.shape}\")\n",
+    "print(f\"output array shape {output_array.shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "7de66a89",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# copy data to input buffer\n",
+    "np.copyto(input_buffer, input_array)\n",
+    "input_buffer.flush()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "d18bac75",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "input gmem_in0_ptr_input_1 will be set to addr: 0x78400000 with elements: 160000\n",
+      "output gmem_out0_ptr_layer12_out will be set to addr: 0x78500000 with elements: 160000\n",
+      "amount of queries will be set to: 10000 at address: 0x28\n",
+      "prepare your interrupt\n",
+      "global interrupt enable register\n",
+      "enable gie successful\n",
+      "ap_done interrupt enable register\n",
+      "enable ap_done interrupt successful\n",
+      "ap_done register clear\n",
+      "clear ap_done interrupt successful\n",
+      "----------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "# get the ip and initialize the system\n",
+    "ip = overlay.myproject_gem_1  # Replace with your IP instance name\n",
+    "ip.set_input (0, input_buffer)\n",
+    "ip.set_output(0, output_buffer)\n",
+    "ip.set_amt_query(input_array.shape[0])\n",
+    "ip.prepare_intr()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d12a031a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def wait_for_acc():\n",
+    "    print(\"starting the accelerator\")\n",
+    "    ip.ctrl_start()\n",
+    "    print(\"waiting for the accelerator to finish\")\n",
+    "    await my_interrupt.wait()\n",
+    "    print(\"accelerator has finished\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "81750e60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#### get event loop from asyncio\n",
+    "loop = asyncio.get_event_loop()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "fe5f7eb2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "starting the accelerator\n",
+      "waiting for the accelerator to finish\n",
+      "accelerator has finished\n"
+     ]
+    }
+   ],
+   "source": [
+    "task = loop.create_task(wait_for_acc())\n",
+    "loop.run_until_complete(task)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "372b1982",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_buffer.invalidate()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "6a7834c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[[[[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.49609375]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.4921875 ]\n",
+      "   [0.4765625 ]]]\n",
+      "\n",
+      "\n",
+      " [[[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.48046875]\n",
+      "   [0.48046875]]]\n",
+      "\n",
+      "\n",
+      " [[[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.49609375]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.48828125]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.4921875 ]]]\n",
+      "\n",
+      "\n",
+      " ...\n",
+      "\n",
+      "\n",
+      " [[[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.48046875]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.4921875 ]\n",
+      "   [0.46875   ]]]\n",
+      "\n",
+      "\n",
+      " [[[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.48046875]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.49609375]\n",
+      "   [0.484375  ]]]\n",
+      "\n",
+      "\n",
+      " [[[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.49609375]]\n",
+      "\n",
+      "  [[0.5       ]\n",
+      "   [0.5       ]\n",
+      "   [0.48828125]\n",
+      "   [0.48828125]]]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(output_buffer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "31e1098e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "we got output shape: (10000, 4, 4, 1)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# convert it to numpy array\n",
+    "print(\"we got output shape:\", output_buffer.shape)\n",
+    "outNp = np.array(output_buffer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "9bf5d406",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# save it to .npy file\n",
+    "np.save(\"out_hw.npy\", outNp)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/test/pytest/test_backend/vitis_unified.py b/test/pytest/test_backend/vitis_unified.py
new file mode 100644
index 0000000000..04f445021c
--- /dev/null
+++ b/test/pytest/test_backend/vitis_unified.py
@@ -0,0 +1,259 @@
+import os
+from pathlib import Path
+
+import numpy as np
+import pytest
+from tensorflow.keras.layers import (
+    Concatenate,
+    Conv2D,
+    Input,
+    MaxPooling2D,
+    UpSampling2D,
+)
+from tensorflow.keras.models import Model, load_model
+
+import hls4ml
+import hls4ml.model
+
+test_root_path = Path(__file__).parent
+
+os.environ['XILINX_VITIS'] = "/tools/Xilinx/Vitis/2023.2"
+os.environ['PATH'] = os.environ['XILINX_VITIS'] + '/bin:' + os.environ['PATH']
+
+XPFM_PATH = "/tools/Xilinx/Vitis/2023.2/base_platforms/" "xilinx_zcu102_base_202320_1/xilinx_zcu102_base_202320_1.xpfm"
+LOG_STD = True
+
+
+def create_io_file_dir():
+    os.makedirs(test_root_path / "input_file", exist_ok=True)
+    os.makedirs(test_root_path / "output_file", exist_ok=True)
+
+
+def checkEqual(a, b):
+
+    equal = np.array_equal(a, b)
+    if equal:
+        print("Test pass both are equal \U0001f642")
+    else:
+        print("Test Fail both are not equal \U0001f62c")
+    return equal
+
+
+def create_simple_testcase(inputShape=(4, 4, 1), fileName="inputX.npy"):
+    n_in = np.random.rand(*inputShape).astype(np.float32)
+    os.makedirs(test_root_path / "input_file", exist_ok=True)
+    np.save(test_root_path / "input_file" / fileName, n_in)
+
+
+def create_simple_unet(input_shape=(4, 4, 1), modelName="simpleSkip.keras"):
+    inputs = Input(input_shape)
+    # Encoder
+    c1 = Conv2D(2, (3, 3), activation='relu', padding='same')(inputs)
+    p1 = MaxPooling2D((2, 2))(c1)
+    # Bottleneck
+    bn = Conv2D(4, (3, 3), activation='relu', padding='same')(p1)
+    # Decoder
+    u1 = UpSampling2D((2, 2))(bn)
+    concat1 = Concatenate()([u1, c1])
+    c2 = Conv2D(2, (3, 3), activation='relu', padding='same')(concat1)
+    # Output layer (1 channel)
+    outputs = Conv2D(1, (1, 1), activation='sigmoid')(c2)
+    model = Model(inputs, outputs)
+    model.compile(optimizer='adam', loss='binary_crossentropy')
+    model.save(test_root_path / "input_file" / modelName)
+
+
+def gen_prj_dir(backend, io_type, strategy, granularity, prefix):
+    return str(test_root_path / f"hls4mlprj_{prefix}_{backend}_{strategy}_{io_type}_{granularity}")
+
+
+def create_hls_model(model, config, backend, io_type, strategy, granularity, prefix):
+    output_dir = gen_prj_dir(backend, io_type, strategy, granularity, prefix)
+    # mono model build
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir=output_dir,
+        backend=backend,
+        io_type=io_type,
+        board='zcu102',
+        part='xczu9eg-ffvb1156-2-e',
+        clock_period='10ns',
+        input_type="float",
+        output_type="float",
+        xpfmPath=XPFM_PATH,
+    )
+    hls_model.compile()
+    return hls_model
+
+
+def create_hls_model4_cosim(model, config, backend, io_type, strategy, granularity, input_data_tb, output_data_tb, prefix):
+    output_dir = gen_prj_dir(backend, io_type, strategy, granularity, prefix)
+    # mono model build
+    hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir=output_dir,
+        backend=backend,
+        io_type=io_type,
+        board='zcu102',
+        part='xczu9eg-ffvb1156-2-e',
+        clock_period='10ns',
+        input_type="float",
+        output_type="float",
+        input_data_tb=input_data_tb,
+        output_data_tb=output_data_tb,
+    )
+    hls_model.compile()
+    return hls_model
+
+
+def predict_hls_model(hls_model, input_data):
+    y_hls4ml = hls_model.predict(input_data)
+    return y_hls4ml
+
+
+@pytest.mark.parametrize('io_type', ['io_stream'])
+@pytest.mark.parametrize('strategy', ['latency'])
+@pytest.mark.parametrize('granularity', ['name'])
+@pytest.mark.parametrize('amt_query', [10])
+def test_backend_predict(io_type, strategy, granularity, amt_query):
+    create_io_file_dir()
+    # create and load data set
+    create_simple_testcase(inputShape=(amt_query, 4, 4, 1), fileName="inputX.npy")
+    input_data = np.load(test_root_path / "input_file" / "inputX.npy")
+    # create and load model
+    model_name = "simpleSkip.keras"
+    create_simple_unet(modelName=model_name)
+    model = load_model(test_root_path / "input_file" / model_name)
+    # config the keras model
+    config = hls4ml.utils.config_from_keras_model(model, granularity=granularity)
+
+    # create hls4ml model
+    vitis_unified_model = create_hls_model(model, config, "VitisUnified", io_type, strategy, granularity, "bridge")
+    vitis_model = create_hls_model(model, config, "Vitis", io_type, strategy, granularity, "bridge")
+
+    # predict test
+
+    y_hls4ml_unified = predict_hls_model(vitis_unified_model, input_data)
+    y_hls4ml = predict_hls_model(vitis_model, input_data)
+
+    assert checkEqual(y_hls4ml_unified, y_hls4ml), "the result from vitis unified and vitis are not equal!"
+
+
+# test_backend_predict("io_stream", 'latency', 'name', 10)
+
+
+@pytest.mark.parametrize('io_type', ['io_stream'])
+@pytest.mark.parametrize('strategy', ['latency'])
+@pytest.mark.parametrize('granularity', ['name'])
+@pytest.mark.parametrize('amt_query', [10])
+def test_co_simulation(io_type, strategy, granularity, amt_query):
+    create_io_file_dir()
+    # create and load data set
+    create_simple_testcase(inputShape=(amt_query, 4, 4, 1), fileName="inputCosim.npy")
+    input_data = np.load(test_root_path / "input_file" / "inputCosim.npy")
+    # create and load model
+    model_name = "simpleSkipCosim.keras"
+    create_simple_unet(modelName=model_name)
+    model = load_model(test_root_path / "input_file" / model_name)
+    # config the keras model
+    config = hls4ml.utils.config_from_keras_model(model, granularity=granularity)
+
+    # predict it first
+    vitis_unified_model = create_hls_model(model, config, "VitisUnified", io_type, strategy, granularity, "precosim")
+    y_hls4ml_unified = predict_hls_model(vitis_unified_model, input_data)
+    np.save(test_root_path / "output_file" / "outputCosim.npy", y_hls4ml_unified)
+
+    input_data_tb = str(test_root_path / "input_file" / "inputCosim.npy")
+    output_data_tb = str(test_root_path / "output_file" / "outputCosim.npy")
+
+    # create hls4ml model
+    vitis_unified_model_cosim = create_hls_model4_cosim(
+        model, config, "VitisUnified", io_type, strategy, granularity, input_data_tb, output_data_tb, "cosim"
+    )
+    # do cosim
+    vitis_unified_model_cosim.compile()
+    vitis_unified_model_cosim.build(synth=True, cosim=True, log_to_stdout=LOG_STD)
+
+    bridge_result_path = (
+        gen_prj_dir("VitisUnified", io_type, strategy, granularity, "cosim") + "/tb_data/tb_output_predictions.dat"
+    )
+    cosim_result_path = (
+        gen_prj_dir("VitisUnified", io_type, strategy, granularity, "cosim") + "/tb_data/rtl_cosim_results.log"
+    )
+
+    bridge_result = np.loadtxt(bridge_result_path)
+    cosim_result = np.loadtxt(cosim_result_path)
+
+    assert np.allclose(bridge_result, cosim_result, rtol=0.0, atol=1e-4), "the result from bridge and cosim are not equal!"
+
+
+# test_co_simulation("io_stream", 'latency', 'name', 10)
+
+
+@pytest.mark.parametrize('io_type', ['io_stream'])
+@pytest.mark.parametrize('strategy', ['latency'])
+@pytest.mark.parametrize('granularity', ['name'])
+@pytest.mark.parametrize('amt_query', [10])
+def test_fifo_depth(io_type, strategy, granularity, amt_query):
+    create_io_file_dir()
+    # create and load data set
+    create_simple_testcase(inputShape=(amt_query, 4, 4, 1), fileName="inputFifoDepth.npy")
+    input_data = np.load(test_root_path / "input_file" / "inputFifoDepth.npy")
+    # create and load model
+    model_name = "simpleSkipFifoDepth.keras"
+    create_simple_unet(modelName=model_name)
+    model = load_model(test_root_path / "input_file" / model_name)
+    # config the keras model
+    config = hls4ml.utils.config_from_keras_model(model, granularity=granularity)
+
+    # predict it first
+    vitis_unified_model = create_hls_model(model, config, "VitisUnified", io_type, strategy, granularity, "fifodepth")
+    y_hls4ml_unified = predict_hls_model(vitis_unified_model, input_data)
+    np.save(test_root_path / "output_file" / "outputFifoDepth.npy", y_hls4ml_unified)
+
+    input_data_tb = str(test_root_path / "input_file" / "inputFifoDepth.npy")
+    output_data_tb = str(test_root_path / "output_file" / "outputFifoDepth.npy")
+
+    # create hls4ml model
+    config['Flows'] = ['vitisunified:fifo_depth_optimization']
+    vitis_unified_model_fifo = create_hls_model4_cosim(
+        model, config, "VitisUnified", io_type, strategy, granularity, input_data_tb, output_data_tb, "fifodepth"
+    )
+    # do cosim
+    vitis_unified_model_fifo.compile()
+
+    fifodepth_result_path = gen_prj_dir("VitisUnified", io_type, strategy, granularity, "fifodepth") + "/fifo_depths.json"
+    assert os.path.exists(fifodepth_result_path), "the fifo_depth file is not exist"
+
+
+# test_fifo_depth("io_stream", 'latency', 'name', 10)
+
+
+@pytest.mark.parametrize('io_type', ['io_stream'])
+@pytest.mark.parametrize('strategy', ['latency'])
+@pytest.mark.parametrize('granularity', ['name'])
+@pytest.mark.parametrize('amt_query', [10000])
+def test_gen_unified(io_type, strategy, granularity, amt_query):
+    create_io_file_dir()
+    # create and load data set
+    create_simple_testcase(inputShape=(amt_query, 4, 4, 1), fileName="inputGenbit.npy")
+    input_data = np.load(test_root_path / "input_file" / "inputGenbit.npy")
+    # create and load model
+    model_name = "simpleSkipGenBit.keras"
+    create_simple_unet(modelName=model_name)
+    model = load_model(test_root_path / "input_file" / model_name)
+    # config the keras model
+    config = hls4ml.utils.config_from_keras_model(model, granularity=granularity)
+
+    # predict it first
+    vitis_unified_model = create_hls_model(model, config, "VitisUnified", io_type, strategy, granularity, "gen_unified")
+    y_hls4ml_unified = predict_hls_model(vitis_unified_model, input_data)
+    np.save(test_root_path / "output_file" / "outputGenbit.npy", y_hls4ml_unified)
+
+    vitis_unified_model.compile()
+    vitis_unified_model.build(synth=True, bitfile=True, log_to_stdout=LOG_STD)
+
+
+# test_gen_unified("io_stream", 'latency', 'name', 10000)