fastmachinelearning
diff --git a/‎hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py‎
Lines changed: 8 additions & 4 deletions b/‎hls4ml/backends/vitis_accelerator/vitis_accelerator_backend.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py‎
Lines changed: 3 additions & 3 deletions b/‎hls4ml/backends/vitis_accelerator/vitis_accelerator_config.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎hls4ml/templates/vitis_accelerator/Makefile‎
Lines changed: 4 additions & 4 deletions b/‎hls4ml/templates/vitis_accelerator/Makefile‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp‎
Lines changed: 202 additions & 0 deletions b/‎hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp‎
Lines changed: 202 additions & 0 deletions
diff --git a/‎hls4ml/templates/vitis_accelerator/libs/DdrFpga.hpp‎
Lines changed: 0 additions & 33 deletions b/‎hls4ml/templates/vitis_accelerator/libs/DdrFpga.hpp‎
Lines changed: 0 additions & 33 deletions
@@ -22,7 +22,7 @@ def create_initial_config(
         clock_uncertainty='27%',
         io_type="io_parallel",
         num_kernel=1,
-        num_thread=1,
+        num_worker=1,
         batchsize=8192,
         hw_quant=False,
         vivado_directives=None,
@@ -36,7 +36,7 @@ def create_initial_config(
             clock_period: clock period passed to hls project
             io_type: io_parallel or io_stream
             num_kernel: how many compute units to create on the fpga
-            num_thread: how many threads the host cpu uses to drive the fpga
+            num_worker: how many threads the host cpu uses to drive each CU on the fpga
             batchsize: how many samples to process within a single buffer on the fpga
             vivado_directives: Directives passed down to Vivado that controls the hardware synthesis and implementation steps
         Returns:
@@ -47,7 +47,7 @@ def create_initial_config(
         config["AcceleratorConfig"] = {}
         config["AcceleratorConfig"]["Board"] = board
         config["AcceleratorConfig"]["Num_Kernel"] = num_kernel
-        config["AcceleratorConfig"]["Num_Thread"] = num_thread
+        config["AcceleratorConfig"]["Num_Worker"] = num_worker
         config["AcceleratorConfig"]["Batchsize"] = batchsize
         config["AcceleratorConfig"]["HW_Quant"] = hw_quant
         config["AcceleratorConfig"]["Vivado_Directives"] = vivado_directives
@@ -112,11 +112,15 @@ def dat_to_numpy(self, model):
         y = np.loadtxt(output_file, dtype=float).reshape(-1, expected_shape)
         return y
 
-    def hardware_predict(self, model, x, target="hw", debug=False):
+    def hardware_predict(self, model, x, target="hw", debug=False, profilingRepeat=-1):
         if debug:
             command = "DEBUG=1 "
+        if isinstance(profilingRepeat, int) and profilingRepeat > 0:
+            command += "PROFILING_DATA_REPEAT_COUNT=" + profilingRepeat + " "
         self._validate_target(target)
+
         self.numpy_to_dat(model, x)
+
         currdir = os.getcwd()
         os.chdir(model.config.get_output_dir())
         command += "TARGET=" + target + " make run"
 
@@ -30,7 +30,7 @@ def __init__(self, config):
                 self.config["Part"] = self.part
 
         self.num_kernel = accel_config.get("Num_Kernel", 1)
-        self.num_thread = accel_config.get("Num_Thread", 1)
+        self.num_worker = accel_config.get("Num_Worker", 1)
         self.batchsize = accel_config.get("Batchsize", 8192)
         self.hw_quant = accel_config.get("HW_Quant", False)
 
@@ -42,8 +42,8 @@ def get_board_type(self):
     def get_platform(self):
         return self.platform
 
-    def get_num_thread(self):
-        return self.num_thread
+    def get_num_worker(self):
+        return self.num_worker
 
     def get_num_kernel(self):
         return self.num_kernel
 
@@ -32,7 +32,7 @@ CARD_CFG ?= accelerator_card.cfg
 PLATFORM ?= $(shell awk -F '=' '/platform=/ {print $$2}' $(CARD_CFG))
 
 # Board Type (determines whether design will go through packaging step)
-BOARD_TYPE :=
+BOARD_TYPE := #BOARDTYPE
 
 # Kernel name
 KERNEL_NAME := #PRJNAME
@@ -115,8 +115,8 @@ xclbin: $(BUILD_DIR)/$(WRAPPER_NAME).xclbin
 INCLUDES := -I$(XILINX_XRT)/include/ -I$(XILINX_VIVADO)/include/ -I$(XILINX_HLS)/include/
 INCLUDES += -I$(PWD)/libs/ -I$(PWD)/firmware/ -I$(PWD)/firmware/nnet_utils/
 
-host: $(KERNEL_NAME)_host_cl.cpp libs/xcl2.cpp
-	$(CXX) $(CXXFLAGS) $^ -o $@ $(INCLUDES) $(LDFLAGS)
+host: $(KERNEL_NAME)_host_cl.cpp libs/xcl2.cpp $(wildcard libs/*.hpp)
+	$(CXX) $(CXXFLAGS) $(KERNEL_NAME)_host_cl.cpp libs/xcl2.cpp -o $@ $(INCLUDES) $(LDFLAGS)
 
 # Execute program #############################################################
 
@@ -128,7 +128,7 @@ else
 	@echo "Setting XCL_EMULATION_MODE to $(TARGET)"
 	$(eval EMULATION_MODE := XCL_EMULATION_MODE=$(TARGET))
 endif
-	@cd firmware && $(EMULATION_MODE) ../host ../$(BUILD_DIR)/$(WRAPPER_NAME).xclbin
+	@cd firmware && $(EMULATION_MODE) ../host ../$(BUILD_DIR)/$(WRAPPER_NAME).xclbin $(PROFILING_DATA_REPEAT_COUNT)
 
 # Cleanup #####################################################################
 
 
@@ -0,0 +1,202 @@
+#pragma once
+
+#include <cmath>
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <stdexcept>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "Types.hpp"
+
+template <class T, class U> class DataBatcher {
+  public:
+    /**
+     * \brief Constructor
+     * \param batchsize Number of samples
+     * \param sampleInputSize Flattened length of a single input to the model
+     * \param sampleOutputSize Flattened length of a single output from the model
+     * \param numWorkers Total number of workers
+     * \param profiling If true, the given data will be iterated over multiple times,
+     * for more accurate throughput testing.
+     * \param profilingDataRepeat Only used if profiling is set to True. Additional number of
+     * times the given data is iterated over.
+     */
+    DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers,
+                bool profiling, int profilingDataRepeat)
+        : _batchsize(batchsize), _sampleInputSize(sampleInputSize), _sampleOutputSize(sampleOutputSize),
+          _numWorkers(numWorkers), _profiling(profiling), _profilingDataRepeat(profilingDataRepeat) {}
+
+    /**
+     * \brief Read in data to a buffer. Allocate space for results.
+     * \param filename Filename.
+     * \param s Type of input, currently supports text files used by VitisAccelerator backend, and
+     * binary files produced by NumPy's toFile() function
+     */
+    void read(const std::string& filename) {
+        std::cout << "\nReading data from text file " << filename << std::endl;
+
+        // Read in text file
+        std::ifstream fin(filename);
+        if (!fin.is_open()) {
+            throw std::runtime_error("Error opening file " + filename);
+        }
+
+        std::string line;
+        while (std::getline(fin, line)) {
+            originalSampleCount++;
+            std::istringstream parser(line);
+            T val;
+            while (parser >> val) {
+                inputData.push_back(val);
+            }
+            if (!parser.eof()) {
+                throw std::runtime_error("Failed to parse value on line " + std::to_string(originalSampleCount));
+            }
+        }
+        std::cout << "Read in " << originalSampleCount << " lines" << std::endl;
+        fin.close();
+
+        // Zero-pad
+        numBatches = std::ceil(static_cast<double>(originalSampleCount) / _batchsize);
+        if (numBatches * _batchsize > originalSampleCount) {
+            inputData.resize(numBatches * _batchsize * _sampleInputSize, (T)0);
+        }
+    }
+
+    /**
+     * \brief Allocate space for writing results to.
+     */
+    void createResultBuffers() {
+        storedEvalResults.resize(numBatches * _batchsize * _sampleOutputSize, (U)0);
+
+        // Allocate space to dump the extra arbitrary data used during profiling
+        if (_profiling) {
+            profilingResultsDump.resize(_numWorkers * _batchsize * _sampleOutputSize, (U)0);
+        }
+    }
+
+    /**
+     * \brief Splits data into batches and distributes batches evenly amongst Workers.
+     * \param batchedData A vector of containers for each Worker's batches/workload.
+     * Size must be equal to _numWorkers.
+     */
+    void batch(std::vector<std::list<Batch<T, U>>>& batchedData) {
+        if (inputData.size() == 0 || originalSampleCount == 0) {
+            throw std::runtime_error("No data to batch");
+        }
+        if (storedEvalResults.size() == 0) {
+            throw std::runtime_error("Create result buffers first");
+        }
+
+        batchedData.reserve(_numWorkers);
+        for (int i = 0; i < _numWorkers; i++) {
+            batchedData.emplace_back();
+        }
+
+        uint64_t batchIndex = 0;
+        while (batchIndex < numBatches) {
+            int worker = batchIndex % _numWorkers;
+            uint64_t inputLocation = batchIndex * _batchsize * _sampleInputSize;
+            uint64_t outputLocation = batchIndex * _batchsize * _sampleOutputSize;
+
+            const T* in = &inputData[inputLocation];
+            U* out = &storedEvalResults[outputLocation];
+            Batch<T, U> newBatch = {in, out};
+
+            batchedData[worker].push_back(newBatch);
+            batchIndex++;
+        }
+
+        if (_profiling) {
+            std::cout << "Creating profiling batches" << std::endl;
+            profilingBatchCount = numBatches * (_profilingDataRepeat + 1);
+            while (batchIndex < profilingBatchCount) {
+                int worker = batchIndex % _numWorkers;
+                uint64_t inputLocation = (batchIndex % numBatches) * _batchsize * _sampleInputSize;
+                uint64_t outputLocation = worker * _batchsize * _sampleOutputSize;
+
+                const T* in = &inputData[inputLocation];
+                U* out = &profilingResultsDump[outputLocation];
+                Batch<T, U> newBatch = {in, out};
+
+                batchedData[worker].push_back(newBatch);
+                batchIndex++;
+            }
+        }
+    }
+
+    /**
+     * \brief Releases resources used when reading from input files. Note: Data from those files
+     * will be cleared and will no longer be accessible.
+     */
+    void closeFile() {
+        inputData.clear();
+
+        originalSampleCount = 0;
+        numBatches = 0;
+        profilingBatchCount = 0;
+    }
+
+    void write(const std::string& filename) {
+        std::cout << "\nWriting HW results to file " << filename << std::endl;
+        std::ofstream fout;
+        fout.open(filename, std::ios::trunc);
+
+        if (fout.is_open()) {
+            for (uint64_t i = 0; i < originalSampleCount; i++) {
+                std::stringstream line;
+                for (int n = 0; n < _sampleOutputSize; n++) {
+                    line << (float)storedEvalResults[(i * _sampleOutputSize) + n] << " ";
+                }
+                fout << line.str() << "\n";
+            }
+            fout.close();
+        } else {
+            throw std::runtime_error("Error writing to file " + filename);
+        }
+
+        storedEvalResults.clear();
+        profilingResultsDump.clear();
+    }
+
+    uint64_t getSampleCount() {
+        return originalSampleCount;
+    }
+
+    uint64_t getPaddedSampleCount() {
+        return numBatches * _batchsize;
+    }
+
+    uint64_t getProfilingSampleCount() {
+        return profilingBatchCount * _batchsize;
+    }
+
+    bool isProfilingMode() {
+        return _profiling;
+    }
+
+  private:
+    int _batchsize;
+    int _sampleInputSize;
+    int _sampleOutputSize;
+    int _numWorkers;
+    bool _profiling;
+    int _profilingDataRepeat;
+
+    /// @brief Number of floats read in. (Not including padding).
+    uint64_t originalSampleCount = 0;
+    /// @brief Number of batches of data. (After padding).
+    uint64_t numBatches = 0;
+    /// @brief Effective number of batches of data being evaluted.
+    uint64_t profilingBatchCount = 0;
+    /// @brief Vector with values.
+    std::vector<T> inputData;
+    /// @brief Vector to store evaluation results.
+    std::vector<U> storedEvalResults;
+    /// @brief Vector for dumping results from extra arbitrary data used during profiling.
+    std::vector<U> profilingResultsDump;
+};