Quartus streaming batch normalisation

bo3z · vloncar · commit e34e0c067404 · 2022-07-26T22:37:38.000+02:00
diff --git a/hls4ml/backends/quartus/passes/core_templates.py b/hls4ml/backends/quartus/passes/core_templates.py
@@ -80,7 +80,7 @@ def format(self, node):
 
 batchnorm_function_template = 'nnet::normalize<{input_t}, {output_t}, {config}>({input}, {output}, {scale}, {bias});'
 
-batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h']
+batchnorm_include_list = ['nnet_utils/nnet_batchnorm.h', 'nnet_utils/nnet_batchnorm_stream.h']
 
 class BatchNormalizationConfigTemplate(LayerConfigTemplate):
     def __init__(self):
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm.h
@@ -60,7 +60,6 @@ void normalize(
     Result:
     #pragma unroll
     for (int ires = 0; ires < CONFIG_T::n_in; ires++) {
-        // TODO - Explore  MULADD instruction in HLS - less clock cycles
         if (CONFIG_T::n_filt==-1) {
             res[ires] = CONFIG_T::template product<data_T, typename CONFIG_T::scale_t>::product(data[ires], scale[ires]) + bias[ires];
 	    } else {
diff --git a/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm_stream.h b/hls4ml/templates/quartus/firmware/nnet_utils/nnet_batchnorm_stream.h
@@ -1,33 +1,100 @@
-//
-//    rfnoc-hls-neuralnet: Vivado HLS code for neural-net building blocks
-//
-//    Copyright (C) 2017 EJ Kreinar
-//
-//    This program is free software: you can redistribute it and/or modify
-//    it under the terms of the GNU General Public License as published by
-//    the Free Software Foundation, either version 3 of the License, or
-//    (at your option) any later version.
-//
-//    This program is distributed in the hope that it will be useful,
-//    but WITHOUT ANY WARRANTY; without even the implied warranty of
-//    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-//    GNU General Public License for more details.
-//
-//    You should have received a copy of the GNU General Public License
-//    along with this program.  If not, see <http://www.gnu.org/licenses/>.
-//
-
-/*
-*   PLACEHOLDER - The common pass bn_quant.py includes both parallel and streaming BN; streaming is currently not supported in Quartus
-*/
-
 #ifndef NNET_BATCHNORM_STREAM_H_
 #define NNET_BATCHNORM_STREAM_H_
 
 #include "nnet_common.h"
 #include "nnet_helpers.h"
 #include "nnet_mult.h"
+#include "nnet_types.h"
+
+namespace nnet {
+
+// ****************************************************
+//       Streaming Batch Normalization
+// ****************************************************
+template<class data_T, class res_T, typename CONFIG_T>
+void normalize(
+    stream<data_T> &data, 
+    stream<res_T> &res,
+    const typename CONFIG_T::scale_t scale[CONFIG_T::n_in],
+    const typename CONFIG_T::bias_t  bias[CONFIG_T::n_in]
+) {
+
+    constexpr unsigned multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in, CONFIG_T::reuse_factor);
+    constexpr unsigned pipeline = CONFIG_T::n_in / multiplier_limit;
+    CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::limit(multiplier_limit);
+
+    BatchNormLoop: 
+    #pragma ii pipeline
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        data_T in_data = data.read();
+        res_T out_data;
+
+        BatchNormpack: 
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            int norm_index;
+            if (CONFIG_T::n_filt==-1) norm_index = i * data_T::size + j;
+            else norm_index = j % CONFIG_T::n_filt;
+            out_data[j] = CONFIG_T::template product<typename data_T::value_type, typename CONFIG_T::scale_t>::product(in_data[j], scale[norm_index]) + bias[norm_index];
+        }
+
+        res.write(out_data);
+    }
+}
+
+// ****************************************************
+//       Merged Batch Normalization and Quantized Tanh
+// ****************************************************
+template<class data_T, typename CONFIG_T>
+void normalize_binary_tanh(
+    stream<data_T> &data, 
+    stream<nnet::array<ac_int<1, false>, CONFIG_T::n_in>> &res,
+    const typename data_T::value_type threshold[CONFIG_T::n_in]
+) {
+
+    BinaryNormLoop: 
+    #pragma ii 1
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        data_T in_data = data.read();
+        nnet::array<ac_int<1, false>, CONFIG_T::n_in> out_data;
+
+        BatchNormPack: 
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            out_data[j] = (in_data[j] > threshold[i * data_T::size + j]) ? 1 : 0;
+        }
+
+        res.write(out_data);
+    }
+}
+
+template<class data_T, typename CONFIG_T>
+void normalize_ternary_tanh(
+    stream<data_T> &data, 
+    stream<nnet::array<ac_int<2, true>, CONFIG_T::n_in>> &res,
+    const typename data_T::value_type threshold_hi[CONFIG_T::n_in],
+    const typename data_T::value_type threshold_lo[CONFIG_T::n_in]
+) {
+
+    TernaryNormLoop: 
+    #pragma ii 1
+    for (int i = 0; i < CONFIG_T::n_in / data_T::size; i++) {
+        data_T in_data = data.read();
+        nnet::array<ac_int<2, true>, CONFIG_T::n_in> out_data;
+
+        BatchNormPack: 
+        #pragma unroll
+        for (int j = 0; j < data_T::size; j++) {
+            int norm_index = i * data_T::size + j;
+            if (in_data[j] > threshold_hi[norm_index]) out_data[j] = 1;
+            else if (in_data[j] <= threshold_lo[norm_index]) out_data[j] = -1;
+            else out_data[j] = 0;
+        }
+
+        res.write(out_data);
+    }
+}
 
-namespace nnet {}
+}
 
 #endif
diff --git a/test/pytest/test_batchnorm.py b/test/pytest/test_batchnorm.py
@@ -24,17 +24,20 @@ def model():
 
   
 @pytest.mark.parametrize('io_type', ['io_parallel', 'io_stream'])
-def test_global_pool1d(model, data, io_type):
+@pytest.mark.parametrize('backend', ['Vivado', 'Quartus'])
+def test_global_pool1d(model, data, backend, io_type):
+
+    default_precision = 'ac_fixed<32, 1, true>' if backend == 'Quartus' else 'ac_fixed<32, 1>'
 
     config = hls4ml.utils.config_from_keras_model(model, 
-                                                  default_precision='ap_fixed<32,1>',
+                                                  default_precision=default_precision,
                                                   granularity='name')
 
     hls_model = hls4ml.converters.convert_from_keras_model(model,
+                                                           backend=backend,
                                                            hls_config=config,
                                                            io_type=io_type,
-                                                           output_dir=f'hls4mlprj_batchnorm_{io_type}',
-                                                           part='xcvu9p-flgb2104-2-i')
+                                                           output_dir=f'hls4mlprj_batchnorm_{backend}_{io_type}')
     hls_model.compile()