diff --git a/example-models b/example-models
index 0d4cc7277e..ff74f73dbc 160000
--- a/example-models
+++ b/example-models
@@ -1 +1 @@
-Subproject commit 0d4cc7277eac9bb9020e3d73a992dc15dbdcce4e
+Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548
diff --git a/hls4ml/__init__.py b/hls4ml/__init__.py
index 7c450cd347..0e55ee713e 100644
--- a/hls4ml/__init__.py
+++ b/hls4ml/__init__.py
@@ -1,6 +1,6 @@
 from __future__ import absolute_import
 
-__version__ = '0.5.1'
+__version__ = '0.6.0'
 
 from hls4ml import converters
 from hls4ml import report
diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py
index bb5ac5ec97..c284ddd3f9 100644
--- a/hls4ml/converters/keras/core.py
+++ b/hls4ml/converters/keras/core.py
@@ -104,7 +104,7 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader,
 
 @keras_handler('BatchNormalization')
 def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
-    assert('BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name'])
+    assert('BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name'] or 'QDenseBatchnorm' in keras_layer['class_name'])
 
     layer = parse_default_keras_layer(keras_layer, input_names)
 
diff --git a/hls4ml/converters/keras/qkeras_layers.py b/hls4ml/converters/keras/qkeras_layers.py
index eecacd84bb..fed2daec0b 100644
--- a/hls4ml/converters/keras/qkeras_layers.py
+++ b/hls4ml/converters/keras/qkeras_layers.py
@@ -110,3 +110,12 @@ def parse_qconv2dbatchnorm_layer(keras_layer, input_names, input_shapes, data_re
     temp_shape = intermediate_shape
     batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader, config)
     return {**conv_layer, **batch_layer}, out_shape
+
+@keras_handler('QDenseBatchnorm')
+def parse_qdensebatchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config):
+    intermediate_shape = list()
+    dense_layer, shape_qdense = parse_qdense_layer(keras_layer, input_names, input_shapes, data_reader, config)
+    intermediate_shape.append(shape_qdense)
+    temp_shape = intermediate_shape
+    batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader, config)
+    return {**dense_layer, **batch_layer}, out_shape
diff --git a/hls4ml/model/hls_layers.py b/hls4ml/model/hls_layers.py
index c730d60ffd..1ec8cc7050 100644
--- a/hls4ml/model/hls_layers.py
+++ b/hls4ml/model/hls_layers.py
@@ -199,6 +199,7 @@ def __init__(self, shape, dim_names, proxy, **kwargs):
         self.shape = shape
         self.dim_names = dim_names
         self.type = proxy.type
+        self.cppname = proxy.name
         self.name = proxy.name
         self.size = proxy.size
 
@@ -365,6 +366,7 @@ def __init__(self, model, name, attributes, inputs, outputs=None):
         self.set_attr('accum_t', accum_t.precision)
         self.reuse_factor = self.model.config.get_reuse_factor(self)
         self.target_cycles = self.model.config.get_target_cycles(self)
+        self.merged_relu = False
 
         layer_config = self.model.config.get_layer_config(self)
         for config_key, config_value in layer_config.items():
@@ -410,6 +412,10 @@ def get_output_variable(self, output_name=None):
         else:
             return next(iter(self.variables.values()))
 
+    def set_output_variable(self, output_name, output_value):
+        self.variables[output_name] = output_value
+
+
     def get_weights(self, var_name=None):
         if var_name:
             return self.weights[var_name]
@@ -450,6 +456,8 @@ def make_array_variable(self, shape, dim_names, var_name='layer{index}_out', typ
 
     def make_stream_variable(self, shape, dim_names, var_name='layer{index}_out', type_name='layer{index}_t', precision=None, depth=0):
         pack_factor = self.model.config.get_layer_config_value(self, 'PackFactor', default=1)
+        if depth == 0:
+            depth = self.model.config.get_layer_config_value(self, 'StreamDepth', default=0)
         
         return StreamVariable(shape, dim_names, var_name=var_name, type_name=type_name, precision=precision, n_pack=pack_factor, depth=depth, index=self.index)
 
@@ -541,6 +549,12 @@ def _default_config_params(self):
     def get_layer_precision(self):
         return self.precision
 
+    def get_merged_relu(self):
+        return self.merged_relu
+    
+    def set_merged_relu(self, merged_relu):
+        self.merged_relu = merged_relu # Bool flag to set merged_relu
+
     # myproject.cpp/h
     def function_cpp(self):
         raise NotImplementedError
@@ -589,7 +603,6 @@ def initialize(self):
         out_name = self.outputs[0]
         proxy = self.get_input_variable()
         out = InplaceVariable(shape, dims, proxy, index=self.get_input_node().index)
-
         self.variables[out_name] = out
         self.model.register_output_variable(out_name, out)
 
@@ -646,9 +659,61 @@ def config_cpp(self):
         params['nonzeros'] = self.get_weights('weight').nonzeros
         params['product_type'] = self.model.config.backend.product_type(self.get_input_variable().type.precision, self.get_weights('weight').type.precision)
         params['strategy'] = self.get_attr('strategy')
-
+        params['merged_relu'] = "true" if self.get_merged_relu() else "false"
+        params['out_t'] = self.get_output_variable().type.name
         return self._config_template.format(**params)
 
+class DenseBatchnorm(Dense):
+    def _get_folded_weights(self):
+        """
+        Function to get the batchnorm folded weights.
+        This function converts the weights by folding batchnorm parameters into
+        the weight of QDense. The high-level equation:
+        W_fold = gamma * W / sqrt(variance + epsilon)
+        bias_fold = gamma * (bias - moving_mean) / sqrt(variance + epsilon) + beta
+        """
+        kernel = self.model.get_weights_data(self.name, 'kernel')
+        bias = self.model.get_weights_data(self.name, 'bias')
+        if bias is None:
+            bias = 0
+
+        # get batchnorm weights and moving stats
+        gamma = self.model.get_weights_data(self.name, 'gamma')
+        beta = self.model.get_weights_data(self.name, 'beta')
+        moving_mean = self.model.get_weights_data(self.name, 'moving_mean')
+        moving_variance = self.model.get_weights_data(self.name, 'moving_variance')
+        # get the inversion factor so that we replace division by multiplication
+        inv = np.reciprocal(np.sqrt(moving_variance + self.get_attr('epsilon')))
+        if gamma is not None:
+            inv *= gamma
+
+        # wrap conv kernel and bias with bn parameters
+        folded_kernel = inv * kernel
+        folded_bias = inv * (bias - moving_mean) + beta
+
+        return [folded_kernel, folded_bias]
+
+    def initialize(self):
+        super(DenseBatchnorm, self).initialize()
+        folded_weights, folded_bias = self._get_folded_weights()
+        if self.model.config.is_resource_strategy(self) and self.model.config.backend.name in ['Vivado', 'VivadoAccelerator']:
+            self.weights['weight'].data_unquantized = np.transpose(folded_weights)
+            self.weights['weight'].data = self.get_attr('weight_quantizer')(self.weights['weight'].data_unquantized)
+
+        else:
+            self.weights['weight'].data_unquantized = folded_weights
+            self.weights['weight'].data = self.get_attr('weight_quantizer')(folded_weights)
+        self.weights['bias'].data_unquantized = folded_bias
+        bias_q = self.get_attr('bias_quantizer')
+        if bias_q is not None:
+            self.weights['bias'].data = bias_q(folded_bias)
+
+    def function_cpp(self):
+        return super(DenseBatchnorm, self).function_cpp()
+
+    def config_cpp(self):
+        return super(DenseBatchnorm, self).config_cpp()
+
 class Conv1D(Layer):
     def initialize(self):
         if self.get_attr('data_format') == 'channels_last':
@@ -854,7 +919,9 @@ def initialize(self):
         else:
             shape = [self.attributes['n_filt'], self.attributes['out_height'], self.attributes['out_width']]
             dims = ['N_FILT_{}'.format(self.index), 'OUT_HEIGHT_{}'.format(self.index), 'OUT_WIDTH_{}'.format(self.index)]
+        self.attributes['intermediate_index'] = self.index
         self.add_output_variable(shape, dims)
+        self.intermediate_op = self.get_output_variable()
         self.add_weights(quantizer=self.get_attr('weight_quantizer'))
         self.add_bias(quantizer=self.get_attr('bias_quantizer'))
         if len(self.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D
@@ -921,6 +988,8 @@ def config_cpp(self):
         mult_params['n_in'] = self.get_attr('n_chan') * self.get_attr('filt_height') * self.get_attr('filt_width')
         mult_params['n_out'] = self.get_attr('n_filt')
         mult_params['product_type'] = self.model.config.backend.product_type(self.get_input_variable().type.precision, self.get_weights('weight').type.precision)
+        mult_params['merged_relu'] = "true" if self.get_merged_relu() else "false"
+        mult_params['out_t'] = self.intermediate_op.type.name
         mult_config = self._config_template[1].format(**mult_params)
 
         return mult_config + '\n' + conv_config
@@ -1865,6 +1934,7 @@ def _get_transforms_config(self, params):
     'BinaryDense'            : Dense,
     'TernaryDense'           : Dense,
     'QDense'                 : Dense,
+    'QDenseBatchnorm'        : DenseBatchnorm,
     'Conv1D'                 : Conv1D,
     'QConv1D'                : Conv1D,
     'Conv2D'                 : Conv2D,
diff --git a/hls4ml/model/hls_model.py b/hls4ml/model/hls_model.py
index 0f9c11ae3a..aeec6158ea 100644
--- a/hls4ml/model/hls_model.py
+++ b/hls4ml/model/hls_model.py
@@ -59,6 +59,18 @@ def get_project_name(self):
     def get_output_dir(self):
         return self.get_config_value('OutputDir')
 
+    def get_merged_relu(self, default=None):
+        hls_config = self.config['HLSConfig']
+        
+        model_config = hls_config.get('Model', None)
+        key = 'MergedRelu'
+
+        if model_config is not None:
+            tempbool = model_config.get(key, default)
+            return tempbool
+            
+        return default
+
     def get_layer_config_value(self, layer, key, default=None):
         hls_config = self.config['HLSConfig']
 
diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py
index 19915b553e..dc312e4121 100644
--- a/hls4ml/model/optimizer/__init__.py
+++ b/hls4ml/model/optimizer/__init__.py
@@ -12,6 +12,7 @@
 from hls4ml.model.optimizer.passes.conv_same_pad import InsertZeroPaddingBeforeConv2D
 from hls4ml.model.optimizer.passes.pointwise import OptimizePointwiseConv
 from hls4ml.model.optimizer.passes.clone import CloneOutput
+from hls4ml.model.optimizer.passes.relu_merge import MergeRelu
 from hls4ml.model.optimizer.passes.repack_stream import ReshapeStream, BroadcastStream, RemoveFinalReshape
 from hls4ml.model.optimizer.passes.transpose_opt import RemoveUselessTranspose
 from hls4ml.model.optimizer.passes.multi_dense import ReplaceMultidimensionalDenseWithConv
@@ -40,6 +41,7 @@
 register_pass('conv2d_same_pad', InsertZeroPaddingBeforeConv2D)
 register_pass('optimize_pointwise_conv', OptimizePointwiseConv)
 register_pass('clone_output', CloneOutput)
+register_pass('relu_merge', MergeRelu)
 register_pass('remove_final_reshape', RemoveFinalReshape)
 register_pass('reshape_stream', ReshapeStream)
 register_pass('remove_useless_transpose', RemoveUselessTranspose)
diff --git a/hls4ml/model/optimizer/passes/relu_merge.py b/hls4ml/model/optimizer/passes/relu_merge.py
new file mode 100644
index 0000000000..9c98eaa714
--- /dev/null
+++ b/hls4ml/model/optimizer/passes/relu_merge.py
@@ -0,0 +1,48 @@
+from hls4ml.model.optimizer import OptimizerPass
+
+class MergeRelu(OptimizerPass):
+    def match(self, node):
+        supported_layers = ['Conv2D', 'Conv2DBatchnorm', 'Dense']
+        is_match = node.get_input_node().__class__.__name__ in supported_layers
+
+        # hls4ml names ReLU activations 'Activation'
+        is_match = is_match and (node.__class__.__name__ == 'Activation') 
+        return is_match
+
+    def transform(self, model, node):
+        # Merge ReLU and Convolution/Dense layer
+        previous_node = node.get_input_node()
+        previous_node.index = node.index
+        previous_node.set_merged_relu(True) # Turn on merged_relu flag for this Conv/Dense layer
+        if 'Conv2D' in previous_node.__class__.__name__:
+            if previous_node.get_attr('data_format') == 'channels_last':
+                shape = [previous_node.attributes['out_height'], previous_node.attributes['out_width'], previous_node.attributes['n_filt']]
+                dims = ['OUT_HEIGHT_{}'.format(previous_node.index), 'OUT_WIDTH_{}'.format(previous_node.index), 'N_FILT_{}'.format(previous_node.index)]
+            else:
+                shape = [previous_node.attributes['n_filt'], previous_node.attributes['out_height'], previous_node.attributes['out_width']]
+                dims = ['N_FILT_{}'.format(previous_node.index), 'OUT_HEIGHT_{}'.format(previous_node.index), 'OUT_WIDTH_{}'.format(previous_node.index)]
+            activation_precision, _ = model.config.get_precision(node, var='result')
+            previous_node.add_output_variable(shape, dims, precision=activation_precision)
+            if not node.get_output_nodes():
+                print("WARNING: {} is the output layer! No rewiring performed.".format(node.name))
+                model.remove_node(node, rewire=False)
+            else:
+                model.remove_node(node, rewire=True)
+            return True 
+        elif 'Dense' in previous_node.__class__.__name__:
+            shape = previous_node.get_input_variable().shape[:]
+            shape[-1] = previous_node.attributes['n_out']
+            if len(shape) > 1:
+                dims = ['N_LAYER_{}_{}'.format(i, previous_node.index) for i in range(1, len(shape) + 1)]
+            else:
+                dims = ['N_LAYER_{}'.format(previous_node.index)]
+            print('shape: {}'.format(shape))
+            print('dims: {}'.format(dims))
+            activation_precision, _ = model.config.get_precision(node, var='result')
+            previous_node.add_output_variable(shape, dims, precision=activation_precision)
+            if not node.get_output_nodes():
+                print("WARNING: {} is the output layer! No rewiring performed.".format(node.name))
+                model.remove_node(node, rewire=False)
+            else:
+                model.remove_node(node, rewire=True)
+            return True
\ No newline at end of file
diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py
index 9aeb38be98..961153ab3e 100644
--- a/hls4ml/model/profiling.py
+++ b/hls4ml/model/profiling.py
@@ -1,3 +1,6 @@
+from pyDigitalWaveTools.vcd.parser import VcdParser
+
+import hls4ml
 from hls4ml.model.hls_model import HLSModel
 from hls4ml.model.hls_layers import IntegerPrecisionType, FixedPrecisionType
 import matplotlib.pyplot as plt
@@ -26,6 +29,101 @@
     __torch_profiling_enabled__ = False
 
 
+def optimize_fifos_depth(hls_model, init_large_fifo=True, reset=True, csim=True, synth=True,
+                         cosim=True, validation=True, export=True, vsynth=True, **kwargs,):
+
+    cfg = hls_model.config.config.copy()
+    hls_config = cfg['HLSConfig']
+    out_dir = hls_model.config.get_output_dir()
+
+    values = []
+
+    def populate_values(name, data, depth):
+        values.append({'name': name, 'data': [], 'max': 0, 'depth': 0})
+        get_values = lambda x: int(x[1][1:], 2)
+        values[-1]['data'] = [get_values(x) for x in data]
+        values[-1]['max'] = max(values[-1]['data'])
+        values[-1]['depth'] = int(depth[1:], 2)
+
+    if not hls_config['Model']['FIFO_opt']:
+        raise Exception('To use this optimization you have to set `FIFO_opt` field to True in the HLS config')
+
+
+    # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be
+    # profiled
+
+    if init_large_fifo:
+
+        for k,_ in hls_model.output_vars.items():
+            if k not in hls_config['LayerName']:
+                hls_config['LayerName'][k] = {'StreamDepth': 10000}
+            else:
+                hls_config['LayerName'][k]['StreamDepth'] = 10000
+
+        if hls_model.config.get_config_value('Backend') == 'VivadoAccelerator':
+            hls_config['LayerName']['in_local'] = {'StreamDepth' : 10000}
+            hls_config['LayerName']['out_local'] = {'StreamDepth': 10000}
+
+        cfg['OutputDir'] = out_dir + "_LARGE_FIFO"
+        cfg['HLSConfig'] = hls_config
+        hls_model = hls4ml.converters.keras_to_hls(cfg)
+
+
+    # run the build with FIFO_opt param set to 1 in order to generate the vcd file
+    hls_model.write()
+    hls_model.build(csim=True, cosim=True, synth=True, vsynth=False, export=False, validation=True)
+
+    with open(hls_model.config.get_output_dir() + '/' + hls_model.config.get_project_name() + '_prj' + '/solution1/sim/verilog/fifo_opt.vcd') as vcd_file:
+        vcd = VcdParser()
+        vcd.parse(vcd_file)
+        data = vcd.scope.toJson()
+
+    # wrapper fifos - useful only with VivadoAccelerator backend
+    if hls_model.config.get_config_value('Backend') == 'VivadoAccelerator':
+        for i in range(1, len(data['children'][0]['children'][0]['children'])):
+            populate_values(data['children'][0]['children'][0]['children'][i]['name'],
+                            data['children'][0]['children'][0]['children'][i]['children'][0]['data'],
+                            data['children'][0]['children'][0]['children'][i]['children'][1]['data'][0][1])
+
+    # model layers fifos
+    n_elem = len(data['children'][0]['children'][0]['children'][0]['children'])
+    for i in range(n_elem):
+        populate_values(data['children'][0]['children'][0]['children'][0]['children'][i]['name'],
+                        data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data'],
+                        data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data'][0][1])
+
+    maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in values]
+
+    with open(hls_model.config.get_output_dir() + '/max_depth.json', 'w') as f:
+        json.dump(maxs, f, indent=4)
+
+    new_config = cfg.copy()['HLSConfig']
+    new_config['Model']['FIFO_opt'] = 0
+    for k, v in hls_model.output_vars.items():
+        filtered_max = [x['max'] for x in maxs if v.cppname in x['name']]
+        if len(filtered_max) == 0:
+            continue
+        if len(filtered_max) > 1:
+            print('WARNING! Check names of FIFOs')
+        if k not in new_config['LayerName']:
+            new_config['LayerName'][k] = {'StreamDepth': filtered_max[0] + 1}
+        else:
+            new_config['LayerName'][k]['StreamDepth'] = filtered_max[0] + 1
+    for x in maxs:
+        if 'in_local' in x['name']:
+            new_config['LayerName']['in_local'] = {'StreamDepth': x['max'] + 1}
+        elif 'out_local' in x['name']:
+            new_config['LayerName']['out_local'] = {'StreamDepth': x['max'] + 1}
+
+    cfg['OutputDir'] = out_dir + '_FIFO_OPT'
+    cfg['HLSConfig'] = new_config
+    hls_model = hls4ml.converters.keras_to_hls(cfg)
+    hls_model.write()
+    hls_model.build(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth)
+    print('[hls4ml] - FIFO optimization completed')
+    return hls_model
+
+
 def get_unoptimized_hlsmodel(model):
     from hls4ml.converters import convert_from_config
 
diff --git a/hls4ml/templates/supported_boards.json b/hls4ml/templates/supported_boards.json
index 34d676d9cf..8f45dbad27 100644
--- a/hls4ml/templates/supported_boards.json
+++ b/hls4ml/templates/supported_boards.json
@@ -1,14 +1,32 @@
 {
+  "pynq-z1": {
+    "part": "xc7z020clg400-1",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "c_drivers": { "axi_master": "axi_master_design.c"}
+  },
   "pynq-z2": {
     "part": "xc7z020clg400-1",
-    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
-    "c_drivers": {}
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "c_drivers": { "axi_master": "axi_master_design.c"}
   },
   "zcu102": {
     "part": "xczu9eg-ffvb1156-2-e",
-    "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"},
-    "python_drivers": {"axi_stream":  "axi_stream_driver.py"},
+    "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
     "c_drivers": {}
+  },
+  "ultra96v2": {
+    "part": "xczu3eg-sbva484-1-e",
+    "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream":  "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"},
+    "python_drivers": {"axi_stream": "axi_stream_driver.py"},
+    "c_drivers": { "axi_master": "axi_master_design.c"}
+  },
+  "arty-a7-100t": {
+    "part": "xc7a100tcsg324-1",
+    "tcl_scripts": {"axi_master": "axi_master_design.tcl"},
+    "python_drivers": {},
+    "c_drivers": { "axi_master": "axi_master_design.c"}
   }
-}
\ No newline at end of file
+}
diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl
index 0ec992cd4c..e0a47ab25b 100644
--- a/hls4ml/templates/vivado/build_prj.tcl
+++ b/hls4ml/templates/vivado/build_prj.tcl
@@ -11,6 +11,94 @@ array set opt {
   vsynth     0
 }
 
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+proc remove_recursive_log_wave {} {
+    global myproject
+    set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}]
+
+    set filename ${myproject}_prj/solution1/sim/verilog/${myproject}_axi.tcl
+    set temp     $filename.new.$timestamp
+    # set backup   $filename.bak.$timestamp
+
+    set in  [open $filename r]
+    set out [open $temp     w]
+
+    # line-by-line, read the original file
+    while {[gets $in line] != -1} {
+        if {[string equal "$line" "log_wave -r /"]} {
+            set line { }
+        }
+        puts $out $line
+    }
+
+     close $in
+     close $out
+
+     # move the new data to the proper filename
+     file delete -force $filename
+     file rename -force $temp $filename
+}
+
+proc add_vcd_instructions_tcl {} {
+    global myproject
+    set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}]
+
+    set filename ${myproject}_prj/solution1/sim/verilog/${myproject}_axi.tcl
+    set temp     $filename.new.$timestamp
+    # set backup   $filename.bak.$timestamp
+
+    set in  [open $filename r]
+    set out [open $temp     w]
+
+    # line-by-line, read the original file
+    while {[gets $in line] != -1} {
+        if {[string equal "$line" "log_wave -r /"]} {
+            set line {current_scope [get_scopes -regex /apatb_myproject_axi_top/AESL_inst_myproject_axi/grp_myproject_fu_.*]
+set scopes [get_scopes -regexp {layer(\d*)_.*data_0_V_U.*}]
+current_scope /apatb_myproject_axi_top/AESL_inst_myproject_axi
+append scopes { }
+append scopes [get_scopes -regexp {.*local_V_data_0.*}]
+open_vcd fifo_opt.vcd
+foreach scope $scopes {
+    current_scope $scope
+    if {[catch [get_objects usedw]] == 0} {
+      puts "$scope skipped"
+      continue
+    }
+    set usedw [get_objects usedw]
+    set depth [get_objects DEPTH]
+    add_wave $usedw
+    log_vcd $usedw
+    log_wave $usedw
+    add_wave $depth
+    log_vcd $depth
+    log_wave $depth
+    }
+    }
+
+    set line [string map [list "myproject" $myproject] $line]
+        }
+
+        if {[string equal "$line" "quit"]} {
+            set line {flush_vcd
+close_vcd
+quit
+}
+        }
+        # then write the transformed line
+        puts $out $line
+    }
+
+    close $in
+    close $out
+
+    # move the new data to the proper filename
+    file delete -force $filename
+    file rename -force $temp $filename
+}
+
 foreach arg $::argv {
   foreach o [lsort [array names opt]] {
     regexp "$o=+(\\w+)" $arg unused opt($o)
@@ -91,7 +179,20 @@ if {$opt(cosim)} {
   # TODO: This is a workaround (Xilinx defines __RTL_SIMULATION__ only for SystemC testbenches).
   add_files -tb myproject_test.cpp -cflags "-std=c++0x -DRTL_SIM"
   set time_start [clock clicks -milliseconds]
-  cosim_design -trace_level all
+
+  cosim_design -trace_level all -setup
+
+  if {$fifo_opt} {
+    puts "\[hls4ml\] - FIFO optimization started"
+    add_vcd_instructions_tcl
+  }
+
+  remove_recursive_log_wave
+  set old_pwd [pwd]
+  cd ${myproject}_prj/solution1/sim/verilog/
+  source run_sim.tcl
+  cd $old_pwd
+
   set time_end [clock clicks -milliseconds]
   puts "INFO:"
   puts [read [open myproject_prj/solution1/sim/report/myproject_cosim.rpt r]]
diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
index 756a627434..c6ee9479aa 100644
--- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
+++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h
@@ -261,6 +261,247 @@ void dense_resource_rf_gt_nin(
     }
 }
 
+// Dense (with ReLU)
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_relu_resource_rf_leq_nin(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in,CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit/CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        int w_index = ir;
+        int in_index = ir;
+        int out_index = 0;
+        int acc_step = 0;
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+
+            acc[out_index] += CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
+
+            // Increment w_index
+            w_index += rufactor;
+            // Increment in_index
+            in_index += rufactor;
+            if (in_index >= nin) {
+                in_index = ir;
+            }
+            // Increment out_index
+            if (acc_step + 1 >= multscale) {
+                acc_step = 0;
+                out_index++;
+            } else {
+                acc_step++;
+            }
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        typename CONFIG_T::out_t act = cast<data_T, typename CONFIG_T::out_t, CONFIG_T>(acc[ires]);
+        if (act > 0) res[ires] = act;
+        else res[ires] = 0;
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_relu_resource_rf_gt_nin_rem0(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out);
+    const int multfactor = MIN(CONFIG_T::n_in,CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit/CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    int w_index;
+    int in_index = 0;
+    int out_index;
+    int outstep = 0;
+    const int outscale = rufactor / nin;
+
+    int outidx[rufactor];
+    IndexLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        outidx[ir] = outstep;
+        if ((ir + 1) % nin == 0) {
+            outstep++;
+        }
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+
+        w_index = ir;
+        out_index = outidx[ir]/*outstep*/;
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            acc[out_index] += CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
+
+            w_index += rufactor;
+            if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) break; // check out of bounds
+            out_index += outscale;
+        }
+
+        in_index++;
+        if (in_index >= nin) {
+            in_index = 0;
+            //outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround.
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        typename CONFIG_T::out_t act = cast<data_T, typename CONFIG_T::out_t, CONFIG_T>(acc[ires]);
+        if (act > 0) res[ires] = act;
+        else res[ires] = 0;
+    }
+}
+
+template<class data_T, class res_T, typename CONFIG_T>
+void dense_relu_resource_rf_gt_nin(
+    data_T data[CONFIG_T::n_in],
+    res_T  res[CONFIG_T::n_out],
+    typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out],
+    typename CONFIG_T::bias_t   biases[CONFIG_T::n_out]) {
+
+    const int rufactor = CONFIG_T::reuse_factor;
+    const int multfactor = MIN(CONFIG_T::n_in,CONFIG_T::reuse_factor);
+    const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor);
+    const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor);
+    const int multscale = multiplier_limit/CONFIG_T::n_out;
+    const int nin = CONFIG_T::n_in;
+    const int nout = CONFIG_T::n_out;
+
+    assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed");
+    assert((rufactor > nin) && "This function is correct only for RF > N_IN");
+
+    #pragma HLS function_instantiate variable=weights,biases
+    //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly
+    #pragma HLS ARRAY_RESHAPE   variable=weights block factor=block_factor
+    #pragma HLS ARRAY_PARTITION variable=biases complete
+
+    typename CONFIG_T::accum_t acc[CONFIG_T::n_out];
+    #pragma HLS ARRAY_PARTITION variable=acc complete
+
+    InitAccum:
+    for (int iacc = 0; iacc < nout; iacc++) {
+        #pragma HLS UNROLL
+        acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc];
+    }
+
+    ReuseLoop:
+    for (int ir = 0; ir < rufactor; ir++) {
+        #pragma HLS PIPELINE II=1 rewind
+        typename CONFIG_T::accum_t tmpmult[block_factor];
+        #pragma HLS ARRAY_PARTITION variable=tmpmult complete
+
+        MultLoop:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int in_index = w_index % nin;
+            if (w_index >= CONFIG_T::n_in*CONFIG_T::n_out) continue; // check out of bounds
+            tmpmult[im] = CONFIG_T::template product<data_T, typename CONFIG_T::weight_t, typename CONFIG_T::accum_t>::product(data[in_index], weights[w_index]);
+        }
+
+        typename CONFIG_T::accum_t mult[multiplier_limit];
+        #pragma HLS ARRAY_PARTITION variable=mult complete
+
+        ResetMult:
+        for (int imult = 0; imult < multiplier_limit; imult++) {
+            #pragma HLS UNROLL
+            mult[imult] = 0;
+        }
+
+        AccumLoop1:
+        for (int im = 0; im < block_factor; im++) {
+            #pragma HLS UNROLL
+            int w_index = ir + rufactor * im;
+            int out_index = w_index / multfactor;
+            if (out_index >= multiplier_limit) continue; // check out of bounds
+            mult[out_index] += tmpmult[im];
+        }
+
+        AccumLoop2:
+        for (int im = 0; im < multiplier_limit; im++) {
+            #pragma HLS UNROLL
+            //int out_index = im/multscale; // This is the general case
+            //acc[out_index] += mult[im];
+            acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out
+        }
+    }
+
+    // Cast to "res_t" type
+    Result:
+    for (int ires = 0; ires < CONFIG_T::n_out; ires++) {
+        #pragma HLS UNROLL
+        typename CONFIG_T::out_t act = cast<data_T, typename CONFIG_T::out_t, CONFIG_T>(acc[ires]);
+        if (act > 0) res[ires] = act;
+        else res[ires] = 0;
+    }
+}
+
+
 template<class data_T, class res_T, typename CONFIG_T>
 void dense_resource(
     data_T data[CONFIG_T::n_in],
@@ -270,15 +511,25 @@ void dense_resource(
 
     #pragma HLS INLINE region
 
-    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
-        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
-        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    } else {
-        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
-    }
+    if (CONFIG_T::merged_relu) {
+	    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+	        dense_relu_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+	    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+	        dense_relu_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+	    } else {
+	        dense_relu_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+	    }
+	} else {
+	    if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) {
+	        dense_resource_rf_leq_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+	    } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) {
+	        dense_resource_rf_gt_nin_rem0<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+	    } else {
+	        dense_resource_rf_gt_nin<data_T, res_T, CONFIG_T>(data, res, weights, biases);
+	    }
+	}
 }
 
 }
 
-#endif
+#endif
\ No newline at end of file
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/Makefile
new file mode 100644
index 0000000000..03ab9b8de7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/Makefile
@@ -0,0 +1,33 @@
+DESIGN := design_1
+
+help:
+	@echo "INFO: make <TAB> to show targets"
+.PHONY: help
+
+--setup:
+	xsct ./setup.tcl $(DESIGN)
+.PHONY: --setup
+
+sdk: --setup
+	rm -f $(DESIGN)_standalone/src/helloworld.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h
+.PHONY: sdk
+
+gui:
+	xsdk --workspace . &
+.PHONY: gui
+
+clean:
+	rm -rf $(DESIGN)_platform
+	rm -rf $(DESIGN)_standalone
+	rm -rf $(DESIGN)_standalone_bsp
+	rm -rf RemoteSystemsTempFiles
+	rm -rf .Xil
+	rm -rf .metadata
+	rm -f *.log
+.PHONY: clean
+
+ultraclean: clean
+	rm -rf hdf/*.hdf
+.PHONY: ultraclean
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/common/main.c
new file mode 100644
index 0000000000..41f5dca282
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/common/main.c
@@ -0,0 +1,351 @@
+/**
+ *
+ * Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+ *
+ */
+
+#include "xmyproject_axi.h"  /* TODO: design-dependent name */
+#include "stdio.h"       /* PRINTF */
+#include "unistd.h"      /* sleep */
+#include "stdlib.h"
+#include "malloc.h"
+#include "assert.h"
+#include "xil_io.h"      /* peripheral read/write wrappers */
+#include "platform.h"    /* platform init/cleanup functions */
+#include "xil_cache.h"   /* enable/disable caches etc */
+#include "xil_printf.h"  /* UART debug print functions */
+#include "xparameters.h" /* peripherals base addresses */
+#include "xtmrctr.h"     /* timer, Xilinx IP Timer Counter */
+
+#include "data.h"
+
+#define EEMBC_POWER 1
+
+#ifdef EEMBC_POWER
+#include "xgpio.h"       /* AXI GPIO drivers */
+
+#define PIN 0x01
+#define GPIO_PMOD_PIN_DEVICE_ID  XPAR_GPIO_0_DEVICE_ID
+
+#define set_pin_high(InstancePtr, Mask) \
+        XGpio_DiscreteWrite(InstancePtr, 1, Mask)
+
+#define set_pin_low(InstancePtr, Mask) \
+        XGpio_DiscreteClear(InstancePtr, 1, Mask)
+
+XGpio Gpio;
+#endif
+
+
+//#define __DEBUG__
+
+#define MAX_PRINT_ELEMENTS (16)
+
+#define PRINTF printf
+
+const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS;
+const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS;
+
+#if 1
+/* Accelerator verification */
+#define REFERENCE_OUTPUTS data_y_hls_outputs
+#else
+/* Accelerator validation */
+#define REFERENCE_OUTPUTS data_y_outputs
+//#define REFERENCE_OUTPUTS data_y_keras_outputs
+#endif
+
+unsigned get_max(float *data, unsigned n_elements) {
+    float max_value = 0.0;
+    unsigned max_index = 0;
+    for (unsigned i = 0; i < n_elements; i++)
+        if (data[i] >= max_value) {
+            max_index = i;
+            max_value = data[i];
+        }
+    return max_index;
+}
+
+float *inputs_mem = NULL;
+float *outputs_mem = NULL;
+float *reference_mem = NULL;
+
+/* Accelerator configuration */
+XMyproject_axi accelerator; /* TODO: design-dependent name */
+XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */
+
+/* Accelerator initialization routine */
+void init_accelerators() {
+    PRINTF("INFO: Initializing accelerator\r\n");
+    accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_DEVICE_ID); /* TODO: design-dependent name */
+    if (accelerator_cfg) {
+        int status  = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */
+        if (status != XST_SUCCESS) {
+            PRINTF("ERROR: Initializing accelerator\r\n");
+        }
+    }
+}
+
+/* Reference implementation of the accelerator in software */
+int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) {
+#ifdef __DEBUG__
+    PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n");
+#endif
+    /* See data.h for inputs and outputs */
+    for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) {
+        sw_outputs_mem[i] = REFERENCE_OUTPUTS[i];
+    }
+    return 0;
+}
+
+/* Profiling utilities */
+static XTmrCtr TimerCounterInst;
+#define TMRCTR_DEVICE_ID    XPAR_TMRCTR_0_DEVICE_ID
+#define TIMER_CNTR_0        0
+#define TIMER_CNTR_1        1
+
+void start_64b_counter() {
+    XTmrCtr_Start(&TimerCounterInst, TIMER_CNTR_0);
+    XTmrCtr_Start(&TimerCounterInst, TIMER_CNTR_1);
+}
+
+void stop_64b_counter() {
+    XTmrCtr_Stop(&TimerCounterInst, TIMER_CNTR_0);
+    XTmrCtr_Stop(&TimerCounterInst, TIMER_CNTR_1);
+}
+
+u64 get_64b_counter_value() {
+    //printf("bytes %u\n\r", sizeof(u64));
+    u64 lo_counter = XTmrCtr_GetValue(&TimerCounterInst, TIMER_CNTR_0);
+    u64 hi_counter = XTmrCtr_GetValue(&TimerCounterInst, TIMER_CNTR_1);
+    u64 counter = (hi_counter << 32) | lo_counter;
+    //printf("INFO: hi = %lu, lo = %lu, total = %lu\n\r", hi_counter, lo_counter, counter);
+    return counter;
+}
+
+#if 0
+double get_elapsed_time(u64 clk_start, u64 clk_stop) {
+    return ((clk_stop-clk_start) * (1.0/XPAR_AXI_TIMER_MCU_CLOCK_FREQ_HZ));
+}
+#endif
+
+float get_elapsed_time_ns(u64 clks) {
+    return clks * 1000000000.0/XPAR_AXI_TIMER_MCU_CLOCK_FREQ_HZ;
+}
+
+
+/* Dump data to the console */
+void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) {
+    PRINTF("INFO:   %s[%u][%u]:\r\n", label, n_samples, feature_count);
+    /* Print at most MAX_PRINT_ELEMENTS */
+    for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) {
+        PRINTF("INFO:     [%u] ", i);
+        for (unsigned j = 0; j < feature_count; j++) {
+            unsigned index = i * feature_count + j;
+            PRINTF("%f ", data[index]);
+        }
+        PRINTF("\r\n");
+    }
+}
+
+/* The top of the hill :-) */
+int main(int argc, char** argv) {
+
+    int status;
+    u64 calibration_time;
+    double __attribute__ ((unused)) sw_elapsed = 0;
+    u64 hw_elapsed = 0;
+    u64 cache_elapsed = 0;
+    unsigned hw_errors;
+
+    char __attribute__ ((unused)) dummy; /* dummy input */
+
+    /* Initialize platform (uart and caches) */
+    init_platform();
+
+    PRINTF("\r\n");
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */
+    PRINTF("INFO: ==================================================\r\n");
+
+    init_accelerators();
+
+    /* Timer Counter */
+    status = XTmrCtr_Initialize(&TimerCounterInst, TMRCTR_DEVICE_ID);
+    if (status != XST_SUCCESS){
+        print("ERROR: Timer counter initialization failed \r\n");
+        return status;
+    }
+
+    XTmrCtr_SetOptions(&TimerCounterInst, TIMER_CNTR_0,
+                XTC_AUTO_RELOAD_OPTION |
+                XTC_CASCADE_MODE_OPTION);
+
+    print("INFO: Timer counter initialized\r\n");
+
+    inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float));
+    outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+    reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+
+    /* Calibration */
+    start_64b_counter();
+    sleep(1);
+    stop_64b_counter();
+    calibration_time = get_64b_counter_value();
+    PRINTF("INFO: Time calibration for one second (%lf sec, %llu clk)\r\n", get_elapsed_time_ns(calibration_time), calibration_time);
+
+    /* Initialize memory */
+    PRINTF("INFO: Initialize memory\r\n");
+    PRINTF("INFO:   - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */
+    PRINTF("INFO:   - Inputs count: %u\r\n", N_X_INPUTS);
+    PRINTF("INFO:   - Outputs count: %u\r\n", N_Y_OUTPUTS);
+    PRINTF("INFO:   - Data size: %u B\r\n", sizeof(float));
+    PRINTF("INFO:   - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+    PRINTF("INFO:   - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+
+    // Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+    //malloc_stats();
+
+    for (int i = 0; i < INPUT_N_ELEMENTS; i++) {
+        inputs_mem[i] = data_X_inputs[i];
+    }
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        outputs_mem[i] = 0x0;
+    }
+
+    /* ****** SW REFERENCE ****** */
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Start SW reference implementation\r\n");
+    start_64b_counter();
+    sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS);
+    stop_64b_counter();
+    sw_elapsed = get_64b_counter_value();
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Press any key to start:\r\n");
+    dummy = inbyte();
+    //PRINTF("INFO:");
+
+    /* ****** HW ACCELERATOR ****** */
+    PRINTF("INFO: Start HW accelerator\r\n");
+    start_64b_counter();
+    Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    stop_64b_counter();
+    cache_elapsed = get_64b_counter_value();
+
+    for (unsigned j = 0; j < N_SAMPLES; j++) {
+        float *inputs_mem_i = inputs_mem + j * N_X_INPUTS;
+        float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS;
+
+        /* Configure the accelerator */
+        start_64b_counter();
+        XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */
+        XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */
+
+        XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */
+
+        /* Polling */
+        while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */
+
+        /* Get error status */
+        //hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */
+        stop_64b_counter();
+        hw_elapsed += get_64b_counter_value();
+    }
+
+    start_64b_counter();
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    stop_64b_counter();
+    cache_elapsed += get_64b_counter_value();
+
+    PRINTF("INFO: HW accelerator done!\r\n");
+
+    /* ****** VALIDATION ****** */
+    PRINTF("INFO: ================== Verification ==================\r\n");
+#ifdef __DEBUG__
+    PRINTF("INFO: Dump data\r\n");
+    dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS);
+    dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS);
+    dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS);
+#endif
+
+#ifdef __DEBUG__
+    PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed);
+#endif
+    PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES);
+    PRINTF("INFO:   - total %f sec\r\n", get_elapsed_time_ns(hw_elapsed));
+    PRINTF("INFO:   - per-inference %.12f sec (%f ns)\r\n", get_elapsed_time_ns(hw_elapsed) / (N_SAMPLES), (get_elapsed_time_ns(hw_elapsed)*1000.0) / (N_SAMPLES));
+    PRINTF("INFO: Cache flush time: %f sec\r\n", get_elapsed_time_ns(cache_elapsed));
+#ifdef __DEBUG__
+    PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed));
+#endif
+
+    hw_errors = 0;
+#if 1
+    /* Accelerator verification */
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        if (outputs_mem[i] != reference_mem[i]) {
+            PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]);
+            hw_errors++;
+        }
+    }
+    PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS);
+    if (hw_errors > 0)
+        PRINTF("INFO: Verification: FAIL\r\n");
+    else
+        PRINTF("INFO: Verification: PASS!\r\n");
+#else
+    /* Accelerator validation */
+    for (unsigned s = 0; s < N_SAMPLES; s++) {
+        unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+        unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+        if (hw_digit != ref_digit) {
+#ifdef __DEBUG__
+            PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit);
+#endif
+            hw_errors++;
+        }
+    }
+    float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0;
+    float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0);
+    PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES);
+    PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate);
+    PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy);
+#endif
+
+    PRINTF("INFO: ==================================================\r\n");
+
+
+#ifdef EEMBC_POWER
+    /* Initialize the GPIO driver */
+	status = XGpio_Initialize(&Gpio, GPIO_PMOD_PIN_DEVICE_ID);
+	if (status != XST_SUCCESS) {
+		xil_printf("GPIO Initialization Failed\r\n");
+		return XST_FAILURE;
+	}
+
+	set_pin_low(&Gpio, PIN);
+
+    PRINTF("INFO: Connect logic analyzer to the pin 3 of Pmod D\r\n");
+    PRINTF("INFO: Press any key to start:\r\n");
+    dummy = inbyte();
+
+	/* Loop forever */
+	for (unsigned i; i < 100; i++) {
+		set_pin_high(&Gpio, PIN);
+
+        sleep(1);
+
+		set_pin_low(&Gpio, PIN);
+
+        sleep(1);
+	}
+#endif
+
+    cleanup_platform();
+
+    return 0;
+}
+
+
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/setup.tcl
new file mode 100644
index 0000000000..383bf39cf7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/setup.tcl
@@ -0,0 +1,14 @@
+# See 
+# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html
+
+setws .
+if { $::argc == 1 } {
+    set myproject [lindex $::argv 0]
+    createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf
+    createapp -name ${myproject}\_standalone -app {Hello World} -proc microblaze_mcu -hwproject ${myproject}\_platform -os standalone
+    configapp -app ${myproject}\_standalone build-config release
+    #configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000}
+    #configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000}
+    projects -build
+    #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE}
+}
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/arty-a7-100t/tcl_scripts/axi_master_design.tcl
new file mode 100644
index 0000000000..67d667b063
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/tcl_scripts/axi_master_design.tcl
@@ -0,0 +1,193 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+# Project names
+set project_name "project_1"
+set design_name "design_1"
+set hls_solution_name "solution1"
+set acc_name "${myproject}_axi"
+set part_name "xc7a100tcsg324-1"
+set board_name "digilentinc.com:arty-a7-100:part0:1.0"
+
+# Set board and chip part names
+create_project ${project_name} ${myproject}_vivado_accelerator -part ${part_name} -force
+set_property board_part ${board_name} [current_project]
+
+# Create block design
+create_bd_design ${design_name}
+
+# Setup IP repo
+#set_property  ip_repo_paths ${myproject}_prj [current_project]
+set_property  ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project]
+update_ip_catalog
+
+# Create clock wizard
+create_bd_cell -type ip -vlnv xilinx.com:ip:clk_wiz:6.0 clk_wiz_0
+apply_board_connection -board_interface "sys_clock" -ip_intf "clk_wiz_0/clock_CLK_IN1" -diagram ${design_name}
+set_property name clk_wizard [get_bd_cells clk_wiz_0]
+set_property -dict [list CONFIG.CLKOUT2_USED {true} CONFIG.CLKOUT1_REQUESTED_OUT_FREQ {166.667} CONFIG.CLKOUT2_REQUESTED_OUT_FREQ {200.00} CONFIG.MMCM_CLKOUT0_DIVIDE_F {6.000} CONFIG.MMCM_CLKOUT1_DIVIDE {5} CONFIG.NUM_OUT_CLKS {2} CONFIG.CLKOUT1_JITTER {118.758} CONFIG.CLKOUT2_JITTER {114.829} CONFIG.CLKOUT2_PHASE_ERROR {98.575}] [get_bd_cells clk_wizard]
+#set_property -dict [list CONFIG.RESET_TYPE {ACTIVE_LOW} CONFIG.RESET_PORT {resetn}] [get_bd_cells clk_wizard]
+
+# Create MIG
+create_bd_cell -type ip -vlnv xilinx.com:ip:mig_7series:4.2 mig_7series_0
+apply_board_connection -board_interface "ddr3_sdram" -ip_intf "mig_7series_0/mig_ddr_interface" -diagram ${design_name}
+
+# Wire MIG and clock wizard
+delete_bd_objs [get_bd_nets clk_ref_i_1] [get_bd_ports clk_ref_i]
+delete_bd_objs [get_bd_nets sys_clk_i_1] [get_bd_ports sys_clk_i]
+connect_bd_net [get_bd_pins clk_wizard/clk_out2] [get_bd_pins mig_7series_0/clk_ref_i]
+connect_bd_net [get_bd_pins clk_wizard/clk_out1] [get_bd_pins mig_7series_0/sys_clk_i]
+
+# Setup reset
+#set_property -dict [list CONFIG.RESET_BOARD_INTERFACE {reset}] [get_bd_cells clk_wizard]
+apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {reset ( System Reset ) } Manual_Source {New External Port (ACTIVE_LOW)}}  [get_bd_pins mig_7series_0/sys_rst]
+
+# Create instance of MicroBlaze
+create_bd_cell -type ip -vlnv xilinx.com:ip:microblaze:11.0 microblaze_mcu
+apply_bd_automation -rule xilinx.com:bd_rule:microblaze -config { \
+    axi_intc {0} \
+    axi_periph {Enabled} \
+    cache {16KB} \
+    clk {/mig_7series_0/ui_clk (83 MHz)} \
+    debug_module {Debug Only} \
+    ecc {None} \
+    local_mem {32KB} \
+    preset {None} } [get_bd_cells microblaze_mcu]
+
+# Resize data and instruction caches
+set_property -dict [list CONFIG.C_ADDR_TAG_BITS {18} CONFIG.C_CACHE_BYTE_SIZE {1024} CONFIG.C_DCACHE_ADDR_TAG {18} CONFIG.C_DCACHE_BYTE_SIZE {1024}] [get_bd_cells microblaze_mcu]
+
+# Enable full FPU
+set_property -dict [list CONFIG.C_USE_FPU {2}] [get_bd_cells microblaze_mcu]
+
+# Create UART interface
+#create_bd_cell -type ip -vlnv xilinx.com:ip:axi_uart16550:2.0 axi_uart
+#apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {Auto} Master {/microblaze_mcu (Periph)} Slave {/axi_uart/S_AXI} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_uart/S_AXI]
+#apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {usb_uart ( USB UART ) } Manual_Source {Auto}}  [get_bd_intf_pins axi_uart/UART]
+
+# Create UART-lite interface
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_uartlite:2.0 axi_uart
+if { ${eembc_power} } {
+    set_property -dict [list CONFIG.C_BAUDRATE {9600}] [get_bd_cells axi_uart]
+} else {
+    apply_board_connection -board_interface "usb_uart" -ip_intf "axi_uart/UART" -diagram ${design_name}
+    set_property -dict [list CONFIG.C_BAUDRATE {115200}] [get_bd_cells axi_uart]
+}
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {/mig_7series_0/ui_clk (83 MHz)} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master {/microblaze_mcu (Periph)} \
+    Slave {/axi_uart/S_AXI} \
+    intc_ip {New AXI Interconnect} \
+    master_apm {0}} [get_bd_intf_pins axi_uart/S_AXI]
+
+# Forward UART interface to PMOD pins
+if { ${eembc_power} } {
+    create_bd_port -dir O pmod_uart_txd
+    create_bd_port -dir I pmod_uart_rxd
+    connect_bd_net [get_bd_pins /axi_uart/tx] [get_bd_ports pmod_uart_txd]
+    connect_bd_net [get_bd_pins /axi_uart/rx] [get_bd_ports pmod_uart_rxd]
+    add_files -fileset constrs_1 -norecurse uart_pmod.xdc
+}
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {/mig_7series_0/ui_clk (83 MHz)} \
+    Clk_slave {/mig_7series_0/ui_clk (83 MHz)} \
+    Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} \
+    Master {/microblaze_mcu (Cached)} \
+    Slave {/mig_7series_0/S_AXI} \
+    intc_ip {Auto} master_apm {0} } [get_bd_intf_pins mig_7series_0/S_AXI]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {/mig_7series_0/ui_clk (83 MHz)} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master {/microblaze_mcu (Periph)} \
+    Slave {/axi_uart/S_AXI} \
+    intc_ip {New AXI Interconnect} \
+    master_apm {0} } [get_bd_intf_pins axi_uart/S_AXI]
+
+# Add accelerator and connect s-axi interface
+create_bd_cell -type ip -vlnv xilinx.com:hls:${acc_name}:1.0 ${acc_name}
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/microblaze_mcu (Periph)} Slave {/${acc_name}/s_axi_CTRL_BUS} intc_ip {/microblaze_mcu_axi_periph} master_apm {0}}  [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS]
+
+# Connect m-axi interfaces
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {/mig_7series_0/ui_clk (83 MHz)} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/${acc_name}/m_axi_IN_BUS} Slave {/mig_7series_0/S_AXI} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins ${acc_name}/m_axi_IN_BUS]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {/mig_7series_0/ui_clk (83 MHz)} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/${acc_name}/m_axi_OUT_BUS} Slave {/mig_7series_0/S_AXI} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS]
+
+# Reset
+apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {reset ( System Reset ) } Manual_Source {Auto}}  [get_bd_pins clk_wizard/reset]
+
+# Add timer
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_mcu
+set_property -dict [list CONFIG.enable_timer2 {1}] [get_bd_cells axi_timer_mcu]
+
+# Wire timer
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/microblaze_mcu (Periph)} Slave {/axi_timer_mcu/S_AXI} intc_ip {/microblaze_mcu_axi_periph} master_apm {0}}  [get_bd_intf_pins axi_timer_mcu/S_AXI]
+
+# Add AXI GPIO controlled pin
+if { ${eembc_power} } {
+    # Add AXI GPIO IP
+    create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0
+    # Wire it up to a single output pin (to a PMOD)
+    set_property -dict [list CONFIG.C_GPIO_WIDTH {1} CONFIG.C_ALL_OUTPUTS {1}] [get_bd_cells axi_gpio_0]
+    apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+        Clk_master {/mig_7series_0/ui_clk (83 MHz)} \
+        Clk_slave {Auto} \
+        Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} \
+        Master {/microblaze_mcu (Periph)} \
+        Slave {/axi_gpio_0/S_AXI} \
+        intc_ip {/microblaze_mcu_axi_periph} \
+        master_apm {0}} [get_bd_intf_pins axi_gpio_0/S_AXI]
+    create_bd_port -dir O pmod_pin
+    connect_bd_net [get_bd_ports pmod_pin] [get_bd_pins axi_gpio_0/gpio_io_o]
+
+    add_files -fileset constrs_1 -norecurse pin_pmod.xdc
+}
+
+# Add Quad SPI for cold boot
+if { ${eembc_power} } {
+    create_bd_cell -type ip -vlnv xilinx.com:ip:axi_quad_spi:3.2 axi_quad_spi_0
+    set_property -dict [list CONFIG.C_SPI_MEMORY {3}] [get_bd_cells axi_quad_spi_0]
+    apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {qspi_flash ( Quad SPI Flash ) } Manual_Source {Auto}}  [get_bd_intf_pins axi_quad_spi_0/SPI_0]
+    apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/microblaze_mcu (Periph)} Slave {/axi_quad_spi_0/AXI_LITE} intc_ip {/microblaze_mcu_axi_periph} master_apm {0}}  [get_bd_intf_pins axi_quad_spi_0/AXI_LITE]
+    set_property -dict [list CONFIG.CLKOUT3_USED {true} CONFIG.CLKOUT3_REQUESTED_OUT_FREQ {50} CONFIG.MMCM_CLKOUT2_DIVIDE {20} CONFIG.NUM_OUT_CLKS {3} CONFIG.CLKOUT3_JITTER {151.636} CONFIG.CLKOUT3_PHASE_ERROR {98.575}] [get_bd_cells clk_wizard]
+    connect_bd_net [get_bd_pins clk_wizard/clk_out3] [get_bd_pins axi_quad_spi_0/ext_spi_clk]
+
+    # BUG FIX
+    delete_bd_objs [get_bd_nets clk_wizard_clk_out3]
+    connect_bd_net [get_bd_pins axi_quad_spi_0/ext_spi_clk] [get_bd_pins mig_7series_0/ui_clk]
+
+    add_files -fileset constrs_1 -norecurse qspi.xdc
+}
+
+# Validate the design block we created
+validate_bd_design
+
+# Save design
+save_bd_design
+
+# Top level wrapper
+#make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top
+#add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v
+add_files -norecurse $design_name\_wrapper.v
+
+# In the Verilog wrapper, enable configuration for the EEMBC power setup
+if { ${eembc_power} } {
+    set_property verilog_define EEMBC_POWER=1 [current_fileset]
+}
+
+# Run synthesis and implementation
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+# Reporting
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+
+# Export HDF file for SDK flow
+file mkdir ./hdf
+file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/verilog_wrappers/design_1_wrapper.v b/hls4ml/templates/vivado_accelerator/arty-a7-100t/verilog_wrappers/design_1_wrapper.v
new file mode 100644
index 0000000000..3bbaf5f9be
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/verilog_wrappers/design_1_wrapper.v
@@ -0,0 +1,209 @@
+`timescale 1 ps / 1 ps
+
+module design_1_wrapper
+   (ddr3_sdram_addr,
+    ddr3_sdram_ba,
+    ddr3_sdram_cas_n,
+    ddr3_sdram_ck_n,
+    ddr3_sdram_ck_p,
+    ddr3_sdram_cke,
+    ddr3_sdram_cs_n,
+    ddr3_sdram_dm,
+    ddr3_sdram_dq,
+    ddr3_sdram_dqs_n,
+    ddr3_sdram_dqs_p,
+    ddr3_sdram_odt,
+    ddr3_sdram_ras_n,
+    ddr3_sdram_reset_n,
+    ddr3_sdram_we_n
+`ifdef EEMBC_POWER
+    ,
+    qspi_flash_io0_io,
+    qspi_flash_io1_io,
+    qspi_flash_io2_io,
+    qspi_flash_io3_io,
+    qspi_flash_sck_io,
+    qspi_flash_ss_io
+ `endif
+    ,
+    reset,
+    sys_clock
+`ifdef EEMBC_POWER
+    ,
+    pmod_uart_rxd,
+    pmod_uart_txd,
+    pmod_pin
+`else
+    ,
+    usb_uart_rxd,
+    usb_uart_txd
+`endif
+    );
+  output [13:0]ddr3_sdram_addr;
+  output [2:0]ddr3_sdram_ba;
+  output ddr3_sdram_cas_n;
+  output [0:0]ddr3_sdram_ck_n;
+  output [0:0]ddr3_sdram_ck_p;
+  output [0:0]ddr3_sdram_cke;
+  output [0:0]ddr3_sdram_cs_n;
+  output [1:0]ddr3_sdram_dm;
+  inout [15:0]ddr3_sdram_dq;
+  inout [1:0]ddr3_sdram_dqs_n;
+  inout [1:0]ddr3_sdram_dqs_p;
+  output [0:0]ddr3_sdram_odt;
+  output ddr3_sdram_ras_n;
+  output ddr3_sdram_reset_n;
+  output ddr3_sdram_we_n;
+`ifdef EEMBC_POWER
+  inout qspi_flash_io0_io;
+  inout qspi_flash_io1_io;
+  inout qspi_flash_io2_io;
+  inout qspi_flash_io3_io;
+  inout qspi_flash_sck_io;
+  inout qspi_flash_ss_io;
+ `endif
+  input reset;
+  input sys_clock;
+`ifdef EEMBC_POWER
+  input pmod_uart_rxd;
+  output pmod_uart_txd;
+  output pmod_pin;
+`else
+  input usb_uart_rxd;
+  output usb_uart_txd;
+`endif
+
+
+  wire [13:0]ddr3_sdram_addr;
+  wire [2:0]ddr3_sdram_ba;
+  wire ddr3_sdram_cas_n;
+  wire [0:0]ddr3_sdram_ck_n;
+  wire [0:0]ddr3_sdram_ck_p;
+  wire [0:0]ddr3_sdram_cke;
+  wire [0:0]ddr3_sdram_cs_n;
+  wire [1:0]ddr3_sdram_dm;
+  wire [15:0]ddr3_sdram_dq;
+  wire [1:0]ddr3_sdram_dqs_n;
+  wire [1:0]ddr3_sdram_dqs_p;
+  wire [0:0]ddr3_sdram_odt;
+  wire ddr3_sdram_ras_n;
+  wire ddr3_sdram_reset_n;
+  wire ddr3_sdram_we_n;
+`ifdef EEMBC_POWER
+  wire qspi_flash_io0_i;
+  wire qspi_flash_io0_io;
+  wire qspi_flash_io0_o;
+  wire qspi_flash_io0_t;
+  wire qspi_flash_io1_i;
+  wire qspi_flash_io1_io;
+  wire qspi_flash_io1_o;
+  wire qspi_flash_io1_t;
+  wire qspi_flash_io2_i;
+  wire qspi_flash_io2_io;
+  wire qspi_flash_io2_o;
+  wire qspi_flash_io2_t;
+  wire qspi_flash_io3_i;
+  wire qspi_flash_io3_io;
+  wire qspi_flash_io3_o;
+  wire qspi_flash_io3_t;
+  wire qspi_flash_sck_i;
+  wire qspi_flash_sck_io;
+  wire qspi_flash_sck_o;
+  wire qspi_flash_sck_t;
+  wire qspi_flash_ss_i;
+  wire qspi_flash_ss_io;
+  wire qspi_flash_ss_o;
+  wire qspi_flash_ss_t;
+`else
+  wire usb_uart_rxd;
+  wire usb_uart_txd;
+`endif
+  wire reset;
+  wire sys_clock;
+
+`ifdef EEMBC_POWER
+  IOBUF qspi_flash_io0_iobuf
+       (.I(qspi_flash_io0_o),
+        .IO(qspi_flash_io0_io),
+        .O(qspi_flash_io0_i),
+        .T(qspi_flash_io0_t));
+  IOBUF qspi_flash_io1_iobuf
+       (.I(qspi_flash_io1_o),
+        .IO(qspi_flash_io1_io),
+        .O(qspi_flash_io1_i),
+        .T(qspi_flash_io1_t));
+  IOBUF qspi_flash_io2_iobuf
+       (.I(qspi_flash_io2_o),
+        .IO(qspi_flash_io2_io),
+        .O(qspi_flash_io2_i),
+        .T(qspi_flash_io2_t));
+  IOBUF qspi_flash_io3_iobuf
+       (.I(qspi_flash_io3_o),
+        .IO(qspi_flash_io3_io),
+        .O(qspi_flash_io3_i),
+        .T(qspi_flash_io3_t));
+  IOBUF qspi_flash_sck_iobuf
+       (.I(qspi_flash_sck_o),
+        .IO(qspi_flash_sck_io),
+        .O(qspi_flash_sck_i),
+        .T(qspi_flash_sck_t));
+  IOBUF qspi_flash_ss_iobuf
+       (.I(qspi_flash_ss_o),
+        .IO(qspi_flash_ss_io),
+        .O(qspi_flash_ss_i),
+        .T(qspi_flash_ss_t));
+`endif
+
+  design_1 design_1_i
+       (.ddr3_sdram_addr(ddr3_sdram_addr),
+        .ddr3_sdram_ba(ddr3_sdram_ba),
+        .ddr3_sdram_cas_n(ddr3_sdram_cas_n),
+        .ddr3_sdram_ck_n(ddr3_sdram_ck_n),
+        .ddr3_sdram_ck_p(ddr3_sdram_ck_p),
+        .ddr3_sdram_cke(ddr3_sdram_cke),
+        .ddr3_sdram_cs_n(ddr3_sdram_cs_n),
+        .ddr3_sdram_dm(ddr3_sdram_dm),
+        .ddr3_sdram_dq(ddr3_sdram_dq),
+        .ddr3_sdram_dqs_n(ddr3_sdram_dqs_n),
+        .ddr3_sdram_dqs_p(ddr3_sdram_dqs_p),
+        .ddr3_sdram_odt(ddr3_sdram_odt),
+        .ddr3_sdram_ras_n(ddr3_sdram_ras_n),
+        .ddr3_sdram_reset_n(ddr3_sdram_reset_n),
+        .ddr3_sdram_we_n(ddr3_sdram_we_n)
+`ifdef EEMBC_POWER
+        ,
+        .qspi_flash_io0_i(qspi_flash_io0_i),
+        .qspi_flash_io0_o(qspi_flash_io0_o),
+        .qspi_flash_io0_t(qspi_flash_io0_t),
+        .qspi_flash_io1_i(qspi_flash_io1_i),
+        .qspi_flash_io1_o(qspi_flash_io1_o),
+        .qspi_flash_io1_t(qspi_flash_io1_t),
+        .qspi_flash_io2_i(qspi_flash_io2_i),
+        .qspi_flash_io2_o(qspi_flash_io2_o),
+        .qspi_flash_io2_t(qspi_flash_io2_t),
+        .qspi_flash_io3_i(qspi_flash_io3_i),
+        .qspi_flash_io3_o(qspi_flash_io3_o),
+        .qspi_flash_io3_t(qspi_flash_io3_t),
+        .qspi_flash_sck_i(qspi_flash_sck_i),
+        .qspi_flash_sck_o(qspi_flash_sck_o),
+        .qspi_flash_sck_t(qspi_flash_sck_t),
+        .qspi_flash_ss_i(qspi_flash_ss_i),
+        .qspi_flash_ss_o(qspi_flash_ss_o),
+        .qspi_flash_ss_t(qspi_flash_ss_t)
+ `endif
+        ,
+        .reset(reset),
+        .sys_clock(sys_clock)
+`ifdef EEMBC_POWER
+        ,
+        .pmod_uart_rxd(pmod_uart_rxd),
+        .pmod_uart_txd(pmod_uart_txd),
+        .pmod_pin(pmod_pin)
+`else
+        ,
+        .usb_uart_rxd(usb_uart_rxd),
+        .usb_uart_txd(usb_uart_txd)
+`endif
+
+        );
+endmodule
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/pin_pmod.xdc b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/pin_pmod.xdc
new file mode 100644
index 0000000000..321279b709
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/pin_pmod.xdc
@@ -0,0 +1,4 @@
+# AXI GPIO controlled pin on Pmod Header JD
+
+# Output pin, PMOD D pin 3 (JD4), IO_L13N_T2_MRCC_35, F4, Blue cable
+set_property -dict { PACKAGE_PIN F4 IOSTANDARD LVCMOS33 } [get_ports { pmod_pin }];
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/qspi.xdc b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/qspi.xdc
new file mode 100644
index 0000000000..6019da47bd
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/qspi.xdc
@@ -0,0 +1,13 @@
+#
+# See also
+# https://github.com/Digilent/digilent-xdc/blob/master/Arty-A7-100-Master.xdc
+#
+
+set_property BITSTREAM.GENERAL.COMPRESS TRUE [current_design]
+
+# Quad SPI Flash
+set_property -dict { PACKAGE_PIN L13   IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_ss_io }]; #IO_L6P_T0_FCS_B_14 Sch=qspi_cs
+set_property -dict { PACKAGE_PIN K17   IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io0_io }]; #IO_L1P_T0_D00_MOSI_14 Sch=qspi_dq[0]
+set_property -dict { PACKAGE_PIN K18   IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io1_io }]; #IO_L1N_T0_D01_DIN_14 Sch=qspi_dq[1]
+set_property -dict { PACKAGE_PIN L14   IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io2_io }]; #IO_L2P_T0_D02_14 Sch=qspi_dq[2]
+set_property -dict { PACKAGE_PIN M14   IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io3_io }]; #IO_L2N_T0_D03_14 Sch=qspi_dq[3]
diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/uart_pmod.xdc b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/uart_pmod.xdc
new file mode 100644
index 0000000000..2cf181a20a
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/uart_pmod.xdc
@@ -0,0 +1,8 @@
+# Expose UART Interface on Pmod Header JA
+# You may need https://www.sparkfun.com/products/9873
+
+# RX uart, PMOD A pin 2 (JA2), IO_L4P_T0_15, B11, BROWN cable
+set_property -dict { PACKAGE_PIN B11 IOSTANDARD LVCMOS33 } [get_ports { pmod_uart_rxd }];
+
+# TX uart, PMOD A pin 3 (JA3), IO_L4N_T0_15, A11, RED cable
+set_property -dict { PACKAGE_PIN A11 IOSTANDARD LVCMOS33 } [get_ports { pmod_uart_txd }];
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/Makefile
new file mode 100644
index 0000000000..03ab9b8de7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/Makefile
@@ -0,0 +1,33 @@
+DESIGN := design_1
+
+help:
+	@echo "INFO: make <TAB> to show targets"
+.PHONY: help
+
+--setup:
+	xsct ./setup.tcl $(DESIGN)
+.PHONY: --setup
+
+sdk: --setup
+	rm -f $(DESIGN)_standalone/src/helloworld.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h
+.PHONY: sdk
+
+gui:
+	xsdk --workspace . &
+.PHONY: gui
+
+clean:
+	rm -rf $(DESIGN)_platform
+	rm -rf $(DESIGN)_standalone
+	rm -rf $(DESIGN)_standalone_bsp
+	rm -rf RemoteSystemsTempFiles
+	rm -rf .Xil
+	rm -rf .metadata
+	rm -f *.log
+.PHONY: clean
+
+ultraclean: clean
+	rm -rf hdf/*.hdf
+.PHONY: ultraclean
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/common/main.c
new file mode 100644
index 0000000000..7dd2be22a8
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/common/main.c
@@ -0,0 +1,262 @@
+/**
+ *
+ * Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+ *
+ */
+
+#include "xmyproject_axi.h"  /* TODO: design-dependent name */
+#include "stdio.h"       /* PRINTF */
+#include "unistd.h"      /* sleep */
+#include "stdlib.h"
+#include "malloc.h"
+#include "assert.h"
+#include "xil_io.h"      /* peripheral read/write wrappers */
+#include "xtime_l.h"     /* to measure performance of the system */
+#include "platform.h"    /* platform init/cleanup functions */
+#include "xil_cache.h"   /* enable/disable caches etc */
+#include "xil_printf.h"  /* UART debug print functions */
+#include "xparameters.h" /* peripherals base addresses */
+
+#include "data.h"
+
+//#define __DEBUG__
+
+#define MAX_PRINT_ELEMENTS (16)
+
+#define PRINTF printf
+
+const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS;
+const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS;
+
+#if 1
+/* Accelerator verification */
+#define REFERENCE_OUTPUTS data_y_hls_outputs
+#else
+/* Accelerator validation */
+#define REFERENCE_OUTPUTS data_y_outputs
+//#define REFERENCE_OUTPUTS data_y_keras_outputs
+#endif
+
+unsigned get_max(float *data, unsigned n_elements) {
+	float max_value = 0.0;
+	unsigned max_index = 0;
+	for (unsigned i = 0; i < n_elements; i++)
+		if (data[i] >= max_value) {
+			max_index = i;
+			max_value = data[i];
+		}
+	return max_index;
+}
+
+float *inputs_mem = NULL;
+float *outputs_mem = NULL;
+float *reference_mem = NULL;
+
+/* Accelerator configuration */
+XMyproject_axi accelerator; /* TODO: design-dependent name */
+XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */
+
+/* Accelerator initialization routine */
+void init_accelerators() {
+    PRINTF("INFO: Initializing accelerator\r\n");
+    accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */
+    if (accelerator_cfg) {
+        int status  = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */
+        if (status != XST_SUCCESS) {
+            PRINTF("ERROR: Initializing accelerator\r\n");
+        }
+    }
+}
+
+/* Reference implementation of the accelerator in software */
+int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) {
+#ifdef __DEBUG__
+	PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n");
+#endif
+    /* See data.h for inputs and outputs */
+    for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) {
+    	sw_outputs_mem[i] = REFERENCE_OUTPUTS[i];
+    }
+    return 0;
+}
+
+/* Profiling function */
+double get_elapsed_time(XTime start, XTime stop) {
+    return 1.0 * (stop - start) / (COUNTS_PER_SECOND);
+}
+
+/* Dump data to the console */
+void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) {
+	PRINTF("INFO:   %s[%u][%u]:\r\n", label, n_samples, feature_count);
+    /* Print at most MAX_PRINT_ELEMENTS */
+    for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) {
+    	PRINTF("INFO:     [%u] ", i);
+        for (unsigned j = 0; j < feature_count; j++) {
+        	unsigned index = i * feature_count + j;
+        	PRINTF("%f ", data[index]);
+        }
+        PRINTF("\r\n");
+    }
+}
+
+/* The top of the hill :-) */
+int main(int argc, char** argv) {
+
+    XTime start, stop;
+    double calibration_time;
+    double sw_elapsed = 0;
+    double hw_elapsed = 0;
+    double cache_elapsed = 0;
+    unsigned hw_errors;
+
+    char __attribute__ ((unused)) dummy; /* dummy input */
+
+    /* Initialize platform (uart and caches) */
+    init_platform();
+
+    PRINTF("\r\n");
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */
+    PRINTF("INFO: ==================================================\r\n");
+
+    init_accelerators();
+
+    inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float));
+    outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+    reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+
+    /* Calibration */
+    XTime_GetTime(&start);
+    sleep(1);
+    XTime_GetTime(&stop);
+    calibration_time = get_elapsed_time(start, stop);
+    PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time);
+
+    /* Initialize memory */
+    PRINTF("INFO: Initialize memory\r\n");
+    PRINTF("INFO:   - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */
+    PRINTF("INFO:   - Inputs count: %u\r\n", N_X_INPUTS);
+    PRINTF("INFO:   - Outputs count: %u\r\n", N_Y_OUTPUTS);
+    PRINTF("INFO:   - Data size: %u B\r\n", sizeof(float));
+    PRINTF("INFO:   - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+    PRINTF("INFO:   - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+
+    // Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+    //malloc_stats();
+
+    for (int i = 0; i < INPUT_N_ELEMENTS; i++) {
+        inputs_mem[i] = data_X_inputs[i];
+    }
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        outputs_mem[i] = 0x0;
+    }
+
+    /* ****** SW REFERENCE ****** */
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Start SW reference implementation\r\n");
+    XTime_GetTime(&start);
+    sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS);
+    XTime_GetTime(&stop);
+    sw_elapsed = get_elapsed_time(start, stop);
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Press any key to start:\r\n");
+    dummy = inbyte();
+    //PRINTF("INFO:");
+
+    /* ****** HW ACCELERATOR ****** */
+    PRINTF("INFO: Start HW accelerator\r\n");
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed = get_elapsed_time(start, stop);
+
+    for (unsigned j = 0; j < N_SAMPLES; j++) {
+    	float *inputs_mem_i = inputs_mem + j * N_X_INPUTS;
+    	float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS;
+
+    	/* Configure the accelerator */
+    	XTime_GetTime(&start);
+        XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */
+    	XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */
+
+    	XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */
+
+    	/* Polling */
+    	while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */
+
+    	/* Get error status */
+    	//hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */
+    	XTime_GetTime(&stop);
+    	hw_elapsed += get_elapsed_time(start, stop);
+    }
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed += get_elapsed_time(start, stop);
+
+    PRINTF("INFO: HW accelerator done!\r\n");
+
+    /* ****** VALIDATION ****** */
+    PRINTF("INFO: ================== Verification ==================\r\n");
+#ifdef __DEBUG__
+    PRINTF("INFO: Dump data\r\n");
+    dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS);
+    dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS);
+    dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS);
+#endif
+
+#ifdef __DEBUG__
+    PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed);
+#endif
+    PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES);
+    PRINTF("INFO:   - total %f sec\r\n", hw_elapsed);
+    PRINTF("INFO:   - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES));
+    PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed);
+#ifdef __DEBUG__
+    PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed));
+#endif
+
+    hw_errors = 0;
+#if 1
+    /* Accelerator verification */
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        if (outputs_mem[i] != reference_mem[i]) {
+            PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]);
+            hw_errors++;
+        }
+    }
+    PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS);
+    if (hw_errors > 0)
+        PRINTF("INFO: Verification: FAIL\r\n");
+    else
+        PRINTF("INFO: Verification: PASS!\r\n");
+#else
+    /* Accelerator validation */
+    for (unsigned s = 0; s < N_SAMPLES; s++) {
+    	unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	if (hw_digit != ref_digit) {
+#ifdef __DEBUG__
+    		PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit);
+#endif
+    	    hw_errors++;
+    	}
+    }
+    float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0;
+    float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0);
+    PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES);
+    PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate);
+    PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy);
+#endif
+    PRINTF("INFO: ==================================================\r\n");
+
+    cleanup_platform();
+
+    return 0;
+}
+
+
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/setup.tcl
new file mode 100644
index 0000000000..5e9e92d501
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/setup.tcl
@@ -0,0 +1,14 @@
+# See 
+# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html
+
+setws .
+if { $::argc == 1 } {
+    set myproject [lindex $::argv 0]
+    createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf
+    createapp -name ${myproject}\_standalone -app {Hello World} -proc ps7_cortexa9_0 -hwproject ${myproject}\_platform -os standalone
+    configapp -app ${myproject}\_standalone build-config release
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000}
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000}
+    projects -build
+    #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE}
+}
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/pynq-z1/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..4adb187ab4
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z1/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from pynq import DefaultHierarchy, DefaultIP, allocate
+from pynq import Overlay
+from datetime import datetime
+import pynq.lib.dma
+import numpy as np
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False,
+                 device=None):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = (timeb - timea)
+        dts = dt.seconds + dt.microseconds * 10 ** -6
+        rate = N / dts
+        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
\ No newline at end of file
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_lite_design.tcl
new file mode 100644
index 0000000000..4f6847ae70
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_lite_design.tcl
@@ -0,0 +1,26 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${myproject}_vivado_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${myproject}_prj [current_project]
+update_ip_catalog
+
+# Create Block Designer design
+create_bd_design "design_1"
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${myproject}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${myproject}_axi_0/s_axi_AXILiteS]
+
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_master_design.tcl
new file mode 100644
index 0000000000..6de05e15a7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_master_design.tcl
@@ -0,0 +1,88 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+# Project names
+set project_name "project_1"
+set design_name "design_1"
+set hls_solution_name "solution1"
+set ps_name "processing_system7_0"
+set acc_name "${myproject}_axi_0"
+set part_name "xc7z020clg400-1"
+set board_name "www.digilentinc.com:pynq-z1:part0:1.0"
+
+# Set board and chip part names
+create_project ${project_name} ${myproject}_vivado_accelerator -part ${part_name} -force
+set_property board_part ${board_name} [current_project]
+
+# Create block design
+create_bd_design ${design_name}
+
+# Setup IP repo
+#set_property  ip_repo_paths ${myproject}_prj [current_project]
+set_property  ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project]
+update_ip_catalog
+
+# Create and setup PS
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 ${ps_name}
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells ${ps_name}]
+set_property -dict [list CONFIG.PCW_USE_S_AXI_GP0 {1} CONFIG.PCW_USE_FABRIC_INTERRUPT {1} CONFIG.PCW_IRQ_F2P_INTR {1}] [get_bd_cells ${ps_name}]
+
+# Create accelerator
+create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name}
+
+# Wiring
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master {/myproject_axi_0/m_axi_IN_BUS} \
+    Slave {/processing_system7_0/S_AXI_GP0} \
+    intc_ip {Auto} \
+    master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_GP0]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master {/processing_system7_0/M_AXI_GP0} \
+    Slave {/myproject_axi_0/s_axi_CTRL_BUS} \
+    intc_ip {New AXI Interconnect} \
+    master_apm {0}} [get_bd_intf_pins myproject_axi_0/s_axi_CTRL_BUS]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {/processing_system7_0/FCLK_CLK0 (100 MHz)} \
+    Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} \
+    Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} \
+    Master {/myproject_axi_0/m_axi_OUT_BUS} \
+    Slave {/processing_system7_0/S_AXI_GP0} \
+    intc_ip {/axi_smc} \
+    master_apm {0}} [get_bd_intf_pins myproject_axi_0/m_axi_OUT_BUS]
+
+# Wiring interrupt signal
+connect_bd_net [get_bd_pins myproject_axi_0/interrupt] [get_bd_pins processing_system7_0/IRQ_F2P]
+
+# Top level wrapper
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top
+add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v
+
+# Memory mapping
+delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_IOP]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0]
+delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_IOP]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0]
+
+# Run synthesis and implementation
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+# Reporting
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+
+# Export HDF file for SDK flow
+file mkdir ./hdf
+file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..f5901c7f37
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,59 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${myproject}_vivado_accelerator -part xc7z020clg400-1 -force
+
+set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project]
+set_property  ip_repo_paths  ${myproject}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells processing_system7_0]
+
+startgroup
+set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+
+set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins processing_system7_0/S_AXI_HP0]
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0
+endgroup
+
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins ${myproject}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${myproject}_axi_0/ap_clk]
+
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0]
+
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/Makefile
new file mode 100644
index 0000000000..03ab9b8de7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/Makefile
@@ -0,0 +1,33 @@
+DESIGN := design_1
+
+help:
+	@echo "INFO: make <TAB> to show targets"
+.PHONY: help
+
+--setup:
+	xsct ./setup.tcl $(DESIGN)
+.PHONY: --setup
+
+sdk: --setup
+	rm -f $(DESIGN)_standalone/src/helloworld.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h
+.PHONY: sdk
+
+gui:
+	xsdk --workspace . &
+.PHONY: gui
+
+clean:
+	rm -rf $(DESIGN)_platform
+	rm -rf $(DESIGN)_standalone
+	rm -rf $(DESIGN)_standalone_bsp
+	rm -rf RemoteSystemsTempFiles
+	rm -rf .Xil
+	rm -rf .metadata
+	rm -f *.log
+.PHONY: clean
+
+ultraclean: clean
+	rm -rf hdf/*.hdf
+.PHONY: ultraclean
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/common/main.c
new file mode 100644
index 0000000000..7dd2be22a8
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/common/main.c
@@ -0,0 +1,262 @@
+/**
+ *
+ * Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+ *
+ */
+
+#include "xmyproject_axi.h"  /* TODO: design-dependent name */
+#include "stdio.h"       /* PRINTF */
+#include "unistd.h"      /* sleep */
+#include "stdlib.h"
+#include "malloc.h"
+#include "assert.h"
+#include "xil_io.h"      /* peripheral read/write wrappers */
+#include "xtime_l.h"     /* to measure performance of the system */
+#include "platform.h"    /* platform init/cleanup functions */
+#include "xil_cache.h"   /* enable/disable caches etc */
+#include "xil_printf.h"  /* UART debug print functions */
+#include "xparameters.h" /* peripherals base addresses */
+
+#include "data.h"
+
+//#define __DEBUG__
+
+#define MAX_PRINT_ELEMENTS (16)
+
+#define PRINTF printf
+
+const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS;
+const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS;
+
+#if 1
+/* Accelerator verification */
+#define REFERENCE_OUTPUTS data_y_hls_outputs
+#else
+/* Accelerator validation */
+#define REFERENCE_OUTPUTS data_y_outputs
+//#define REFERENCE_OUTPUTS data_y_keras_outputs
+#endif
+
+unsigned get_max(float *data, unsigned n_elements) {
+	float max_value = 0.0;
+	unsigned max_index = 0;
+	for (unsigned i = 0; i < n_elements; i++)
+		if (data[i] >= max_value) {
+			max_index = i;
+			max_value = data[i];
+		}
+	return max_index;
+}
+
+float *inputs_mem = NULL;
+float *outputs_mem = NULL;
+float *reference_mem = NULL;
+
+/* Accelerator configuration */
+XMyproject_axi accelerator; /* TODO: design-dependent name */
+XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */
+
+/* Accelerator initialization routine */
+void init_accelerators() {
+    PRINTF("INFO: Initializing accelerator\r\n");
+    accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */
+    if (accelerator_cfg) {
+        int status  = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */
+        if (status != XST_SUCCESS) {
+            PRINTF("ERROR: Initializing accelerator\r\n");
+        }
+    }
+}
+
+/* Reference implementation of the accelerator in software */
+int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) {
+#ifdef __DEBUG__
+	PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n");
+#endif
+    /* See data.h for inputs and outputs */
+    for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) {
+    	sw_outputs_mem[i] = REFERENCE_OUTPUTS[i];
+    }
+    return 0;
+}
+
+/* Profiling function */
+double get_elapsed_time(XTime start, XTime stop) {
+    return 1.0 * (stop - start) / (COUNTS_PER_SECOND);
+}
+
+/* Dump data to the console */
+void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) {
+	PRINTF("INFO:   %s[%u][%u]:\r\n", label, n_samples, feature_count);
+    /* Print at most MAX_PRINT_ELEMENTS */
+    for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) {
+    	PRINTF("INFO:     [%u] ", i);
+        for (unsigned j = 0; j < feature_count; j++) {
+        	unsigned index = i * feature_count + j;
+        	PRINTF("%f ", data[index]);
+        }
+        PRINTF("\r\n");
+    }
+}
+
+/* The top of the hill :-) */
+int main(int argc, char** argv) {
+
+    XTime start, stop;
+    double calibration_time;
+    double sw_elapsed = 0;
+    double hw_elapsed = 0;
+    double cache_elapsed = 0;
+    unsigned hw_errors;
+
+    char __attribute__ ((unused)) dummy; /* dummy input */
+
+    /* Initialize platform (uart and caches) */
+    init_platform();
+
+    PRINTF("\r\n");
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */
+    PRINTF("INFO: ==================================================\r\n");
+
+    init_accelerators();
+
+    inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float));
+    outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+    reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+
+    /* Calibration */
+    XTime_GetTime(&start);
+    sleep(1);
+    XTime_GetTime(&stop);
+    calibration_time = get_elapsed_time(start, stop);
+    PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time);
+
+    /* Initialize memory */
+    PRINTF("INFO: Initialize memory\r\n");
+    PRINTF("INFO:   - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */
+    PRINTF("INFO:   - Inputs count: %u\r\n", N_X_INPUTS);
+    PRINTF("INFO:   - Outputs count: %u\r\n", N_Y_OUTPUTS);
+    PRINTF("INFO:   - Data size: %u B\r\n", sizeof(float));
+    PRINTF("INFO:   - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+    PRINTF("INFO:   - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+
+    // Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+    //malloc_stats();
+
+    for (int i = 0; i < INPUT_N_ELEMENTS; i++) {
+        inputs_mem[i] = data_X_inputs[i];
+    }
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        outputs_mem[i] = 0x0;
+    }
+
+    /* ****** SW REFERENCE ****** */
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Start SW reference implementation\r\n");
+    XTime_GetTime(&start);
+    sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS);
+    XTime_GetTime(&stop);
+    sw_elapsed = get_elapsed_time(start, stop);
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Press any key to start:\r\n");
+    dummy = inbyte();
+    //PRINTF("INFO:");
+
+    /* ****** HW ACCELERATOR ****** */
+    PRINTF("INFO: Start HW accelerator\r\n");
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed = get_elapsed_time(start, stop);
+
+    for (unsigned j = 0; j < N_SAMPLES; j++) {
+    	float *inputs_mem_i = inputs_mem + j * N_X_INPUTS;
+    	float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS;
+
+    	/* Configure the accelerator */
+    	XTime_GetTime(&start);
+        XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */
+    	XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */
+
+    	XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */
+
+    	/* Polling */
+    	while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */
+
+    	/* Get error status */
+    	//hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */
+    	XTime_GetTime(&stop);
+    	hw_elapsed += get_elapsed_time(start, stop);
+    }
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed += get_elapsed_time(start, stop);
+
+    PRINTF("INFO: HW accelerator done!\r\n");
+
+    /* ****** VALIDATION ****** */
+    PRINTF("INFO: ================== Verification ==================\r\n");
+#ifdef __DEBUG__
+    PRINTF("INFO: Dump data\r\n");
+    dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS);
+    dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS);
+    dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS);
+#endif
+
+#ifdef __DEBUG__
+    PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed);
+#endif
+    PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES);
+    PRINTF("INFO:   - total %f sec\r\n", hw_elapsed);
+    PRINTF("INFO:   - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES));
+    PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed);
+#ifdef __DEBUG__
+    PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed));
+#endif
+
+    hw_errors = 0;
+#if 1
+    /* Accelerator verification */
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        if (outputs_mem[i] != reference_mem[i]) {
+            PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]);
+            hw_errors++;
+        }
+    }
+    PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS);
+    if (hw_errors > 0)
+        PRINTF("INFO: Verification: FAIL\r\n");
+    else
+        PRINTF("INFO: Verification: PASS!\r\n");
+#else
+    /* Accelerator validation */
+    for (unsigned s = 0; s < N_SAMPLES; s++) {
+    	unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	if (hw_digit != ref_digit) {
+#ifdef __DEBUG__
+    		PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit);
+#endif
+    	    hw_errors++;
+    	}
+    }
+    float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0;
+    float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0);
+    PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES);
+    PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate);
+    PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy);
+#endif
+    PRINTF("INFO: ==================================================\r\n");
+
+    cleanup_platform();
+
+    return 0;
+}
+
+
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/setup.tcl
new file mode 100644
index 0000000000..5e9e92d501
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/setup.tcl
@@ -0,0 +1,14 @@
+# See 
+# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html
+
+setws .
+if { $::argc == 1 } {
+    set myproject [lindex $::argv 0]
+    createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf
+    createapp -name ${myproject}\_standalone -app {Hello World} -proc ps7_cortexa9_0 -hwproject ${myproject}\_platform -os standalone
+    configapp -app ${myproject}\_standalone build-config release
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000}
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000}
+    projects -build
+    #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE}
+}
diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl
new file mode 100644
index 0000000000..b3c3ba9c0d
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl
@@ -0,0 +1,88 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+# Project names
+set project_name "project_1"
+set design_name "design_1"
+set hls_solution_name "solution1"
+set ps_name "processing_system7_0"
+set acc_name "${myproject}_axi_0"
+set part_name "xc7z020clg400-1"
+set board_name "tul.com.tw:pynq-z2:part0:1.0"
+
+# Set board and chip part names
+create_project ${project_name} ${myproject}_vivado_accelerator -part ${part_name} -force
+set_property board_part ${board_name} [current_project]
+
+# Create block design
+create_bd_design ${design_name}
+
+# Setup IP repo
+#set_property  ip_repo_paths ${myproject}_prj [current_project]
+set_property  ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project]
+update_ip_catalog
+
+# Create and setup PS
+create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 ${ps_name}
+apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells ${ps_name}]
+set_property -dict [list CONFIG.PCW_USE_S_AXI_GP0 {1} CONFIG.PCW_USE_FABRIC_INTERRUPT {1} CONFIG.PCW_IRQ_F2P_INTR {1}] [get_bd_cells ${ps_name}]
+
+# Create accelerator
+create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name}
+
+# Wiring
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master {/myproject_axi_0/m_axi_IN_BUS} \
+    Slave {/processing_system7_0/S_AXI_GP0} \
+    intc_ip {Auto} \
+    master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_GP0]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master {/processing_system7_0/M_AXI_GP0} \
+    Slave {/myproject_axi_0/s_axi_CTRL_BUS} \
+    intc_ip {New AXI Interconnect} \
+    master_apm {0}} [get_bd_intf_pins myproject_axi_0/s_axi_CTRL_BUS]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {/processing_system7_0/FCLK_CLK0 (100 MHz)} \
+    Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} \
+    Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} \
+    Master {/myproject_axi_0/m_axi_OUT_BUS} \
+    Slave {/processing_system7_0/S_AXI_GP0} \
+    intc_ip {/axi_smc} \
+    master_apm {0}} [get_bd_intf_pins myproject_axi_0/m_axi_OUT_BUS]
+
+# Wiring interrupt signal
+connect_bd_net [get_bd_pins myproject_axi_0/interrupt] [get_bd_pins processing_system7_0/IRQ_F2P]
+
+# Top level wrapper
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top
+add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v
+
+# Memory mapping
+delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_IOP]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0]
+delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_IOP]
+delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0]
+
+# Run synthesis and implementation
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+# Reporting
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+
+# Export HDF file for SDK flow
+file mkdir ./hdf
+file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c
new file mode 100644
index 0000000000..8a46df8bde
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c
@@ -0,0 +1,6 @@
+#include "xil_printf.h"
+
+int main(void) {
+    xil_printf("Hello world!\r\n");
+    return 0;
+}
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h
new file mode 100644
index 0000000000..8a46df8bde
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h
@@ -0,0 +1,6 @@
+#include "xil_printf.h"
+
+int main(void) {
+    xil_printf("Hello world!\r\n");
+    return 0;
+}
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile
new file mode 100644
index 0000000000..03ab9b8de7
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile
@@ -0,0 +1,33 @@
+DESIGN := design_1
+
+help:
+	@echo "INFO: make <TAB> to show targets"
+.PHONY: help
+
+--setup:
+	xsct ./setup.tcl $(DESIGN)
+.PHONY: --setup
+
+sdk: --setup
+	rm -f $(DESIGN)_standalone/src/helloworld.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c
+	cd  $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h
+.PHONY: sdk
+
+gui:
+	xsdk --workspace . &
+.PHONY: gui
+
+clean:
+	rm -rf $(DESIGN)_platform
+	rm -rf $(DESIGN)_standalone
+	rm -rf $(DESIGN)_standalone_bsp
+	rm -rf RemoteSystemsTempFiles
+	rm -rf .Xil
+	rm -rf .metadata
+	rm -f *.log
+.PHONY: clean
+
+ultraclean: clean
+	rm -rf hdf/*.hdf
+.PHONY: ultraclean
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/common/main.c
new file mode 100644
index 0000000000..7dd2be22a8
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/common/main.c
@@ -0,0 +1,262 @@
+/**
+ *
+ * Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+ *
+ */
+
+#include "xmyproject_axi.h"  /* TODO: design-dependent name */
+#include "stdio.h"       /* PRINTF */
+#include "unistd.h"      /* sleep */
+#include "stdlib.h"
+#include "malloc.h"
+#include "assert.h"
+#include "xil_io.h"      /* peripheral read/write wrappers */
+#include "xtime_l.h"     /* to measure performance of the system */
+#include "platform.h"    /* platform init/cleanup functions */
+#include "xil_cache.h"   /* enable/disable caches etc */
+#include "xil_printf.h"  /* UART debug print functions */
+#include "xparameters.h" /* peripherals base addresses */
+
+#include "data.h"
+
+//#define __DEBUG__
+
+#define MAX_PRINT_ELEMENTS (16)
+
+#define PRINTF printf
+
+const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS;
+const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS;
+
+#if 1
+/* Accelerator verification */
+#define REFERENCE_OUTPUTS data_y_hls_outputs
+#else
+/* Accelerator validation */
+#define REFERENCE_OUTPUTS data_y_outputs
+//#define REFERENCE_OUTPUTS data_y_keras_outputs
+#endif
+
+unsigned get_max(float *data, unsigned n_elements) {
+	float max_value = 0.0;
+	unsigned max_index = 0;
+	for (unsigned i = 0; i < n_elements; i++)
+		if (data[i] >= max_value) {
+			max_index = i;
+			max_value = data[i];
+		}
+	return max_index;
+}
+
+float *inputs_mem = NULL;
+float *outputs_mem = NULL;
+float *reference_mem = NULL;
+
+/* Accelerator configuration */
+XMyproject_axi accelerator; /* TODO: design-dependent name */
+XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */
+
+/* Accelerator initialization routine */
+void init_accelerators() {
+    PRINTF("INFO: Initializing accelerator\r\n");
+    accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */
+    if (accelerator_cfg) {
+        int status  = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */
+        if (status != XST_SUCCESS) {
+            PRINTF("ERROR: Initializing accelerator\r\n");
+        }
+    }
+}
+
+/* Reference implementation of the accelerator in software */
+int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) {
+#ifdef __DEBUG__
+	PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n");
+#endif
+    /* See data.h for inputs and outputs */
+    for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) {
+    	sw_outputs_mem[i] = REFERENCE_OUTPUTS[i];
+    }
+    return 0;
+}
+
+/* Profiling function */
+double get_elapsed_time(XTime start, XTime stop) {
+    return 1.0 * (stop - start) / (COUNTS_PER_SECOND);
+}
+
+/* Dump data to the console */
+void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) {
+	PRINTF("INFO:   %s[%u][%u]:\r\n", label, n_samples, feature_count);
+    /* Print at most MAX_PRINT_ELEMENTS */
+    for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) {
+    	PRINTF("INFO:     [%u] ", i);
+        for (unsigned j = 0; j < feature_count; j++) {
+        	unsigned index = i * feature_count + j;
+        	PRINTF("%f ", data[index]);
+        }
+        PRINTF("\r\n");
+    }
+}
+
+/* The top of the hill :-) */
+int main(int argc, char** argv) {
+
+    XTime start, stop;
+    double calibration_time;
+    double sw_elapsed = 0;
+    double hw_elapsed = 0;
+    double cache_elapsed = 0;
+    unsigned hw_errors;
+
+    char __attribute__ ((unused)) dummy; /* dummy input */
+
+    /* Initialize platform (uart and caches) */
+    init_platform();
+
+    PRINTF("\r\n");
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */
+    PRINTF("INFO: ==================================================\r\n");
+
+    init_accelerators();
+
+    inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float));
+    outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+    reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float));
+
+    /* Calibration */
+    XTime_GetTime(&start);
+    sleep(1);
+    XTime_GetTime(&stop);
+    calibration_time = get_elapsed_time(start, stop);
+    PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time);
+
+    /* Initialize memory */
+    PRINTF("INFO: Initialize memory\r\n");
+    PRINTF("INFO:   - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */
+    PRINTF("INFO:   - Inputs count: %u\r\n", N_X_INPUTS);
+    PRINTF("INFO:   - Outputs count: %u\r\n", N_Y_OUTPUTS);
+    PRINTF("INFO:   - Data size: %u B\r\n", sizeof(float));
+    PRINTF("INFO:   - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+    PRINTF("INFO:   - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024));
+
+    // Set Heap Size in ldscript.ld to 0x1000000 (16MB)
+    //malloc_stats();
+
+    for (int i = 0; i < INPUT_N_ELEMENTS; i++) {
+        inputs_mem[i] = data_X_inputs[i];
+    }
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        outputs_mem[i] = 0x0;
+    }
+
+    /* ****** SW REFERENCE ****** */
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Start SW reference implementation\r\n");
+    XTime_GetTime(&start);
+    sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS);
+    XTime_GetTime(&stop);
+    sw_elapsed = get_elapsed_time(start, stop);
+    PRINTF("INFO: ==================================================\r\n");
+    PRINTF("INFO: Press any key to start:\r\n");
+    dummy = inbyte();
+    //PRINTF("INFO:");
+
+    /* ****** HW ACCELERATOR ****** */
+    PRINTF("INFO: Start HW accelerator\r\n");
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed = get_elapsed_time(start, stop);
+
+    for (unsigned j = 0; j < N_SAMPLES; j++) {
+    	float *inputs_mem_i = inputs_mem + j * N_X_INPUTS;
+    	float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS;
+
+    	/* Configure the accelerator */
+    	XTime_GetTime(&start);
+        XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */
+    	XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */
+
+    	XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */
+
+    	/* Polling */
+    	while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */
+
+    	/* Get error status */
+    	//hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */
+    	XTime_GetTime(&stop);
+    	hw_elapsed += get_elapsed_time(start, stop);
+    }
+
+    XTime_GetTime(&start);
+    Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float));
+    XTime_GetTime(&stop);
+    cache_elapsed += get_elapsed_time(start, stop);
+
+    PRINTF("INFO: HW accelerator done!\r\n");
+
+    /* ****** VALIDATION ****** */
+    PRINTF("INFO: ================== Verification ==================\r\n");
+#ifdef __DEBUG__
+    PRINTF("INFO: Dump data\r\n");
+    dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS);
+    dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS);
+    dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS);
+#endif
+
+#ifdef __DEBUG__
+    PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed);
+#endif
+    PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES);
+    PRINTF("INFO:   - total %f sec\r\n", hw_elapsed);
+    PRINTF("INFO:   - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES));
+    PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed);
+#ifdef __DEBUG__
+    PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed));
+#endif
+
+    hw_errors = 0;
+#if 1
+    /* Accelerator verification */
+    for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) {
+        if (outputs_mem[i] != reference_mem[i]) {
+            PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]);
+            hw_errors++;
+        }
+    }
+    PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS);
+    if (hw_errors > 0)
+        PRINTF("INFO: Verification: FAIL\r\n");
+    else
+        PRINTF("INFO: Verification: PASS!\r\n");
+#else
+    /* Accelerator validation */
+    for (unsigned s = 0; s < N_SAMPLES; s++) {
+    	unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS);
+    	if (hw_digit != ref_digit) {
+#ifdef __DEBUG__
+    		PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit);
+#endif
+    	    hw_errors++;
+    	}
+    }
+    float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0;
+    float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0);
+    PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES);
+    PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate);
+    PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy);
+#endif
+    PRINTF("INFO: ==================================================\r\n");
+
+    cleanup_platform();
+
+    return 0;
+}
+
+
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl
new file mode 100644
index 0000000000..ea386d4281
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl
@@ -0,0 +1,18 @@
+# See 
+# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html
+
+setws .
+if { $::argc == 1 } {
+    set myproject [lindex $::argv 0]
+    createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf
+    createapp -name ${myproject}\_standalone -app {Hello World} -proc psu_cortexa53_0 -hwproject ${myproject}\_platform -os standalone -arch 64
+    configbsp -bsp ${myproject}\_standalone_bsp stdin psu_uart_1
+    configbsp -bsp ${myproject}\_standalone_bsp stdout psu_uart_1
+    updatemss -mss ${myproject}\_standalone_bsp/system.mss
+    regenbsp -bsp ${myproject}\_standalone_bsp 
+    configapp -app ${myproject}\_standalone build-config release
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000}
+    configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000}
+    projects -build
+    #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE}
+}
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py
new file mode 100644
index 0000000000..4adb187ab4
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py
@@ -0,0 +1,75 @@
+from pynq import DefaultHierarchy, DefaultIP, allocate
+from pynq import Overlay
+from datetime import datetime
+import pynq.lib.dma
+import numpy as np
+
+
+class NeuralNetworkOverlay(Overlay):
+    def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False,
+                 device=None):
+        super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None)
+        self.sendchannel = self.hier_0.axi_dma_0.sendchannel
+        self.recvchannel = self.hier_0.axi_dma_0.recvchannel
+        self.input_buffer = allocate(shape=x_shape, dtype=dtype)
+        self.output_buffer = allocate(shape=y_shape, dtype=dtype)
+
+    def _print_dt(self, timea, timeb, N):
+        dt = (timeb - timea)
+        dts = dt.seconds + dt.microseconds * 10 ** -6
+        rate = N / dts
+        print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate))
+        return dts, rate
+
+    def predict(self, X, debug=False, profile=False, encode=None, decode=None):
+        """
+        Obtain the predictions of the NN implemented in the FPGA.
+        Parameters:
+        - X : the input vector. Should be numpy ndarray.
+        - dtype : the data type of the elements of the input/output vectors.
+                  Note: it should be set depending on the interface of the accelerator; if it uses 'float'
+                  types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use.
+                  Instead if it uses 'ap_fixed<A,B>', 'np.intA' is the correct one to use (note that A cannot
+                  any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy`
+                  doc for more info).
+                  In this case the encoding/decoding has to be computed by the PS. For example for
+                  'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode
+                  'float' -> 'ap_fixed<16,6>':
+                  ```
+                    def encode(xi):
+                        return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B)
+                    def decode(yi):
+                        return yi * 2**-10
+                    encode_v = np.vectorize(encode) # to apply them element-wise
+                    decode_v = np.vectorize(decode)
+                  ```
+        - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`.
+        - encode/decode: function pointers. See `dtype` section for more information.
+        - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to
+                  the namesake parameter.
+        """
+        if profile:
+            timea = datetime.now()
+        if encode is not None:
+            X = encode(X)
+        self.input_buffer[:] = X
+        self.sendchannel.transfer(self.input_buffer)
+        self.recvchannel.transfer(self.output_buffer)
+        if debug:
+            print("Transfer OK")
+        self.sendchannel.wait()
+        if debug:
+            print("Send OK")
+        self.recvchannel.wait()
+        if debug:
+            print("Receive OK")
+        # result = self.output_buffer.copy()
+        if decode is not None:
+            self.output_buffer = decode(self.output_buffer)
+
+        if profile:
+            timeb = datetime.now()
+            dts, rate = self._print_dt(timea, timeb, len(X))
+            return self.output_buffer, dts, rate
+        else:
+            return self.output_buffer
\ No newline at end of file
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl
new file mode 100644
index 0000000000..2df93afca5
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl
@@ -0,0 +1,26 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${myproject}_vivado_accelerator -part xczu3eg-sbva484-1-e -force
+
+set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project]
+set_property  ip_repo_paths  ${myproject}_prj [current_project]
+update_ip_catalog
+
+# Create Block Designer design
+create_bd_design "design_1"
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" }  [get_bd_cells zynq_ultra_ps_e]
+create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${myproject}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins ${myproject}_axi_0/s_axi_AXILiteS]
+
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl
new file mode 100644
index 0000000000..bb91ba9ee2
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl
@@ -0,0 +1,91 @@
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+# Project names
+set project_name "project_1"
+set design_name "design_1"
+set hls_solution_name "solution1"
+set ps_name "zynq_ultra_ps_e_0"
+set acc_name "${myproject}_axi_0"
+
+# Board and chip part names
+create_project ${project_name} ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+set_property board_part avnet.com:ultra96v2:part0:1.2 [current_project]
+
+# Create block design
+create_bd_design ${design_name}
+
+# Setup IP repo
+#set_property  ip_repo_paths ${myproject}_prj [current_project]
+set_property  ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project]
+update_ip_catalog
+
+# Create and setup PS
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 ${ps_name}
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells ${ps_name}]
+set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells ${ps_name}]
+
+# Create accelerator
+create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name}
+
+# Wiring
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave {Auto} \
+    Clk_xbar {Auto} \
+    Master "/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD" \
+    Slave "/myproject_axi_0/s_axi_CTRL_BUS" \
+    intc_ip {New AXI Interconnect} \
+    master_apm {0}} [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+    Clk_master {Auto} \
+    Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+    Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+    Master "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD" \
+    Slave "/myproject_axi_0/s_axi_CTRL_BUS" \
+    intc_ip {/ps8_0_axi_periph} \
+    master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+   Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Clk_slave {Auto} \
+   Clk_xbar {Auto} \
+   Master "/myproject_axi_0/m_axi_IN_BUS" \
+   Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \
+   intc_ip {Auto} \
+   master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \
+   Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \
+   Master "/myproject_axi_0/m_axi_OUT_BUS" \
+   Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \
+   intc_ip {/axi_smc} \
+   master_apm {0}} [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS]
+
+# Wiring interrupt signal
+connect_bd_net [get_bd_pins ${acc_name}/interrupt] [get_bd_pins ${ps_name}/pl_ps_irq0]
+
+# Top level wrapper
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top
+add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v
+
+# Memory mapping
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_HPC0_LPS_OCM]
+delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_HPC0_LPS_OCM]
+
+# Run synthesis and implementation
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+# Reporting
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
+
+# Export HDF file for SDK flow
+file mkdir ./hdf
+file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf
diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl
new file mode 100644
index 0000000000..4721b59941
--- /dev/null
+++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl
@@ -0,0 +1,58 @@
+#@todo: try to remove startgroup and endgroup and see if it work
+set tcldir [file dirname [info script]]
+source [file join $tcldir project.tcl]
+
+create_project project_1 ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force
+
+set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project]
+set_property  ip_repo_paths  ${myproject}_prj [current_project]
+update_ip_catalog
+
+create_bd_design "design_1"
+set_property  ip_repo_paths ${myproject}_prj/solution1/impl/ip [current_project]
+update_ip_catalog
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0
+endgroup
+
+apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" }  [get_bd_cells zynq_ultra_ps_e_0]
+
+set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0]
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0
+endgroup
+set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0]
+set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0]
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}}  [get_bd_intf_pins axi_dma_0/S_AXI_LITE]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD]
+endgroup
+
+startgroup
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}}  [get_bd_intf_pins axi_dma_0/M_AXI_S2MM]
+apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}}  [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD]
+endgroup
+
+startgroup
+create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0
+endgroup
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r]
+connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${myproject}_axi_0/out_r]
+
+apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}}  [get_bd_pins ${myproject}_axi_0/ap_clk]
+group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0]
+
+make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top
+
+add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v
+
+reset_run impl_1
+reset_run synth_1
+launch_runs impl_1 -to_step write_bitstream -jobs 6
+wait_on_run -timeout 360 impl_1
+
+open_run impl_1
+report_utilization -file util.rpt -hierarchical -hierarchical_percentages
diff --git a/hls4ml/templates/vivado_accelerator_config.py b/hls4ml/templates/vivado_accelerator_config.py
index 2896d3d144..9b524120f6 100644
--- a/hls4ml/templates/vivado_accelerator_config.py
+++ b/hls4ml/templates/vivado_accelerator_config.py
@@ -14,7 +14,7 @@ def __init__(self, config, model_inputs, model_outputs):
             self.part = board_info['part']
         else:
             raise Exception('The board does not appear in supported_boards.json file')
-        
+
         if self.config.get('XilinxPart') is not None:
             if self.config.get('XilinxPart') != self.part:
                 print('WARNING: You set a XilinxPart that does not correspond to the Board you specified. The correct '
@@ -29,7 +29,7 @@ def __init__(self, config, model_inputs, model_outputs):
                 if prec.get('Input') is None or prec.get('Output') is None:
                     raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision')
         else:
-            accel_config = {'Precision': 
+            accel_config = {'Precision':
                                 {
                                     'Input': 'float',
                                     'Output': 'float'
@@ -61,16 +61,16 @@ def __init__(self, config, model_inputs, model_outputs):
         if out_axi_t not in ['float', 'double']:
             self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t))
 
-        if self.input_type is 'float':
+        if inp_axi_t == 'float':
             self.input_bitwidth = 32
-        elif self.input_type is 'double':
+        elif out_axi_t == 'double':
             self.input_bitwidth = 64
         else:
             self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width
 
-        if out_axi_t is 'float':
+        if out_axi_t == 'float':
             self.output_bitwidth = 32
-        elif out_axi_t is 'double':
+        elif out_axi_t == 'double':
             self.output_bitwidth = 64
         else:
             self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width
@@ -120,11 +120,21 @@ def get_board(self):
 
     def get_driver_path(self):
         return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + \
-               self.get_driver_file()
+               self.get_driver_files()
+
+    def get_vivado_ip_wrapper_path(self):
+        return '../templates/vivado_accelerator/' + self.board + '/verilog_wrappers'
+
+    def get_vivado_constraints_path(self):
+        return '../templates/vivado_accelerator/' + self.board + '/xdc_constraints'
 
-    def get_driver_file(self):
-        driver_ext = '.py' if self.driver == 'python' else '.h'
-        return self.interface + '_driver' + driver_ext
+    def get_driver_files(self):
+        if self.driver == 'c':
+            driver_dir = 'sdk'
+            return driver_dir
+        elif self.driver == 'python':
+            driver_ext = '.py'
+            return self.interface + '_driver' + driver_ext
 
     def get_input_type(self):
         return self.input_type
diff --git a/hls4ml/templates/vivado_template.py b/hls4ml/templates/vivado_template.py
index 149b52f1d5..6170ff5f9e 100644
--- a/hls4ml/templates/vivado_template.py
+++ b/hls4ml/templates/vivado_template.py
@@ -14,11 +14,13 @@
     static const unsigned reuse_factor = {reuse};
     static const unsigned n_zeros = {nzeros};
     static const unsigned n_nonzeros = {nonzeros};
+    static const bool merged_relu = {merged_relu};
     static const bool store_weights_in_bram = false;
     typedef {accum_t} accum_t;
     typedef {bias_t} bias_t;
     typedef {weight_t} weight_t;
     typedef {index_t} index_t;
+    typedef {out_t}:: value_type out_t;
     template<class x_T, class y_T, class res_T>
     using product = nnet::product::{product_type}<x_T, y_T, res_T>;
 }};\n"""
@@ -65,9 +67,11 @@
     static const unsigned n_out = {n_out};
     static const unsigned reuse_factor = {reuse};
     static const unsigned strategy = nnet::{strategy};
+    static const bool merged_relu = {merged_relu};
     typedef {accum_t} accum_t;
     typedef {bias_t} bias_t;
     typedef {weight_t} weight_t;
+    typedef {out_t}:: value_type out_t;
     template<class x_T, class y_T, class res_T>
     using product = nnet::product::{product_type}<x_T, y_T, res_T>;
 }};\n"""
@@ -386,6 +390,7 @@ def __init__(self, name='Vivado'):
         super(VivadoBackend, self).__init__(name)
         self.register_templates('Dense', dense_function_template, dense_config_template, dense_include_list)
         self.register_templates('BinaryDense'            , dense_function_template,       dense_config_template, dense_include_list)
+        self.register_templates('DenseBatchnorm'         , dense_function_template,       dense_config_template, dense_include_list)
         self.register_templates('BatchNormalization'     , batchnorm_function_template,   batchnorm_config_template, batchnorm_include_list)
         self.register_templates('Conv1D'                 , conv1d_function_template,      [conv1d_config_template, conv_mult_config_template], conv1d_include_list)
         self.register_templates('Conv2D'                 , conv2d_function_template,      [conv2d_config_template, conv_mult_config_template], conv2d_include_list)
diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py
index b907350bb4..1a99b888b2 100644
--- a/hls4ml/utils/config.py
+++ b/hls4ml/utils/config.py
@@ -108,7 +108,7 @@ def config_from_keras_model(model, granularity='model', default_precision='ap_fi
     norm_layers = ['BatchNormalization']
     activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'ReLU']
     merge_layers = ['Add', 'Subtract', 'Multiply', 'Average', 'Maximum', 'Minimum', 'Concatenate', 'Dot']
-    qkeras_layers = ['QDense', 'QActivation', 'QConv1D', 'QConv2D', 'QBatchNormalization', 'QConv2DBatchnorm']
+    qkeras_layers = ['QDense', 'QActivation', 'QConv1D', 'QConv2D', 'QBatchNormalization', 'QConv2DBatchnorm', 'QDenseBatchnorm']
     #Define layers to skip because they're not configurable or not converted to HLS
     skip_layers = ['Dropout', 'Flatten', 'Reshape', 'Permute']
     #All supported layers
diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py
index c5206e002d..d1f84a5db6 100644
--- a/hls4ml/writer/vivado_accelerator_writer.py
+++ b/hls4ml/writer/vivado_accelerator_writer.py
@@ -1,5 +1,6 @@
 import os
 from shutil import copyfile
+from shutil import copytree
 
 from hls4ml.templates.vivado_accelerator_config import VivadoAcceleratorConfig
 from hls4ml.writer.vivado_writer import VivadoWriter
@@ -98,8 +99,21 @@ def write_axi_wrapper(self, model):
                 elif io_type == 'io_stream':
                     newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n'
                     newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n'
-                    newline += indent + '#pragma HLS STREAM variable=in_local depth=N_IN\n'
-                    newline += indent + '#pragma HLS STREAM variable=out_local depth=N_OUT\n'
+                    in_local_depth = 0
+                    out_local_depth = 0
+                    try:
+                        in_local_depth  = model.config.config['HLSConfig']['LayerName']['in_local']['StreamDepth']
+                        out_local_depth = model.config.config['HLSConfig']['LayerName']['out_local']['StreamDepth']
+                    except KeyError:
+                        pass
+                    if in_local_depth:
+                        newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format(in_local_depth)
+                    else:
+                        newline += indent + '#pragma HLS STREAM variable=in_local depth=N_IN\n'
+                    if out_local_depth:
+                        newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format(out_local_depth)
+                    else:
+                        newline += indent + '#pragma HLS STREAM variable=out_local depth=N_OUT\n'
             elif '//hls-fpga-machine-learning insert call' in line:
                 newline = indent + '{}(in_local, out_local, in_size, out_size);\n'.format(
                     model.config.get_project_name())
@@ -196,7 +210,9 @@ def modify_build_script(self, model):
         fout = open(newfile, 'w')
 
         for line in f.readlines():
-            if 'set_top' in line:
+            if 'set filename myproject_prj/solution1/sim/verilog/myproject.tcl' in line:
+                newline = line[:-5] + '_axi\n'
+            elif 'set_top' in line:
                 newline = line[:-1] + '_axi\n'  # remove the newline from the line end and append _axi for the new top
                 newline += 'add_files firmware/{}_axi.cpp -cflags "-std=c++0x"\n'.format(
                     model.config.get_project_name())
@@ -317,6 +333,13 @@ def write_board_script(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
         copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_tcl_file_path()),
                  '{}/design.tcl'.format(model.config.get_output_dir()))
+        if self.vivado_accelerator_config.get_interface() == 'axi_master' and self.vivado_accelerator_config.board == "arty-a7-100t":
+            copytree(os.path.join(filedir, self.vivado_accelerator_config.get_vivado_ip_wrapper_path()),
+                     '{}/'.format(model.config.get_output_dir()),
+                     dirs_exist_ok=True)
+            copytree(os.path.join(filedir, self.vivado_accelerator_config.get_vivado_constraints_path()),
+                     '{}/'.format(model.config.get_output_dir()),
+                     dirs_exist_ok=True)
         f = open('{}/project.tcl'.format(model.config.get_output_dir()), 'w')
         f.write('variable myproject\n')
         f.write('set myproject "{}"\n'.format(model.config.get_project_name()))
@@ -324,17 +347,138 @@ def write_board_script(self, model):
             in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth()
             f.write('set bit_width_hls_output {}\n'.format(in_bit))
             f.write('set bit_width_hls_input {}\n'.format(out_bit))
+        if model.config.config['HLSConfig']['Model'].get('FIFO_opt'):
+            f.write('set fifo_opt 1\n')
+        else:
+            f.write('set fifo_opt 0\n')
+        if model.config.config['HLSConfig']['Model'].get('EEMBC_power'):
+            f.write('set eembc_power 1\n')
+        else:
+            f.write('set eembc_power 0\n')
         f.close()
 
+    def write_header_file(model, X, y, y_keras, y_hls, n_samples, filename='data.h'):
+        vivado_accelerator_config = VivadoAcceleratorConfig(model.config, model.get_input_variables(),
+                                                            model.get_output_variables())
+        inp_axi_t, out_axi_t, inp, out = vivado_accelerator_config.get_corrected_types()
+        header_file = open(filename, 'w')
+        (n_X_samples, n_X_inputs) = X.shape
+        (n_y_samples, n_y_outputs) = y.shape
+        (n_y_keras_samples, n_y_keras_outputs) = y_keras.shape
+        (n_y_hls_samples, n_y_hls_outputs) = y_hls.shape
+   
+        header_file.write('#ifndef __DATA_H__\n')
+        header_file.write('#define __DATA_H__\n')
+        header_file.write('/* out of {} */\n'.format(n_X_samples))
+        header_file.write('#define N_SAMPLES {}\n'.format(n_samples))
+        header_file.write('\n')
+        header_file.write('#define N_X_INPUTS {}\n'.format(n_X_inputs))
+        header_file.write('const {} data_X_inputs[N_SAMPLES*N_X_INPUTS] = {{\n'.format(inp_axi_t))
+        for s in range(n_samples):
+            header_file.write('    ')
+            for i in range(n_X_inputs):
+                header_file.write('{}, '.format(X[s][i]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('\n')
+        header_file.write('/* Ground truth - for validation */\n')
+        header_file.write('#define N_Y_OUTPUTS {}\n'.format(n_y_outputs))
+        header_file.write('const float data_y_outputs[N_SAMPLES*N_Y_OUTPUTS] = {\n')
+        for s in range(n_samples):
+            header_file.write('    ')
+            for o in range(n_y_outputs):
+                header_file.write('{}, '.format(y[s][o]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('\n')
+        header_file.write('/* Keras outputs - for validation */\n')
+        header_file.write('#define N_Y_KERAS_OUTPUTS {}\n'.format(n_y_keras_outputs))
+        header_file.write('')
+        header_file.write('const float data_y_keras_outputs[N_SAMPLES*N_Y_KERAS_OUTPUTS] = {\n')
+        for s in range(n_samples):
+            header_file.write('    ')
+            for o in range(n_y_keras_outputs):
+                header_file.write('{}, '.format(y_keras[s][o]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('\n')
+        header_file.write('/* csim outputs - for verification */\n')
+        header_file.write('#define N_Y_HLS_OUTPUTS {}\n'.format(n_y_hls_outputs))
+        header_file.write('')
+        header_file.write('const {} data_y_hls_outputs[N_SAMPLES*N_Y_HLS_OUTPUTS] = {{\n'.format(out_axi_t))
+        for s in range(n_samples):
+            header_file.write('    ')
+            for o in range(n_y_hls_outputs):
+                header_file.write('{}, '.format(y_hls[s][o]))
+            header_file.write('\n')
+        header_file.write('};\n')
+        header_file.write('#endif\n')
+        header_file.close()
+ 
     def write_driver(self, model):
         filedir = os.path.dirname(os.path.abspath(__file__))
-        copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()),
-                 ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir()))
+        srcfiles = os.path.join(filedir, self.vivado_accelerator_config.get_driver_path())
+        dstfiles = ('{}/' + self.vivado_accelerator_config.get_driver_files()).format(model.config.get_output_dir())
+        if os.path.isdir(srcfiles):
+            copytree(srcfiles, dstfiles, dirs_exist_ok=True)
+        else:
+            copyfile(srcfiles, dstfiles)
         
     def write_new_tar(self, model):
         os.remove(model.config.get_output_dir() + '.tar.gz')
         super(VivadoAcceleratorWriter, self).write_tar(model)
-        
+
+    def apply_patches(self, model):
+        '''
+        Apply patches.
+        '''
+        filedir = os.path.dirname(os.path.abspath(__file__))
+
+        indent = '    '
+
+        ###################
+        # patch myproject_axi.h
+        ###################  
+        oldfile = '{}/firmware/{}_axi.h'.format(model.config.get_output_dir(), model.config.get_project_name())
+        newfile = '{}/firmware/{}_axi_patch.h'.format(model.config.get_output_dir(), model.config.get_project_name())
+
+        f = open(oldfile,'r')
+        fout = open(newfile, 'w')
+
+        for line in f.readlines():
+            if 'typedef' in line and 'input_axi_t;' in line:
+                # hardcoded ap_uint<8> input
+                newline = 'typedef ap_uint<8> input_axi_t;\n'
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
+        ###################
+        # patch myproject_axi.cpp
+        ###################
+        oldfile = '{}/firmware/{}_axi.cpp'.format(model.config.get_output_dir(), model.config.get_project_name())
+        newfile = '{}/firmware/{}_axi_patch.cpp'.format(model.config.get_output_dir(), model.config.get_project_name())
+
+        f = open(oldfile,'r')
+        fout = open(newfile, 'w')
+
+        for line in f.readlines():
+            if 'ctype[j] = typename input_t::value_type' in line:
+                # these lines are hardcoded to do the bitshift by 256
+                newline = indent + indent + indent + 'ap_ufixed<16,8> tmp = in[i * input_t::size + j]; // store 8 bit input in a larger temp variable\n'
+                newline += indent + indent + indent + 'ctype[j] = typename input_t::value_type(tmp >> 8); // shift right by 8 (div by 256) and select only the decimal of the larger temp variable\n'
+            else:
+                newline = line
+            fout.write(newline)
+
+        f.close()
+        fout.close()
+        os.rename(newfile, oldfile)
+
     def write_hls(self, model):
         """
         Write the HLS project. Calls the VivadoBackend writer, and extra steps for VivadoAccelerator/AXI interface
@@ -347,5 +491,7 @@ def write_hls(self, model):
         self.write_wrapper_test(model)
         self.write_axi_wrapper(model)
         self.modify_build_script(model)
+        if model.config.get_config_value('ApplyPatches'):
+            self.apply_patches(model)
         self.write_new_tar(model)
 
diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py
index de7ff65543..b77e09a97b 100644
--- a/hls4ml/writer/vivado_writer.py
+++ b/hls4ml/writer/vivado_writer.py
@@ -663,7 +663,8 @@ def write_tar(self, model):
         ###################
         # Tarball output
         ###################
-
+        if os.path.isfile(model.config.get_output_dir() + '.tar.gz'):
+            return
         with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive:
             archive.add(model.config.get_output_dir(), recursive=True)
 
diff --git a/setup.py b/setup.py
index ebdda86d79..0d16f6acb8 100644
--- a/setup.py
+++ b/setup.py
@@ -30,7 +30,8 @@ def get_version(rel_path):
                         'six',
                         'pyyaml',
                         'h5py',
-                        'onnx>=1.4.0'],
+                        'onnx>=1.4.0',
+                        'pyDigitalWaveTools'],
       extras_require={
         'profiling': [
             'pandas',