diff --git a/example-models b/example-models index 0d4cc7277e..ff74f73dbc 160000 --- a/example-models +++ b/example-models @@ -1 +1 @@ -Subproject commit 0d4cc7277eac9bb9020e3d73a992dc15dbdcce4e +Subproject commit ff74f73dbc253d1aa7de1603ee10ede551919548 diff --git a/hls4ml/__init__.py b/hls4ml/__init__.py index 7c450cd347..0e55ee713e 100644 --- a/hls4ml/__init__.py +++ b/hls4ml/__init__.py @@ -1,6 +1,6 @@ from __future__ import absolute_import -__version__ = '0.5.1' +__version__ = '0.6.0' from hls4ml import converters from hls4ml import report diff --git a/hls4ml/converters/keras/core.py b/hls4ml/converters/keras/core.py index bb5ac5ec97..c284ddd3f9 100644 --- a/hls4ml/converters/keras/core.py +++ b/hls4ml/converters/keras/core.py @@ -104,7 +104,7 @@ def parse_activation_layer(keras_layer, input_names, input_shapes, data_reader, @keras_handler('BatchNormalization') def parse_batchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config): - assert('BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name']) + assert('BatchNormalization' in keras_layer['class_name'] or 'QConv2DBatchnorm' in keras_layer['class_name'] or 'QDenseBatchnorm' in keras_layer['class_name']) layer = parse_default_keras_layer(keras_layer, input_names) diff --git a/hls4ml/converters/keras/qkeras_layers.py b/hls4ml/converters/keras/qkeras_layers.py index eecacd84bb..fed2daec0b 100644 --- a/hls4ml/converters/keras/qkeras_layers.py +++ b/hls4ml/converters/keras/qkeras_layers.py @@ -110,3 +110,12 @@ def parse_qconv2dbatchnorm_layer(keras_layer, input_names, input_shapes, data_re temp_shape = intermediate_shape batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader, config) return {**conv_layer, **batch_layer}, out_shape + +@keras_handler('QDenseBatchnorm') +def parse_qdensebatchnorm_layer(keras_layer, input_names, input_shapes, data_reader, config): + intermediate_shape = list() + dense_layer, shape_qdense = parse_qdense_layer(keras_layer, input_names, input_shapes, data_reader, config) + intermediate_shape.append(shape_qdense) + temp_shape = intermediate_shape + batch_layer, out_shape = parse_batchnorm_layer(keras_layer, input_names, temp_shape, data_reader, config) + return {**dense_layer, **batch_layer}, out_shape diff --git a/hls4ml/model/hls_layers.py b/hls4ml/model/hls_layers.py index c730d60ffd..1ec8cc7050 100644 --- a/hls4ml/model/hls_layers.py +++ b/hls4ml/model/hls_layers.py @@ -199,6 +199,7 @@ def __init__(self, shape, dim_names, proxy, **kwargs): self.shape = shape self.dim_names = dim_names self.type = proxy.type + self.cppname = proxy.name self.name = proxy.name self.size = proxy.size @@ -365,6 +366,7 @@ def __init__(self, model, name, attributes, inputs, outputs=None): self.set_attr('accum_t', accum_t.precision) self.reuse_factor = self.model.config.get_reuse_factor(self) self.target_cycles = self.model.config.get_target_cycles(self) + self.merged_relu = False layer_config = self.model.config.get_layer_config(self) for config_key, config_value in layer_config.items(): @@ -410,6 +412,10 @@ def get_output_variable(self, output_name=None): else: return next(iter(self.variables.values())) + def set_output_variable(self, output_name, output_value): + self.variables[output_name] = output_value + + def get_weights(self, var_name=None): if var_name: return self.weights[var_name] @@ -450,6 +456,8 @@ def make_array_variable(self, shape, dim_names, var_name='layer{index}_out', typ def make_stream_variable(self, shape, dim_names, var_name='layer{index}_out', type_name='layer{index}_t', precision=None, depth=0): pack_factor = self.model.config.get_layer_config_value(self, 'PackFactor', default=1) + if depth == 0: + depth = self.model.config.get_layer_config_value(self, 'StreamDepth', default=0) return StreamVariable(shape, dim_names, var_name=var_name, type_name=type_name, precision=precision, n_pack=pack_factor, depth=depth, index=self.index) @@ -541,6 +549,12 @@ def _default_config_params(self): def get_layer_precision(self): return self.precision + def get_merged_relu(self): + return self.merged_relu + + def set_merged_relu(self, merged_relu): + self.merged_relu = merged_relu # Bool flag to set merged_relu + # myproject.cpp/h def function_cpp(self): raise NotImplementedError @@ -589,7 +603,6 @@ def initialize(self): out_name = self.outputs[0] proxy = self.get_input_variable() out = InplaceVariable(shape, dims, proxy, index=self.get_input_node().index) - self.variables[out_name] = out self.model.register_output_variable(out_name, out) @@ -646,9 +659,61 @@ def config_cpp(self): params['nonzeros'] = self.get_weights('weight').nonzeros params['product_type'] = self.model.config.backend.product_type(self.get_input_variable().type.precision, self.get_weights('weight').type.precision) params['strategy'] = self.get_attr('strategy') - + params['merged_relu'] = "true" if self.get_merged_relu() else "false" + params['out_t'] = self.get_output_variable().type.name return self._config_template.format(**params) +class DenseBatchnorm(Dense): + def _get_folded_weights(self): + """ + Function to get the batchnorm folded weights. + This function converts the weights by folding batchnorm parameters into + the weight of QDense. The high-level equation: + W_fold = gamma * W / sqrt(variance + epsilon) + bias_fold = gamma * (bias - moving_mean) / sqrt(variance + epsilon) + beta + """ + kernel = self.model.get_weights_data(self.name, 'kernel') + bias = self.model.get_weights_data(self.name, 'bias') + if bias is None: + bias = 0 + + # get batchnorm weights and moving stats + gamma = self.model.get_weights_data(self.name, 'gamma') + beta = self.model.get_weights_data(self.name, 'beta') + moving_mean = self.model.get_weights_data(self.name, 'moving_mean') + moving_variance = self.model.get_weights_data(self.name, 'moving_variance') + # get the inversion factor so that we replace division by multiplication + inv = np.reciprocal(np.sqrt(moving_variance + self.get_attr('epsilon'))) + if gamma is not None: + inv *= gamma + + # wrap conv kernel and bias with bn parameters + folded_kernel = inv * kernel + folded_bias = inv * (bias - moving_mean) + beta + + return [folded_kernel, folded_bias] + + def initialize(self): + super(DenseBatchnorm, self).initialize() + folded_weights, folded_bias = self._get_folded_weights() + if self.model.config.is_resource_strategy(self) and self.model.config.backend.name in ['Vivado', 'VivadoAccelerator']: + self.weights['weight'].data_unquantized = np.transpose(folded_weights) + self.weights['weight'].data = self.get_attr('weight_quantizer')(self.weights['weight'].data_unquantized) + + else: + self.weights['weight'].data_unquantized = folded_weights + self.weights['weight'].data = self.get_attr('weight_quantizer')(folded_weights) + self.weights['bias'].data_unquantized = folded_bias + bias_q = self.get_attr('bias_quantizer') + if bias_q is not None: + self.weights['bias'].data = bias_q(folded_bias) + + def function_cpp(self): + return super(DenseBatchnorm, self).function_cpp() + + def config_cpp(self): + return super(DenseBatchnorm, self).config_cpp() + class Conv1D(Layer): def initialize(self): if self.get_attr('data_format') == 'channels_last': @@ -854,7 +919,9 @@ def initialize(self): else: shape = [self.attributes['n_filt'], self.attributes['out_height'], self.attributes['out_width']] dims = ['N_FILT_{}'.format(self.index), 'OUT_HEIGHT_{}'.format(self.index), 'OUT_WIDTH_{}'.format(self.index)] + self.attributes['intermediate_index'] = self.index self.add_output_variable(shape, dims) + self.intermediate_op = self.get_output_variable() self.add_weights(quantizer=self.get_attr('weight_quantizer')) self.add_bias(quantizer=self.get_attr('bias_quantizer')) if len(self.weights['weight'].data.shape) == 2: # This can happen if we assign weights of Dense layer to 1x1 Conv2D @@ -921,6 +988,8 @@ def config_cpp(self): mult_params['n_in'] = self.get_attr('n_chan') * self.get_attr('filt_height') * self.get_attr('filt_width') mult_params['n_out'] = self.get_attr('n_filt') mult_params['product_type'] = self.model.config.backend.product_type(self.get_input_variable().type.precision, self.get_weights('weight').type.precision) + mult_params['merged_relu'] = "true" if self.get_merged_relu() else "false" + mult_params['out_t'] = self.intermediate_op.type.name mult_config = self._config_template[1].format(**mult_params) return mult_config + '\n' + conv_config @@ -1865,6 +1934,7 @@ def _get_transforms_config(self, params): 'BinaryDense' : Dense, 'TernaryDense' : Dense, 'QDense' : Dense, + 'QDenseBatchnorm' : DenseBatchnorm, 'Conv1D' : Conv1D, 'QConv1D' : Conv1D, 'Conv2D' : Conv2D, diff --git a/hls4ml/model/hls_model.py b/hls4ml/model/hls_model.py index 0f9c11ae3a..aeec6158ea 100644 --- a/hls4ml/model/hls_model.py +++ b/hls4ml/model/hls_model.py @@ -59,6 +59,18 @@ def get_project_name(self): def get_output_dir(self): return self.get_config_value('OutputDir') + def get_merged_relu(self, default=None): + hls_config = self.config['HLSConfig'] + + model_config = hls_config.get('Model', None) + key = 'MergedRelu' + + if model_config is not None: + tempbool = model_config.get(key, default) + return tempbool + + return default + def get_layer_config_value(self, layer, key, default=None): hls_config = self.config['HLSConfig'] diff --git a/hls4ml/model/optimizer/__init__.py b/hls4ml/model/optimizer/__init__.py index 19915b553e..dc312e4121 100644 --- a/hls4ml/model/optimizer/__init__.py +++ b/hls4ml/model/optimizer/__init__.py @@ -12,6 +12,7 @@ from hls4ml.model.optimizer.passes.conv_same_pad import InsertZeroPaddingBeforeConv2D from hls4ml.model.optimizer.passes.pointwise import OptimizePointwiseConv from hls4ml.model.optimizer.passes.clone import CloneOutput +from hls4ml.model.optimizer.passes.relu_merge import MergeRelu from hls4ml.model.optimizer.passes.repack_stream import ReshapeStream, BroadcastStream, RemoveFinalReshape from hls4ml.model.optimizer.passes.transpose_opt import RemoveUselessTranspose from hls4ml.model.optimizer.passes.multi_dense import ReplaceMultidimensionalDenseWithConv @@ -40,6 +41,7 @@ register_pass('conv2d_same_pad', InsertZeroPaddingBeforeConv2D) register_pass('optimize_pointwise_conv', OptimizePointwiseConv) register_pass('clone_output', CloneOutput) +register_pass('relu_merge', MergeRelu) register_pass('remove_final_reshape', RemoveFinalReshape) register_pass('reshape_stream', ReshapeStream) register_pass('remove_useless_transpose', RemoveUselessTranspose) diff --git a/hls4ml/model/optimizer/passes/relu_merge.py b/hls4ml/model/optimizer/passes/relu_merge.py new file mode 100644 index 0000000000..9c98eaa714 --- /dev/null +++ b/hls4ml/model/optimizer/passes/relu_merge.py @@ -0,0 +1,48 @@ +from hls4ml.model.optimizer import OptimizerPass + +class MergeRelu(OptimizerPass): + def match(self, node): + supported_layers = ['Conv2D', 'Conv2DBatchnorm', 'Dense'] + is_match = node.get_input_node().__class__.__name__ in supported_layers + + # hls4ml names ReLU activations 'Activation' + is_match = is_match and (node.__class__.__name__ == 'Activation') + return is_match + + def transform(self, model, node): + # Merge ReLU and Convolution/Dense layer + previous_node = node.get_input_node() + previous_node.index = node.index + previous_node.set_merged_relu(True) # Turn on merged_relu flag for this Conv/Dense layer + if 'Conv2D' in previous_node.__class__.__name__: + if previous_node.get_attr('data_format') == 'channels_last': + shape = [previous_node.attributes['out_height'], previous_node.attributes['out_width'], previous_node.attributes['n_filt']] + dims = ['OUT_HEIGHT_{}'.format(previous_node.index), 'OUT_WIDTH_{}'.format(previous_node.index), 'N_FILT_{}'.format(previous_node.index)] + else: + shape = [previous_node.attributes['n_filt'], previous_node.attributes['out_height'], previous_node.attributes['out_width']] + dims = ['N_FILT_{}'.format(previous_node.index), 'OUT_HEIGHT_{}'.format(previous_node.index), 'OUT_WIDTH_{}'.format(previous_node.index)] + activation_precision, _ = model.config.get_precision(node, var='result') + previous_node.add_output_variable(shape, dims, precision=activation_precision) + if not node.get_output_nodes(): + print("WARNING: {} is the output layer! No rewiring performed.".format(node.name)) + model.remove_node(node, rewire=False) + else: + model.remove_node(node, rewire=True) + return True + elif 'Dense' in previous_node.__class__.__name__: + shape = previous_node.get_input_variable().shape[:] + shape[-1] = previous_node.attributes['n_out'] + if len(shape) > 1: + dims = ['N_LAYER_{}_{}'.format(i, previous_node.index) for i in range(1, len(shape) + 1)] + else: + dims = ['N_LAYER_{}'.format(previous_node.index)] + print('shape: {}'.format(shape)) + print('dims: {}'.format(dims)) + activation_precision, _ = model.config.get_precision(node, var='result') + previous_node.add_output_variable(shape, dims, precision=activation_precision) + if not node.get_output_nodes(): + print("WARNING: {} is the output layer! No rewiring performed.".format(node.name)) + model.remove_node(node, rewire=False) + else: + model.remove_node(node, rewire=True) + return True \ No newline at end of file diff --git a/hls4ml/model/profiling.py b/hls4ml/model/profiling.py index 9aeb38be98..961153ab3e 100644 --- a/hls4ml/model/profiling.py +++ b/hls4ml/model/profiling.py @@ -1,3 +1,6 @@ +from pyDigitalWaveTools.vcd.parser import VcdParser + +import hls4ml from hls4ml.model.hls_model import HLSModel from hls4ml.model.hls_layers import IntegerPrecisionType, FixedPrecisionType import matplotlib.pyplot as plt @@ -26,6 +29,101 @@ __torch_profiling_enabled__ = False +def optimize_fifos_depth(hls_model, init_large_fifo=True, reset=True, csim=True, synth=True, + cosim=True, validation=True, export=True, vsynth=True, **kwargs,): + + cfg = hls_model.config.config.copy() + hls_config = cfg['HLSConfig'] + out_dir = hls_model.config.get_output_dir() + + values = [] + + def populate_values(name, data, depth): + values.append({'name': name, 'data': [], 'max': 0, 'depth': 0}) + get_values = lambda x: int(x[1][1:], 2) + values[-1]['data'] = [get_values(x) for x in data] + values[-1]['max'] = max(values[-1]['data']) + values[-1]['depth'] = int(depth[1:], 2) + + if not hls_config['Model']['FIFO_opt']: + raise Exception('To use this optimization you have to set `FIFO_opt` field to True in the HLS config') + + + # initialize all the fifos to 10000 so that they will be automatically implemented in BRAMs and so they will be + # profiled + + if init_large_fifo: + + for k,_ in hls_model.output_vars.items(): + if k not in hls_config['LayerName']: + hls_config['LayerName'][k] = {'StreamDepth': 10000} + else: + hls_config['LayerName'][k]['StreamDepth'] = 10000 + + if hls_model.config.get_config_value('Backend') == 'VivadoAccelerator': + hls_config['LayerName']['in_local'] = {'StreamDepth' : 10000} + hls_config['LayerName']['out_local'] = {'StreamDepth': 10000} + + cfg['OutputDir'] = out_dir + "_LARGE_FIFO" + cfg['HLSConfig'] = hls_config + hls_model = hls4ml.converters.keras_to_hls(cfg) + + + # run the build with FIFO_opt param set to 1 in order to generate the vcd file + hls_model.write() + hls_model.build(csim=True, cosim=True, synth=True, vsynth=False, export=False, validation=True) + + with open(hls_model.config.get_output_dir() + '/' + hls_model.config.get_project_name() + '_prj' + '/solution1/sim/verilog/fifo_opt.vcd') as vcd_file: + vcd = VcdParser() + vcd.parse(vcd_file) + data = vcd.scope.toJson() + + # wrapper fifos - useful only with VivadoAccelerator backend + if hls_model.config.get_config_value('Backend') == 'VivadoAccelerator': + for i in range(1, len(data['children'][0]['children'][0]['children'])): + populate_values(data['children'][0]['children'][0]['children'][i]['name'], + data['children'][0]['children'][0]['children'][i]['children'][0]['data'], + data['children'][0]['children'][0]['children'][i]['children'][1]['data'][0][1]) + + # model layers fifos + n_elem = len(data['children'][0]['children'][0]['children'][0]['children']) + for i in range(n_elem): + populate_values(data['children'][0]['children'][0]['children'][0]['children'][i]['name'], + data['children'][0]['children'][0]['children'][0]['children'][i]['children'][0]['data'], + data['children'][0]['children'][0]['children'][0]['children'][i]['children'][1]['data'][0][1]) + + maxs = [{'name': i['name'], 'max': i['max'], 'depth': i['depth']} for i in values] + + with open(hls_model.config.get_output_dir() + '/max_depth.json', 'w') as f: + json.dump(maxs, f, indent=4) + + new_config = cfg.copy()['HLSConfig'] + new_config['Model']['FIFO_opt'] = 0 + for k, v in hls_model.output_vars.items(): + filtered_max = [x['max'] for x in maxs if v.cppname in x['name']] + if len(filtered_max) == 0: + continue + if len(filtered_max) > 1: + print('WARNING! Check names of FIFOs') + if k not in new_config['LayerName']: + new_config['LayerName'][k] = {'StreamDepth': filtered_max[0] + 1} + else: + new_config['LayerName'][k]['StreamDepth'] = filtered_max[0] + 1 + for x in maxs: + if 'in_local' in x['name']: + new_config['LayerName']['in_local'] = {'StreamDepth': x['max'] + 1} + elif 'out_local' in x['name']: + new_config['LayerName']['out_local'] = {'StreamDepth': x['max'] + 1} + + cfg['OutputDir'] = out_dir + '_FIFO_OPT' + cfg['HLSConfig'] = new_config + hls_model = hls4ml.converters.keras_to_hls(cfg) + hls_model.write() + hls_model.build(reset=reset, csim=csim, synth=synth, cosim=cosim, validation=validation, export=export, vsynth=vsynth) + print('[hls4ml] - FIFO optimization completed') + return hls_model + + def get_unoptimized_hlsmodel(model): from hls4ml.converters import convert_from_config diff --git a/hls4ml/templates/supported_boards.json b/hls4ml/templates/supported_boards.json index 34d676d9cf..8f45dbad27 100644 --- a/hls4ml/templates/supported_boards.json +++ b/hls4ml/templates/supported_boards.json @@ -1,14 +1,32 @@ { + "pynq-z1": { + "part": "xc7z020clg400-1", + "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": { "axi_master": "axi_master_design.c"} + }, "pynq-z2": { "part": "xc7z020clg400-1", - "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, - "c_drivers": {} + "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": { "axi_master": "axi_master_design.c"} }, "zcu102": { "part": "xczu9eg-ffvb1156-2-e", - "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, - "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "tcl_scripts": {"axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, "c_drivers": {} + }, + "ultra96v2": { + "part": "xczu3eg-sbva484-1-e", + "tcl_scripts": {"axi_lite": "axi_lite_design.tcl", "axi_stream": "axi_stream_design.tcl", "axi_master": "axi_master_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": { "axi_master": "axi_master_design.c"} + }, + "arty-a7-100t": { + "part": "xc7a100tcsg324-1", + "tcl_scripts": {"axi_master": "axi_master_design.tcl"}, + "python_drivers": {}, + "c_drivers": { "axi_master": "axi_master_design.c"} } -} \ No newline at end of file +} diff --git a/hls4ml/templates/vivado/build_prj.tcl b/hls4ml/templates/vivado/build_prj.tcl index 0ec992cd4c..e0a47ab25b 100644 --- a/hls4ml/templates/vivado/build_prj.tcl +++ b/hls4ml/templates/vivado/build_prj.tcl @@ -11,6 +11,94 @@ array set opt { vsynth 0 } +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +proc remove_recursive_log_wave {} { + global myproject + set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}] + + set filename ${myproject}_prj/solution1/sim/verilog/${myproject}_axi.tcl + set temp $filename.new.$timestamp + # set backup $filename.bak.$timestamp + + set in [open $filename r] + set out [open $temp w] + + # line-by-line, read the original file + while {[gets $in line] != -1} { + if {[string equal "$line" "log_wave -r /"]} { + set line { } + } + puts $out $line + } + + close $in + close $out + + # move the new data to the proper filename + file delete -force $filename + file rename -force $temp $filename +} + +proc add_vcd_instructions_tcl {} { + global myproject + set timestamp [clock format [clock seconds] -format {%Y%m%d%H%M%S}] + + set filename ${myproject}_prj/solution1/sim/verilog/${myproject}_axi.tcl + set temp $filename.new.$timestamp + # set backup $filename.bak.$timestamp + + set in [open $filename r] + set out [open $temp w] + + # line-by-line, read the original file + while {[gets $in line] != -1} { + if {[string equal "$line" "log_wave -r /"]} { + set line {current_scope [get_scopes -regex /apatb_myproject_axi_top/AESL_inst_myproject_axi/grp_myproject_fu_.*] +set scopes [get_scopes -regexp {layer(\d*)_.*data_0_V_U.*}] +current_scope /apatb_myproject_axi_top/AESL_inst_myproject_axi +append scopes { } +append scopes [get_scopes -regexp {.*local_V_data_0.*}] +open_vcd fifo_opt.vcd +foreach scope $scopes { + current_scope $scope + if {[catch [get_objects usedw]] == 0} { + puts "$scope skipped" + continue + } + set usedw [get_objects usedw] + set depth [get_objects DEPTH] + add_wave $usedw + log_vcd $usedw + log_wave $usedw + add_wave $depth + log_vcd $depth + log_wave $depth + } + } + + set line [string map [list "myproject" $myproject] $line] + } + + if {[string equal "$line" "quit"]} { + set line {flush_vcd +close_vcd +quit +} + } + # then write the transformed line + puts $out $line + } + + close $in + close $out + + # move the new data to the proper filename + file delete -force $filename + file rename -force $temp $filename +} + foreach arg $::argv { foreach o [lsort [array names opt]] { regexp "$o=+(\\w+)" $arg unused opt($o) @@ -91,7 +179,20 @@ if {$opt(cosim)} { # TODO: This is a workaround (Xilinx defines __RTL_SIMULATION__ only for SystemC testbenches). add_files -tb myproject_test.cpp -cflags "-std=c++0x -DRTL_SIM" set time_start [clock clicks -milliseconds] - cosim_design -trace_level all + + cosim_design -trace_level all -setup + + if {$fifo_opt} { + puts "\[hls4ml\] - FIFO optimization started" + add_vcd_instructions_tcl + } + + remove_recursive_log_wave + set old_pwd [pwd] + cd ${myproject}_prj/solution1/sim/verilog/ + source run_sim.tcl + cd $old_pwd + set time_end [clock clicks -milliseconds] puts "INFO:" puts [read [open myproject_prj/solution1/sim/report/myproject_cosim.rpt r]] diff --git a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h index 756a627434..c6ee9479aa 100644 --- a/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h +++ b/hls4ml/templates/vivado/nnet_utils/nnet_dense_resource.h @@ -261,6 +261,247 @@ void dense_resource_rf_gt_nin( } } +// Dense (with ReLU) +template +void dense_relu_resource_rf_leq_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in,CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit/CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((multiplier_limit == block_factor) && "This function is correct only for RF <= N_IN"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + int w_index = ir; + int in_index = ir; + int out_index = 0; + int acc_step = 0; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + + acc[out_index] += CONFIG_T::template product::product(data[in_index], weights[w_index]); + + // Increment w_index + w_index += rufactor; + // Increment in_index + in_index += rufactor; + if (in_index >= nin) { + in_index = ir; + } + // Increment out_index + if (acc_step + 1 >= multscale) { + acc_step = 0; + out_index++; + } else { + acc_step++; + } + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + typename CONFIG_T::out_t act = cast(acc[ires]); + if (act > 0) res[ires] = act; + else res[ires] = 0; + } +} + +template +void dense_relu_resource_rf_gt_nin_rem0( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = MIN(CONFIG_T::reuse_factor, CONFIG_T::n_in * CONFIG_T::n_out); + const int multfactor = MIN(CONFIG_T::n_in,CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit/CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin && rufactor % nin == 0) && "This function is correct only for RF > N_IN && RF % N_IN == 0"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + int w_index; + int in_index = 0; + int out_index; + int outstep = 0; + const int outscale = rufactor / nin; + + int outidx[rufactor]; + IndexLoop: + for (int ir = 0; ir < rufactor; ir++) { + outidx[ir] = outstep; + if ((ir + 1) % nin == 0) { + outstep++; + } + } + + ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + + w_index = ir; + out_index = outidx[ir]/*outstep*/; + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + acc[out_index] += CONFIG_T::template product::product(data[in_index], weights[w_index]); + + w_index += rufactor; + if (w_index >= CONFIG_T::n_in * CONFIG_T::n_out) break; // check out of bounds + out_index += outscale; + } + + in_index++; + if (in_index >= nin) { + in_index = 0; + //outstep++; // This causes a huge increase in scheduling and RTL generation times, hence the above workaround. + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + typename CONFIG_T::out_t act = cast(acc[ires]); + if (act > 0) res[ires] = act; + else res[ires] = 0; + } +} + +template +void dense_relu_resource_rf_gt_nin( + data_T data[CONFIG_T::n_in], + res_T res[CONFIG_T::n_out], + typename CONFIG_T::weight_t weights[CONFIG_T::n_in*CONFIG_T::n_out], + typename CONFIG_T::bias_t biases[CONFIG_T::n_out]) { + + const int rufactor = CONFIG_T::reuse_factor; + const int multfactor = MIN(CONFIG_T::n_in,CONFIG_T::reuse_factor); + const int multiplier_limit = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, multfactor); + const int block_factor = DIV_ROUNDUP(CONFIG_T::n_in*CONFIG_T::n_out, CONFIG_T::reuse_factor); + const int multscale = multiplier_limit/CONFIG_T::n_out; + const int nin = CONFIG_T::n_in; + const int nout = CONFIG_T::n_out; + + assert((multiplier_limit % nout == 0 || rufactor >= nin) && "The current Reuse Factor is not allowed"); + assert((rufactor > nin) && "This function is correct only for RF > N_IN"); + + #pragma HLS function_instantiate variable=weights,biases + //#pragma HLS RESOURCE variable=weights core=RAM_2P_BRAM Commenting out the deisgnation HLS seems to choose correctly + #pragma HLS ARRAY_RESHAPE variable=weights block factor=block_factor + #pragma HLS ARRAY_PARTITION variable=biases complete + + typename CONFIG_T::accum_t acc[CONFIG_T::n_out]; + #pragma HLS ARRAY_PARTITION variable=acc complete + + InitAccum: + for (int iacc = 0; iacc < nout; iacc++) { + #pragma HLS UNROLL + acc[iacc] = (typename CONFIG_T::accum_t) biases[iacc]; + } + + ReuseLoop: + for (int ir = 0; ir < rufactor; ir++) { + #pragma HLS PIPELINE II=1 rewind + typename CONFIG_T::accum_t tmpmult[block_factor]; + #pragma HLS ARRAY_PARTITION variable=tmpmult complete + + MultLoop: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + rufactor * im; + int in_index = w_index % nin; + if (w_index >= CONFIG_T::n_in*CONFIG_T::n_out) continue; // check out of bounds + tmpmult[im] = CONFIG_T::template product::product(data[in_index], weights[w_index]); + } + + typename CONFIG_T::accum_t mult[multiplier_limit]; + #pragma HLS ARRAY_PARTITION variable=mult complete + + ResetMult: + for (int imult = 0; imult < multiplier_limit; imult++) { + #pragma HLS UNROLL + mult[imult] = 0; + } + + AccumLoop1: + for (int im = 0; im < block_factor; im++) { + #pragma HLS UNROLL + int w_index = ir + rufactor * im; + int out_index = w_index / multfactor; + if (out_index >= multiplier_limit) continue; // check out of bounds + mult[out_index] += tmpmult[im]; + } + + AccumLoop2: + for (int im = 0; im < multiplier_limit; im++) { + #pragma HLS UNROLL + //int out_index = im/multscale; // This is the general case + //acc[out_index] += mult[im]; + acc[im] += mult[im]; // If RF > N_IN then multiplier_limit == n_out + } + } + + // Cast to "res_t" type + Result: + for (int ires = 0; ires < CONFIG_T::n_out; ires++) { + #pragma HLS UNROLL + typename CONFIG_T::out_t act = cast(acc[ires]); + if (act > 0) res[ires] = act; + else res[ires] = 0; + } +} + + template void dense_resource( data_T data[CONFIG_T::n_in], @@ -270,15 +511,25 @@ void dense_resource( #pragma HLS INLINE region - if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { - dense_resource_rf_leq_nin(data, res, weights, biases); - } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { - dense_resource_rf_gt_nin_rem0(data, res, weights, biases); - } else { - dense_resource_rf_gt_nin(data, res, weights, biases); - } + if (CONFIG_T::merged_relu) { + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_relu_resource_rf_leq_nin(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { + dense_relu_resource_rf_gt_nin_rem0(data, res, weights, biases); + } else { + dense_relu_resource_rf_gt_nin(data, res, weights, biases); + } + } else { + if (CONFIG_T::reuse_factor <= CONFIG_T::n_in) { + dense_resource_rf_leq_nin(data, res, weights, biases); + } else if (CONFIG_T::reuse_factor % CONFIG_T::n_in == 0) { + dense_resource_rf_gt_nin_rem0(data, res, weights, biases); + } else { + dense_resource_rf_gt_nin(data, res, weights, biases); + } + } } } -#endif +#endif \ No newline at end of file diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/Makefile new file mode 100644 index 0000000000..03ab9b8de7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/Makefile @@ -0,0 +1,33 @@ +DESIGN := design_1 + +help: + @echo "INFO: make to show targets" +.PHONY: help + +--setup: + xsct ./setup.tcl $(DESIGN) +.PHONY: --setup + +sdk: --setup + rm -f $(DESIGN)_standalone/src/helloworld.c + cd $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c + cd $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h +.PHONY: sdk + +gui: + xsdk --workspace . & +.PHONY: gui + +clean: + rm -rf $(DESIGN)_platform + rm -rf $(DESIGN)_standalone + rm -rf $(DESIGN)_standalone_bsp + rm -rf RemoteSystemsTempFiles + rm -rf .Xil + rm -rf .metadata + rm -f *.log +.PHONY: clean + +ultraclean: clean + rm -rf hdf/*.hdf +.PHONY: ultraclean diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/common/main.c new file mode 100644 index 0000000000..41f5dca282 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/common/main.c @@ -0,0 +1,351 @@ +/** + * + * Set Heap Size in ldscript.ld to 0x1000000 (16MB) + * + */ + +#include "xmyproject_axi.h" /* TODO: design-dependent name */ +#include "stdio.h" /* PRINTF */ +#include "unistd.h" /* sleep */ +#include "stdlib.h" +#include "malloc.h" +#include "assert.h" +#include "xil_io.h" /* peripheral read/write wrappers */ +#include "platform.h" /* platform init/cleanup functions */ +#include "xil_cache.h" /* enable/disable caches etc */ +#include "xil_printf.h" /* UART debug print functions */ +#include "xparameters.h" /* peripherals base addresses */ +#include "xtmrctr.h" /* timer, Xilinx IP Timer Counter */ + +#include "data.h" + +#define EEMBC_POWER 1 + +#ifdef EEMBC_POWER +#include "xgpio.h" /* AXI GPIO drivers */ + +#define PIN 0x01 +#define GPIO_PMOD_PIN_DEVICE_ID XPAR_GPIO_0_DEVICE_ID + +#define set_pin_high(InstancePtr, Mask) \ + XGpio_DiscreteWrite(InstancePtr, 1, Mask) + +#define set_pin_low(InstancePtr, Mask) \ + XGpio_DiscreteClear(InstancePtr, 1, Mask) + +XGpio Gpio; +#endif + + +//#define __DEBUG__ + +#define MAX_PRINT_ELEMENTS (16) + +#define PRINTF printf + +const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS; +const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS; + +#if 1 +/* Accelerator verification */ +#define REFERENCE_OUTPUTS data_y_hls_outputs +#else +/* Accelerator validation */ +#define REFERENCE_OUTPUTS data_y_outputs +//#define REFERENCE_OUTPUTS data_y_keras_outputs +#endif + +unsigned get_max(float *data, unsigned n_elements) { + float max_value = 0.0; + unsigned max_index = 0; + for (unsigned i = 0; i < n_elements; i++) + if (data[i] >= max_value) { + max_index = i; + max_value = data[i]; + } + return max_index; +} + +float *inputs_mem = NULL; +float *outputs_mem = NULL; +float *reference_mem = NULL; + +/* Accelerator configuration */ +XMyproject_axi accelerator; /* TODO: design-dependent name */ +XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */ + +/* Accelerator initialization routine */ +void init_accelerators() { + PRINTF("INFO: Initializing accelerator\r\n"); + accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_DEVICE_ID); /* TODO: design-dependent name */ + if (accelerator_cfg) { + int status = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */ + if (status != XST_SUCCESS) { + PRINTF("ERROR: Initializing accelerator\r\n"); + } + } +} + +/* Reference implementation of the accelerator in software */ +int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) { +#ifdef __DEBUG__ + PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n"); +#endif + /* See data.h for inputs and outputs */ + for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) { + sw_outputs_mem[i] = REFERENCE_OUTPUTS[i]; + } + return 0; +} + +/* Profiling utilities */ +static XTmrCtr TimerCounterInst; +#define TMRCTR_DEVICE_ID XPAR_TMRCTR_0_DEVICE_ID +#define TIMER_CNTR_0 0 +#define TIMER_CNTR_1 1 + +void start_64b_counter() { + XTmrCtr_Start(&TimerCounterInst, TIMER_CNTR_0); + XTmrCtr_Start(&TimerCounterInst, TIMER_CNTR_1); +} + +void stop_64b_counter() { + XTmrCtr_Stop(&TimerCounterInst, TIMER_CNTR_0); + XTmrCtr_Stop(&TimerCounterInst, TIMER_CNTR_1); +} + +u64 get_64b_counter_value() { + //printf("bytes %u\n\r", sizeof(u64)); + u64 lo_counter = XTmrCtr_GetValue(&TimerCounterInst, TIMER_CNTR_0); + u64 hi_counter = XTmrCtr_GetValue(&TimerCounterInst, TIMER_CNTR_1); + u64 counter = (hi_counter << 32) | lo_counter; + //printf("INFO: hi = %lu, lo = %lu, total = %lu\n\r", hi_counter, lo_counter, counter); + return counter; +} + +#if 0 +double get_elapsed_time(u64 clk_start, u64 clk_stop) { + return ((clk_stop-clk_start) * (1.0/XPAR_AXI_TIMER_MCU_CLOCK_FREQ_HZ)); +} +#endif + +float get_elapsed_time_ns(u64 clks) { + return clks * 1000000000.0/XPAR_AXI_TIMER_MCU_CLOCK_FREQ_HZ; +} + + +/* Dump data to the console */ +void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) { + PRINTF("INFO: %s[%u][%u]:\r\n", label, n_samples, feature_count); + /* Print at most MAX_PRINT_ELEMENTS */ + for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) { + PRINTF("INFO: [%u] ", i); + for (unsigned j = 0; j < feature_count; j++) { + unsigned index = i * feature_count + j; + PRINTF("%f ", data[index]); + } + PRINTF("\r\n"); + } +} + +/* The top of the hill :-) */ +int main(int argc, char** argv) { + + int status; + u64 calibration_time; + double __attribute__ ((unused)) sw_elapsed = 0; + u64 hw_elapsed = 0; + u64 cache_elapsed = 0; + unsigned hw_errors; + + char __attribute__ ((unused)) dummy; /* dummy input */ + + /* Initialize platform (uart and caches) */ + init_platform(); + + PRINTF("\r\n"); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */ + PRINTF("INFO: ==================================================\r\n"); + + init_accelerators(); + + /* Timer Counter */ + status = XTmrCtr_Initialize(&TimerCounterInst, TMRCTR_DEVICE_ID); + if (status != XST_SUCCESS){ + print("ERROR: Timer counter initialization failed \r\n"); + return status; + } + + XTmrCtr_SetOptions(&TimerCounterInst, TIMER_CNTR_0, + XTC_AUTO_RELOAD_OPTION | + XTC_CASCADE_MODE_OPTION); + + print("INFO: Timer counter initialized\r\n"); + + inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float)); + outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + + /* Calibration */ + start_64b_counter(); + sleep(1); + stop_64b_counter(); + calibration_time = get_64b_counter_value(); + PRINTF("INFO: Time calibration for one second (%lf sec, %llu clk)\r\n", get_elapsed_time_ns(calibration_time), calibration_time); + + /* Initialize memory */ + PRINTF("INFO: Initialize memory\r\n"); + PRINTF("INFO: - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */ + PRINTF("INFO: - Inputs count: %u\r\n", N_X_INPUTS); + PRINTF("INFO: - Outputs count: %u\r\n", N_Y_OUTPUTS); + PRINTF("INFO: - Data size: %u B\r\n", sizeof(float)); + PRINTF("INFO: - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + PRINTF("INFO: - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + + // Set Heap Size in ldscript.ld to 0x1000000 (16MB) + //malloc_stats(); + + for (int i = 0; i < INPUT_N_ELEMENTS; i++) { + inputs_mem[i] = data_X_inputs[i]; + } + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + outputs_mem[i] = 0x0; + } + + /* ****** SW REFERENCE ****** */ + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Start SW reference implementation\r\n"); + start_64b_counter(); + sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS); + stop_64b_counter(); + sw_elapsed = get_64b_counter_value(); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Press any key to start:\r\n"); + dummy = inbyte(); + //PRINTF("INFO:"); + + /* ****** HW ACCELERATOR ****** */ + PRINTF("INFO: Start HW accelerator\r\n"); + start_64b_counter(); + Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + stop_64b_counter(); + cache_elapsed = get_64b_counter_value(); + + for (unsigned j = 0; j < N_SAMPLES; j++) { + float *inputs_mem_i = inputs_mem + j * N_X_INPUTS; + float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS; + + /* Configure the accelerator */ + start_64b_counter(); + XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */ + XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */ + + XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */ + + /* Polling */ + while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */ + + /* Get error status */ + //hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */ + stop_64b_counter(); + hw_elapsed += get_64b_counter_value(); + } + + start_64b_counter(); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + stop_64b_counter(); + cache_elapsed += get_64b_counter_value(); + + PRINTF("INFO: HW accelerator done!\r\n"); + + /* ****** VALIDATION ****** */ + PRINTF("INFO: ================== Verification ==================\r\n"); +#ifdef __DEBUG__ + PRINTF("INFO: Dump data\r\n"); + dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS); + dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS); + dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS); +#endif + +#ifdef __DEBUG__ + PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed); +#endif + PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES); + PRINTF("INFO: - total %f sec\r\n", get_elapsed_time_ns(hw_elapsed)); + PRINTF("INFO: - per-inference %.12f sec (%f ns)\r\n", get_elapsed_time_ns(hw_elapsed) / (N_SAMPLES), (get_elapsed_time_ns(hw_elapsed)*1000.0) / (N_SAMPLES)); + PRINTF("INFO: Cache flush time: %f sec\r\n", get_elapsed_time_ns(cache_elapsed)); +#ifdef __DEBUG__ + PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed)); +#endif + + hw_errors = 0; +#if 1 + /* Accelerator verification */ + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + if (outputs_mem[i] != reference_mem[i]) { + PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]); + hw_errors++; + } + } + PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS); + if (hw_errors > 0) + PRINTF("INFO: Verification: FAIL\r\n"); + else + PRINTF("INFO: Verification: PASS!\r\n"); +#else + /* Accelerator validation */ + for (unsigned s = 0; s < N_SAMPLES; s++) { + unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + if (hw_digit != ref_digit) { +#ifdef __DEBUG__ + PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit); +#endif + hw_errors++; + } + } + float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0; + float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0); + PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES); + PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate); + PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy); +#endif + + PRINTF("INFO: ==================================================\r\n"); + + +#ifdef EEMBC_POWER + /* Initialize the GPIO driver */ + status = XGpio_Initialize(&Gpio, GPIO_PMOD_PIN_DEVICE_ID); + if (status != XST_SUCCESS) { + xil_printf("GPIO Initialization Failed\r\n"); + return XST_FAILURE; + } + + set_pin_low(&Gpio, PIN); + + PRINTF("INFO: Connect logic analyzer to the pin 3 of Pmod D\r\n"); + PRINTF("INFO: Press any key to start:\r\n"); + dummy = inbyte(); + + /* Loop forever */ + for (unsigned i; i < 100; i++) { + set_pin_high(&Gpio, PIN); + + sleep(1); + + set_pin_low(&Gpio, PIN); + + sleep(1); + } +#endif + + cleanup_platform(); + + return 0; +} + + diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/setup.tcl new file mode 100644 index 0000000000..383bf39cf7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/c_drivers/sdk/setup.tcl @@ -0,0 +1,14 @@ +# See +# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html + +setws . +if { $::argc == 1 } { + set myproject [lindex $::argv 0] + createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf + createapp -name ${myproject}\_standalone -app {Hello World} -proc microblaze_mcu -hwproject ${myproject}\_platform -os standalone + configapp -app ${myproject}\_standalone build-config release + #configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000} + #configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000} + projects -build + #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE} +} diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/arty-a7-100t/tcl_scripts/axi_master_design.tcl new file mode 100644 index 0000000000..67d667b063 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/tcl_scripts/axi_master_design.tcl @@ -0,0 +1,193 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +# Project names +set project_name "project_1" +set design_name "design_1" +set hls_solution_name "solution1" +set acc_name "${myproject}_axi" +set part_name "xc7a100tcsg324-1" +set board_name "digilentinc.com:arty-a7-100:part0:1.0" + +# Set board and chip part names +create_project ${project_name} ${myproject}_vivado_accelerator -part ${part_name} -force +set_property board_part ${board_name} [current_project] + +# Create block design +create_bd_design ${design_name} + +# Setup IP repo +#set_property ip_repo_paths ${myproject}_prj [current_project] +set_property ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project] +update_ip_catalog + +# Create clock wizard +create_bd_cell -type ip -vlnv xilinx.com:ip:clk_wiz:6.0 clk_wiz_0 +apply_board_connection -board_interface "sys_clock" -ip_intf "clk_wiz_0/clock_CLK_IN1" -diagram ${design_name} +set_property name clk_wizard [get_bd_cells clk_wiz_0] +set_property -dict [list CONFIG.CLKOUT2_USED {true} CONFIG.CLKOUT1_REQUESTED_OUT_FREQ {166.667} CONFIG.CLKOUT2_REQUESTED_OUT_FREQ {200.00} CONFIG.MMCM_CLKOUT0_DIVIDE_F {6.000} CONFIG.MMCM_CLKOUT1_DIVIDE {5} CONFIG.NUM_OUT_CLKS {2} CONFIG.CLKOUT1_JITTER {118.758} CONFIG.CLKOUT2_JITTER {114.829} CONFIG.CLKOUT2_PHASE_ERROR {98.575}] [get_bd_cells clk_wizard] +#set_property -dict [list CONFIG.RESET_TYPE {ACTIVE_LOW} CONFIG.RESET_PORT {resetn}] [get_bd_cells clk_wizard] + +# Create MIG +create_bd_cell -type ip -vlnv xilinx.com:ip:mig_7series:4.2 mig_7series_0 +apply_board_connection -board_interface "ddr3_sdram" -ip_intf "mig_7series_0/mig_ddr_interface" -diagram ${design_name} + +# Wire MIG and clock wizard +delete_bd_objs [get_bd_nets clk_ref_i_1] [get_bd_ports clk_ref_i] +delete_bd_objs [get_bd_nets sys_clk_i_1] [get_bd_ports sys_clk_i] +connect_bd_net [get_bd_pins clk_wizard/clk_out2] [get_bd_pins mig_7series_0/clk_ref_i] +connect_bd_net [get_bd_pins clk_wizard/clk_out1] [get_bd_pins mig_7series_0/sys_clk_i] + +# Setup reset +#set_property -dict [list CONFIG.RESET_BOARD_INTERFACE {reset}] [get_bd_cells clk_wizard] +apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {reset ( System Reset ) } Manual_Source {New External Port (ACTIVE_LOW)}} [get_bd_pins mig_7series_0/sys_rst] + +# Create instance of MicroBlaze +create_bd_cell -type ip -vlnv xilinx.com:ip:microblaze:11.0 microblaze_mcu +apply_bd_automation -rule xilinx.com:bd_rule:microblaze -config { \ + axi_intc {0} \ + axi_periph {Enabled} \ + cache {16KB} \ + clk {/mig_7series_0/ui_clk (83 MHz)} \ + debug_module {Debug Only} \ + ecc {None} \ + local_mem {32KB} \ + preset {None} } [get_bd_cells microblaze_mcu] + +# Resize data and instruction caches +set_property -dict [list CONFIG.C_ADDR_TAG_BITS {18} CONFIG.C_CACHE_BYTE_SIZE {1024} CONFIG.C_DCACHE_ADDR_TAG {18} CONFIG.C_DCACHE_BYTE_SIZE {1024}] [get_bd_cells microblaze_mcu] + +# Enable full FPU +set_property -dict [list CONFIG.C_USE_FPU {2}] [get_bd_cells microblaze_mcu] + +# Create UART interface +#create_bd_cell -type ip -vlnv xilinx.com:ip:axi_uart16550:2.0 axi_uart +#apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {Auto} Master {/microblaze_mcu (Periph)} Slave {/axi_uart/S_AXI} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_uart/S_AXI] +#apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {usb_uart ( USB UART ) } Manual_Source {Auto}} [get_bd_intf_pins axi_uart/UART] + +# Create UART-lite interface +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_uartlite:2.0 axi_uart +if { ${eembc_power} } { + set_property -dict [list CONFIG.C_BAUDRATE {9600}] [get_bd_cells axi_uart] +} else { + apply_board_connection -board_interface "usb_uart" -ip_intf "axi_uart/UART" -diagram ${design_name} + set_property -dict [list CONFIG.C_BAUDRATE {115200}] [get_bd_cells axi_uart] +} +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {/mig_7series_0/ui_clk (83 MHz)} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master {/microblaze_mcu (Periph)} \ + Slave {/axi_uart/S_AXI} \ + intc_ip {New AXI Interconnect} \ + master_apm {0}} [get_bd_intf_pins axi_uart/S_AXI] + +# Forward UART interface to PMOD pins +if { ${eembc_power} } { + create_bd_port -dir O pmod_uart_txd + create_bd_port -dir I pmod_uart_rxd + connect_bd_net [get_bd_pins /axi_uart/tx] [get_bd_ports pmod_uart_txd] + connect_bd_net [get_bd_pins /axi_uart/rx] [get_bd_ports pmod_uart_rxd] + add_files -fileset constrs_1 -norecurse uart_pmod.xdc +} + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {/mig_7series_0/ui_clk (83 MHz)} \ + Clk_slave {/mig_7series_0/ui_clk (83 MHz)} \ + Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} \ + Master {/microblaze_mcu (Cached)} \ + Slave {/mig_7series_0/S_AXI} \ + intc_ip {Auto} master_apm {0} } [get_bd_intf_pins mig_7series_0/S_AXI] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {/mig_7series_0/ui_clk (83 MHz)} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master {/microblaze_mcu (Periph)} \ + Slave {/axi_uart/S_AXI} \ + intc_ip {New AXI Interconnect} \ + master_apm {0} } [get_bd_intf_pins axi_uart/S_AXI] + +# Add accelerator and connect s-axi interface +create_bd_cell -type ip -vlnv xilinx.com:hls:${acc_name}:1.0 ${acc_name} +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/microblaze_mcu (Periph)} Slave {/${acc_name}/s_axi_CTRL_BUS} intc_ip {/microblaze_mcu_axi_periph} master_apm {0}} [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS] + +# Connect m-axi interfaces +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {/mig_7series_0/ui_clk (83 MHz)} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/${acc_name}/m_axi_IN_BUS} Slave {/mig_7series_0/S_AXI} intc_ip {/axi_smc} master_apm {0}} [get_bd_intf_pins ${acc_name}/m_axi_IN_BUS] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {/mig_7series_0/ui_clk (83 MHz)} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/${acc_name}/m_axi_OUT_BUS} Slave {/mig_7series_0/S_AXI} intc_ip {/axi_smc} master_apm {0}} [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS] + +# Reset +apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {reset ( System Reset ) } Manual_Source {Auto}} [get_bd_pins clk_wizard/reset] + +# Add timer +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_timer:2.0 axi_timer_mcu +set_property -dict [list CONFIG.enable_timer2 {1}] [get_bd_cells axi_timer_mcu] + +# Wire timer +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/microblaze_mcu (Periph)} Slave {/axi_timer_mcu/S_AXI} intc_ip {/microblaze_mcu_axi_periph} master_apm {0}} [get_bd_intf_pins axi_timer_mcu/S_AXI] + +# Add AXI GPIO controlled pin +if { ${eembc_power} } { + # Add AXI GPIO IP + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_gpio:2.0 axi_gpio_0 + # Wire it up to a single output pin (to a PMOD) + set_property -dict [list CONFIG.C_GPIO_WIDTH {1} CONFIG.C_ALL_OUTPUTS {1}] [get_bd_cells axi_gpio_0] + apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {/mig_7series_0/ui_clk (83 MHz)} \ + Clk_slave {Auto} \ + Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} \ + Master {/microblaze_mcu (Periph)} \ + Slave {/axi_gpio_0/S_AXI} \ + intc_ip {/microblaze_mcu_axi_periph} \ + master_apm {0}} [get_bd_intf_pins axi_gpio_0/S_AXI] + create_bd_port -dir O pmod_pin + connect_bd_net [get_bd_ports pmod_pin] [get_bd_pins axi_gpio_0/gpio_io_o] + + add_files -fileset constrs_1 -norecurse pin_pmod.xdc +} + +# Add Quad SPI for cold boot +if { ${eembc_power} } { + create_bd_cell -type ip -vlnv xilinx.com:ip:axi_quad_spi:3.2 axi_quad_spi_0 + set_property -dict [list CONFIG.C_SPI_MEMORY {3}] [get_bd_cells axi_quad_spi_0] + apply_bd_automation -rule xilinx.com:bd_rule:board -config { Board_Interface {qspi_flash ( Quad SPI Flash ) } Manual_Source {Auto}} [get_bd_intf_pins axi_quad_spi_0/SPI_0] + apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {/mig_7series_0/ui_clk (83 MHz)} Clk_slave {Auto} Clk_xbar {/mig_7series_0/ui_clk (83 MHz)} Master {/microblaze_mcu (Periph)} Slave {/axi_quad_spi_0/AXI_LITE} intc_ip {/microblaze_mcu_axi_periph} master_apm {0}} [get_bd_intf_pins axi_quad_spi_0/AXI_LITE] + set_property -dict [list CONFIG.CLKOUT3_USED {true} CONFIG.CLKOUT3_REQUESTED_OUT_FREQ {50} CONFIG.MMCM_CLKOUT2_DIVIDE {20} CONFIG.NUM_OUT_CLKS {3} CONFIG.CLKOUT3_JITTER {151.636} CONFIG.CLKOUT3_PHASE_ERROR {98.575}] [get_bd_cells clk_wizard] + connect_bd_net [get_bd_pins clk_wizard/clk_out3] [get_bd_pins axi_quad_spi_0/ext_spi_clk] + + # BUG FIX + delete_bd_objs [get_bd_nets clk_wizard_clk_out3] + connect_bd_net [get_bd_pins axi_quad_spi_0/ext_spi_clk] [get_bd_pins mig_7series_0/ui_clk] + + add_files -fileset constrs_1 -norecurse qspi.xdc +} + +# Validate the design block we created +validate_bd_design + +# Save design +save_bd_design + +# Top level wrapper +#make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top +#add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v +add_files -norecurse $design_name\_wrapper.v + +# In the Verilog wrapper, enable configuration for the EEMBC power setup +if { ${eembc_power} } { + set_property verilog_define EEMBC_POWER=1 [current_fileset] +} + +# Run synthesis and implementation +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +# Reporting +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages + +# Export HDF file for SDK flow +file mkdir ./hdf +file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/verilog_wrappers/design_1_wrapper.v b/hls4ml/templates/vivado_accelerator/arty-a7-100t/verilog_wrappers/design_1_wrapper.v new file mode 100644 index 0000000000..3bbaf5f9be --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/verilog_wrappers/design_1_wrapper.v @@ -0,0 +1,209 @@ +`timescale 1 ps / 1 ps + +module design_1_wrapper + (ddr3_sdram_addr, + ddr3_sdram_ba, + ddr3_sdram_cas_n, + ddr3_sdram_ck_n, + ddr3_sdram_ck_p, + ddr3_sdram_cke, + ddr3_sdram_cs_n, + ddr3_sdram_dm, + ddr3_sdram_dq, + ddr3_sdram_dqs_n, + ddr3_sdram_dqs_p, + ddr3_sdram_odt, + ddr3_sdram_ras_n, + ddr3_sdram_reset_n, + ddr3_sdram_we_n +`ifdef EEMBC_POWER + , + qspi_flash_io0_io, + qspi_flash_io1_io, + qspi_flash_io2_io, + qspi_flash_io3_io, + qspi_flash_sck_io, + qspi_flash_ss_io + `endif + , + reset, + sys_clock +`ifdef EEMBC_POWER + , + pmod_uart_rxd, + pmod_uart_txd, + pmod_pin +`else + , + usb_uart_rxd, + usb_uart_txd +`endif + ); + output [13:0]ddr3_sdram_addr; + output [2:0]ddr3_sdram_ba; + output ddr3_sdram_cas_n; + output [0:0]ddr3_sdram_ck_n; + output [0:0]ddr3_sdram_ck_p; + output [0:0]ddr3_sdram_cke; + output [0:0]ddr3_sdram_cs_n; + output [1:0]ddr3_sdram_dm; + inout [15:0]ddr3_sdram_dq; + inout [1:0]ddr3_sdram_dqs_n; + inout [1:0]ddr3_sdram_dqs_p; + output [0:0]ddr3_sdram_odt; + output ddr3_sdram_ras_n; + output ddr3_sdram_reset_n; + output ddr3_sdram_we_n; +`ifdef EEMBC_POWER + inout qspi_flash_io0_io; + inout qspi_flash_io1_io; + inout qspi_flash_io2_io; + inout qspi_flash_io3_io; + inout qspi_flash_sck_io; + inout qspi_flash_ss_io; + `endif + input reset; + input sys_clock; +`ifdef EEMBC_POWER + input pmod_uart_rxd; + output pmod_uart_txd; + output pmod_pin; +`else + input usb_uart_rxd; + output usb_uart_txd; +`endif + + + wire [13:0]ddr3_sdram_addr; + wire [2:0]ddr3_sdram_ba; + wire ddr3_sdram_cas_n; + wire [0:0]ddr3_sdram_ck_n; + wire [0:0]ddr3_sdram_ck_p; + wire [0:0]ddr3_sdram_cke; + wire [0:0]ddr3_sdram_cs_n; + wire [1:0]ddr3_sdram_dm; + wire [15:0]ddr3_sdram_dq; + wire [1:0]ddr3_sdram_dqs_n; + wire [1:0]ddr3_sdram_dqs_p; + wire [0:0]ddr3_sdram_odt; + wire ddr3_sdram_ras_n; + wire ddr3_sdram_reset_n; + wire ddr3_sdram_we_n; +`ifdef EEMBC_POWER + wire qspi_flash_io0_i; + wire qspi_flash_io0_io; + wire qspi_flash_io0_o; + wire qspi_flash_io0_t; + wire qspi_flash_io1_i; + wire qspi_flash_io1_io; + wire qspi_flash_io1_o; + wire qspi_flash_io1_t; + wire qspi_flash_io2_i; + wire qspi_flash_io2_io; + wire qspi_flash_io2_o; + wire qspi_flash_io2_t; + wire qspi_flash_io3_i; + wire qspi_flash_io3_io; + wire qspi_flash_io3_o; + wire qspi_flash_io3_t; + wire qspi_flash_sck_i; + wire qspi_flash_sck_io; + wire qspi_flash_sck_o; + wire qspi_flash_sck_t; + wire qspi_flash_ss_i; + wire qspi_flash_ss_io; + wire qspi_flash_ss_o; + wire qspi_flash_ss_t; +`else + wire usb_uart_rxd; + wire usb_uart_txd; +`endif + wire reset; + wire sys_clock; + +`ifdef EEMBC_POWER + IOBUF qspi_flash_io0_iobuf + (.I(qspi_flash_io0_o), + .IO(qspi_flash_io0_io), + .O(qspi_flash_io0_i), + .T(qspi_flash_io0_t)); + IOBUF qspi_flash_io1_iobuf + (.I(qspi_flash_io1_o), + .IO(qspi_flash_io1_io), + .O(qspi_flash_io1_i), + .T(qspi_flash_io1_t)); + IOBUF qspi_flash_io2_iobuf + (.I(qspi_flash_io2_o), + .IO(qspi_flash_io2_io), + .O(qspi_flash_io2_i), + .T(qspi_flash_io2_t)); + IOBUF qspi_flash_io3_iobuf + (.I(qspi_flash_io3_o), + .IO(qspi_flash_io3_io), + .O(qspi_flash_io3_i), + .T(qspi_flash_io3_t)); + IOBUF qspi_flash_sck_iobuf + (.I(qspi_flash_sck_o), + .IO(qspi_flash_sck_io), + .O(qspi_flash_sck_i), + .T(qspi_flash_sck_t)); + IOBUF qspi_flash_ss_iobuf + (.I(qspi_flash_ss_o), + .IO(qspi_flash_ss_io), + .O(qspi_flash_ss_i), + .T(qspi_flash_ss_t)); +`endif + + design_1 design_1_i + (.ddr3_sdram_addr(ddr3_sdram_addr), + .ddr3_sdram_ba(ddr3_sdram_ba), + .ddr3_sdram_cas_n(ddr3_sdram_cas_n), + .ddr3_sdram_ck_n(ddr3_sdram_ck_n), + .ddr3_sdram_ck_p(ddr3_sdram_ck_p), + .ddr3_sdram_cke(ddr3_sdram_cke), + .ddr3_sdram_cs_n(ddr3_sdram_cs_n), + .ddr3_sdram_dm(ddr3_sdram_dm), + .ddr3_sdram_dq(ddr3_sdram_dq), + .ddr3_sdram_dqs_n(ddr3_sdram_dqs_n), + .ddr3_sdram_dqs_p(ddr3_sdram_dqs_p), + .ddr3_sdram_odt(ddr3_sdram_odt), + .ddr3_sdram_ras_n(ddr3_sdram_ras_n), + .ddr3_sdram_reset_n(ddr3_sdram_reset_n), + .ddr3_sdram_we_n(ddr3_sdram_we_n) +`ifdef EEMBC_POWER + , + .qspi_flash_io0_i(qspi_flash_io0_i), + .qspi_flash_io0_o(qspi_flash_io0_o), + .qspi_flash_io0_t(qspi_flash_io0_t), + .qspi_flash_io1_i(qspi_flash_io1_i), + .qspi_flash_io1_o(qspi_flash_io1_o), + .qspi_flash_io1_t(qspi_flash_io1_t), + .qspi_flash_io2_i(qspi_flash_io2_i), + .qspi_flash_io2_o(qspi_flash_io2_o), + .qspi_flash_io2_t(qspi_flash_io2_t), + .qspi_flash_io3_i(qspi_flash_io3_i), + .qspi_flash_io3_o(qspi_flash_io3_o), + .qspi_flash_io3_t(qspi_flash_io3_t), + .qspi_flash_sck_i(qspi_flash_sck_i), + .qspi_flash_sck_o(qspi_flash_sck_o), + .qspi_flash_sck_t(qspi_flash_sck_t), + .qspi_flash_ss_i(qspi_flash_ss_i), + .qspi_flash_ss_o(qspi_flash_ss_o), + .qspi_flash_ss_t(qspi_flash_ss_t) + `endif + , + .reset(reset), + .sys_clock(sys_clock) +`ifdef EEMBC_POWER + , + .pmod_uart_rxd(pmod_uart_rxd), + .pmod_uart_txd(pmod_uart_txd), + .pmod_pin(pmod_pin) +`else + , + .usb_uart_rxd(usb_uart_rxd), + .usb_uart_txd(usb_uart_txd) +`endif + + ); +endmodule diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/pin_pmod.xdc b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/pin_pmod.xdc new file mode 100644 index 0000000000..321279b709 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/pin_pmod.xdc @@ -0,0 +1,4 @@ +# AXI GPIO controlled pin on Pmod Header JD + +# Output pin, PMOD D pin 3 (JD4), IO_L13N_T2_MRCC_35, F4, Blue cable +set_property -dict { PACKAGE_PIN F4 IOSTANDARD LVCMOS33 } [get_ports { pmod_pin }]; diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/qspi.xdc b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/qspi.xdc new file mode 100644 index 0000000000..6019da47bd --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/qspi.xdc @@ -0,0 +1,13 @@ +# +# See also +# https://github.com/Digilent/digilent-xdc/blob/master/Arty-A7-100-Master.xdc +# + +set_property BITSTREAM.GENERAL.COMPRESS TRUE [current_design] + +# Quad SPI Flash +set_property -dict { PACKAGE_PIN L13 IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_ss_io }]; #IO_L6P_T0_FCS_B_14 Sch=qspi_cs +set_property -dict { PACKAGE_PIN K17 IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io0_io }]; #IO_L1P_T0_D00_MOSI_14 Sch=qspi_dq[0] +set_property -dict { PACKAGE_PIN K18 IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io1_io }]; #IO_L1N_T0_D01_DIN_14 Sch=qspi_dq[1] +set_property -dict { PACKAGE_PIN L14 IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io2_io }]; #IO_L2P_T0_D02_14 Sch=qspi_dq[2] +set_property -dict { PACKAGE_PIN M14 IOSTANDARD LVCMOS33 } [get_ports { qspi_flash_io3_io }]; #IO_L2N_T0_D03_14 Sch=qspi_dq[3] diff --git a/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/uart_pmod.xdc b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/uart_pmod.xdc new file mode 100644 index 0000000000..2cf181a20a --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/arty-a7-100t/xdc_constraints/uart_pmod.xdc @@ -0,0 +1,8 @@ +# Expose UART Interface on Pmod Header JA +# You may need https://www.sparkfun.com/products/9873 + +# RX uart, PMOD A pin 2 (JA2), IO_L4P_T0_15, B11, BROWN cable +set_property -dict { PACKAGE_PIN B11 IOSTANDARD LVCMOS33 } [get_ports { pmod_uart_rxd }]; + +# TX uart, PMOD A pin 3 (JA3), IO_L4N_T0_15, A11, RED cable +set_property -dict { PACKAGE_PIN A11 IOSTANDARD LVCMOS33 } [get_ports { pmod_uart_txd }]; diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/Makefile new file mode 100644 index 0000000000..03ab9b8de7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/Makefile @@ -0,0 +1,33 @@ +DESIGN := design_1 + +help: + @echo "INFO: make to show targets" +.PHONY: help + +--setup: + xsct ./setup.tcl $(DESIGN) +.PHONY: --setup + +sdk: --setup + rm -f $(DESIGN)_standalone/src/helloworld.c + cd $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c + cd $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h +.PHONY: sdk + +gui: + xsdk --workspace . & +.PHONY: gui + +clean: + rm -rf $(DESIGN)_platform + rm -rf $(DESIGN)_standalone + rm -rf $(DESIGN)_standalone_bsp + rm -rf RemoteSystemsTempFiles + rm -rf .Xil + rm -rf .metadata + rm -f *.log +.PHONY: clean + +ultraclean: clean + rm -rf hdf/*.hdf +.PHONY: ultraclean diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/common/main.c new file mode 100644 index 0000000000..7dd2be22a8 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/common/main.c @@ -0,0 +1,262 @@ +/** + * + * Set Heap Size in ldscript.ld to 0x1000000 (16MB) + * + */ + +#include "xmyproject_axi.h" /* TODO: design-dependent name */ +#include "stdio.h" /* PRINTF */ +#include "unistd.h" /* sleep */ +#include "stdlib.h" +#include "malloc.h" +#include "assert.h" +#include "xil_io.h" /* peripheral read/write wrappers */ +#include "xtime_l.h" /* to measure performance of the system */ +#include "platform.h" /* platform init/cleanup functions */ +#include "xil_cache.h" /* enable/disable caches etc */ +#include "xil_printf.h" /* UART debug print functions */ +#include "xparameters.h" /* peripherals base addresses */ + +#include "data.h" + +//#define __DEBUG__ + +#define MAX_PRINT_ELEMENTS (16) + +#define PRINTF printf + +const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS; +const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS; + +#if 1 +/* Accelerator verification */ +#define REFERENCE_OUTPUTS data_y_hls_outputs +#else +/* Accelerator validation */ +#define REFERENCE_OUTPUTS data_y_outputs +//#define REFERENCE_OUTPUTS data_y_keras_outputs +#endif + +unsigned get_max(float *data, unsigned n_elements) { + float max_value = 0.0; + unsigned max_index = 0; + for (unsigned i = 0; i < n_elements; i++) + if (data[i] >= max_value) { + max_index = i; + max_value = data[i]; + } + return max_index; +} + +float *inputs_mem = NULL; +float *outputs_mem = NULL; +float *reference_mem = NULL; + +/* Accelerator configuration */ +XMyproject_axi accelerator; /* TODO: design-dependent name */ +XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */ + +/* Accelerator initialization routine */ +void init_accelerators() { + PRINTF("INFO: Initializing accelerator\r\n"); + accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */ + if (accelerator_cfg) { + int status = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */ + if (status != XST_SUCCESS) { + PRINTF("ERROR: Initializing accelerator\r\n"); + } + } +} + +/* Reference implementation of the accelerator in software */ +int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) { +#ifdef __DEBUG__ + PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n"); +#endif + /* See data.h for inputs and outputs */ + for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) { + sw_outputs_mem[i] = REFERENCE_OUTPUTS[i]; + } + return 0; +} + +/* Profiling function */ +double get_elapsed_time(XTime start, XTime stop) { + return 1.0 * (stop - start) / (COUNTS_PER_SECOND); +} + +/* Dump data to the console */ +void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) { + PRINTF("INFO: %s[%u][%u]:\r\n", label, n_samples, feature_count); + /* Print at most MAX_PRINT_ELEMENTS */ + for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) { + PRINTF("INFO: [%u] ", i); + for (unsigned j = 0; j < feature_count; j++) { + unsigned index = i * feature_count + j; + PRINTF("%f ", data[index]); + } + PRINTF("\r\n"); + } +} + +/* The top of the hill :-) */ +int main(int argc, char** argv) { + + XTime start, stop; + double calibration_time; + double sw_elapsed = 0; + double hw_elapsed = 0; + double cache_elapsed = 0; + unsigned hw_errors; + + char __attribute__ ((unused)) dummy; /* dummy input */ + + /* Initialize platform (uart and caches) */ + init_platform(); + + PRINTF("\r\n"); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */ + PRINTF("INFO: ==================================================\r\n"); + + init_accelerators(); + + inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float)); + outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + + /* Calibration */ + XTime_GetTime(&start); + sleep(1); + XTime_GetTime(&stop); + calibration_time = get_elapsed_time(start, stop); + PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time); + + /* Initialize memory */ + PRINTF("INFO: Initialize memory\r\n"); + PRINTF("INFO: - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */ + PRINTF("INFO: - Inputs count: %u\r\n", N_X_INPUTS); + PRINTF("INFO: - Outputs count: %u\r\n", N_Y_OUTPUTS); + PRINTF("INFO: - Data size: %u B\r\n", sizeof(float)); + PRINTF("INFO: - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + PRINTF("INFO: - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + + // Set Heap Size in ldscript.ld to 0x1000000 (16MB) + //malloc_stats(); + + for (int i = 0; i < INPUT_N_ELEMENTS; i++) { + inputs_mem[i] = data_X_inputs[i]; + } + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + outputs_mem[i] = 0x0; + } + + /* ****** SW REFERENCE ****** */ + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Start SW reference implementation\r\n"); + XTime_GetTime(&start); + sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS); + XTime_GetTime(&stop); + sw_elapsed = get_elapsed_time(start, stop); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Press any key to start:\r\n"); + dummy = inbyte(); + //PRINTF("INFO:"); + + /* ****** HW ACCELERATOR ****** */ + PRINTF("INFO: Start HW accelerator\r\n"); + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed = get_elapsed_time(start, stop); + + for (unsigned j = 0; j < N_SAMPLES; j++) { + float *inputs_mem_i = inputs_mem + j * N_X_INPUTS; + float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS; + + /* Configure the accelerator */ + XTime_GetTime(&start); + XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */ + XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */ + + XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */ + + /* Polling */ + while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */ + + /* Get error status */ + //hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */ + XTime_GetTime(&stop); + hw_elapsed += get_elapsed_time(start, stop); + } + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed += get_elapsed_time(start, stop); + + PRINTF("INFO: HW accelerator done!\r\n"); + + /* ****** VALIDATION ****** */ + PRINTF("INFO: ================== Verification ==================\r\n"); +#ifdef __DEBUG__ + PRINTF("INFO: Dump data\r\n"); + dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS); + dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS); + dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS); +#endif + +#ifdef __DEBUG__ + PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed); +#endif + PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES); + PRINTF("INFO: - total %f sec\r\n", hw_elapsed); + PRINTF("INFO: - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES)); + PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed); +#ifdef __DEBUG__ + PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed)); +#endif + + hw_errors = 0; +#if 1 + /* Accelerator verification */ + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + if (outputs_mem[i] != reference_mem[i]) { + PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]); + hw_errors++; + } + } + PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS); + if (hw_errors > 0) + PRINTF("INFO: Verification: FAIL\r\n"); + else + PRINTF("INFO: Verification: PASS!\r\n"); +#else + /* Accelerator validation */ + for (unsigned s = 0; s < N_SAMPLES; s++) { + unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + if (hw_digit != ref_digit) { +#ifdef __DEBUG__ + PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit); +#endif + hw_errors++; + } + } + float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0; + float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0); + PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES); + PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate); + PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy); +#endif + PRINTF("INFO: ==================================================\r\n"); + + cleanup_platform(); + + return 0; +} + + diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/setup.tcl new file mode 100644 index 0000000000..5e9e92d501 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z1/c_drivers/sdk/setup.tcl @@ -0,0 +1,14 @@ +# See +# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html + +setws . +if { $::argc == 1 } { + set myproject [lindex $::argv 0] + createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf + createapp -name ${myproject}\_standalone -app {Hello World} -proc ps7_cortexa9_0 -hwproject ${myproject}\_platform -os standalone + configapp -app ${myproject}\_standalone build-config release + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000} + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000} + projects -build + #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE} +} diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/pynq-z1/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..4adb187ab4 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z1/python_drivers/axi_stream_driver.py @@ -0,0 +1,75 @@ +from pynq import DefaultHierarchy, DefaultIP, allocate +from pynq import Overlay +from datetime import datetime +import pynq.lib.dma +import numpy as np + + +class NeuralNetworkOverlay(Overlay): + def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, + device=None): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.sendchannel = self.hier_0.axi_dma_0.sendchannel + self.recvchannel = self.hier_0.axi_dma_0.recvchannel + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = (timeb - timea) + dts = dt.seconds + dt.microseconds * 10 ** -6 + rate = N / dts + print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.sendchannel.transfer(self.input_buffer) + self.recvchannel.transfer(self.output_buffer) + if debug: + print("Transfer OK") + self.sendchannel.wait() + if debug: + print("Send OK") + self.recvchannel.wait() + if debug: + print("Receive OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer \ No newline at end of file diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_lite_design.tcl new file mode 100644 index 0000000000..4f6847ae70 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_lite_design.tcl @@ -0,0 +1,26 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${myproject}_vivado_accelerator -part xc7z020clg400-1 -force + +set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] +set_property ip_repo_paths ${myproject}_prj [current_project] +update_ip_catalog + +# Create Block Designer design +create_bd_design "design_1" +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] +create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${myproject}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins ${myproject}_axi_0/s_axi_AXILiteS] + +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top +add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_master_design.tcl new file mode 100644 index 0000000000..6de05e15a7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_master_design.tcl @@ -0,0 +1,88 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +# Project names +set project_name "project_1" +set design_name "design_1" +set hls_solution_name "solution1" +set ps_name "processing_system7_0" +set acc_name "${myproject}_axi_0" +set part_name "xc7z020clg400-1" +set board_name "www.digilentinc.com:pynq-z1:part0:1.0" + +# Set board and chip part names +create_project ${project_name} ${myproject}_vivado_accelerator -part ${part_name} -force +set_property board_part ${board_name} [current_project] + +# Create block design +create_bd_design ${design_name} + +# Setup IP repo +#set_property ip_repo_paths ${myproject}_prj [current_project] +set_property ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project] +update_ip_catalog + +# Create and setup PS +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 ${ps_name} +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells ${ps_name}] +set_property -dict [list CONFIG.PCW_USE_S_AXI_GP0 {1} CONFIG.PCW_USE_FABRIC_INTERRUPT {1} CONFIG.PCW_IRQ_F2P_INTR {1}] [get_bd_cells ${ps_name}] + +# Create accelerator +create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name} + +# Wiring +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master {/myproject_axi_0/m_axi_IN_BUS} \ + Slave {/processing_system7_0/S_AXI_GP0} \ + intc_ip {Auto} \ + master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_GP0] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master {/processing_system7_0/M_AXI_GP0} \ + Slave {/myproject_axi_0/s_axi_CTRL_BUS} \ + intc_ip {New AXI Interconnect} \ + master_apm {0}} [get_bd_intf_pins myproject_axi_0/s_axi_CTRL_BUS] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {/processing_system7_0/FCLK_CLK0 (100 MHz)} \ + Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} \ + Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} \ + Master {/myproject_axi_0/m_axi_OUT_BUS} \ + Slave {/processing_system7_0/S_AXI_GP0} \ + intc_ip {/axi_smc} \ + master_apm {0}} [get_bd_intf_pins myproject_axi_0/m_axi_OUT_BUS] + +# Wiring interrupt signal +connect_bd_net [get_bd_pins myproject_axi_0/interrupt] [get_bd_pins processing_system7_0/IRQ_F2P] + +# Top level wrapper +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top +add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v + +# Memory mapping +delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_IOP] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0] +delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_IOP] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0] + +# Run synthesis and implementation +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +# Reporting +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages + +# Export HDF file for SDK flow +file mkdir ./hdf +file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf diff --git a/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..f5901c7f37 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z1/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,59 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${myproject}_vivado_accelerator -part xc7z020clg400-1 -force + +set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] +set_property ip_repo_paths ${myproject}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 processing_system7_0 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells processing_system7_0] + +startgroup +set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells processing_system7_0] +endgroup + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup + +set_property -dict [list CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_HP0] +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/processing_system7_0/S_AXI_HP0} ddr_seg {Auto} intc_ip {/axi_mem_intercon} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +endgroup + +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins ${myproject}_axi_0/out_r] [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] + +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/processing_system7_0/FCLK_CLK0 (100 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${myproject}_axi_0/ap_clk] + +group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0] + +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/Makefile new file mode 100644 index 0000000000..03ab9b8de7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/Makefile @@ -0,0 +1,33 @@ +DESIGN := design_1 + +help: + @echo "INFO: make to show targets" +.PHONY: help + +--setup: + xsct ./setup.tcl $(DESIGN) +.PHONY: --setup + +sdk: --setup + rm -f $(DESIGN)_standalone/src/helloworld.c + cd $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c + cd $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h +.PHONY: sdk + +gui: + xsdk --workspace . & +.PHONY: gui + +clean: + rm -rf $(DESIGN)_platform + rm -rf $(DESIGN)_standalone + rm -rf $(DESIGN)_standalone_bsp + rm -rf RemoteSystemsTempFiles + rm -rf .Xil + rm -rf .metadata + rm -f *.log +.PHONY: clean + +ultraclean: clean + rm -rf hdf/*.hdf +.PHONY: ultraclean diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/common/main.c new file mode 100644 index 0000000000..7dd2be22a8 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/common/main.c @@ -0,0 +1,262 @@ +/** + * + * Set Heap Size in ldscript.ld to 0x1000000 (16MB) + * + */ + +#include "xmyproject_axi.h" /* TODO: design-dependent name */ +#include "stdio.h" /* PRINTF */ +#include "unistd.h" /* sleep */ +#include "stdlib.h" +#include "malloc.h" +#include "assert.h" +#include "xil_io.h" /* peripheral read/write wrappers */ +#include "xtime_l.h" /* to measure performance of the system */ +#include "platform.h" /* platform init/cleanup functions */ +#include "xil_cache.h" /* enable/disable caches etc */ +#include "xil_printf.h" /* UART debug print functions */ +#include "xparameters.h" /* peripherals base addresses */ + +#include "data.h" + +//#define __DEBUG__ + +#define MAX_PRINT_ELEMENTS (16) + +#define PRINTF printf + +const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS; +const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS; + +#if 1 +/* Accelerator verification */ +#define REFERENCE_OUTPUTS data_y_hls_outputs +#else +/* Accelerator validation */ +#define REFERENCE_OUTPUTS data_y_outputs +//#define REFERENCE_OUTPUTS data_y_keras_outputs +#endif + +unsigned get_max(float *data, unsigned n_elements) { + float max_value = 0.0; + unsigned max_index = 0; + for (unsigned i = 0; i < n_elements; i++) + if (data[i] >= max_value) { + max_index = i; + max_value = data[i]; + } + return max_index; +} + +float *inputs_mem = NULL; +float *outputs_mem = NULL; +float *reference_mem = NULL; + +/* Accelerator configuration */ +XMyproject_axi accelerator; /* TODO: design-dependent name */ +XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */ + +/* Accelerator initialization routine */ +void init_accelerators() { + PRINTF("INFO: Initializing accelerator\r\n"); + accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */ + if (accelerator_cfg) { + int status = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */ + if (status != XST_SUCCESS) { + PRINTF("ERROR: Initializing accelerator\r\n"); + } + } +} + +/* Reference implementation of the accelerator in software */ +int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) { +#ifdef __DEBUG__ + PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n"); +#endif + /* See data.h for inputs and outputs */ + for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) { + sw_outputs_mem[i] = REFERENCE_OUTPUTS[i]; + } + return 0; +} + +/* Profiling function */ +double get_elapsed_time(XTime start, XTime stop) { + return 1.0 * (stop - start) / (COUNTS_PER_SECOND); +} + +/* Dump data to the console */ +void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) { + PRINTF("INFO: %s[%u][%u]:\r\n", label, n_samples, feature_count); + /* Print at most MAX_PRINT_ELEMENTS */ + for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) { + PRINTF("INFO: [%u] ", i); + for (unsigned j = 0; j < feature_count; j++) { + unsigned index = i * feature_count + j; + PRINTF("%f ", data[index]); + } + PRINTF("\r\n"); + } +} + +/* The top of the hill :-) */ +int main(int argc, char** argv) { + + XTime start, stop; + double calibration_time; + double sw_elapsed = 0; + double hw_elapsed = 0; + double cache_elapsed = 0; + unsigned hw_errors; + + char __attribute__ ((unused)) dummy; /* dummy input */ + + /* Initialize platform (uart and caches) */ + init_platform(); + + PRINTF("\r\n"); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */ + PRINTF("INFO: ==================================================\r\n"); + + init_accelerators(); + + inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float)); + outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + + /* Calibration */ + XTime_GetTime(&start); + sleep(1); + XTime_GetTime(&stop); + calibration_time = get_elapsed_time(start, stop); + PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time); + + /* Initialize memory */ + PRINTF("INFO: Initialize memory\r\n"); + PRINTF("INFO: - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */ + PRINTF("INFO: - Inputs count: %u\r\n", N_X_INPUTS); + PRINTF("INFO: - Outputs count: %u\r\n", N_Y_OUTPUTS); + PRINTF("INFO: - Data size: %u B\r\n", sizeof(float)); + PRINTF("INFO: - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + PRINTF("INFO: - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + + // Set Heap Size in ldscript.ld to 0x1000000 (16MB) + //malloc_stats(); + + for (int i = 0; i < INPUT_N_ELEMENTS; i++) { + inputs_mem[i] = data_X_inputs[i]; + } + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + outputs_mem[i] = 0x0; + } + + /* ****** SW REFERENCE ****** */ + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Start SW reference implementation\r\n"); + XTime_GetTime(&start); + sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS); + XTime_GetTime(&stop); + sw_elapsed = get_elapsed_time(start, stop); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Press any key to start:\r\n"); + dummy = inbyte(); + //PRINTF("INFO:"); + + /* ****** HW ACCELERATOR ****** */ + PRINTF("INFO: Start HW accelerator\r\n"); + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed = get_elapsed_time(start, stop); + + for (unsigned j = 0; j < N_SAMPLES; j++) { + float *inputs_mem_i = inputs_mem + j * N_X_INPUTS; + float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS; + + /* Configure the accelerator */ + XTime_GetTime(&start); + XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */ + XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */ + + XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */ + + /* Polling */ + while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */ + + /* Get error status */ + //hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */ + XTime_GetTime(&stop); + hw_elapsed += get_elapsed_time(start, stop); + } + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed += get_elapsed_time(start, stop); + + PRINTF("INFO: HW accelerator done!\r\n"); + + /* ****** VALIDATION ****** */ + PRINTF("INFO: ================== Verification ==================\r\n"); +#ifdef __DEBUG__ + PRINTF("INFO: Dump data\r\n"); + dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS); + dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS); + dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS); +#endif + +#ifdef __DEBUG__ + PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed); +#endif + PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES); + PRINTF("INFO: - total %f sec\r\n", hw_elapsed); + PRINTF("INFO: - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES)); + PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed); +#ifdef __DEBUG__ + PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed)); +#endif + + hw_errors = 0; +#if 1 + /* Accelerator verification */ + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + if (outputs_mem[i] != reference_mem[i]) { + PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]); + hw_errors++; + } + } + PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS); + if (hw_errors > 0) + PRINTF("INFO: Verification: FAIL\r\n"); + else + PRINTF("INFO: Verification: PASS!\r\n"); +#else + /* Accelerator validation */ + for (unsigned s = 0; s < N_SAMPLES; s++) { + unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + if (hw_digit != ref_digit) { +#ifdef __DEBUG__ + PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit); +#endif + hw_errors++; + } + } + float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0; + float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0); + PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES); + PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate); + PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy); +#endif + PRINTF("INFO: ==================================================\r\n"); + + cleanup_platform(); + + return 0; +} + + diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/setup.tcl new file mode 100644 index 0000000000..5e9e92d501 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z2/c_drivers/sdk/setup.tcl @@ -0,0 +1,14 @@ +# See +# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html + +setws . +if { $::argc == 1 } { + set myproject [lindex $::argv 0] + createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf + createapp -name ${myproject}\_standalone -app {Hello World} -proc ps7_cortexa9_0 -hwproject ${myproject}\_platform -os standalone + configapp -app ${myproject}\_standalone build-config release + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000} + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000} + projects -build + #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE} +} diff --git a/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl new file mode 100644 index 0000000000..b3c3ba9c0d --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-z2/tcl_scripts/axi_master_design.tcl @@ -0,0 +1,88 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +# Project names +set project_name "project_1" +set design_name "design_1" +set hls_solution_name "solution1" +set ps_name "processing_system7_0" +set acc_name "${myproject}_axi_0" +set part_name "xc7z020clg400-1" +set board_name "tul.com.tw:pynq-z2:part0:1.0" + +# Set board and chip part names +create_project ${project_name} ${myproject}_vivado_accelerator -part ${part_name} -force +set_property board_part ${board_name} [current_project] + +# Create block design +create_bd_design ${design_name} + +# Setup IP repo +#set_property ip_repo_paths ${myproject}_prj [current_project] +set_property ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project] +update_ip_catalog + +# Create and setup PS +create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 ${ps_name} +apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells ${ps_name}] +set_property -dict [list CONFIG.PCW_USE_S_AXI_GP0 {1} CONFIG.PCW_USE_FABRIC_INTERRUPT {1} CONFIG.PCW_IRQ_F2P_INTR {1}] [get_bd_cells ${ps_name}] + +# Create accelerator +create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name} + +# Wiring +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master {/myproject_axi_0/m_axi_IN_BUS} \ + Slave {/processing_system7_0/S_AXI_GP0} \ + intc_ip {Auto} \ + master_apm {0}} [get_bd_intf_pins processing_system7_0/S_AXI_GP0] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master {/processing_system7_0/M_AXI_GP0} \ + Slave {/myproject_axi_0/s_axi_CTRL_BUS} \ + intc_ip {New AXI Interconnect} \ + master_apm {0}} [get_bd_intf_pins myproject_axi_0/s_axi_CTRL_BUS] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {/processing_system7_0/FCLK_CLK0 (100 MHz)} \ + Clk_slave {/processing_system7_0/FCLK_CLK0 (100 MHz)} \ + Clk_xbar {/processing_system7_0/FCLK_CLK0 (100 MHz)} \ + Master {/myproject_axi_0/m_axi_OUT_BUS} \ + Slave {/processing_system7_0/S_AXI_GP0} \ + intc_ip {/axi_smc} \ + master_apm {0}} [get_bd_intf_pins myproject_axi_0/m_axi_OUT_BUS] + +# Wiring interrupt signal +connect_bd_net [get_bd_pins myproject_axi_0/interrupt] [get_bd_pins processing_system7_0/IRQ_F2P] + +# Top level wrapper +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top +add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v + +# Memory mapping +delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_IOP] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_IN_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0] +delete_bd_objs [get_bd_addr_segs myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_QSPI_LINEAR] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_IOP] +delete_bd_objs [get_bd_addr_segs -excluded myproject_axi_0/Data_m_axi_OUT_BUS/SEG_processing_system7_0_GP0_M_AXI_GP0] + +# Run synthesis and implementation +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +# Reporting +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages + +# Export HDF file for SDK flow +file mkdir ./hdf +file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c new file mode 100644 index 0000000000..8a46df8bde --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.c @@ -0,0 +1,6 @@ +#include "xil_printf.h" + +int main(void) { + xil_printf("Hello world!\r\n"); + return 0; +} diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h new file mode 100644 index 0000000000..8a46df8bde --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/axi_master_driver.h @@ -0,0 +1,6 @@ +#include "xil_printf.h" + +int main(void) { + xil_printf("Hello world!\r\n"); + return 0; +} diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile new file mode 100644 index 0000000000..03ab9b8de7 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/Makefile @@ -0,0 +1,33 @@ +DESIGN := design_1 + +help: + @echo "INFO: make to show targets" +.PHONY: help + +--setup: + xsct ./setup.tcl $(DESIGN) +.PHONY: --setup + +sdk: --setup + rm -f $(DESIGN)_standalone/src/helloworld.c + cd $(DESIGN)_standalone/src && ln -s ../../common/main.c main.c + cd $(DESIGN)_standalone/src && ln -s ../../common/data.h data.h +.PHONY: sdk + +gui: + xsdk --workspace . & +.PHONY: gui + +clean: + rm -rf $(DESIGN)_platform + rm -rf $(DESIGN)_standalone + rm -rf $(DESIGN)_standalone_bsp + rm -rf RemoteSystemsTempFiles + rm -rf .Xil + rm -rf .metadata + rm -f *.log +.PHONY: clean + +ultraclean: clean + rm -rf hdf/*.hdf +.PHONY: ultraclean diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/common/main.c b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/common/main.c new file mode 100644 index 0000000000..7dd2be22a8 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/common/main.c @@ -0,0 +1,262 @@ +/** + * + * Set Heap Size in ldscript.ld to 0x1000000 (16MB) + * + */ + +#include "xmyproject_axi.h" /* TODO: design-dependent name */ +#include "stdio.h" /* PRINTF */ +#include "unistd.h" /* sleep */ +#include "stdlib.h" +#include "malloc.h" +#include "assert.h" +#include "xil_io.h" /* peripheral read/write wrappers */ +#include "xtime_l.h" /* to measure performance of the system */ +#include "platform.h" /* platform init/cleanup functions */ +#include "xil_cache.h" /* enable/disable caches etc */ +#include "xil_printf.h" /* UART debug print functions */ +#include "xparameters.h" /* peripherals base addresses */ + +#include "data.h" + +//#define __DEBUG__ + +#define MAX_PRINT_ELEMENTS (16) + +#define PRINTF printf + +const unsigned INPUT_N_ELEMENTS = N_SAMPLES * N_X_INPUTS; +const unsigned OUTPUT_N_ELEMENTS = N_SAMPLES * N_Y_OUTPUTS; + +#if 1 +/* Accelerator verification */ +#define REFERENCE_OUTPUTS data_y_hls_outputs +#else +/* Accelerator validation */ +#define REFERENCE_OUTPUTS data_y_outputs +//#define REFERENCE_OUTPUTS data_y_keras_outputs +#endif + +unsigned get_max(float *data, unsigned n_elements) { + float max_value = 0.0; + unsigned max_index = 0; + for (unsigned i = 0; i < n_elements; i++) + if (data[i] >= max_value) { + max_index = i; + max_value = data[i]; + } + return max_index; +} + +float *inputs_mem = NULL; +float *outputs_mem = NULL; +float *reference_mem = NULL; + +/* Accelerator configuration */ +XMyproject_axi accelerator; /* TODO: design-dependent name */ +XMyproject_axi_Config *accelerator_cfg; /* TODO: design-dependent name */ + +/* Accelerator initialization routine */ +void init_accelerators() { + PRINTF("INFO: Initializing accelerator\r\n"); + accelerator_cfg = XMyproject_axi_LookupConfig(XPAR_MYPROJECT_AXI_0_DEVICE_ID); /* TODO: design-dependent name */ + if (accelerator_cfg) { + int status = XMyproject_axi_CfgInitialize(&accelerator, accelerator_cfg); /* TODO: design-dependent name */ + if (status != XST_SUCCESS) { + PRINTF("ERROR: Initializing accelerator\r\n"); + } + } +} + +/* Reference implementation of the accelerator in software */ +int sw_reference_implementation(float *sw_inputs_mem, float *sw_outputs_mem, unsigned n_samples, unsigned n_X_inputs, unsigned n_y_ouputs) { +#ifdef __DEBUG__ + PRINTF("INFO: Reference outputs are pre-compiled. It would be nice to run a software model here.\r\n"); +#endif + /* See data.h for inputs and outputs */ + for (unsigned i = 0; i < n_samples * n_y_ouputs; i++) { + sw_outputs_mem[i] = REFERENCE_OUTPUTS[i]; + } + return 0; +} + +/* Profiling function */ +double get_elapsed_time(XTime start, XTime stop) { + return 1.0 * (stop - start) / (COUNTS_PER_SECOND); +} + +/* Dump data to the console */ +void dump_data(const char* label, float* data, unsigned n_samples, unsigned feature_count) { + PRINTF("INFO: %s[%u][%u]:\r\n", label, n_samples, feature_count); + /* Print at most MAX_PRINT_ELEMENTS */ + for (unsigned i = 0; i < n_samples && i < MAX_PRINT_ELEMENTS; i++) { + PRINTF("INFO: [%u] ", i); + for (unsigned j = 0; j < feature_count; j++) { + unsigned index = i * feature_count + j; + PRINTF("%f ", data[index]); + } + PRINTF("\r\n"); + } +} + +/* The top of the hill :-) */ +int main(int argc, char** argv) { + + XTime start, stop; + double calibration_time; + double sw_elapsed = 0; + double hw_elapsed = 0; + double cache_elapsed = 0; + unsigned hw_errors; + + char __attribute__ ((unused)) dummy; /* dummy input */ + + /* Initialize platform (uart and caches) */ + init_platform(); + + PRINTF("\r\n"); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: XMyproject_axi (w/ polling)\r\n"); /* TODO: design-dependent name */ + PRINTF("INFO: ==================================================\r\n"); + + init_accelerators(); + + inputs_mem = malloc(INPUT_N_ELEMENTS * sizeof(float)); + outputs_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + reference_mem = malloc(OUTPUT_N_ELEMENTS * sizeof(float)); + + /* Calibration */ + XTime_GetTime(&start); + sleep(1); + XTime_GetTime(&stop); + calibration_time = get_elapsed_time(start, stop); + PRINTF("INFO: Time calibration for one second (%lf sec)\r\n", calibration_time); + + /* Initialize memory */ + PRINTF("INFO: Initialize memory\r\n"); + PRINTF("INFO: - Samples count: %u\r\n", N_SAMPLES); /* Same as dst_SAMPLE_COUNT */ + PRINTF("INFO: - Inputs count: %u\r\n", N_X_INPUTS); + PRINTF("INFO: - Outputs count: %u\r\n", N_Y_OUTPUTS); + PRINTF("INFO: - Data size: %u B\r\n", sizeof(float)); + PRINTF("INFO: - Total input size: %u B, %.2f KB, %.2f MB\r\n", N_X_INPUTS * N_SAMPLES * sizeof(float), (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_X_INPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + PRINTF("INFO: - Total output size: %u B, %.2f KB, %.2f MB\r\n", N_Y_OUTPUTS * N_SAMPLES * sizeof(float), (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)1024, (N_Y_OUTPUTS * N_SAMPLES * sizeof(float)) / (float)(1024*1024)); + + // Set Heap Size in ldscript.ld to 0x1000000 (16MB) + //malloc_stats(); + + for (int i = 0; i < INPUT_N_ELEMENTS; i++) { + inputs_mem[i] = data_X_inputs[i]; + } + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + outputs_mem[i] = 0x0; + } + + /* ****** SW REFERENCE ****** */ + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Start SW reference implementation\r\n"); + XTime_GetTime(&start); + sw_reference_implementation(inputs_mem, reference_mem, N_SAMPLES, N_X_INPUTS, N_Y_OUTPUTS); + XTime_GetTime(&stop); + sw_elapsed = get_elapsed_time(start, stop); + PRINTF("INFO: ==================================================\r\n"); + PRINTF("INFO: Press any key to start:\r\n"); + dummy = inbyte(); + //PRINTF("INFO:"); + + /* ****** HW ACCELERATOR ****** */ + PRINTF("INFO: Start HW accelerator\r\n"); + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)inputs_mem, INPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + Xil_DCacheFlushRange((UINTPTR)reference_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed = get_elapsed_time(start, stop); + + for (unsigned j = 0; j < N_SAMPLES; j++) { + float *inputs_mem_i = inputs_mem + j * N_X_INPUTS; + float *outputs_mem_i = outputs_mem + j * N_Y_OUTPUTS; + + /* Configure the accelerator */ + XTime_GetTime(&start); + XMyproject_axi_Set_in_r(&accelerator, (unsigned)inputs_mem_i); /* TODO: design-dependent name */ + XMyproject_axi_Set_out_r(&accelerator, (unsigned)outputs_mem_i); /* TODO: design-dependent name */ + + XMyproject_axi_Start(&accelerator); /* TODO: design-dependent name */ + + /* Polling */ + while (!XMyproject_axi_IsDone(&accelerator)); /* TODO: design-dependent name */ + + /* Get error status */ + //hw_flags = XMyproject_axi_Get_return(&accelerator); /* TODO: design-dependent name */ + XTime_GetTime(&stop); + hw_elapsed += get_elapsed_time(start, stop); + } + + XTime_GetTime(&start); + Xil_DCacheFlushRange((UINTPTR)outputs_mem, OUTPUT_N_ELEMENTS * sizeof(float)); + XTime_GetTime(&stop); + cache_elapsed += get_elapsed_time(start, stop); + + PRINTF("INFO: HW accelerator done!\r\n"); + + /* ****** VALIDATION ****** */ + PRINTF("INFO: ================== Verification ==================\r\n"); +#ifdef __DEBUG__ + PRINTF("INFO: Dump data\r\n"); + dump_data("inputs_mem", inputs_mem, N_SAMPLES, N_X_INPUTS); + dump_data("outputs_mem", outputs_mem, N_SAMPLES, N_Y_OUTPUTS); + dump_data("reference_mem", reference_mem, N_SAMPLES, N_Y_OUTPUTS); +#endif + +#ifdef __DEBUG__ + PRINTF("INFO: SW execution time: %f sec\r\n", sw_elapsed); +#endif + PRINTF("INFO: HW-acceleration exec. time (%d inferences):\r\n", N_SAMPLES); + PRINTF("INFO: - total %f sec\r\n", hw_elapsed); + PRINTF("INFO: - per-inference %.12f sec (%f ns)\r\n", hw_elapsed / (N_SAMPLES), (hw_elapsed*1000.0) / (N_SAMPLES)); + PRINTF("INFO: Cache flush time: %f sec\r\n", cache_elapsed); +#ifdef __DEBUG__ + PRINTF("INFO: HW/SW speedup (the software is fake so this does not count...): %.2f X\r\n", (sw_elapsed >= (hw_elapsed+cache_elapsed))?(sw_elapsed/(hw_elapsed+cache_elapsed)):-((hw_elapsed+cache_elapsed)/sw_elapsed)); +#endif + + hw_errors = 0; +#if 1 + /* Accelerator verification */ + for (int i = 0; i < OUTPUT_N_ELEMENTS; i++) { + if (outputs_mem[i] != reference_mem[i]) { + PRINTF("ERROR: [%d]: Accelerator HW %f != SW %f\r\n", i, outputs_mem[i], reference_mem[i]); + hw_errors++; + } + } + PRINTF("INFO: Total errors = %d (out of %d elements)\r\n", hw_errors, OUTPUT_N_ELEMENTS); + if (hw_errors > 0) + PRINTF("INFO: Verification: FAIL\r\n"); + else + PRINTF("INFO: Verification: PASS!\r\n"); +#else + /* Accelerator validation */ + for (unsigned s = 0; s < N_SAMPLES; s++) { + unsigned ref_digit = get_max(reference_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + unsigned hw_digit = get_max(outputs_mem + s * N_Y_OUTPUTS, N_Y_OUTPUTS); + if (hw_digit != ref_digit) { +#ifdef __DEBUG__ + PRINTF("ERROR: [%d]: Accelerator HW %u != SW %u\r\n", s, hw_digit, ref_digit); +#endif + hw_errors++; + } + } + float error_rate = (hw_errors / (float)(N_SAMPLES)) * 100.0; + float accuracy = 100 - ((hw_errors / (float)(N_SAMPLES)) * 100.0); + PRINTF("INFO: Total errors = %d (out of %d digits)\r\n", hw_errors, N_SAMPLES); + PRINTF("INFO: Error rate = %.2f %%\r\n", error_rate); + PRINTF("INFO: Accuracy = %.2f %%\r\n", accuracy); +#endif + PRINTF("INFO: ==================================================\r\n"); + + cleanup_platform(); + + return 0; +} + + diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl new file mode 100644 index 0000000000..ea386d4281 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/c_drivers/sdk/setup.tcl @@ -0,0 +1,18 @@ +# See +# https://www.xilinx.com/html_docs/xilinx2019_1/SDK_Doc/xsct/intro/xsct_introduction.html + +setws . +if { $::argc == 1 } { + set myproject [lindex $::argv 0] + createhw -name ${myproject}\_platform -hwspec ../hdf/${myproject}\_wrapper.hdf + createapp -name ${myproject}\_standalone -app {Hello World} -proc psu_cortexa53_0 -hwproject ${myproject}\_platform -os standalone -arch 64 + configbsp -bsp ${myproject}\_standalone_bsp stdin psu_uart_1 + configbsp -bsp ${myproject}\_standalone_bsp stdout psu_uart_1 + updatemss -mss ${myproject}\_standalone_bsp/system.mss + regenbsp -bsp ${myproject}\_standalone_bsp + configapp -app ${myproject}\_standalone build-config release + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_HEAP_SIZE=0x1000000} + configapp -app ${myproject}\_standalone -add linker-misc {-Wl,--defsym=_STACK_SIZE=0x40000} + projects -build + #configapp -app ${myproject}\_standalone -add define-compiler-symbols {FLAG=VALUE} +} diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..4adb187ab4 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/python_drivers/axi_stream_driver.py @@ -0,0 +1,75 @@ +from pynq import DefaultHierarchy, DefaultIP, allocate +from pynq import Overlay +from datetime import datetime +import pynq.lib.dma +import numpy as np + + +class NeuralNetworkOverlay(Overlay): + def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, + device=None): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.sendchannel = self.hier_0.axi_dma_0.sendchannel + self.recvchannel = self.hier_0.axi_dma_0.recvchannel + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = (timeb - timea) + dts = dt.seconds + dt.microseconds * 10 ** -6 + rate = N / dts + print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.sendchannel.transfer(self.input_buffer) + self.recvchannel.transfer(self.output_buffer) + if debug: + print("Transfer OK") + self.sendchannel.wait() + if debug: + print("Send OK") + self.recvchannel.wait() + if debug: + print("Receive OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer \ No newline at end of file diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl new file mode 100644 index 0000000000..2df93afca5 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_lite_design.tcl @@ -0,0 +1,26 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${myproject}_vivado_accelerator -part xczu3eg-sbva484-1-e -force + +set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project] +set_property ip_repo_paths ${myproject}_prj [current_project] +update_ip_catalog + +# Create Block Designer design +create_bd_design "design_1" +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells zynq_ultra_ps_e] +create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/processing_system7_0/M_AXI_GP0} Slave {/${myproject}_axi_0/s_axi_AXILiteS} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins ${myproject}_axi_0/s_axi_AXILiteS] + +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top +add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl new file mode 100644 index 0000000000..bb91ba9ee2 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_master_design.tcl @@ -0,0 +1,91 @@ +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +# Project names +set project_name "project_1" +set design_name "design_1" +set hls_solution_name "solution1" +set ps_name "zynq_ultra_ps_e_0" +set acc_name "${myproject}_axi_0" + +# Board and chip part names +create_project ${project_name} ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force +set_property board_part avnet.com:ultra96v2:part0:1.2 [current_project] + +# Create block design +create_bd_design ${design_name} + +# Setup IP repo +#set_property ip_repo_paths ${myproject}_prj [current_project] +set_property ip_repo_paths ${myproject}_prj/${hls_solution_name}/impl/ip [current_project] +update_ip_catalog + +# Create and setup PS +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 ${ps_name} +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells ${ps_name}] +set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells ${ps_name}] + +# Create accelerator +create_bd_cell -type ip -vlnv xilinx.com:hls:myproject_axi:1.0 ${acc_name} + +# Wiring +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master "/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD" \ + Slave "/myproject_axi_0/s_axi_CTRL_BUS" \ + intc_ip {New AXI Interconnect} \ + master_apm {0}} [get_bd_intf_pins ${acc_name}/s_axi_CTRL_BUS] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master {Auto} \ + Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Master "/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD" \ + Slave "/myproject_axi_0/s_axi_CTRL_BUS" \ + intc_ip {/ps8_0_axi_periph} \ + master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_slave {Auto} \ + Clk_xbar {Auto} \ + Master "/myproject_axi_0/m_axi_IN_BUS" \ + Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \ + intc_ip {Auto} \ + master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD] + +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { \ + Clk_master "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_slave "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Clk_xbar "/zynq_ultra_ps_e_0/pl_clk0 (100 MHz)" \ + Master "/myproject_axi_0/m_axi_OUT_BUS" \ + Slave "/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD" \ + intc_ip {/axi_smc} \ + master_apm {0}} [get_bd_intf_pins ${acc_name}/m_axi_OUT_BUS] + +# Wiring interrupt signal +connect_bd_net [get_bd_pins ${acc_name}/interrupt] [get_bd_pins ${ps_name}/pl_ps_irq0] + +# Top level wrapper +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/${design_name}.bd] -top +add_files -norecurse ./${myproject}_vivado_accelerator/${project_name}.srcs/sources_1/bd/${design_name}/hdl/${design_name}_wrapper.v + +# Memory mapping +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_IN_BUS/SEG_${ps_name}_HPC0_LPS_OCM] +delete_bd_objs [get_bd_addr_segs -excluded ${acc_name}/Data_m_axi_OUT_BUS/SEG_${ps_name}_HPC0_LPS_OCM] + +# Run synthesis and implementation +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +# Reporting +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages + +# Export HDF file for SDK flow +file mkdir ./hdf +file copy -force ${myproject}_vivado_accelerator/${project_name}.runs/impl_1/${design_name}_wrapper.sysdef ./hdf/${design_name}_wrapper.hdf diff --git a/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..4721b59941 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/ultra96v2/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,58 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force + +set_property board_part em.avnet.com:ultra96:part0:1.2 [current_project] +set_property ip_repo_paths ${myproject}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" +set_property ip_repo_paths ${myproject}_prj/solution1/impl/ip [current_project] +update_ip_catalog + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_0] + +set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD] +endgroup + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD] +endgroup + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +endgroup +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${myproject}_axi_0/out_r] + +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${myproject}_axi_0/ap_clk] +group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0] + +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages diff --git a/hls4ml/templates/vivado_accelerator_config.py b/hls4ml/templates/vivado_accelerator_config.py index 2896d3d144..9b524120f6 100644 --- a/hls4ml/templates/vivado_accelerator_config.py +++ b/hls4ml/templates/vivado_accelerator_config.py @@ -14,7 +14,7 @@ def __init__(self, config, model_inputs, model_outputs): self.part = board_info['part'] else: raise Exception('The board does not appear in supported_boards.json file') - + if self.config.get('XilinxPart') is not None: if self.config.get('XilinxPart') != self.part: print('WARNING: You set a XilinxPart that does not correspond to the Board you specified. The correct ' @@ -29,7 +29,7 @@ def __init__(self, config, model_inputs, model_outputs): if prec.get('Input') is None or prec.get('Output') is None: raise Exception('Input and Output fields must be provided in the AcceleratorConfig->Precision') else: - accel_config = {'Precision': + accel_config = {'Precision': { 'Input': 'float', 'Output': 'float' @@ -61,16 +61,16 @@ def __init__(self, config, model_inputs, model_outputs): if out_axi_t not in ['float', 'double']: self.output_type = self._next_factor8_type(config.backend.convert_precision_string(out_axi_t)) - if self.input_type is 'float': + if inp_axi_t == 'float': self.input_bitwidth = 32 - elif self.input_type is 'double': + elif out_axi_t == 'double': self.input_bitwidth = 64 else: self.input_bitwidth = config.backend.convert_precision_string(inp_axi_t).width - if out_axi_t is 'float': + if out_axi_t == 'float': self.output_bitwidth = 32 - elif out_axi_t is 'double': + elif out_axi_t == 'double': self.output_bitwidth = 64 else: self.output_bitwidth = config.backend.convert_precision_string(out_axi_t).width @@ -120,11 +120,21 @@ def get_board(self): def get_driver_path(self): return '../templates/vivado_accelerator/' + self.board + '/' + self.driver + '_drivers/' + \ - self.get_driver_file() + self.get_driver_files() + + def get_vivado_ip_wrapper_path(self): + return '../templates/vivado_accelerator/' + self.board + '/verilog_wrappers' + + def get_vivado_constraints_path(self): + return '../templates/vivado_accelerator/' + self.board + '/xdc_constraints' - def get_driver_file(self): - driver_ext = '.py' if self.driver == 'python' else '.h' - return self.interface + '_driver' + driver_ext + def get_driver_files(self): + if self.driver == 'c': + driver_dir = 'sdk' + return driver_dir + elif self.driver == 'python': + driver_ext = '.py' + return self.interface + '_driver' + driver_ext def get_input_type(self): return self.input_type diff --git a/hls4ml/templates/vivado_template.py b/hls4ml/templates/vivado_template.py index 149b52f1d5..6170ff5f9e 100644 --- a/hls4ml/templates/vivado_template.py +++ b/hls4ml/templates/vivado_template.py @@ -14,11 +14,13 @@ static const unsigned reuse_factor = {reuse}; static const unsigned n_zeros = {nzeros}; static const unsigned n_nonzeros = {nonzeros}; + static const bool merged_relu = {merged_relu}; static const bool store_weights_in_bram = false; typedef {accum_t} accum_t; typedef {bias_t} bias_t; typedef {weight_t} weight_t; typedef {index_t} index_t; + typedef {out_t}:: value_type out_t; template using product = nnet::product::{product_type}; }};\n""" @@ -65,9 +67,11 @@ static const unsigned n_out = {n_out}; static const unsigned reuse_factor = {reuse}; static const unsigned strategy = nnet::{strategy}; + static const bool merged_relu = {merged_relu}; typedef {accum_t} accum_t; typedef {bias_t} bias_t; typedef {weight_t} weight_t; + typedef {out_t}:: value_type out_t; template using product = nnet::product::{product_type}; }};\n""" @@ -386,6 +390,7 @@ def __init__(self, name='Vivado'): super(VivadoBackend, self).__init__(name) self.register_templates('Dense', dense_function_template, dense_config_template, dense_include_list) self.register_templates('BinaryDense' , dense_function_template, dense_config_template, dense_include_list) + self.register_templates('DenseBatchnorm' , dense_function_template, dense_config_template, dense_include_list) self.register_templates('BatchNormalization' , batchnorm_function_template, batchnorm_config_template, batchnorm_include_list) self.register_templates('Conv1D' , conv1d_function_template, [conv1d_config_template, conv_mult_config_template], conv1d_include_list) self.register_templates('Conv2D' , conv2d_function_template, [conv2d_config_template, conv_mult_config_template], conv2d_include_list) diff --git a/hls4ml/utils/config.py b/hls4ml/utils/config.py index b907350bb4..1a99b888b2 100644 --- a/hls4ml/utils/config.py +++ b/hls4ml/utils/config.py @@ -108,7 +108,7 @@ def config_from_keras_model(model, granularity='model', default_precision='ap_fi norm_layers = ['BatchNormalization'] activation_layers = ['Activation', 'LeakyReLU', 'ThresholdedReLU', 'ELU', 'PReLU', 'Softmax', 'ReLU'] merge_layers = ['Add', 'Subtract', 'Multiply', 'Average', 'Maximum', 'Minimum', 'Concatenate', 'Dot'] - qkeras_layers = ['QDense', 'QActivation', 'QConv1D', 'QConv2D', 'QBatchNormalization', 'QConv2DBatchnorm'] + qkeras_layers = ['QDense', 'QActivation', 'QConv1D', 'QConv2D', 'QBatchNormalization', 'QConv2DBatchnorm', 'QDenseBatchnorm'] #Define layers to skip because they're not configurable or not converted to HLS skip_layers = ['Dropout', 'Flatten', 'Reshape', 'Permute'] #All supported layers diff --git a/hls4ml/writer/vivado_accelerator_writer.py b/hls4ml/writer/vivado_accelerator_writer.py index c5206e002d..d1f84a5db6 100644 --- a/hls4ml/writer/vivado_accelerator_writer.py +++ b/hls4ml/writer/vivado_accelerator_writer.py @@ -1,5 +1,6 @@ import os from shutil import copyfile +from shutil import copytree from hls4ml.templates.vivado_accelerator_config import VivadoAcceleratorConfig from hls4ml.writer.vivado_writer import VivadoWriter @@ -98,8 +99,21 @@ def write_axi_wrapper(self, model): elif io_type == 'io_stream': newline += indent + 'hls::stream<' + inp.type.name + '> in_local("input_1");\n' newline += indent + 'hls::stream<' + out.type.name + '> out_local("output_1");\n\n' - newline += indent + '#pragma HLS STREAM variable=in_local depth=N_IN\n' - newline += indent + '#pragma HLS STREAM variable=out_local depth=N_OUT\n' + in_local_depth = 0 + out_local_depth = 0 + try: + in_local_depth = model.config.config['HLSConfig']['LayerName']['in_local']['StreamDepth'] + out_local_depth = model.config.config['HLSConfig']['LayerName']['out_local']['StreamDepth'] + except KeyError: + pass + if in_local_depth: + newline += indent + '#pragma HLS STREAM variable=in_local depth={}\n'.format(in_local_depth) + else: + newline += indent + '#pragma HLS STREAM variable=in_local depth=N_IN\n' + if out_local_depth: + newline += indent + '#pragma HLS STREAM variable=out_local depth={}\n'.format(out_local_depth) + else: + newline += indent + '#pragma HLS STREAM variable=out_local depth=N_OUT\n' elif '//hls-fpga-machine-learning insert call' in line: newline = indent + '{}(in_local, out_local, in_size, out_size);\n'.format( model.config.get_project_name()) @@ -196,7 +210,9 @@ def modify_build_script(self, model): fout = open(newfile, 'w') for line in f.readlines(): - if 'set_top' in line: + if 'set filename myproject_prj/solution1/sim/verilog/myproject.tcl' in line: + newline = line[:-5] + '_axi\n' + elif 'set_top' in line: newline = line[:-1] + '_axi\n' # remove the newline from the line end and append _axi for the new top newline += 'add_files firmware/{}_axi.cpp -cflags "-std=c++0x"\n'.format( model.config.get_project_name()) @@ -317,6 +333,13 @@ def write_board_script(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_tcl_file_path()), '{}/design.tcl'.format(model.config.get_output_dir())) + if self.vivado_accelerator_config.get_interface() == 'axi_master' and self.vivado_accelerator_config.board == "arty-a7-100t": + copytree(os.path.join(filedir, self.vivado_accelerator_config.get_vivado_ip_wrapper_path()), + '{}/'.format(model.config.get_output_dir()), + dirs_exist_ok=True) + copytree(os.path.join(filedir, self.vivado_accelerator_config.get_vivado_constraints_path()), + '{}/'.format(model.config.get_output_dir()), + dirs_exist_ok=True) f = open('{}/project.tcl'.format(model.config.get_output_dir()), 'w') f.write('variable myproject\n') f.write('set myproject "{}"\n'.format(model.config.get_project_name())) @@ -324,17 +347,138 @@ def write_board_script(self, model): in_bit, out_bit = self.vivado_accelerator_config.get_io_bitwidth() f.write('set bit_width_hls_output {}\n'.format(in_bit)) f.write('set bit_width_hls_input {}\n'.format(out_bit)) + if model.config.config['HLSConfig']['Model'].get('FIFO_opt'): + f.write('set fifo_opt 1\n') + else: + f.write('set fifo_opt 0\n') + if model.config.config['HLSConfig']['Model'].get('EEMBC_power'): + f.write('set eembc_power 1\n') + else: + f.write('set eembc_power 0\n') f.close() + def write_header_file(model, X, y, y_keras, y_hls, n_samples, filename='data.h'): + vivado_accelerator_config = VivadoAcceleratorConfig(model.config, model.get_input_variables(), + model.get_output_variables()) + inp_axi_t, out_axi_t, inp, out = vivado_accelerator_config.get_corrected_types() + header_file = open(filename, 'w') + (n_X_samples, n_X_inputs) = X.shape + (n_y_samples, n_y_outputs) = y.shape + (n_y_keras_samples, n_y_keras_outputs) = y_keras.shape + (n_y_hls_samples, n_y_hls_outputs) = y_hls.shape + + header_file.write('#ifndef __DATA_H__\n') + header_file.write('#define __DATA_H__\n') + header_file.write('/* out of {} */\n'.format(n_X_samples)) + header_file.write('#define N_SAMPLES {}\n'.format(n_samples)) + header_file.write('\n') + header_file.write('#define N_X_INPUTS {}\n'.format(n_X_inputs)) + header_file.write('const {} data_X_inputs[N_SAMPLES*N_X_INPUTS] = {{\n'.format(inp_axi_t)) + for s in range(n_samples): + header_file.write(' ') + for i in range(n_X_inputs): + header_file.write('{}, '.format(X[s][i])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('\n') + header_file.write('/* Ground truth - for validation */\n') + header_file.write('#define N_Y_OUTPUTS {}\n'.format(n_y_outputs)) + header_file.write('const float data_y_outputs[N_SAMPLES*N_Y_OUTPUTS] = {\n') + for s in range(n_samples): + header_file.write(' ') + for o in range(n_y_outputs): + header_file.write('{}, '.format(y[s][o])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('\n') + header_file.write('/* Keras outputs - for validation */\n') + header_file.write('#define N_Y_KERAS_OUTPUTS {}\n'.format(n_y_keras_outputs)) + header_file.write('') + header_file.write('const float data_y_keras_outputs[N_SAMPLES*N_Y_KERAS_OUTPUTS] = {\n') + for s in range(n_samples): + header_file.write(' ') + for o in range(n_y_keras_outputs): + header_file.write('{}, '.format(y_keras[s][o])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('\n') + header_file.write('/* csim outputs - for verification */\n') + header_file.write('#define N_Y_HLS_OUTPUTS {}\n'.format(n_y_hls_outputs)) + header_file.write('') + header_file.write('const {} data_y_hls_outputs[N_SAMPLES*N_Y_HLS_OUTPUTS] = {{\n'.format(out_axi_t)) + for s in range(n_samples): + header_file.write(' ') + for o in range(n_y_hls_outputs): + header_file.write('{}, '.format(y_hls[s][o])) + header_file.write('\n') + header_file.write('};\n') + header_file.write('#endif\n') + header_file.close() + def write_driver(self, model): filedir = os.path.dirname(os.path.abspath(__file__)) - copyfile(os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()), - ('{}/' + self.vivado_accelerator_config.get_driver_file()).format(model.config.get_output_dir())) + srcfiles = os.path.join(filedir, self.vivado_accelerator_config.get_driver_path()) + dstfiles = ('{}/' + self.vivado_accelerator_config.get_driver_files()).format(model.config.get_output_dir()) + if os.path.isdir(srcfiles): + copytree(srcfiles, dstfiles, dirs_exist_ok=True) + else: + copyfile(srcfiles, dstfiles) def write_new_tar(self, model): os.remove(model.config.get_output_dir() + '.tar.gz') super(VivadoAcceleratorWriter, self).write_tar(model) - + + def apply_patches(self, model): + ''' + Apply patches. + ''' + filedir = os.path.dirname(os.path.abspath(__file__)) + + indent = ' ' + + ################### + # patch myproject_axi.h + ################### + oldfile = '{}/firmware/{}_axi.h'.format(model.config.get_output_dir(), model.config.get_project_name()) + newfile = '{}/firmware/{}_axi_patch.h'.format(model.config.get_output_dir(), model.config.get_project_name()) + + f = open(oldfile,'r') + fout = open(newfile, 'w') + + for line in f.readlines(): + if 'typedef' in line and 'input_axi_t;' in line: + # hardcoded ap_uint<8> input + newline = 'typedef ap_uint<8> input_axi_t;\n' + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + + ################### + # patch myproject_axi.cpp + ################### + oldfile = '{}/firmware/{}_axi.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()) + newfile = '{}/firmware/{}_axi_patch.cpp'.format(model.config.get_output_dir(), model.config.get_project_name()) + + f = open(oldfile,'r') + fout = open(newfile, 'w') + + for line in f.readlines(): + if 'ctype[j] = typename input_t::value_type' in line: + # these lines are hardcoded to do the bitshift by 256 + newline = indent + indent + indent + 'ap_ufixed<16,8> tmp = in[i * input_t::size + j]; // store 8 bit input in a larger temp variable\n' + newline += indent + indent + indent + 'ctype[j] = typename input_t::value_type(tmp >> 8); // shift right by 8 (div by 256) and select only the decimal of the larger temp variable\n' + else: + newline = line + fout.write(newline) + + f.close() + fout.close() + os.rename(newfile, oldfile) + def write_hls(self, model): """ Write the HLS project. Calls the VivadoBackend writer, and extra steps for VivadoAccelerator/AXI interface @@ -347,5 +491,7 @@ def write_hls(self, model): self.write_wrapper_test(model) self.write_axi_wrapper(model) self.modify_build_script(model) + if model.config.get_config_value('ApplyPatches'): + self.apply_patches(model) self.write_new_tar(model) diff --git a/hls4ml/writer/vivado_writer.py b/hls4ml/writer/vivado_writer.py index de7ff65543..b77e09a97b 100644 --- a/hls4ml/writer/vivado_writer.py +++ b/hls4ml/writer/vivado_writer.py @@ -663,7 +663,8 @@ def write_tar(self, model): ################### # Tarball output ################### - + if os.path.isfile(model.config.get_output_dir() + '.tar.gz'): + return with tarfile.open(model.config.get_output_dir() + '.tar.gz', mode='w:gz') as archive: archive.add(model.config.get_output_dir(), recursive=True) diff --git a/setup.py b/setup.py index ebdda86d79..0d16f6acb8 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,8 @@ def get_version(rel_path): 'six', 'pyyaml', 'h5py', - 'onnx>=1.4.0'], + 'onnx>=1.4.0', + 'pyDigitalWaveTools'], extras_require={ 'profiling': [ 'pandas',