Whitney Knitter
Published © GPL3+

Debugging Accelerated Apps using Emulation in Vitis 2021.2

This project walks through how to set up a generic project in Vitis 2021.2 to debug FPGA accelerated applications using SW & HW emulation.

AdvancedFull instructions provided3 hours1,287
Debugging Accelerated Apps using Emulation in Vitis 2021.2

Things used in this project

Story

Read more

Code

vadd.cpp

C/C++
/*******************************************************************************
Vendor: Xilinx
Associated Filename: vadd.cpp
Purpose: VITIS vector addition

*******************************************************************************
Copyright (C) 2019 XILINX, Inc.

This file contains confidential and proprietary information of Xilinx, Inc. and
is protected under U.S. and international copyright and other intellectual
property laws.

DISCLAIMER
This disclaimer is not a license and does not grant any rights to the materials
distributed herewith. Except as otherwise provided in a valid license issued to
you by Xilinx, and to the maximum extent permitted by applicable law:
(1) THESE MATERIALS ARE MADE AVAILABLE "AS IS" AND WITH ALL FAULTS, AND XILINX
HEREBY DISCLAIMS ALL WARRANTIES AND CONDITIONS, EXPRESS, IMPLIED, OR STATUTORY,
INCLUDING BUT NOT LIMITED TO WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT, OR
FITNESS FOR ANY PARTICULAR PURPOSE; and (2) Xilinx shall not be liable (whether
in contract or tort, including negligence, or under any other theory of
liability) for any loss or damage of any kind or nature related to, arising under
or in connection with these materials, including for any direct, or any indirect,
special, incidental, or consequential loss or damage (including loss of data,
profits, goodwill, or any type of loss or damage suffered as a result of any
action brought by a third party) even if such damage or loss was reasonably
foreseeable or Xilinx had been advised of the possibility of the same.

CRITICAL APPLICATIONS
Xilinx products are not designed or intended to be fail-safe, or for use in any
application requiring fail-safe performance, such as life-support or safety
devices or systems, Class III medical devices, nuclear facilities, applications
related to the deployment of airbags, or any other applications that could lead
to death, personal injury, or severe property or environmental damage
(individually and collectively, "Critical Applications"). Customer assumes the
sole risk and liability of any use of Xilinx products in Critical Applications,
subject only to applicable laws and regulations governing limitations on product
liability.

THIS COPYRIGHT NOTICE AND DISCLAIMER MUST BE RETAINED AS PART OF THIS FILE AT
ALL TIMES.

*******************************************************************************/
#define OCL_CHECK(error, call)                                                                   \
    call;                                                                                        \
    if (error != CL_SUCCESS) {                                                                   \
        printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, error); \
        exit(EXIT_FAILURE);                                                                      \
    }

#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>

#include "vadd.h"
#include "ap_int.h"


static const int DATA_SIZE = 4096;

static const std::string error_message =
    "Error: Result mismatch:\n"
    "i = %d CPU result = %d Device result = %d\n";

int main(int argc, char* argv[]) {

    //TARGET_DEVICE macro needs to be passed from gcc command line
    if(argc != 2) {
		std::cout << "Usage: " << argv[0] <<" <xclbin>" << std::endl;
		return EXIT_FAILURE;
	}

    std::string xclbinFilename = argv[1];
    
    // Compute the size of array in bytes
    size_t size_in_bytes = DATA_SIZE * sizeof(int);
    
    // Creates a vector of DATA_SIZE elements with an initial value of 10 and 32
    // using customized allocator for getting buffer alignment to 4k boundary
    
    std::vector<cl::Device> devices;
    cl::Device device;
    cl_int err;
    cl::Context context;
    cl::CommandQueue q;
    cl::Kernel krnl_vector_add;
    cl::Program program;
    std::vector<cl::Platform> platforms;
    bool found_device = false;

    //traversing all Platforms To find Xilinx Platform and targeted
    //Device in Xilinx Platform
    cl::Platform::get(&platforms);
    for(size_t i = 0; (i < platforms.size() ) & (found_device == false) ;i++){
        cl::Platform platform = platforms[i];
        std::string platformName = platform.getInfo<CL_PLATFORM_NAME>();
        if ( platformName == "Xilinx"){
            devices.clear();
            platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices);
	    if (devices.size()){
		    device = devices[0];
		    found_device = true;
		    break;
	    }
        }
    }
    if (found_device == false){
       std::cout << "Error: Unable to find Target Device " 
           << device.getInfo<CL_DEVICE_NAME>() << std::endl;
       return EXIT_FAILURE; 
    }

    // Creating Context and Command Queue for selected device
    OCL_CHECK(err, context = cl::Context(device, NULL, NULL, NULL, &err));
    OCL_CHECK(err, q = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err));

    std::cout << "INFO: Reading " << xclbinFilename << std::endl;
    FILE* fp;
    if ((fp = fopen(xclbinFilename.c_str(), "r")) == nullptr) {
        printf("ERROR: %s xclbin not available please build\n", xclbinFilename.c_str());
        exit(EXIT_FAILURE);
    }
    // Load xclbin 
    std::cout << "Loading: '" << xclbinFilename << "'\n";
    std::ifstream bin_file(xclbinFilename, std::ifstream::binary);
    bin_file.seekg (0, bin_file.end);
    unsigned nb = bin_file.tellg();
    bin_file.seekg (0, bin_file.beg);
    char *buf = new char [nb];
    bin_file.read(buf, nb);
    
    // Creating Program from Binary File
    cl::Program::Binaries bins;
    bins.push_back({buf,nb});
    devices.resize(1);
    OCL_CHECK(err, program = cl::Program(context, devices, bins, NULL, &err));
    
    // This call will get the kernel object from program. A kernel is an 
    // OpenCL function that is executed on the FPGA. 
    OCL_CHECK(err, krnl_vector_add = cl::Kernel(program,"krnl_vadd", &err));

    // These commands will allocate memory on the Device. The cl::Buffer objects can
    // be used to reference the memory locations on the device. 
    std::cout << "Creating buffer objects for each variable..." << std::endl;
    OCL_CHECK(err, cl::Buffer buffer_a(context, CL_MEM_READ_ONLY, size_in_bytes, NULL, &err));
    OCL_CHECK(err, cl::Buffer buffer_b(context, CL_MEM_READ_ONLY, size_in_bytes, NULL, &err));
    OCL_CHECK(err, cl::Buffer buffer_result(context, CL_MEM_WRITE_ONLY, size_in_bytes, NULL, &err));

    OCL_CHECK(err, cl::Buffer buffer_phase(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, size_in_bytes, NULL, &err));
    OCL_CHECK(err, cl::Buffer buffer_waveout(context, CL_MEM_WRITE_ONLY, 1024*sizeof(int32_t), NULL, &err));

    //set the kernel Arguments
    std::cout << "Setting kernel arguments..." << std::endl;
    int narg=0;
    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++,buffer_a));
    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++,buffer_b));
    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++,buffer_result));
    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++,DATA_SIZE));

    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++,buffer_phase));
    OCL_CHECK(err, err = krnl_vector_add.setArg(narg++,buffer_waveout));

    //We then need to map our OpenCL buffers to get the pointers
    std::cout << "Mapping buffers..." << std::endl;
    int *ptr_a;
    int *ptr_b;
    int *ptr_result;
    OCL_CHECK(err, ptr_a = (int*)q.enqueueMapBuffer (buffer_a, CL_TRUE, CL_MAP_WRITE, 0, size_in_bytes, NULL, NULL, &err));
    OCL_CHECK(err, ptr_b = (int*)q.enqueueMapBuffer (buffer_b, CL_TRUE, CL_MAP_WRITE, 0, size_in_bytes, NULL, NULL, &err));
    OCL_CHECK(err, ptr_result = (int*)q.enqueueMapBuffer (buffer_result, CL_TRUE, CL_MAP_READ, 0, size_in_bytes, NULL, NULL, &err));

    uint32_t *ptr_phase;
    int32_t *ptr_waveout;
    OCL_CHECK(err, ptr_phase = (uint32_t*)q.enqueueMapBuffer (buffer_phase, CL_TRUE, CL_MAP_WRITE, 0, size_in_bytes, NULL, NULL, &err));
    OCL_CHECK(err, ptr_waveout = (int32_t*) q.enqueueMapBuffer (buffer_waveout, CL_TRUE, CL_MAP_READ, 0, 1024*sizeof(int32_t), NULL, NULL, &err));

    std::cout << "Writing 1MHz phase increment to DDS..." << std::endl;
    uint32_t phase_1MHz = 0x0051EB85;
    for(int i=0; i<DATA_SIZE; i++){
    	ptr_phase[i] = phase_1MHz;
    }

    // Data will be migrated to kernel space
    std::cout << "Mirgrate data to kernel space." << std::endl;
    OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_a,buffer_b},0/* 0 means from host*/));
    OCL_CHECK(err, err = q.enqueueMigrateMemObjects({buffer_phase},0/* 0 means from host*/));

    //Launch the Kernel
    std::cout << "Launch the kernel." << std::endl;
    OCL_CHECK(err, err = q.enqueueTask(krnl_vector_add));

    // The result of the previous kernel execution will need to be retrieved in
    // order to view the results. This call will transfer the data from FPGA to
    // source_results vector
    std::cout << "Transfer the data from FPGA to source results vector." << std::endl;
    OCL_CHECK(err, q.enqueueMigrateMemObjects({buffer_result},CL_MIGRATE_MEM_OBJECT_HOST));
    OCL_CHECK(err, q.enqueueMigrateMemObjects({buffer_waveout},CL_MIGRATE_MEM_OBJECT_HOST));

    std::cout << "Verify the result." << std::endl;
    //Verify the result
    int match = 0;
    for (int i = 0; i < DATA_SIZE; i++) {
        int host_result = ptr_a[i] + ptr_b[i];
        std::cout << "ptr_result value = " << ptr_result[i] << std::endl;
        std::cout << "host_result value = " << host_result << std::endl;
        if (ptr_result[i] != host_result) {
            printf(error_message.c_str(), i, host_result, ptr_result[i]);
            match = 1;
            break;
        }
    }

    std::cout << "Opening text file to write output waveform to..." << std::endl;
    FILE *fp_wave;
    fp_wave=fopen("wave_out.txt","w");

    std::cout << "Writing the output waveform from the results buffer to the text file..." << std::endl;
    for (int i = 0; i < 1024; i++) {
    	fprintf(fp_wave,"%i\n",ptr_waveout[i]);
    }

    std::cout << "Closing text file..." << std::endl;
    fclose(fp_wave);

    std::cout << "Unmapping buffers..." << std::endl;
    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_a , ptr_a));
    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_b , ptr_b));
    OCL_CHECK(err, err = q.enqueueUnmapMemObject(buffer_result , ptr_result));

    q.enqueueUnmapMemObject(buffer_phase, ptr_phase);
    q.enqueueUnmapMemObject(buffer_waveout, ptr_waveout);

    std::cout << "Final error check..." << std::endl;
    OCL_CHECK(err, err = q.finish());

    std::cout << "TEST " << (match ? "FAILED" : "PASSED") << std::endl; 
    return (match ? EXIT_FAILURE :  EXIT_SUCCESS);

}

krnl_vadd.cpp

C/C++
/**
* Copyright (C) 2019-2021 Xilinx, Inc
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may
* not use this file except in compliance with the License. A copy of the
* License is located at
*
*     http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*/

/*******************************************************************************
Description:

    This example uses the load/compute/store coding style which is generally
    the most efficient for implementing kernels using HLS. The load and store
    functions are responsible for moving data in and out of the kernel as
    efficiently as possible. The core functionality is decomposed across one
    of more compute functions. Whenever possible, the compute function should
    pass data through HLS streams and should contain a single set of nested loops.

    HLS stream objects are used to pass data between producer and consumer
    functions. Stream read and write operations have a blocking behavior which
    allows consumers and producers to synchronize with each other automatically.

    The dataflow pragma instructs the compiler to enable task-level pipelining.
    This is required for to load/compute/store functions to execute in a parallel
    and pipelined manner. Here the kernel loads, computes and stores NUM_WORDS integer values per
    clock cycle and is implemented as below:
                                       _____________
                                      |             |<----- Input Vector 1 from Global Memory
                                      |  load_input |       __
                                      |_____________|----->|  |
                                       _____________       |  | in1_stream
Input Vector 2 from Global Memory --->|             |      |__|
                               __     |  load_input |        |
                              |  |<---|_____________|        |
                   in2_stream |  |     _____________         |
                              |__|--->|             |<--------
                                      | compute_add |      __
                                      |_____________|---->|  |
                                       ______________     |  | out_stream
                                      |              |<---|__|
                                      | store_result |
                                      |______________|-----> Output result to Global Memory

*******************************************************************************/

// Includes
#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <stdint.h>
#include <hls_stream.h>
#include "ap_int.h"
#include "ap_axi_sdata.h"
#include "hls_stream.h"

#define BUFFER_SIZE 256
#define DATA_SIZE 4096
#define WAVE_SIZE 1024

// TRIPCOUNT identifier
const int c_size = DATA_SIZE;

static void load_input(uint32_t* in, hls::stream<uint32_t>& inStream, int size) {
mem_rd:
    for (int i = 0; i < size; i++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
        inStream << in[i];
    }
}

static void compute_add(hls::stream<uint32_t>& in1_stream, hls::stream<uint32_t>& in2_stream, hls::stream<uint32_t>& out_stream, int size) {
// The kernel is operating with vector of NUM_WORDS integers. The + operator performs
// an element-wise add, resulting in NUM_WORDS parallel additions.
execute:
    for (int i = 0; i < size; i++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
        out_stream << (in1_stream.read() + in2_stream.read());
    }
}

static void store_result(uint32_t* out, hls::stream<uint32_t>& out_stream, int size) {
mem_wr:
    for (int i = 0; i < size; i++) {
#pragma HLS LOOP_TRIPCOUNT min = c_size max = c_size
        out[i] = out_stream.read();
    }
}

typedef ap_axis<16, 0, 0, 0> data_pkt;
typedef ap_axis<31, 0, 0, 0> phase_pkt;

extern "C" {

/*
    Vector Addition Kernel

    Arguments:
        in1      (input)  --> Input vector 1
        in2      (input)  --> Input vector 2
        out      (output) --> Output vector
        size     (input)  --> Number of elements in vector
        wave_out () --> DDS output waveform
        dds_in   ()  --> DDS input waveform from platform DDS Compiler IP
*/

void krnl_vadd(uint32_t* in1, uint32_t* in2, uint32_t* out, int size,
	       uint32_t *phase, int32_t *wave_out,
	       hls::stream<data_pkt> &dds_in, hls::stream<phase_pkt> &phase_out) {
#pragma HLS INTERFACE m_axi port = in1 bundle = gmem0
#pragma HLS INTERFACE m_axi port = in2 bundle = gmem1
#pragma HLS INTERFACE m_axi port = out bundle = gmem0

    static hls::stream<uint32_t> in1_stream("input_stream_1");
    static hls::stream<uint32_t> in2_stream("input_stream_2");
    static hls::stream<uint32_t> out_stream("output_stream");

#pragma HLS dataflow
    // dataflow pragma instruct compiler to run following three APIs in parallel
    load_input(in1, in1_stream, size);
    load_input(in2, in2_stream, size);
    compute_add(in1_stream, in2_stream, out_stream, size);
    store_result(out, out_stream, size);

    for (int i = 0; i < size; i++) {
	#pragma HLS PIPELINE II = 1
    	phase_pkt val;
    	val.data = phase[i];
    	phase_out.write(val);
    }

    for (int i = 0; i < WAVE_SIZE; i++) {
    #pragma HLS PIPELINE II = 1
    	data_pkt value = dds_in.read();
    	wave_out[i] = value.data;
    }

}
}

Credits

Whitney Knitter

Whitney Knitter

156 projects • 1574 followers
All thoughts/opinions are my own and do not reflect those of any company/entity I currently/previously associate with.

Comments