#include <mbed.h>
 
#if 1
#define DWT_CONTROL ((volatile uint32_t *)0xE0001000)
#define DWT_CYCCNT  ((volatile uint32_t *)0xE0001004)
#define SCB_DEMCR   ((volatile uint32_t *)0xE000EDFC)

static inline uint32_t getDwtCyccnt(void)
{
    return *DWT_CYCCNT;
}

static inline void resetDwtCyccnt(void)
{
    *DWT_CYCCNT = 0; // reset the counter
}

static inline void enableDwtCyccnt(void)
{
    *SCB_DEMCR = *SCB_DEMCR | (1u << 24); // TRCENA = 1
    *DWT_CONTROL = *DWT_CONTROL | 1 ; // enable the counter (CYCCNTENA = 1)
    *DWT_CYCCNT = 0; // reset the counter
}

#endif

Serial pc(USBTX, USBRX);

static void benchmark_wait_us() {
    uint32_t count[11];
    enableDwtCyccnt();
    resetDwtCyccnt();
    count[0] = getDwtCyccnt();
    wait_us(1000);
    count[1] = getDwtCyccnt();
    wait_us(1000);
    count[2] = getDwtCyccnt();
    wait_us(1000);
    count[3] = getDwtCyccnt();
    wait_us(1000);
    count[4] = getDwtCyccnt();
    wait_us(1000);
    count[5] = getDwtCyccnt();
    wait_us(1000);
    count[6] = getDwtCyccnt();
    wait_us(1000);
    count[7] = getDwtCyccnt();
    wait_us(1000);
    count[8] = getDwtCyccnt();
    wait_us(1000);
    count[9] = getDwtCyccnt();
    wait_us(1000);
    count[10] = getDwtCyccnt();

    for (size_t i = 0; i <= 10; i++) {
        const uint32_t diff = (i >= 1)?(count[i] - count[i - 1]):0;
        pc.printf("%d (%d)\r\n", count[i], diff);
    }
}

static void benchmark_nop() {
    uint32_t count[11];
    enableDwtCyccnt();
    resetDwtCyccnt();
    count[0] = getDwtCyccnt();
    __nop();
    count[1] = getDwtCyccnt();
    __nop();
    count[2] = getDwtCyccnt();
    __nop();
    count[3] = getDwtCyccnt();
    __nop();
    count[4] = getDwtCyccnt();
    __nop();
    count[5] = getDwtCyccnt();
    __nop();
    count[6] = getDwtCyccnt();
    __nop();
    count[7] = getDwtCyccnt();
    __nop();
    count[8] = getDwtCyccnt();
    __nop();
    count[9] = getDwtCyccnt();
    __nop();
    count[10] = getDwtCyccnt();

    for (size_t i = 0; i <= 10; i++) {
        const uint32_t diff = (i >= 1)?(count[i] - count[i - 1]):0;
        pc.printf("%d (%d)\r\n", count[i], diff);
    }
}

int main() {
    pc.baud(115200);
    pc.printf("CPU SystemCoreClock is %.2f MHz\r\n", (float)SystemCoreClock/1000.0f/1000.0f);
    pc.printf("Nucleo compare DWT_CYCCNT to wait_us\r\n");

    //
    benchmark_wait_us();
    benchmark_nop();
}
 