#include "mbed.h"
#include "SDRAM_DISCO_F429ZI.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include "SDFileSystem.h"

#define M_PI           3.14159265358979323846

SDRAM_DISCO_F429ZI sdram;
SDFileSystem sd(PC_12, PC_11, PC_10, PB_2, "sd");// MOSI, MISO, SCK, CS
DigitalOut led_green(LED1);
DigitalOut led_red(LED2);
SPI spi(PC_12, PC_11, PC_10); // mosi, miso, sclk
DigitalOut cs(PB_2);
Serial pc(USBTX, USBRX);

#define line_limit 10 // limit on # of lines to be returned by HoughLines


typedef struct
{
    int x;
    int y;
} ptvec;

typedef struct
{
    float rho, theta;
    double a, b;
    double x0, y0;
    ptvec ptvec1, ptvec2;
    unsigned int strength;
} trans_line;

typedef struct
{
    unsigned int rows;
    unsigned int cols;
    unsigned int channels;
    uint32_t data;
} Image;

typedef struct
{
    int rho;
    int theta;
    unsigned int strength;
} line_polar;


int bmpread(char *filename, Image *dst)
{
    pc.printf("Reading Image file: %s\r\n", filename);
    FILE *file;
    unsigned char tempSize4[4];
    unsigned int width, height;
    unsigned short planes, bpp;
    
    if ((file = fopen(filename, "rb"))==NULL)
    {
        pc.printf("File Not Found : %s\r\n", filename);
        return 0;
    }
    
    // seek through the bmp header, up to the width/height:
    fseek(file, 18, SEEK_CUR);
    if ( fread( tempSize4, 1, 4, file) < 4 )
        return 0;
    width = (tempSize4[3]<<24) | (tempSize4[2]<<16) | (tempSize4[1]<<8) | tempSize4[0]; // big endian
    if ( fread( tempSize4, 1, 4, file) < 4 )
        return 0;
    height = (tempSize4[3]<<24) | (tempSize4[2]<<16) | (tempSize4[1]<<8) | tempSize4[0];;
    
    pc.printf("width: %d, height: %d\r\n", width, height);
    
    dst->rows = height;
    dst->cols = width;
    
    
    if ( fread( tempSize4, 1, 2, file) < 2 )
        return 0;
    planes = (tempSize4[1]<<8) | tempSize4[0]; // big endian
    dst->channels = planes;
    
    if ( fread( tempSize4, 1, 2, file) < 2 )
        return 0;
    bpp = (tempSize4[1]<<8) | tempSize4[0]; // big endian
    pc.printf("planes: %d, bpp: %d\r\n", planes, bpp);
    
    fseek(file, 24, SEEK_CUR);
    unsigned char *readline;
    unsigned int padding=0; while ((width*3+padding) % 4!=0) padding++;
    unsigned int linewidth = width * 3 + padding;
    readline = (unsigned char*)malloc(linewidth * sizeof(unsigned char));
    unsigned char R, G, B;
    int i, j;
    
    dst->data = 1;
    unsigned char *dst_buf= (unsigned char *) malloc(width * sizeof(unsigned char));
    uint32_t buffer[width/4];
    
    pc.printf("Converting to grayscale and Storing to SDRAM\r\n");
    for(i = height-1; i >= 0; i--)
    {
        fread(readline, sizeof(unsigned char), linewidth, file);
        for(j = 0; j < width; j++)
        {
            B = readline[3*j];
            G = readline[3*j+1];
            R = readline[3*j+2];
            dst_buf[j] =  (0.299*R) + (0.587*G) + (0.114*B);
        }
        for(j = 0; j < width/4; j++)
        {
            buffer[j] = (16777216*dst_buf[j*4]) + (65536*dst_buf[j*4+1]) + (dst_buf[j*4+2]*256) + dst_buf[j*4+3];
        }
        sdram.WriteData(SDRAM_DEVICE_ADDR + dst->data + (i*width), buffer, width/4);
        j = height / 10;
        if(i%j == 0)
            pc.printf("%d%%...",(100 - (i/j*10)));
    }
    pc.printf("\r\nImage loaded\r\n");
    free(readline);
    free(dst_buf);
    fclose(file);
    
    return 1;
}

int Round(double number)
{
    double diff_down, diff_up;
    diff_down = number - (int)number;
    diff_up = 1 - diff_down;
    if(diff_down < diff_up)
        return (int)number;
    else
        return 1+(int)number;
}

void matrix_convolution(Image *src, Image *dst, double **mask, int width)
{
    unsigned char lines[width][src->cols];
    unsigned char result[src->cols];
    uint32_t buffer[src->cols/4];
    int i, j, k, r, c, actual_r, actual_c, scope;
    double gaussian_sum;
    //int flag = 1;
    scope = width/2;
    for(i = 0; i < src->rows; i++)
    {
        if(i < scope || (src->rows - i <= scope))
        {
            sdram.ReadData(i*(src->cols) + SDRAM_DEVICE_ADDR + (src->data), buffer, src->cols/4);
            sdram.WriteData(i*(src->cols) + SDRAM_DEVICE_ADDR + (dst->data), buffer, src->cols/4);
            continue;
        }
        for(j = 0; j < width; j++)
        {
            sdram.ReadData((i-scope+j)*(src->cols) + SDRAM_DEVICE_ADDR + (src->data), buffer, src->cols/4);
            for(k = 0; k < src->cols/4; k++)
            {
                lines[j][k*4+3] = buffer[k];
                lines[j][k*4+2] = buffer[k] >> 8;
                lines[j][k*4+1] = buffer[k] >> 16;
                lines[j][k*4]   = buffer[k] >> 24;
            }
        }
        //pc.printf("line %d, data = %d\r\n", i, (int)lines[scope][0]);
        
        for(j = 0; j < src->cols; j++)
        {
            if( (j < scope) || (src->cols - j <= scope))
            {
                result[j] = lines[scope][j];
                continue;
            }
            gaussian_sum = 0;
            for(r = 0; r < width; r++)
            {
                for(c = 0; c < width; c++)
                {
                    actual_r = r;
                    actual_c = j + c - scope;
                    /*
                    if(flag == 1)
                    {
                        pc.printf("%d, %d mask = %.4f, value = %d\r\n", r,c,mask[r][c],(int)lines[actual_r][actual_c]);
                    }
                    */
                    double op1 = mask[r][c];
                    double op2 = (double) lines[actual_r][actual_c];
                    double temp = op1 * op2;
                    gaussian_sum = gaussian_sum + temp;
                }
            }
            //flag = 0;
            if(gaussian_sum > 255) gaussian_sum = 255;
            if(gaussian_sum < 0) gaussian_sum  = 0;
            result[j] = (unsigned char)Round(gaussian_sum);
            //pc.printf("%d ", result[j]);
        }
        //pc.printf("\r\n");
        for(j = 0; j < src->cols/4; j++)
        {
            buffer[j] = (16777216*result[j*4]) + (65536*result[j*4+1]) + (result[j*4+2]*256) + result[j*4+3];
        }
        sdram.WriteData(SDRAM_DEVICE_ADDR + dst->data + (i*src->cols), buffer, src->cols/4);
        j = src->rows / 10;
        if(i%j == 0)
            pc.printf("%d%%...",i/j*10);
    }
    pc.printf("\r\n");
    return;
}


void myGaussianBlur(Image *src, Image *dst, int width, double sigma)
{
    // build the Gaussian Kernal
    // ref: http://stackoverflow.com/questions/8204645/implementing-gaussian-blur-how-to-calculate-convolution-matrix-kernel
    
    pc.printf("\r\nGaussian Blur Module Invoked\r\n");
    dst->rows = src->rows;
    dst->cols = src->cols;
    dst->channels = src->channels;
    dst->data = src->data + (src->rows * src->cols);
    
    int W = width;
    int i, x, y;
    double **kernel;
    kernel = (double **) malloc(W * sizeof(double *));
    for(i = 0; i < W; i++)
        kernel[i] = (double *) malloc(W * sizeof(double));
    double mean = W/2;
    double sum = 0.0; // For accumulating the kernel values
    for (x = 0; x < W; ++x)
    {
        for (y = 0; y < W; ++y)
        {
            kernel[x][y] = exp( -0.5 * (pow((x-mean)/sigma, 2.0) + pow((y-mean)/sigma,2.0)) )
            / (2 * M_PI * sigma * sigma);
            
            // Accumulate the kernel values
            sum += kernel[x][y];
        }
    }
    // Normalize the kernel
    //pc.printf("Gaussian Kernal in use:\r\n");
    for (x = 0; x < W; ++x)
    {
        for (y = 0; y < W; ++y)
        {
            kernel[x][y] /= sum;
            //pc.printf("%.7f ", kernel[x][y]);
        }
        //pc.printf("\r\n");
    }
    
    // do matrix convolution
    matrix_convolution(src, dst, kernel, width);

    
    for(i = 0; i < W; i++)
        free(kernel[i]);
    free(kernel);
    return;
}

void myCanny(Image *src, Image *dst, double min_threshold, double max_threshold)
{
    // ref:
    // http://dasl.mem.drexel.edu/alumni/bGreen/www.pages.drexel.edu/_weg22/can_tut.html
    // https://en.wikipedia.org/wiki/Canny_edge_detector
    
    pc.printf("\r\nCanny Edge Detection Module Invoked\r\n");
    
    dst->rows = src->rows;
    dst->cols = src->cols;
    dst->channels = src->channels;
    dst->data = src->data + (src->rows * src->cols);
    
    // de-noise using local Gaussian Blur, simulating size = 5, sigma = 1.4
    pc.printf("Smoothing image...\r\n");
    double mask[5][5] =
    {
        {2,  4,  5,  4,  2},
        {4,  9, 12,  9,  4},
        {5, 12, 15, 12,  5},
        {4,  9, 12,  9,  4},
        {2,  4,  5,  4,  2}
    };
    double **kernel;
    int i, j;
    kernel = (double **) malloc(5 * sizeof(double *));
    for(i = 0; i < 5; i++)
        kernel[i] = (double *) malloc(5 * sizeof(double));
    for(i = 0; i < 5; i++)
    {
        for(j = 0; j < 5; j++)
        {
            kernel[i][j] = mask[i][j] / 159;
            //printf("%.7f ", kernel[i][j]);
        }
        //printf("\n");
    }
    matrix_convolution(src, dst, kernel, 5);
    for(i = 0; i < 5; i++)
        free(kernel[i]);
    free(kernel);
    
    // find gradient x and gradient y
    pc.printf("Finding gradient in x and y for each pixel...\r\n");
    Image Gx, Gy;   // Gx and Gy are used to store x-gradient and y-gradient
    Gx.cols = src->cols;
    Gx.rows = src->rows;
    Gx.data = dst->data + (dst->cols * dst->rows);
    Gy.cols = src->cols;
    Gy.rows = src->rows;
    Gy.data = Gx.data + (dst->cols * dst->rows);
    
    // size-3 Sobel mask
    double sobel_x[3][3] = {{-1, 0, 1}, {-2, 0, 2}, {-1, 0, 1}};
    double sobel_y[3][3] = {{-1, -2, -1}, {0, 0, 0}, {1, 2, 1}};
    double **mask_x, **mask_y;
    mask_x = (double **) malloc(3 * sizeof(double *));
    for(i = 0; i < 3; i++)
        mask_x[i] = (double *) malloc(3 * sizeof(double));
    mask_y = (double **) malloc(3 * sizeof(double *));
    for(i = 0; i < 3; i++)
        mask_y[i] = (double *) malloc(3 * sizeof(double));
    for(i = 0; i < 3; i++)
    {
        for(j = 0; j < 3; j++)
        {
            mask_x[i][j] = sobel_x[i][j];
            mask_y[i][j] = sobel_y[i][j];
        }
    }
    pc.printf("in x: ");
    matrix_convolution(src, &Gx, mask_x, 3);
    pc.printf("in y: ");
    matrix_convolution(src, &Gy, mask_y, 3);
    for(i = 0; i < 3; i++)
    {
        free(mask_x[i]);
        free(mask_y[i]);
    }
    free(mask_x);
    free(mask_y);
    //pc.printf("dst at %d, Gx at %d, Gy at %d\r\n", dst->data, Gx.data, Gy.data);
    
    // calculate gradient strength (store in Gx) and direction (store in Gy)
    pc.printf("Calculating gradient strength and direction...\r\n");
    double gx_val, gy_val;
    double arctan_val;
    int over_threshold_count = 0;  // number used for adaptive non-max suppression
    uint32_t buffer[src->cols/4];
    unsigned char Gx_buf[src->cols], Gy_buf[src->cols];
    //pc.printf("%d %d %d %d %d %d", src->rows, src->cols, Gx.cols, Gx.rows, Gy.cols, Gy.rows);
    for(i = 0; i < src->rows; i++)
    {
        sdram.ReadData(i*(Gx.cols) + SDRAM_DEVICE_ADDR + (Gx.data), buffer, Gx.cols/4);
        for(j = 0; j < Gx.cols/4; j++)
        {
            Gx_buf[j*4+3] = buffer[j];
            Gx_buf[j*4+2] = buffer[j] >> 8;
            Gx_buf[j*4+1] = buffer[j] >> 16;
            Gx_buf[j*4]   = buffer[j] >> 24;
        }
        sdram.ReadData(i*(Gy.cols) + SDRAM_DEVICE_ADDR + (Gy.data), buffer, Gy.cols/4);
        for(j = 0; j < Gy.cols/4; j++)
        {
            Gy_buf[j*4+3] = buffer[j];
            Gy_buf[j*4+2] = buffer[j] >> 8;
            Gy_buf[j*4+1] = buffer[j] >> 16;
            Gy_buf[j*4]   = buffer[j] >> 24;
        }
        
        for(j = 0; j < src->cols; j++)
        {
            gx_val = Gx_buf[j];
            gy_val = Gy_buf[j];
            
            Gx_buf[j] = (unsigned char)sqrt(pow(gx_val, 2) + pow(gy_val, 2));
            if(Gx_buf[j] > max_threshold)
                over_threshold_count++;
            arctan_val = atan(gy_val / gx_val);
            if(arctan_val >= 0.41421 && arctan_val < 2.414421)    // 22.5 to 67.5 deg
                Gy_buf[j] = 45;
            else if(arctan_val >= 2.41421 || arctan_val < -2.41421) // 67.5 to 112.5 deg
                Gy_buf[j] = 90;
            else if(arctan_val >= -2.41421 && arctan_val < 0.41421) // 112.5 to 157.5 deg
                Gy_buf[j] = 135;
            else    // 0 to 22.5 and 157.5 to 180 deg
                Gy_buf[j] = 0;
        }
        
        for(j = 0; j < Gx.cols/4; j++)
        {
            buffer[j] = (16777216*Gx_buf[j*4]) + (65536*Gx_buf[j*4+1]) + (Gx_buf[j*4+2]*256) + Gx_buf[j*4+3];
        }
        sdram.WriteData(SDRAM_DEVICE_ADDR + Gx.data + (i*Gx.cols), buffer, Gx.cols/4);
        for(j = 0; j < Gy.cols/4; j++)
        {
            buffer[j] = (16777216*Gy_buf[j*4]) + (65536*Gy_buf[j*4+1]) + (Gy_buf[j*4+2]*256) + Gy_buf[j*4+3];
        }
        sdram.WriteData(SDRAM_DEVICE_ADDR + Gy.data + (i*Gy.cols), buffer, Gy.cols/4);
    }
    
    // Non-maximum suppression. Aim is to shrink the edge width
    pc.printf("Non-maximum suppression...\r\n");
    int mask_radius = 1;    // hard-coded parameter
    int r, c, not_max_flag, vertical_neighbor;
    Image sup_matrix;      // maxtrix used to keep track of local maxima stutas for each pixel
    sup_matrix.cols = dst->cols;
    sup_matrix.rows = dst->rows;
    sup_matrix.data = Gy.data + (dst->cols * dst->rows);
    double skip_sup_ratio = (double)over_threshold_count/8800; // 10000 pixels should be found in a 800x600 pic
    if(skip_sup_ratio < 1)
        skip_sup_ratio = 1;
    else if(skip_sup_ratio > 1.8)
        skip_sup_ratio = 1.8;
    pc.printf("Over max threshold count: %d\r\n", over_threshold_count);
    //pc.printf("Skip suppression ratio: %.2f\r\n", skip_sup_ratio);
    unsigned char sup_buf[dst->cols], Gx_buf_array[3][dst->cols], Gy_buf_array[3][dst->cols];
    for(i = mask_radius; i < src->rows - mask_radius; i++)
    {
        sup_buf[j] = 0;   // in sup_matrix, 0 is default value
        for(r = 0; r < 3; r++)
        {
            sdram.ReadData((i+r-1)*(Gx.cols) + SDRAM_DEVICE_ADDR + (Gx.data), buffer, Gx.cols/4);
            for(j = 0; j < Gx.cols/4; j++)
            {
                Gx_buf_array[r][j*4+3] = buffer[j];
                Gx_buf_array[r][j*4+2] = buffer[j] >> 8;
                Gx_buf_array[r][j*4+1] = buffer[j] >> 16;
                Gx_buf_array[r][j*4]   = buffer[j] >> 24;
            }
        }
        for(r = 0; r < 3; r++)
        {
            sdram.ReadData((i+r-1)*(Gy.cols) + SDRAM_DEVICE_ADDR + (Gy.data), buffer, Gy.cols/4);
            for(j = 0; j < Gy.cols/4; j++)
            {
                Gy_buf_array[r][j*4+3] = buffer[j];
                Gy_buf_array[r][j*4+2] = buffer[j] >> 8;
                Gy_buf_array[r][j*4+1] = buffer[j] >> 16;
                Gy_buf_array[r][j*4]   = buffer[j] >> 24;
            }
        }
    
        for(j = mask_radius; j < src->cols - mask_radius; j++)
        {
            if( j < mask_radius || (j >= src->rows - mask_radius))
            {
                sup_buf[j] = 1;
                continue;
            }
            not_max_flag = 0;
            
            // pixels with significant high magnitute will never be suppressed
            if(Gx_buf_array[1][j] > 1.8 * max_threshold)
            {
                sup_buf[j] = 2;   // in sup matrix, 2 means maxima's
                continue;
            }
            
            // for other pixels, compare its gradient magnitute with its neighbor
            for(r = -mask_radius; r <= mask_radius; r++)
            {
                if(not_max_flag == 1)
                    break;
                for(c = -mask_radius; c <= mask_radius; c++)
                {
                    vertical_neighbor = 0;
                    // only neighbors vertical to the gradient direction will be checked
                    switch (Gy_buf_array[1][j])
                    {
                        case 0:
                            if(c == 0 && (r == -1 || r ==1))
                                vertical_neighbor = 1;
                            break;
                        case 45:
                            if((r == 1 && c == -1) || (r == -1 && c == 1))
                                vertical_neighbor = 1;
                            break;
                        case 90:
                            if(r == 0 && (c == -1 || c == 1))
                                vertical_neighbor = 1;
                        case 135:
                            if((r == 1 && c == 1) || (r == -1 && c == -1))
                                vertical_neighbor = 1;
                        default:
                            break;
                    }
                    if(vertical_neighbor != 1)
                        continue;
                    
                    if(Gy_buf_array[1][j] == Gy_buf_array[1+r][j+c])
                    {
                        if(Gx_buf_array[1][j] < Gx_buf_array[1+r][j+c])
                        {
                            not_max_flag = 1;
                            sup_buf[j] = 1;
                            break;
                        }
                    }
                }
            }
        }
        
        for(j = 0; j < sup_matrix.cols/4; j++)
        {
            buffer[j] = (16777216*sup_buf[j*4]) + (65536*sup_buf[j*4+1]) + (sup_buf[j*4+2]*256) + sup_buf[j*4+3];
        }
        sdram.WriteData(SDRAM_DEVICE_ADDR + sup_matrix.data + (i*sup_matrix.cols), buffer, sup_matrix.cols/4);
    }
    
        // Gy is no longer used
    
    // Double threshold
    pc.printf("Double threshold...\r\n");
    unsigned char dst_buf[dst->cols];
    int a = 0, b = 0, d = 0;
    for(i = 0; i < src->rows; i++)
    {
        sdram.ReadData(i*(Gx.cols) + SDRAM_DEVICE_ADDR + (Gx.data), buffer, Gx.cols/4);
        for(j = 0; j < Gx.cols/4; j++)
        {
            Gx_buf[j*4+3] = buffer[j];
            Gx_buf[j*4+2] = buffer[j] >> 8;
            Gx_buf[j*4+1] = buffer[j] >> 16;
            Gx_buf[j*4]   = buffer[j] >> 24;
        }
        sdram.ReadData(i*(sup_matrix.cols) + SDRAM_DEVICE_ADDR + (sup_matrix.data), buffer, sup_matrix.cols/4);
        for(j = 0; j < Gx.cols/4; j++)
        {
            sup_buf[j*4+3] = buffer[j];
            sup_buf[j*4+2] = buffer[j] >> 8;
            sup_buf[j*4+1] = buffer[j] >> 16;
            sup_buf[j*4]   = buffer[j] >> 24;
        }
        
        for(j = 0; j < src->cols; j++)
        {
            if(i == 0 || j == 0 || i == src->rows-1 || j == src->cols-1)
            {
                dst_buf[j] = 0;
                b++;
                continue;
            }
            if(sup_buf[j] == 1)
                Gx_buf[j] = 0;
            if(Gx_buf[j] > max_threshold)   // strong edge
            {
                dst_buf[j] = 255;
                a++;
            }
            else if(Gx_buf[j] < min_threshold)  // weak edge
            {
                dst_buf[j] = 0;
                b++;
            }
            else    // in middle: later check if it is connected to a strong edge
            {
                dst_buf[j] = Gx_buf[j];
                d++;
            }
        }
        for(j = 0; j < dst->cols/4; j++)
        {
            buffer[j] = (16777216*dst_buf[j*4]) + (65536*dst_buf[j*4+1]) + (dst_buf[j*4+2]*256) + dst_buf[j*4+3];
        }
        sdram.WriteData(SDRAM_DEVICE_ADDR + dst->data + (i*dst->cols), buffer, dst->cols/4);
    }
    pc.printf("strong = %d, weak = %d, middle = %d\r\n", a, b, d);
    
    int k;
    // Gx and sup_matrix is no longer used
    
    // pixel which has magnitude in-middle will be preserved if connected to a strong edge
    pc.printf("Edge tracking by hysteresis...\r\n");
    short min_neighbor;
    unsigned char dst_buf_array[3][dst->cols];
    for(i = 0; i < 10; i++)
    {
        for(j = 1; j < dst->rows-1; j++)
        {
            for(r = 0; r < 3; r++)
            {
                sdram.ReadData((i+r-1)*(dst->cols) + SDRAM_DEVICE_ADDR + (dst->data), buffer, dst->cols/4);
                for(k = 0; k < dst->cols/4; k++)
                {
                    dst_buf_array[r][k*4+3] = buffer[k];
                    dst_buf_array[r][k*4+2] = buffer[k] >> 8;
                    dst_buf_array[r][k*4+1] = buffer[k] >> 16;
                    dst_buf_array[r][k*4]   = buffer[k] >> 24;
                }
            }
            for(k = 1; k < dst->cols - 1; k++)
            {
                if(dst_buf_array[1][j] == 0 || dst_buf_array[1][j] == 255)
                    continue;
                min_neighbor = 0;
                for(r = -1; r <= 1; r++)
                {
                    for(c = -1; c <= 1; c++)
                    {
                        if(dst_buf_array[1+r][j+c] > max_threshold)
                            dst_buf_array[1][j] = 255;
                        else if(dst_buf_array[1+r][j+c] < min_threshold)
                            min_neighbor++;
                    }
                }
                if(min_neighbor == 8)
                    dst_buf_array[1][j] = 0;
            }
            for(k = 0; k < dst->cols/4; k++)
            {
                buffer[j] = (16777216*dst_buf_array[1][k*4]) + (65536*dst_buf_array[1][k*4+1]) + (dst_buf_array[1][k*4+2]*256) + dst_buf_array[1][k*4+3];
            }
            sdram.WriteData(SDRAM_DEVICE_ADDR + dst->data + (i*dst->cols), buffer, dst->cols/4);
        }
    }
    
    // those can't reach to a strong edge in 10 pixels will be supressed
    for(i = 0; i < src->rows; i++)
    {
        sdram.ReadData((i)*(dst->cols) + SDRAM_DEVICE_ADDR + (dst->data), buffer, dst->cols/4);
        for(k = 0; k < dst->cols/4; k++)
        {
            dst_buf[k*4+3] = buffer[k];
            dst_buf[k*4+2] = buffer[k] >> 8;
            dst_buf[k*4+1] = buffer[k] >> 16;
            dst_buf[k*4]   = buffer[k] >> 24;
        }
        for(j = 0; j < src->cols; j++)
        {
            if((dst_buf[j] == 255) || (dst_buf[j] == 0))
                continue;
            else
                dst_buf[j] = 0;
        }
        for(j = 0; j < dst->cols/4; j++)
        {
            buffer[j] = (16777216*dst_buf[j*4]) + (65536*dst_buf[j*4+1]) + (dst_buf[j*4+2]*256) + dst_buf[j*4+3];
        }
        sdram.WriteData(SDRAM_DEVICE_ADDR + dst->data + (i*dst->cols), buffer, dst->cols/4);
    }
    
    int zero = 0, max = 0, in_middle = 0;
    for(i = 0; i < src->rows; ++i)
    {

        sdram.ReadData((i)*(dst->cols) + SDRAM_DEVICE_ADDR + (dst->data), buffer, dst->cols/4);
        for(k = 0; k < dst->cols/4; k++)
        {
            dst_buf[k*4+3] = buffer[k];
            dst_buf[k*4+2] = buffer[k] >> 8;
            dst_buf[k*4+1] = buffer[k] >> 16;
            dst_buf[k*4]   = buffer[k] >> 24;
        }

        for ( j = 0; j < src->cols; ++j)
        {
            if(dst_buf[j] == 0)
                zero++;
            else if(dst_buf[j] == 255)
                max++;
            else
                in_middle++;
        }
    }
    pc.printf("zero = %ld, max = %ld, in_middle = %ld\r\n", zero, max, in_middle);
    pc.printf("SDRAM peak usage %ld\r\n", sup_matrix.data + (src->cols * src->rows) -1);
    
    return;
}

int myHoughLines(Image *src, line_polar *line_list, int threshold, int number)
{
    // ref:
    // http://www.keymolen.com/2013/05/hough-transformation-c-implementation.html
    // https://en.wikipedia.org/wiki/Hough_transform
    
    pc.printf("\r\nHoughLines Module Invoked\r\n");
    
    // initialize the accumulator for lines. rho and theta are resolutions in distance and angle (all set to 1)
    unsigned int dimension_r, dimension_t, i, j;
    //if(src->channels != 1)  // only accept grayscal images
    //    return -1;
    if(src->rows >= src->cols)
        dimension_r = src->rows;
    else
        dimension_r = src->cols;
    dimension_r = (unsigned int) ((double)dimension_r * sqrt((double)2)) + 1;
    dimension_t = 360;
    
    uint32_t error_count = 0, addr = src->data + (src->rows * src->cols);
    pc.printf("accumulator start at %ld\r\n", addr-1);
    uint32_t *accumulator = (uint32_t*)malloc(dimension_t*sizeof(uint32_t));
    for(i = 0; i < dimension_r; i++)
    {
        for(j = 0; j < dimension_t; j++)
            accumulator[j] = 0;
        sdram.WriteData(SDRAM_DEVICE_ADDR + (4*i*dimension_t) + addr, accumulator, dimension_t);
    }
    for(i = 0; i < dimension_r;i++)
    {
        sdram.ReadData(SDRAM_DEVICE_ADDR + (4*i*dimension_t) + addr, accumulator, dimension_t);
        for(j = 0; j < dimension_t; j++)
        {
            if(accumulator[j] != 0)
            {
                error_count++;
            }
        }
    }
    pc.printf("error_count = %ld\r\n", error_count);
    
    pc.printf("Accumulator dimension: %d(rho) by %d(theta)\r\n", dimension_r, dimension_t);
    
    // scan through all edge pixels.
    int t, neg_r = 0, round_up = 0, round_down = 0, pixel_count = 0;
    double new_x, new_y, theta_rad, r;
    unsigned char src_buf[src->cols];
    uint32_t buffer[src->cols/4], tmp_accu;
    
    for(i = 0; i < src->rows; i++)
    {
        sdram.ReadData(i*(src->cols) + SDRAM_DEVICE_ADDR + (src->data), buffer, src->cols/4);
        for(j = 0; j < src->cols/4; j++)
        {
            src_buf[j*4+3] = buffer[j];
            src_buf[j*4+2] = buffer[j] >> 8;
            src_buf[j*4+1] = buffer[j] >> 16;
            src_buf[j*4]   = buffer[j] >> 24;
        }
        for(j = 0; j < src->cols; j++)
        {
            if(src_buf[j] != 255)  // only consider edge pixels (who have value of 255)
                continue;
            pixel_count++;
            for(t = 0; t < dimension_t; t++)    // for each theta, calc the corresponding rho
            {
                new_x = (double)i;
                new_y = (double)j;
                theta_rad = (double)t * (M_PI / 180);
                r = (new_y * cos(theta_rad)) + (new_x * sin(theta_rad));
                
                if(r <= 0) // negative rho = positive rho, they are duplicated lines that can be discarded
                {
                    neg_r++;
                    continue;
                }
                
                if( r - (int)r >= 0.5)  // round up to the bin
                {
                    round_up++;
                    sdram.ReadData(4*dimension_t*((int)r +1) + SDRAM_DEVICE_ADDR + (addr) + (4*t), &tmp_accu, 1);
                    tmp_accu++;
                    if(tmp_accu > 503640)
                        printf("(%d, %d)\r\n", (int)r +1, t);
                    sdram.WriteData(4*dimension_t*((int)r +1) + SDRAM_DEVICE_ADDR + (addr) + (4*t), &tmp_accu, 1);
                }
                else    // round down to the bin
                {
                    round_down++;
                    sdram.ReadData(4*dimension_t*((int)r) + SDRAM_DEVICE_ADDR + (addr) + (4*t), &tmp_accu, 1);
                    tmp_accu++;
                    sdram.WriteData(4*dimension_t*((int)r) + SDRAM_DEVICE_ADDR + (addr) + (4*t), &tmp_accu, 1);
                }
            }
        }
        j = src->rows / 10;
        if(i%j == 0)
            pc.printf("%d%%...",i/j*10);
    }
    pc.printf("\r\n");
    pc.printf("negatve rho = %d\r\nround up = %d, round down = %d\r\n", neg_r, round_up, round_down);
    pc.printf("pixel count = %d\r\n", pixel_count);
    // find top x lines in rho-theta space
    int weakest = 0, count = 0, weakest_pt = 0;
    line_polar local_line_list[number];
    for(i = 0; i < dimension_r; i++)
    {
        sdram.ReadData(4*i*dimension_t + SDRAM_DEVICE_ADDR + (addr), accumulator, dimension_t);
        for(j = 0; j < dimension_t; j++)
        {
            if(accumulator[j] < threshold || accumulator[j] <= weakest)
                continue;
            
            local_line_list[weakest_pt].rho = i;
            local_line_list[weakest_pt].theta = j;
            sdram.ReadData(4*dimension_t*i + SDRAM_DEVICE_ADDR + (addr) + (4*j), &tmp_accu, 1);
            local_line_list[weakest_pt].strength = tmp_accu;
            count++;
            
            if(count < number)
            {
                weakest_pt++;
                weakest = 0;
            }
            else
            {
                weakest = local_line_list[0].strength;
                weakest_pt = 0;
                for(t = 1; t < number; t++)
                {
                    if(local_line_list[t].strength < weakest)
                    {
                        weakest = local_line_list[t].strength;
                        weakest_pt = t;
                    }
                }
            }
        }
    }
    
    // copy those found lines baack to main's stack
    memcpy(line_list, local_line_list, number*sizeof(line_polar));
    
    for(i = 0; i < number; i++)
    {
        if(count == i)
            break;
        printf("line %d: rho = %d, theta = %d, strength = %d\r\n", i, local_line_list[i].rho, local_line_list[i].theta, local_line_list[i].strength);
    }
    printf("\r\n");
    free(accumulator);
    // return the actual number of lines found above threshold
    return count;
}

int det(ptvec a, ptvec b)
{
    return (a.x * b.y) - (a.y * b.x);
}

int line_intersection(trans_line line1, trans_line line2, ptvec *result)
{
    ptvec xdiff, ydiff, d;
    xdiff.x = line1.ptvec1.x - line1.ptvec2.x;
    xdiff.y = line2.ptvec1.x - line2.ptvec2.x;
    ydiff.x = line1.ptvec1.y - line1.ptvec2.y;
    ydiff.y = line2.ptvec1.y - line2.ptvec2.y;
    
    int div = det(xdiff, ydiff);
    if(div == 0)
    {
        // Lines don't intersect
        return 0;
    }
    
    d.x = det(line1.ptvec1, line1.ptvec2);
    d.y = det(line2.ptvec1, line2.ptvec2);
    result->x = (int) det(d, xdiff) / div;
    result->y = (int) det(d, ydiff) / div;
    return 1;
}

double LineAnalysis(trans_line *line_list, int line_count, int dim_x, int dim_y)
{
    printf("Needle Position Detection Module Invoked.\r\n");
    int strongest_pt = 0, i = 0, j = 0, upper_count = 0, lower_count = 0;
    int upper_bin[line_count];
    int lower_bin[line_count];
    double average_deg = 0, diff, theta_deg;
    
    // find the line with highest confidence
    for(i = 0; i < line_count; i++)
    {
        if(line_list[i].strength > line_list[strongest_pt].strength)
            strongest_pt = i;
    }
    printf("Line used as reference of de-noise: rho = %d, ", (int)line_list[strongest_pt].rho);
    printf("theta = %d\r\n", (int) (line_list[strongest_pt].theta * 180 / M_PI));
    
    // delete lines whose theta varies > 10 deg from the strongest line
    for(i = 0; i < line_count; i++)
    {
        diff = 180 * line_list[strongest_pt].theta / M_PI;
        diff = diff - (180 * line_list[i].theta / M_PI);
        if(diff >= 10 || diff <= -10)
        {
            printf("Line with confidence of %d is suppressed.\r\n", line_list[i].strength);
            line_list[i].strength = 0;
            continue;
        }
        average_deg = (180 * line_list[i].theta / M_PI) + average_deg;
        j++;
    }
    average_deg = average_deg / j;
    printf("Theta used to divide lines into bins: %.2f\r\n", average_deg);
    
    // categorize all lines into 2 bins depends on whether its theta is higher than average.
    for(i = 0; i < line_count; i++)
    {
        if(line_list[i].strength == 0)
            continue;
        
        theta_deg = line_list[i].theta * 180 / M_PI;
        if(theta_deg > average_deg)
        {
            upper_bin[upper_count] = i;
            upper_count++;
        }
        else
        {
            lower_bin[lower_count] = i;
            lower_count++;
        }
    }
    
    // choose the bin with minority of lines.
    // find a line in the bin with highest / lowest theta if we chose upper / lower bin
    // the chosen line is considered as the line mostly diverted from the other lines but yet near needle
    // find all intersections between this line and the lines in the bin with majority of lines
    // since this line is well seperated from others, points found should be far enough from gauge center
    int ref_pt = 0, inter_count = 0;
    ptvec intersect_list[line_count];
    if(lower_count < upper_count)
    {
        for(i = 0; i < lower_count; i++)
        {
            if(line_list[lower_bin[i]].theta < line_list[lower_bin[ref_pt]].theta)
                ref_pt = i;
        }
        average_deg = (180 * line_list[lower_bin[ref_pt]].theta / M_PI);
        printf("Line in lower_bin is used to find intersections:\r\n");
        printf("rho = %d, ", (int) (line_list[lower_bin[ref_pt]].rho));
        printf("theta = %d ", (int) (line_list[lower_bin[ref_pt]].theta * 180 / (M_PI)));
        printf("strength = %d\r\n", (int) (line_list[lower_bin[ref_pt]].strength));
        for(i = 0; i < upper_count; i++)
        {
            if(line_intersection(line_list[lower_bin[ref_pt]], line_list[upper_bin[i]], &intersect_list[inter_count]) == 1)
                inter_count++;
        }
        ref_pt = 0;
        for(i = 0; i < upper_count; i++)
        {
            if(line_list[upper_bin[i]].theta > line_list[upper_bin[ref_pt]].theta)
                ref_pt = i;
        }
        average_deg = (180 * line_list[upper_bin[ref_pt]].theta / M_PI) + average_deg;
        average_deg = average_deg / 2;
    }
    else
    {
        for(i = 0; i < upper_count; i++)
        {
            if(line_list[upper_bin[i]].theta > line_list[upper_bin[ref_pt]].theta)
                ref_pt = i;
        }
        average_deg = (180 * line_list[upper_bin[ref_pt]].theta / M_PI);
        printf("Line in upper_bin is used to find intersections:\r\n");
        printf("rho = %d, ", (int) (line_list[upper_bin[ref_pt]].rho));
        printf("theta = %d ", (int) (line_list[upper_bin[ref_pt]].theta * 180 / (M_PI)));
        printf("strength = %d\r\n", (int) (line_list[upper_bin[ref_pt]].strength));
        for(i = 0; i < lower_count; i++)
        {
            if(line_intersection(line_list[upper_bin[ref_pt]], line_list[lower_bin[i]], &intersect_list[inter_count]) == 1)
                inter_count++;
        }
        ref_pt = 0;
        for(i = 0; i < lower_count; i++)
        {
            if(line_list[lower_bin[i]].theta < line_list[lower_bin[ref_pt]].theta)
                ref_pt = i;
        }
        average_deg = (180 * line_list[lower_bin[ref_pt]].theta / M_PI) + average_deg;
        average_deg = average_deg / 2;
    }
    
    // categorize all intersections points into 4 bins
    //  ----------------> x
    //  |       |
    //  |  III  |   IV
    //  |       |
    //  |---------------
    //  |       |
    //  |  II   |   I
    //  v       |
    //  y
    // the reason is that the center of the pic should be close to the center of the gauge.
    // the angle of the needle is already detected. we don't need actually find the center
    // to tell the direction of the needle. the distribution of intersections can provide
    // enough info to do the desicion.
    
    int cor_count[4] = {0, 0, 0, 0};
    int var_x = 0, var_y = 0;
    printf("Intersections:\r\n");
    for(i = 0; i < inter_count; i++)
    {
        if(intersect_list[i].x > dim_x /2)
        {
            if(intersect_list[i].y > dim_y/2)
                cor_count[0]++;
            else
                cor_count[3]++;
        }
        else
        {
            if(intersect_list[i].y > dim_y/2)
                cor_count[1]++;
            else
                cor_count[2]++;
        }
        var_x = var_x + abs(intersect_list[i].x - dim_x/2);
        var_y = var_y + abs(intersect_list[i].y - dim_y/2);
        printf("(%d, %d)\r\n", intersect_list[i].x, intersect_list[i].y);
    }
    printf("Variation in x: %d, in y: %d\r\n", var_x, var_y);
    
    int most_cor_pt = 0;
    printf("Intersection points distribution: \r\n");
    for(i = 0; i < 4; i++)
    {
        if(cor_count[i] > cor_count[most_cor_pt])
            most_cor_pt = i;
        printf("Cor %d: %d\r\n", i+1, cor_count[i]);
    }
    
    int correct_vote = 0, revert_vote = 0;
    if(average_deg >= 0 && average_deg < 90)    // intersections are expected to be in II or IV
    {
        if(var_x < var_y)   // variance in x is smaller, I+II should > III+IV
        {
            if((cor_count[0]+cor_count[1]) > (cor_count[2]+cor_count[3]))
                correct_vote++;
            else
                revert_vote++;
        }
        else            // variance in y is smaller, II+III should > IV+I
        {
            if((cor_count[1]+cor_count[2]) > (cor_count[3]+cor_count[0]))
                correct_vote++;
            else
                revert_vote++;
        }
        if(correct_vote > revert_vote)
            printf("Original detected angle is correct.\r\n");
        else
        {
            average_deg = average_deg + 180;
            printf("Reverted angle is correct.\r\n");
        }
    }
    else if(average_deg >= 90 && average_deg < 180)    // intersections are expected to be in III or I
    {
        if(var_x < var_y)   // variance in x is smaller, I+II should < III+IV
        {
            if((cor_count[0]+cor_count[1]) < (cor_count[2]+cor_count[3]))
                correct_vote++;
            else
                revert_vote++;
        }
        else            // variance in y is smaller, II+III should > IV+I
        {
            if((cor_count[1]+cor_count[2]) > (cor_count[3]+cor_count[0]))
                correct_vote++;
            else
                revert_vote++;
        }
        if(correct_vote > revert_vote)
            printf("Original detected angle is correct.\r\n");
        else
        {
            average_deg = average_deg + 180;
            printf("Reverted angle is correct.\r\n");
        }
    }
    else if(average_deg >= 180 && average_deg < 270)    // intersections are expected to be in II or IV
    {
        if(var_x < var_y)   // variance in x is smaller, I+II should < III+IV
        {
            if((cor_count[0]+cor_count[1]) < (cor_count[2]+cor_count[3]))
                correct_vote++;
            else
                revert_vote++;
        }
        else            // variance in y is smaller, II+III should < IV+I
        {
            if((cor_count[1]+cor_count[2]) < (cor_count[3]+cor_count[0]))
                correct_vote++;
            else
                revert_vote++;
        }
        if(correct_vote > revert_vote)
            printf("Original detected angle is correct.\r\n");
        else
        {
            average_deg = average_deg - 180;
            printf("Reverted angle is correct.\r\n");
        }
    }
    else if(average_deg >= 270 && average_deg < 360)    // intersections are expected to be in III or I
    {
        if(var_x < var_y)   // variance in x is smaller, I+II should > III+IV
        {
            if((cor_count[0]+cor_count[1]) > (cor_count[2]+cor_count[3]))
                correct_vote++;
            else
                revert_vote++;
        }
        else            // variance in y is smaller, II+III should < IV+I
        {
            if((cor_count[1]+cor_count[2]) < (cor_count[3]+cor_count[0]))
                correct_vote++;
            else
                revert_vote++;
        }
        if(correct_vote > revert_vote)
            printf("Original detected angle is correct.\r\n");
        else
        {
            average_deg = average_deg - 180;
            printf("Reverted angle is correct.\r\n");
        }
    }
    else
    {
        printf("Detection failed. The result may not be correct.\r\n");
    }
    return average_deg;
}


int main(void)
{
    pc.baud(115200);
    int i, j;
    led_red = 0;
    led_green = 0;
    
    FMC_SDRAM_CommandTypeDef SDRAMCommandStructure;
    SDRAMCommandStructure.CommandTarget          = FMC_SDRAM_CMD_TARGET_BANK2;

    pc.printf("\r\nSDRAM demo started\r\n");
    
    Image src, dst, bin;
    if (bmpread("/sd/1.bmp", &src) == 0)
        return 0;
    
    // smooth the picture using GaussianBlur: void GaussianBlur(Image *src, Image *dst, int width, double sigma)
    myGaussianBlur(&src, &dst, 5, 1.5);
    myCanny(&dst, &bin, 20, 60);
    line_polar my_line_list[10];
    int line_count;
    line_count = myHoughLines(&bin, my_line_list, 40, line_limit);
    if(line_count >= line_limit)
        line_count = line_limit;
        
    trans_line line_list[line_limit];
    unsigned int m = src.rows, n = src.cols;
    for(i = 0; i < line_count; i++)  // combined for loops at #64, #73, #83, #92, #101
    {
        line_list[i].rho = my_line_list[i].rho;
        line_list[i].theta = my_line_list[i].theta * (M_PI / 180);
        line_list[i].strength = my_line_list[i].strength;
        line_list[i].a = cos(line_list[i].theta);
        line_list[i].b = sin(line_list[i].theta);
        line_list[i].x0 = line_list[i].a * line_list[i].rho;
        line_list[i].y0 = line_list[i].b * line_list[i].rho;
        line_list[i].ptvec1.x = Round(line_list[i].x0 + (m+n)*(-line_list[i].b));
        line_list[i].ptvec1.y = Round(line_list[i].y0 + (m+n)*line_list[i].a);
        line_list[i].ptvec2.x = Round(line_list[i].x0 - (m+n)*(-line_list[i].b));
        line_list[i].ptvec2.y = Round(line_list[i].y0 - (m+n)*line_list[i].a);
    }

    double angle = LineAnalysis(line_list, line_count, src.cols, src.rows);
    double value = (angle - 42) / 270 * 50;
    pc.printf("Detected angle = %.2f\r\nEstimated value = %.2f\r\n", angle, value);
}
