#include "stm32f30x.h"
#include "video.h"

typedef unsigned Unit;

#define cols 416
#define rows 300
#define bits 32

extern char *fb[VID_VSIZE*2];

typedef struct {
  Unit sum;
  Unit carry;
} AddResult;

static void half_add(AddResult *c,Unit a, Unit b) {
  c->sum=a^b;
  c->carry=a&b;
}

static void full_add(AddResult *d, Unit a, Unit b, Unit c) {
  AddResult r0,r1;
  half_add(&r0,a, b);
  half_add(&r1,r0.sum, c);
  d->sum=r1.sum;
  d->carry=r0.carry | r1.carry;
}

static Unit col_step(Unit above[3],
                     Unit current[3],
                     Unit below[3]) {
  AddResult a_inf,b_inf,c_inf,next0,next1a,next1b;
  /*
   * Compute row-wise influence sums.  This produces 96 2-bit sums (represented
   * as three pairs of 32-vectors) giving the number of live cells in the 1D
   * Moore neighborhood around each position.
   */
  full_add(&a_inf,(above[1] << 1) | (above[0] >> (bits - 1)),
                             above[1],
                             (above[1] >> 1) | (above[2] << (bits - 1)));
  half_add(&c_inf,(current[1] << 1) | (current[0] >> (bits - 1)),
                             /* middle bits of current[1] don't count */
                             (current[1] >> 1) | (current[2] << (bits - 1)));
  full_add(&b_inf,(below[1] << 1) | (below[0] >> (bits - 1)),
                             below[1],
                             (below[1] >> 1) | (below[2] << (bits - 1)));

  /*
   * Sum the row-wise sums into a two-dimensional Moore neighborhood population
   * count.  Such a count can overflow into four bits, but we don't care: Conway
   * has the same result for 8/9 and 0/1 (the cell is cleared in both cases).
   *
   * Thus, we don't need a four-bit addition.  Instead, we just retain the
   * carry output from the two intermediate additions and use it as a mask.
   */
  full_add(&next0,a_inf.sum, c_inf.sum, b_inf.sum);
  full_add(&next1a,a_inf.carry, next0.carry, b_inf.carry);
  half_add(&next1b,c_inf.carry, next1a.sum);

  /*
   * Apply Niemiec's optimization: OR the current cell state vector into the
   * 9-cell neighborhoold population count to derive the new state cheaply.  The
   * cell is set iff its three-bit sum is 0b011.
   */
  return (next0.sum | current[1])
       & next1b.sum
       & ~next1a.carry
       & ~next1b.carry;
}



static void step(Unit const *current_map,
                 Unit *next_map,
                 Unit width,
                 Unit height) {
  // We keep sliding windows of state in these arrays.
  Unit above[3]={ 0, 0, 0 };
  Unit current[3]={ 0, 0, 0 };
  Unit below[3]={ 0, 0, 0 };
  unsigned x,y;

  // Bootstrap for first column of first row.
  current[0] = current[1] = 0;
  current[2] = current_map[0];

  below[0] = below[1] = 0;
  below[2] = current_map[width];

  #define ADV(name, next) \
    name[0] = name[1]; \
    name[1] = name[2]; \
    name[2] = (next)

  // First row, wherein above[x] = 0, less final column
  for (x = 0; x < width - 1; ++x) {
    ADV(current, current_map[x + 1]);
    ADV(below,   current_map[width + x + 1]);
    next_map[x] = col_step(above, current, below);
  }


  // Final column of first row, wherein we cannot fetch next values.
  ADV(current, 0);
  ADV(below, 0);
  next_map[width - 1] = col_step(above, current, below);

  // Remaining rows except the last.
  for (y = 1; y < height - 1; ++y) {
    unsigned offset = y * width;

    // Bootstrap row like we did for row 1.
    above[0] = above[1] = 0;
    current[0] = current[1] = 0;
    below[0] = below[1] = 0;

    above[2] = current_map[offset - width];
    current[2] = current_map[offset];
    below[2] = current_map[offset + width];

    for (x = 0; x < width - 1; ++x) {
      ADV(above, current_map[offset - width + x + 1]);
      ADV(current, current_map[offset + x + 1]);
      ADV(below, current_map[offset + width + x + 1]);
      next_map[offset + x] = col_step(above, current, below);
    }

    // Last column.
    ADV(above, 0);
    ADV(current, 0);
    ADV(below, 0);
    next_map[offset + width - 1] = col_step(above, current, below);
  }

  // Final row, wherein below[x] = 0.
  unsigned offset = width * (height - 1);
  above[0] = above[1] = 0;
  current[0] = current[1] = 0;
  below[0] = below[1] = below[2] = 0;

  above[2] = current_map[offset - width];
  current[2] = current_map[offset];

  for (x = 0; x < width - 1; ++x) {
    ADV(above, current_map[offset - width + x + 1]);
    ADV(current, current_map[offset + x + 1]);
    next_map[offset + x] = col_step(above, current, below);
  }

  // Final column
  ADV(above, 0);
  ADV(current, 0);
  next_map[offset + width - 1] = col_step(above, current, below);

  #undef ADV
}

extern unsigned fboffset;

void conway_demo() {
	Unit *current=(Unit *)fb[0];
	Unit *next=(Unit *)fb[rows];
	SPI1->CR1 |= SPI_FirstBit_LSB;

	while(1) {
		step(current,next,cols/bits,rows);
		fboffset=300*52;
		sysDelayMs(1);
		step(next,current,cols/bits,rows);
		fboffset=0;
		sysDelayMs(1);
	}
}

