Adafruit-RGB_matrix_Panel(32*16)

Dependencies:   Adafruit-GFX

Revision:
0:06d9443a018f
Child:
1:0078213d3fa4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RGBmatrixPanel.cpp	Fri May 23 15:08:14 2014 +0000
@@ -0,0 +1,536 @@
+#include "RGBmatrixPanel.h"
+#include "gamma.h"
+
+#define DATAPORT PORTD
+#define DATADIR  DDRD
+#define SCLKPORT PORTB
+
+#define nPlanes 4
+
+// The fact that the display driver interrupt stuff is tied to the
+// singular Timer1 doesn't really take well to object orientation with
+// multiple RGBmatrixPanel instances.  The solution at present is to
+// allow instances, but only one is active at any given time, via its
+// begin() method.  The implementation is still incomplete in parts;
+// the prior active panel really should be gracefully disabled, and a
+// stop() method should perhaps be added...assuming multiple instances
+// are even an actual need.
+static RGBmatrixPanel *activePanel = NULL;
+
+// Code common to both the 16x32 and 32x32 constructors:
+void RGBmatrixPanel::init(uint8_t rows, uint8_t a, uint8_t b, uint8_t c, uint8_t sclk, uint8_t latch, uint8_t oe, bool dbuf)
+{
+    nRows = rows; // Number of multiplexed rows; actual height is 2X this
+    // Allocate and initialize matrix buffer:
+    int buffsize  = 32 * nRows * 3, // x3 = 3 bytes holds 4 planes "packed"
+        allocsize = (dbuf == true) ? (buffsize * 2) : buffsize;
+    if(NULL == (matrixbuff[0] = (uint8_t *)malloc(allocsize))) return;
+    memset(matrixbuff[0], 0, allocsize);
+    // If not double-buffered, both buffers then point to the same address:
+    matrixbuff[1] = (dbuf == true) ? &matrixbuff[0][buffsize] : matrixbuff[0];
+
+    // Save pin numbers for use by begin() method later.
+    _a     = a;
+    _a     = a;
+    _b     = b;
+    _c     = c;
+    _sclk  = sclk;
+    _latch = latch;
+    _oe    = oe;
+
+    // Look up port registers and pin masks ahead of time,
+    // avoids many slow digitalWrite() calls later.
+    sclkpin   = digitalPinToBitMask(sclk);
+    latport   = portOutputRegister(digitalPinToPort(latch));
+    latpin    = digitalPinToBitMask(latch);
+    oeport    = portOutputRegister(digitalPinToPort(oe));
+    oepin     = digitalPinToBitMask(oe);
+    addraport = portOutputRegister(digitalPinToPort(a));
+    addrapin  = digitalPinToBitMask(a);
+    addrbport = portOutputRegister(digitalPinToPort(b));
+    addrbpin  = digitalPinToBitMask(b);
+    addrcport = portOutputRegister(digitalPinToPort(c));
+    addrcpin  = digitalPinToBitMask(c);
+    plane     = nPlanes - 1;
+    row       = nRows   - 1;
+    swapflag  = false;
+    backindex = 0;     // Array index of back buffer
+}
+
+// Constructor for 16x32 panel:
+RGBmatrixPanel::RGBmatrixPanel(uint8_t a, uint8_t b, uint8_t c, uint8_t sclk, uint8_t latch, uint8_t oe, bool dbuf)
+    :Adafruit_GFX(32, 16)
+{
+    init(8, a, b, c, sclk, latch, oe, dbuf);
+}
+
+// Constructor for 32x32 panel:
+RGBmatrixPanel::RGBmatrixPanel(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t sclk, uint8_t latch, uint8_t oe, bool dbuf)
+    :Adafruit_GFX(32, 32)
+{
+    init(16, a, b, c, sclk, latch, oe, dbuf);
+
+    // Init a few extra 32x32-specific elements:
+    _d        = d;
+    addrdport = portOutputRegister(digitalPinToPort(d));
+    addrdpin  = digitalPinToBitMask(d);
+}
+
+void RGBmatrixPanel::begin(void)
+{
+
+    backindex   = 0;                         // Back buffer
+    buffptr     = matrixbuff[1 - backindex]; // -> front buffer
+    activePanel = this;                      // For interrupt hander
+
+    // Enable all comm & address pins as outputs, set default states:
+    pinMode(_sclk , OUTPUT);
+    SCLKPORT   &= ~sclkpin;  // Low
+    pinMode(_latch, OUTPUT);
+    *latport   &= ~latpin;   // Low
+    pinMode(_oe   , OUTPUT);
+    *oeport    |= oepin;     // High (disable output)
+    pinMode(_a    , OUTPUT);
+    *addraport &= ~addrapin; // Low
+    pinMode(_b    , OUTPUT);
+    *addrbport &= ~addrbpin; // Low
+    pinMode(_c    , OUTPUT);
+    *addrcport &= ~addrcpin; // Low
+    if(nRows > 8) {
+        pinMode(_d  , OUTPUT);
+        *addrdport &= ~addrdpin; // Low
+    }
+
+    // The high six bits of the data port are set as outputs;
+    // Might make this configurable in the future, but not yet.
+    DATADIR  = B11111100;
+    DATAPORT = 0;
+
+    // Set up Timer1 for interrupt:
+    TCCR1A  = _BV(WGM11); // Mode 14 (fast PWM), OC1A off
+    TCCR1B  = _BV(WGM13) | _BV(WGM12) | _BV(CS10); // Mode 14, no prescale
+    ICR1    = 100;
+    TIMSK1 |= _BV(TOIE1); // Enable Timer1 interrupt
+    sei();                // Enable global interrupts
+}
+
+// Original RGBmatrixPanel library used 3/3/3 color.  Later version used
+// 4/4/4.  Then Adafruit_GFX (core library used across all Adafruit
+// display devices now) standardized on 5/6/5.  The matrix still operates
+// internally on 4/4/4 color, but all the graphics functions are written
+// to expect 5/6/5...the matrix lib will truncate the color components as
+// needed when drawing.  These next functions are mostly here for the
+// benefit of older code using one of the original color formats.
+
+// Promote 3/3/3 RGB to Adafruit_GFX 5/6/5
+uint16_t RGBmatrixPanel::Color333(uint8_t r, uint8_t g, uint8_t b)
+{
+    // RRRrrGGGgggBBBbb
+    return ((r & 0x7) << 13) | ((r & 0x6) << 10) |
+           ((g & 0x7) <<  8) | ((g & 0x7) <<  5) |
+           ((b & 0x7) <<  2) | ((b & 0x6) >>  1);
+}
+
+// Promote 4/4/4 RGB to Adafruit_GFX 5/6/5
+uint16_t RGBmatrixPanel::Color444(uint8_t r, uint8_t g, uint8_t b)
+{
+    // RRRRrGGGGggBBBBb
+    return ((r & 0xF) << 12) | ((r & 0x8) << 8) |
+           ((g & 0xF) <<  7) | ((g & 0xC) << 3) |
+           ((b & 0xF) <<  1) | ((b & 0x8) >> 3);
+}
+
+// Demote 8/8/8 to Adafruit_GFX 5/6/5
+// If no gamma flag passed, assume linear color
+uint16_t RGBmatrixPanel::Color888(uint8_t r, uint8_t g, uint8_t b)
+{
+    return ((r & 0xF8) << 11) | ((g & 0xFC) << 5) | (b >> 3);
+}
+
+// 8/8/8 -> gamma -> 5/6/5
+uint16_t RGBmatrixPanel::Color888(
+    uint8_t r, uint8_t g, uint8_t b, bool gflag)
+{
+    if(gflag) { // Gamma-corrected color?
+        r = pgm_read_byte(&gamma[r]); // Gamma correction table maps
+        g = pgm_read_byte(&gamma[g]); // 8-bit input to 4-bit output
+        b = pgm_read_byte(&gamma[b]);
+        return (r << 12) | ((r & 0x8) << 8) | // 4/4/4 -> 5/6/5
+               (g <<  7) | ((g & 0xC) << 3) |
+               (b <<  1) | ( b        >> 3);
+    } // else linear (uncorrected) color
+    return ((r & 0xF8) << 11) | ((g & 0xFC) << 5) | (b >> 3);
+}
+
+uint16_t RGBmatrixPanel::ColorHSV(
+    long hue, uint8_t sat, uint8_t val, bool gflag)
+{
+
+    uint8_t  r, g, b, lo;
+    uint16_t s1, v1;
+
+    // Hue
+    hue %= 1536;             // -1535 to +1535
+    if(hue < 0) hue += 1536; //     0 to +1535
+    lo = hue & 255;          // Low byte  = primary/secondary color mix
+    switch(hue >> 8) {       // High byte = sextant of colorwheel
+        case 0 :
+            r = 255     ;
+            g =  lo     ;
+            b =   0     ;
+            break; // R to Y
+        case 1 :
+            r = 255 - lo;
+            g = 255     ;
+            b =   0     ;
+            break; // Y to G
+        case 2 :
+            r =   0     ;
+            g = 255     ;
+            b =  lo     ;
+            break; // G to C
+        case 3 :
+            r =   0     ;
+            g = 255 - lo;
+            b = 255     ;
+            break; // C to B
+        case 4 :
+            r =  lo     ;
+            g =   0     ;
+            b = 255     ;
+            break; // B to M
+        default:
+            r = 255     ;
+            g =   0     ;
+            b = 255 - lo;
+            break; // M to R
+    }
+
+    // Saturation: add 1 so range is 1 to 256, allowig a quick shift operation
+    // on the result rather than a costly divide, while the type upgrade to int
+    // avoids repeated type conversions in both directions.
+    s1 = sat + 1;
+    r  = 255 - (((255 - r) * s1) >> 8);
+    g  = 255 - (((255 - g) * s1) >> 8);
+    b  = 255 - (((255 - b) * s1) >> 8);
+
+    // Value (brightness) & 16-bit color reduction: similar to above, add 1
+    // to allow shifts, and upgrade to int makes other conversions implicit.
+    v1 = val + 1;
+    if(gflag) { // Gamma-corrected color?
+        r = pgm_read_byte(&gamma[(r * v1) >> 8]); // Gamma correction table maps
+        g = pgm_read_byte(&gamma[(g * v1) >> 8]); // 8-bit input to 4-bit output
+        b = pgm_read_byte(&gamma[(b * v1) >> 8]);
+    } else { // linear (uncorrected) color
+        r = (r * v1) >> 12; // 4-bit results
+        g = (g * v1) >> 12;
+        b = (b * v1) >> 12;
+    }
+    return (r << 12) | ((r & 0x8) << 8) | // 4/4/4 -> 5/6/5
+           (g <<  7) | ((g & 0xC) << 3) |
+           (b <<  1) | ( b        >> 3);
+}
+
+void RGBmatrixPanel::drawPixel(int16_t x, int16_t y, uint16_t c)
+{
+    uint8_t r, g, b, bit, limit, *ptr;
+    if((x < 0) || (x >= _width) || (y < 0) || (y >= _height)) return;
+    switch(rotation) {
+        case 1:
+            swap(x, y);
+            x = _rawWidth  - 1 - x;
+            break;
+        case 2:
+            x = _rawWidth  - 1 - x;
+            y = _rawHeight - 1 - y;
+            break;
+        case 3:
+            swap(x, y);
+            y = _rawHeight - 1 - y;
+            break;
+    }
+
+    // Adafruit_GFX uses 16-bit color in 5/6/5 format, while matrix needs
+    // 4/4/4.  Pluck out relevant bits while separating into R,G,B:
+    r =  c >> 12;        // RRRRrggggggbbbbb
+    g = (c >>  7) & 0xF; // rrrrrGGGGggbbbbb
+    b = (c >>  1) & 0xF; // rrrrrggggggBBBBb
+    // Loop counter stuff
+    bit   = 2;
+    limit = 1 << nPlanes;
+
+    if(y < nRows) {
+        // Data for the upper half of the display is stored in the lower
+        // bits of each byte.
+        ptr = &matrixbuff[backindex][y * _rawWidth * (nPlanes - 1) + x]; // Base addr
+        // Plane 0 is a tricky case -- its data is spread about,
+        // stored in least two bits not used by the other planes.
+        ptr[64] &= ~B00000011;            // Plane 0 R,G mask out in one op
+        if(r & 1) ptr[64] |=  B00000001;  // Plane 0 R: 64 bytes ahead, bit 0
+        if(g & 1) ptr[64] |=  B00000010;  // Plane 0 G: 64 bytes ahead, bit 1
+        if(b & 1) ptr[32] |=  B00000001;  // Plane 0 B: 32 bytes ahead, bit 0
+        else      ptr[32] &= ~B00000001;  // Plane 0 B unset; mask out
+        // The remaining three image planes are more normal-ish.
+        // Data is stored in the high 6 bits so it can be quickly
+        // copied to the DATAPORT register w/6 output lines.
+        for(; bit < limit; bit <<= 1) {
+            *ptr &= ~B00011100;             // Mask out R,G,B in one op
+            if(r & bit) *ptr |= B00000100;  // Plane N R: bit 2
+            if(g & bit) *ptr |= B00001000;  // Plane N G: bit 3
+            if(b & bit) *ptr |= B00010000;  // Plane N B: bit 4
+            ptr  += WIDTH;                  // Advance to next bit plane
+        }
+    } else {
+        // Data for the lower half of the display is stored in the upper
+        // bits, except for the plane 0 stuff, using 2 least bits.
+        ptr = &matrixbuff[backindex][(y - nRows) * WIDTH * (nPlanes - 1) + x];
+        *ptr &= ~B00000011;               // Plane 0 G,B mask out in one op
+        if(r & 1)  ptr[32] |=  B00000010; // Plane 0 R: 32 bytes ahead, bit 1
+        else       ptr[32] &= ~B00000010; // Plane 0 R unset; mask out
+        if(g & 1) *ptr     |=  B00000001; // Plane 0 G: bit 0
+        if(b & 1) *ptr     |=  B00000010; // Plane 0 B: bit 0
+        for(; bit < limit; bit <<= 1) {
+            *ptr &= ~B11100000;             // Mask out R,G,B in one op
+            if(r & bit) *ptr |= B00100000;  // Plane N R: bit 5
+            if(g & bit) *ptr |= B01000000;  // Plane N G: bit 6
+            if(b & bit) *ptr |= B10000000;  // Plane N B: bit 7
+            ptr  += WIDTH;                  // Advance to next bit plane
+        }
+    }
+}
+
+void RGBmatrixPanel::fillScreen(uint16_t c)
+{
+    if((c == 0x0000) || (c == 0xffff)) {
+        // For black or white, all bits in frame buffer will be identically
+        // set or unset (regardless of weird bit packing), so it's OK to just
+        // quickly memset the whole thing:
+        memset(matrixbuff[backindex], c, 32 * nRows * 3);
+    } else {
+        // Otherwise, need to handle it the long way:
+        Adafruit_GFX::fillScreen(c);
+    }
+}
+
+// Return address of back buffer -- can then load/store data directly
+uint8_t *RGBmatrixPanel::backBuffer()
+{
+    return matrixbuff[backindex];
+}
+
+// For smooth animation -- drawing always takes place in the "back" buffer;
+// this method pushes it to the "front" for display.  Passing "true", the
+// updated display contents are then copied to the new back buffer and can
+// be incrementally modified.  If "false", the back buffer then contains
+// the old front buffer contents -- your code can either clear this or
+// draw over every pixel.  (No effect if double-buffering is not enabled.)
+void RGBmatrixPanel::swapBuffers(bool copy)
+{
+    if(matrixbuff[0] != matrixbuff[1]) {
+        // To avoid 'tearing' display, actual swap takes place in the interrupt
+        // handler, at the end of a complete screen refresh cycle.
+        swapflag = true;                  // Set flag here, then...
+        while(swapflag == true) delay(1); // wait for interrupt to clear it
+        if(copy == true)
+            memcpy(matrixbuff[backindex], matrixbuff[1-backindex], 32 * nRows * 3);
+    }
+}
+
+// Dump display contents to the Serial Monitor, adding some formatting to
+// simplify copy-and-paste of data as a PROGMEM-embedded image for another
+// sketch.  If using multiple dumps this way, you'll need to edit the
+// output to change the 'img' name for each.  Data can then be loaded
+// back into the display using a pgm_read_byte() loop.
+void RGBmatrixPanel::dumpMatrix(void)
+{
+
+    int i, buffsize = 32 * nRows * 3;
+
+    Serial.print("\n\n"
+                 "#include <avr/pgmspace.h>\n\n"
+                 "static const uint8_t PROGMEM img[] = {\n  ");
+
+    for(i=0; i<buffsize; i++) {
+        Serial.print("0x");
+        if(matrixbuff[backindex][i] < 0x10) Serial.print('0');
+        Serial.print(matrixbuff[backindex][i],HEX);
+        if(i < (buffsize - 1)) {
+            if((i & 7) == 7) Serial.print(",\n  ");
+            else             Serial.print(',');
+        }
+    }
+    Serial.println("\n};");
+}
+
+// -------------------- Interrupt handler stuff --------------------
+
+ISR(TIMER1_OVF_vect, ISR_BLOCK)   // ISR_BLOCK important -- see notes later
+{
+    activePanel->updateDisplay();   // Call refresh func for active display
+    TIFR1 |= TOV1;                  // Clear Timer1 interrupt flag
+}
+
+// Two constants are used in timing each successive BCM interval.
+// These were found empirically, by checking the value of TCNT1 at
+// certain positions in the interrupt code.
+// CALLOVERHEAD is the number of CPU 'ticks' from the timer overflow
+// condition (triggering the interrupt) to the first line in the
+// updateDisplay() method.  It's then assumed (maybe not entirely 100%
+// accurately, but close enough) that a similar amount of time will be
+// needed at the opposite end, restoring regular program flow.
+// LOOPTIME is the number of 'ticks' spent inside the shortest data-
+// issuing loop (not actually a 'loop' because it's unrolled, but eh).
+// Both numbers are rounded up slightly to allow a little wiggle room
+// should different compilers produce slightly different results.
+#define CALLOVERHEAD 60   // Actual value measured = 56
+#define LOOPTIME     200  // Actual value measured = 188
+// The "on" time for bitplane 0 (with the shortest BCM interval) can
+// then be estimated as LOOPTIME + CALLOVERHEAD * 2.  Each successive
+// bitplane then doubles the prior amount of time.  We can then
+// estimate refresh rates from this:
+// 4 bitplanes = 320 + 640 + 1280 + 2560 = 4800 ticks per row.
+// 4800 ticks * 16 rows (for 32x32 matrix) = 76800 ticks/frame.
+// 16M CPU ticks/sec / 76800 ticks/frame = 208.33 Hz.
+// Actual frame rate will be slightly less due to work being done
+// during the brief "LEDs off" interval...it's reasonable to say
+// "about 200 Hz."  The 16x32 matrix only has to scan half as many
+// rows...so we could either double the refresh rate (keeping the CPU
+// load the same), or keep the same refresh rate but halve the CPU
+// load.  We opted for the latter.
+// Can also estimate CPU use: bitplanes 1-3 all use 320 ticks to
+// issue data (the increasing gaps in the timing invervals are then
+// available to other code), and bitplane 0 takes 920 ticks out of
+// the 2560 tick interval.
+// 320 * 3 + 920 = 1880 ticks spent in interrupt code, per row.
+// From prior calculations, about 4800 ticks happen per row.
+// CPU use = 1880 / 4800 = ~39% (actual use will be very slightly
+// higher, again due to code used in the LEDs off interval).
+// 16x32 matrix uses about half that CPU load.  CPU time could be
+// further adjusted by padding the LOOPTIME value, but refresh rates
+// will decrease proportionally, and 200 Hz is a decent target.
+
+// The flow of the interrupt can be awkward to grasp, because data is
+// being issued to the LED matrix for the *next* bitplane and/or row
+// while the *current* plane/row is being shown.  As a result, the
+// counter variables change between past/present/future tense in mid-
+// function...hopefully tenses are sufficiently commented.
+
+void RGBmatrixPanel::updateDisplay(void)
+{
+    uint8_t  i, tick, tock, *ptr;
+    uint16_t t, duration;
+
+    *oeport  |= oepin;  // Disable LED output during row/plane switchover
+    *latport |= latpin; // Latch data loaded during *prior* interrupt
+
+    // Calculate time to next interrupt BEFORE incrementing plane #.
+    // This is because duration is the display time for the data loaded
+    // on the PRIOR interrupt.  CALLOVERHEAD is subtracted from the
+    // result because that time is implicit between the timer overflow
+    // (interrupt triggered) and the initial LEDs-off line at the start
+    // of this method.
+    t = (nRows > 8) ? LOOPTIME : (LOOPTIME * 2);
+    duration = ((t + CALLOVERHEAD * 2) << plane) - CALLOVERHEAD;
+
+    // Borrowing a technique here from Ray's Logic:
+    // www.rayslogic.com/propeller/Programming/AdafruitRGB/AdafruitRGB.htm
+    // This code cycles through all four planes for each scanline before
+    // advancing to the next line.  While it might seem beneficial to
+    // advance lines every time and interleave the planes to reduce
+    // vertical scanning artifacts, in practice with this panel it causes
+    // a green 'ghosting' effect on black pixels, a much worse artifact.
+
+    if(++plane >= nPlanes) {      // Advance plane counter.  Maxed out?
+        plane = 0;                  // Yes, reset to plane 0, and
+        if(++row >= nRows) {        // advance row counter.  Maxed out?
+            row     = 0;              // Yes, reset row counter, then...
+            if(swapflag == true) {    // Swap front/back buffers if requested
+                backindex = 1 - backindex;
+                swapflag  = false;
+            }
+            buffptr = matrixbuff[1-backindex]; // Reset into front buffer
+        }
+    } else if(plane == 1) {
+        // Plane 0 was loaded on prior interrupt invocation and is about to
+        // latch now, so update the row address lines before we do that:
+        if(row & 0x1)   *addraport |=  addrapin;
+        else            *addraport &= ~addrapin;
+        if(row & 0x2)   *addrbport |=  addrbpin;
+        else            *addrbport &= ~addrbpin;
+        if(row & 0x4)   *addrcport |=  addrcpin;
+        else            *addrcport &= ~addrcpin;
+        if(nRows > 8) {
+            if(row & 0x8) *addrdport |=  addrdpin;
+            else          *addrdport &= ~addrdpin;
+        }
+    }
+
+    // buffptr, being 'volatile' type, doesn't take well to optimization.
+    // A local register copy can speed some things up:
+    ptr = (uint8_t *)buffptr;
+
+    ICR1      = duration; // Set interval for next interrupt
+    TCNT1     = 0;        // Restart interrupt timer
+    *oeport  &= ~oepin;   // Re-enable output
+    *latport &= ~latpin;  // Latch down
+
+    // Record current state of SCLKPORT register, as well as a second
+    // copy with the clock bit set.  This makes the innnermost data-
+    // pushing loops faster, as they can just set the PORT state and
+    // not have to load/modify/store bits every single time.  It's a
+    // somewhat rude trick that ONLY works because the interrupt
+    // handler is set ISR_BLOCK, halting any other interrupts that
+    // might otherwise also be twiddling the port at the same time
+    // (else this would clobber them).
+    tock = SCLKPORT;
+    tick = tock | sclkpin;
+
+    if(plane > 0) { // 188 ticks from TCNT1=0 (above) to end of function
+
+        // Planes 1-3 copy bytes directly from RAM to PORT without unpacking.
+        // The least 2 bits (used for plane 0 data) are presumed masked out
+        // by the port direction bits.
+
+        // A tiny bit of inline assembly is used; compiler doesn't pick
+        // up on opportunity for post-increment addressing mode.
+        // 5 instruction ticks per 'pew' = 160 ticks total
+#define pew asm volatile(                 \
+      "ld  __tmp_reg__, %a[ptr]+"    "\n\t"   \
+      "out %[data]    , __tmp_reg__" "\n\t"   \
+      "out %[clk]     , %[tick]"     "\n\t"   \
+      "out %[clk]     , %[tock]"     "\n"     \
+      :: [ptr]  "e" (ptr),                    \
+         [data] "I" (_SFR_IO_ADDR(DATAPORT)), \
+         [clk]  "I" (_SFR_IO_ADDR(SCLKPORT)), \
+         [tick] "r" (tick),                   \
+         [tock] "r" (tock));
+
+        // Loop is unrolled for speed:
+        pew pew pew pew pew pew pew pew
+        pew pew pew pew pew pew pew pew
+        pew pew pew pew pew pew pew pew
+        pew pew pew pew pew pew pew pew
+
+        buffptr += 32;
+
+    } else { // 920 ticks from TCNT1=0 (above) to end of function
+
+        // Planes 1-3 (handled above) formatted their data "in place,"
+        // their layout matching that out the output PORT register (where
+        // 6 bits correspond to output data lines), maximizing throughput
+        // as no conversion or unpacking is needed.  Plane 0 then takes up
+        // the slack, with all its data packed into the 2 least bits not
+        // used by the other planes.  This works because the unpacking and
+        // output for plane 0 is handled while plane 3 is being displayed...
+        // because binary coded modulation is used (not PWM), that plane
+        // has the longest display interval, so the extra work fits.
+        for(i=0; i<32; i++) {
+            DATAPORT =
+                ( ptr[i]    << 6)         |
+                ((ptr[i+32] << 4) & 0x30) |
+                ((ptr[i+64] << 2) & 0x0C);
+            SCLKPORT = tick; // Clock lo
+            SCLKPORT = tock; // Clock hi
+        }
+    }
+}