text-to-speech through DAC to audio amp/speaker

Dependencies:   mbed

Embed: (wiki syntax)

« Back to documentation index

Show/hide line numbers TTS.cpp Source File

TTS.cpp

00001 /**
00002  * Text To Speech synthesis library
00003  * Copyright (c) 2008 Clive Webster.  All rights reserved.
00004  *
00005  * Nov. 29th 2009 - Modified to work with Arduino by Gabriel Petrut:
00006  * The Text To Speech library uses Timer1 to generate the PWM
00007  * output on digital pin 10. The output signal needs to be fed
00008  * to an RC filter then through an amplifier to the speaker.
00009  * http://www.tehnorama.ro/minieric-modulul-de-control-si-sinteza-vocala/
00010  *
00011  * Modified to allow use of different PWM pins by Stephen Crane.
00012  * Modified for Timer5 on Arduino Mega2560 by Peter Dambrowsky.
00013  */
00014 
00015 #include "TTS.h"
00016 
00017 AnalogOut dac(DACpin);
00018 
00019 // Random number seed
00020 static byte seed0;
00021 static byte seed1;
00022 static byte seed2;
00023 
00024 static char phonemes[128];
00025 static char modifier[128];  // must be same size as 'phonemes'
00026 static char g_text[128];
00027 
00028 static byte defaultPitch = 7;
00029 
00030 // Lookup user specified pitch changes
00031 static const byte PROGMEM PitchesP[] = { 1, 2, 4, 6, 8, 10, 13, 16 };
00032 
00033 /**
00034  * Find the single character 'token' in 'vocab'
00035  * and append its phonemes to dest[x]
00036  */
00037 static int copyToken(char token, char *dest, int x, const VOCAB * vocab)
00038 {
00039     for (unsigned int ph = 0; ph < numVocab; ph++) {
00040         const char *txt = (const char *) pgm_read_word(&vocab[ph].txt);
00041         if (pgm_read_byte(&txt[0]) == token && pgm_read_byte(&txt[1]) == 0) {
00042             const char *src =
00043                 (const char *) pgm_read_word(&vocab[ph].phoneme);
00044             while (pgm_read_byte(src)) {
00045                 dest[x++] = pgm_read_byte(src);
00046                 src++;
00047             }
00048             break;
00049         }
00050     }
00051     return x;
00052 }
00053 
00054 static byte whitespace(char c)
00055 {
00056     return (c == 0 || c == ' ' || c == ',' || c == '.' || c == '?'
00057             || c == '\'' || c == '!' || c == ':' || c == '/');
00058 }
00059 
00060 /**
00061  * Enter:
00062  * src => English text in upper case
00063  * vocab => VOCAB array
00064  * dest => address to return result
00065  * return 1 if ok, or 0 if error
00066  */
00067 static int textToPhonemes(const char *src, const VOCAB * vocab, char *dest)
00068 {
00069     int outIndex = 0;       // Current offset into dest
00070     int inIndex = -1;       // Starts at -1 so that a leading space is assumed
00071 
00072     while (inIndex == -1 || src[inIndex]) { // until end of text
00073         int maxMatch = 0;   // Max chars matched on input text
00074         int numOut = 0;     // Number of characters copied to output stream for the best match
00075         boolean endsInWhiteSpace = FALSE;
00076         int maxWildcardPos = 0;
00077 
00078         // Get next phoneme, P2
00079         for (unsigned int ph = 0; ph < numVocab; ph++) {
00080             int y, x;
00081             char wildcard = 0;  // modifier
00082             int wildcardInPos = 0;
00083             boolean hasWhiteSpace = FALSE;
00084             const char *text =
00085                 (const char *) pgm_read_word(&vocab[ph].txt);
00086             const char *phon =
00087                 (const char *) pgm_read_word(&vocab[ph].phoneme);
00088 
00089             for (y = 0;; y++) {
00090                 char nextVocabChar = pgm_read_byte(&text[y]);
00091                 char nextCharIn =
00092                     (y + inIndex == -1) ? ' ' : src[y + inIndex];
00093                 if (nextCharIn >= 'a' && nextCharIn <= 'z')
00094                     nextCharIn = nextCharIn - 'a' + 'A';
00095 
00096                 if (nextVocabChar == '#' && nextCharIn >= 'A'
00097                         && nextCharIn <= 'Z') {
00098                     wildcard = nextCharIn;  // The character equivalent to the '#'
00099                     wildcardInPos = y;
00100                     continue;
00101                 }
00102 
00103                 if (nextVocabChar == '_') {
00104                     // try to match against a white space
00105                     hasWhiteSpace = TRUE;
00106                     if (whitespace(nextCharIn))
00107                         continue;
00108                     y--;
00109                     break;
00110                 }
00111                 // check for end of either string
00112                 if (nextVocabChar == 0 || nextCharIn == 0)
00113                     break;
00114 
00115                 if (nextVocabChar != nextCharIn)
00116                     break;
00117             }
00118 
00119             // See if its the longest complete match so far
00120             if (y <= maxMatch || pgm_read_byte(&text[y]))
00121                 continue;
00122 
00123             // This is the longest complete match
00124             maxMatch = y;
00125             maxWildcardPos = 0;
00126             x = outIndex;   // offset into phoneme return data
00127 
00128             // Copy the matching phrase changing any '#' to the phoneme for the wildcard
00129             for (y = 0;; y++) {
00130                 char c = pgm_read_byte(&phon[y]);
00131                 if (c == 0)
00132                     break;
00133                 if (c == '#') {
00134                     if (pgm_read_byte(&phon[y + 1]) == 0) {
00135                         // replacement ends in wildcard
00136                         maxWildcardPos = wildcardInPos;
00137                     } else {
00138                         x = copyToken(wildcard, dest, x, vocab);    // Copy the phonemes for the wildcard character
00139                     }
00140                 } else {
00141                     dest[x++] = c;
00142                 }
00143             }
00144             dest[x] = 0;
00145             endsInWhiteSpace = hasWhiteSpace;
00146 
00147             // 14
00148             numOut = x - outIndex;  // The number of bytes added
00149         }
00150         // 15 - end of vocab table
00151 
00152         // 16
00153         if (endsInWhiteSpace)
00154             maxMatch--;
00155 
00156         // 17
00157         if (maxMatch == 0) {
00158             //loggerP(PSTR("Mistake in SAY, no token for "));
00159             //logger(&src[inIndex]);
00160             //loggerCRLF();
00161             return 0;
00162         }
00163         // 20
00164         outIndex += numOut;
00165         if (outIndex > 128 - 16) {
00166             //loggerP(PSTR("Mistake in SAY, text too long\n"));
00167             return 0;
00168         }
00169         // 21
00170         inIndex += (maxWildcardPos > 0) ? maxWildcardPos : maxMatch;
00171     }
00172     return 1;
00173 }
00174 
00175 /**
00176  * Convert phonemes to data string
00177  * Enter: textp = phonemes string
00178  * Return: phonemes = string of sound data
00179  *     modifier = 2 bytes per sound data
00180  */
00181 static int phonemesToData(const char *textp, const PHONEME * phoneme)
00182 {
00183     unsigned int phonemeOut = 0;    // offset into the phonemes array
00184     unsigned int modifierOut = 0;   // offset into the modifiers array
00185     unsigned int L81 = 0;   // attenuate
00186     unsigned int L80 = 16;
00187 
00188     while (*textp) {
00189         // P20: Get next phoneme
00190         boolean anyMatch = FALSE;
00191         int longestMatch = 0;
00192         int numOut = 0;     // The number of bytes copied to the output for the longest match
00193 
00194         // Get next phoneme, P2
00195         for (unsigned int ph = 0; ph < numPhoneme; ph++) {
00196             int numChars;
00197 
00198             // Locate start of next phoneme
00199             const char *ph_text =
00200                 (const char *) pgm_read_word(&phoneme[ph].txt);
00201 
00202             // Set 'numChars' to the number of characters
00203             // that we match against this phoneme
00204             for (numChars = 0; textp[numChars]; numChars++) {
00205 
00206                 // get next input character and make lower case
00207                 char nextChar = textp[numChars];
00208                 if (nextChar >= 'A' && nextChar <= 'Z')
00209                     nextChar = nextChar - 'A' + 'a';
00210 
00211                 if (nextChar != pgm_read_byte(&ph_text[numChars]))
00212                     break;
00213             }
00214 
00215             // if not the longest match so far then ignore
00216             if (numChars <= longestMatch)
00217                 continue;
00218 
00219             // partial phoneme match
00220             if (pgm_read_byte(&ph_text[numChars]))
00221                 continue;
00222 
00223             // P7: we have matched the whole phoneme
00224             longestMatch = numChars;
00225 
00226             // Copy phoneme data to 'phonemes'
00227             const char *ph_ph =
00228                 (const char *) pgm_read_word(&phoneme[ph].phoneme);
00229             for (numOut = 0; pgm_read_byte(&ph_ph[numOut]); numOut++)
00230                 phonemes[phonemeOut + numOut] =
00231                     pgm_read_byte(&ph_ph[numOut]);
00232 
00233             L81 = pgm_read_byte(&phoneme[ph].attenuate) + '0';
00234             anyMatch = TRUE;    // phoneme match found
00235 
00236             modifier[modifierOut] = -1;
00237             modifier[modifierOut + 1] = 0;
00238 
00239             // Get char from text after the phoneme and test if it is a numeric
00240             if (textp[longestMatch] >= '0' && textp[longestMatch] <= '9') {
00241                 // Pitch change requested
00242                 modifier[modifierOut] =
00243                     pgm_read_byte(&PitchesP[textp[longestMatch] - '1']);
00244                 modifier[modifierOut + 1] = L81;
00245                 longestMatch++;
00246             }
00247             // P10
00248             if (L81 != '0' && L81 != L80 && modifier[modifierOut] >= 0) {
00249                 modifier[modifierOut - 2] = modifier[modifierOut];
00250                 modifier[modifierOut - 1] = '0';
00251                 continue;
00252             }
00253             // P11
00254             if ((textp[longestMatch - 1] | 0x20) == 0x20) {
00255                 // end of input string or a space
00256                 modifier[modifierOut] =
00257                     (modifierOut == 0) ? 16 : modifier[modifierOut - 2];
00258             }
00259         }           // next phoneme
00260 
00261         // p13
00262         L80 = L81;
00263         if (longestMatch == 0 && !anyMatch) {
00264             //loggerP(PSTR("Mistake in speech at "));
00265             //logger(textp);
00266             //loggerCRLF();
00267             return 0;
00268         }
00269         // Move over the bytes we have copied to the output
00270         phonemeOut += numOut;
00271 
00272         if (phonemeOut > sizeof(phonemes) - 16) {
00273             //loggerP(PSTR("Line too long\n"));
00274             return 0;
00275         }
00276         // P16
00277 
00278         // Copy the modifier setting to each sound data element for this phoneme
00279         if (numOut > 2)
00280             for (int count = 0; count != numOut; count += 2) {
00281                 modifier[modifierOut + count + 2] =
00282                     modifier[modifierOut + count];
00283                 modifier[modifierOut + count + 3] = 0;
00284             }
00285         modifierOut += numOut;
00286 
00287         //p21
00288         textp += longestMatch;
00289     }
00290 
00291     phonemes[phonemeOut++] = 'z';
00292     phonemes[phonemeOut++] = 'z';
00293     phonemes[phonemeOut++] = 'z';
00294     phonemes[phonemeOut++] = 'z';
00295 
00296     while (phonemeOut < sizeof(phonemes))
00297         phonemes[phonemeOut++] = 0;
00298 
00299     while (modifierOut < sizeof(modifier)) {
00300         modifier[modifierOut++] = -1;
00301         modifier[modifierOut++] = 0;
00302     }
00303 
00304     return 1;
00305 }
00306 
00307 /*
00308  * A delay loop that doesn't change with different optimisation settings
00309  */
00310 
00311 
00312 static void pause(byte delays)
00313 {
00314     wait_us(delays*6);
00315 }
00316 
00317 static void delay2(byte d)
00318 {
00319     wait_us(d*3127);
00320 }
00321 
00322 /*
00323  * Generate a random number
00324  */
00325 static byte random2(void)
00326 {
00327     byte tmp = (seed0 & 0x48) + 0x38;
00328     seed0 <<= 1;
00329     if (seed1 & 0x80)
00330         seed0++;
00331     seed1 <<= 1;
00332     if (seed2 & 0x80)
00333         seed1++;
00334     seed2 <<= 1;
00335     if (tmp & 0x40)
00336         seed2++;
00337     return seed0;
00338 }
00339 
00340 static int pin;
00341 
00342 static void soundOff(void)
00343 {
00344     //dac.write(0);
00345 }
00346 
00347 #define PWM_TOP (1200/2)
00348 
00349 //https://sites.google.com/site/qeewiki/books/avr-guide/pwm-on-the-atmega328
00350 static void soundOn(void)
00351 {
00352     // dac.write(0);
00353 
00354     // initialise random number seed
00355     seed0 = 0xecu;
00356     seed1 = 7;
00357     seed2 = 0xcfu;
00358 }
00359 
00360 // Logarithmic scale
00361 //static const int16_t PROGMEM Volume[8] =
00362 //{ 0, PWM_TOP * 0.01, PWM_TOP * 0.02, PWM_TOP * 0.03, PWM_TOP * 0.06,
00363 //PWM_TOP * 0.12, PWM_TOP * 0.25, PWM_TOP * 0.5 };
00364 
00365 // Linear scale
00366 static const int16_t PROGMEM Volume[8] = {
00367     0, (uint16_t)(PWM_TOP * 0.07), (uint16_t)(PWM_TOP * 0.14), (uint16_t)(PWM_TOP * 0.21), (uint16_t)(PWM_TOP * 0.29),
00368     (uint16_t)(PWM_TOP * 0.36), (uint16_t)(PWM_TOP * 0.43), (uint16_t)(PWM_TOP * 0.5)
00369 };
00370 
00371 static void sound(byte b)
00372 {
00373     // Update PWM volume
00374     b = (b & 15);
00375     dac.write(0.5*b/16.);
00376 }
00377 
00378 static byte playTone(byte soundNum, byte soundPos, char pitch1,
00379                      char pitch2, byte count, byte volume)
00380 {
00381     const byte *soundData = &SoundData[soundNum * 0x40];
00382     while (count-- > 0) {
00383         byte s = pgm_read_byte(&soundData[soundPos & 0x3fu]);
00384         sound((byte) (s & volume));
00385         pause(pitch1);
00386         sound((byte) ((s >> 4) & volume));
00387         pause(pitch2);
00388 
00389         soundPos++;
00390     }
00391     return soundPos & 0x3fu;
00392 }
00393 
00394 static void play(byte duration, byte soundNumber)
00395 {
00396     while (duration--)
00397         playTone(soundNumber, random2(), 7, 7, 10, 15);
00398 }
00399 
00400 /******************************************************************************
00401  * User API
00402  ******************************************************************************/
00403 TTS::TTS()
00404 {
00405 }
00406 
00407 void TTS::setPitch(byte pitch)
00408 {
00409     defaultPitch = pitch;
00410 }
00411 
00412 byte TTS::getPitch(void)
00413 {
00414     return defaultPitch;
00415 }
00416 
00417 /*
00418  * Speak a string of phonemes
00419  */
00420 void TTS::sayPhonemes(const char *textp)
00421 {
00422     byte phonemeIn,     // offset into text
00423          byte2, modifierIn,      // offset into stuff in modifier
00424          punctuationPitchDelta;  // change in pitch due to fullstop or question mark
00425     int8_t byte1;
00426     char phoneme;
00427     const SOUND_INDEX *soundIndex;
00428     byte sound1Num;     // Sound data for the current phoneme
00429     byte sound2Num;     // Sound data for the next phoneme
00430     byte sound2Stop;        // Where the second sound should stop
00431     char pitch1;        // pitch for the first sound
00432     char pitch2;        // pitch for the second sound
00433     short i;
00434     byte sound1Duration;    // the duration for sound 1
00435 
00436     if (phonemesToData(textp, s_phonemes)) {
00437         // phonemes has list of sound bytes
00438         soundOn();
00439 
00440         // _630C
00441         byte1 = 0;
00442         punctuationPitchDelta = 0;
00443 
00444         // Q19
00445         for (phonemeIn = 0, modifierIn = 0; phonemes[phonemeIn];
00446                 phonemeIn += 2, modifierIn += 2) {
00447             byte duration;  // duration from text line
00448             byte SoundPos;  // offset into sound data
00449             byte fadeSpeed = 0;
00450 
00451             phoneme = phonemes[phonemeIn];
00452             if (phoneme == 'z') {
00453                 delay2(15);
00454                 continue;
00455             } else if (phoneme == '#') {
00456                 continue;
00457             } else {
00458 
00459                 // Collect info on sound 1
00460                 soundIndex = &SoundIndex[phoneme - 'A'];
00461                 sound1Num = pgm_read_byte(&soundIndex->SoundNumber);
00462                 byte1 = pgm_read_byte(&soundIndex->byte1);
00463                 byte2 = pgm_read_byte(&soundIndex->byte2);
00464 
00465                 duration = phonemes[phonemeIn + 1] - '0';   // Get duration from the input line
00466                 if (duration != 1)
00467                     duration <<= 1;
00468 
00469                 duration += 6;  // scaled duration from the input line (at least 6)
00470                 sound2Stop = 0x40 >> 1;
00471 
00472                 pitch1 = modifier[modifierIn];
00473                 if (modifier[modifierIn + 1] == 0 || pitch1 == -1) {
00474                     pitch1 = 10;
00475                     duration -= 6;
00476                 } else if (modifier[modifierIn + 1] == '0'
00477                            || duration == 6) {
00478                     duration -= 6;
00479                 }
00480                 // q8
00481                 pitch2 = modifier[modifierIn + 2];
00482                 if (modifier[modifierIn + 3] == 0 || pitch2 == -1)
00483                     pitch2 = 10;
00484 
00485                 // q10
00486                 if (byte1 < 0) {
00487                     sound1Num = 0;
00488                     random2();
00489                     sound2Stop = (0x40 >> 1) + 2;
00490                 } else {
00491                     // is positive
00492                     if (byte1 == 2) {
00493                         // 64A4
00494                         // Make a white noise sound !
00495                         byte volume = (duration == 6) ? 15 : 1; // volume mask
00496                         for (duration <<= 2; duration > 0; duration--) {
00497                             playTone(sound1Num, random2(), 8, 12, 11,
00498                                      volume);
00499                             // Increase the volume
00500                             if (++volume == 16)
00501                                 volume = 15;    // full volume from now on
00502                         }
00503                         continue;
00504 
00505                     } else {
00506                         // q11
00507                         if (byte1)
00508                             delay2(25);
00509                     }
00510                 }
00511             }
00512 
00513             // 6186
00514             pitch1 += defaultPitch + punctuationPitchDelta;
00515             if (pitch1 < 1)
00516                 pitch1 = 1;
00517 
00518             pitch2 += defaultPitch + punctuationPitchDelta;
00519             if (pitch2 < 1)
00520                 pitch2 = 1;
00521 
00522             // get next phoneme
00523             phoneme = phonemes[phonemeIn + 2];
00524 
00525             if (phoneme == 0 || phoneme == 'z') {
00526                 if (duration == 1)
00527                     delay2(60);
00528                 phoneme = 'a';  // change to a pause
00529             } else {
00530                 // s6
00531                 if (byte2 != 1)
00532                     byte2 =
00533                         (byte2 +
00534                          pgm_read_byte(&SoundIndex[phoneme - 'A'].byte2))
00535                         >> 1;
00536 
00537                 if (byte1 < 0
00538                         || pgm_read_byte(&SoundIndex[phoneme - 'A'].byte1))
00539                     phoneme = 'a';  // change to a pause
00540             }
00541 
00542             // S10
00543             sound2Num =
00544                 pgm_read_byte(&SoundIndex[phoneme - 'A'].SoundNumber);
00545 
00546             sound1Duration = 0x80;  // play half of sound 1
00547             if (sound2Num == sound1Num)
00548                 byte2 = duration;
00549 
00550             // S11
00551             if ((byte2 >> 1) == 0) {
00552                 sound1Duration = 0xff;  // play all of sound 1
00553             } else {
00554                 // The fade speed between the two sounds
00555                 fadeSpeed = (sound1Duration + (byte2 >> 1)) / byte2;
00556 
00557                 if (duration == 1) {
00558                     sound2Stop = 0x40;  // dont play sound2
00559                     sound1Duration = 0xff;  // play all of sound 1
00560                     pitch1 = 12;
00561                 }
00562             }
00563 
00564             SoundPos = 0;
00565             do {
00566                 byte sound1Stop = (sound1Duration >> 2) & 0x3fu;
00567                 byte sound1End = sound1Stop;
00568                 if (sound2Stop < sound1End) sound1End = sound2Stop;  // min
00569 
00570                 if (sound1Stop)
00571                     SoundPos =
00572                         playTone(sound1Num, SoundPos, pitch1, pitch1,
00573                                  sound1End, 15);
00574 
00575                 // s18
00576                 if (sound2Stop != 0x40) {
00577                     SoundPos =
00578                         playTone(sound2Num, SoundPos, pitch2, pitch2,
00579                                  (byte) (sound2Stop - sound1End), 15);
00580                 }
00581                 // s23
00582                 if (sound1Duration != 0xff && duration < byte2) {
00583                     // Fade sound1 out
00584                     sound1Duration -= fadeSpeed;
00585                     if (sound1Duration >= (byte) 0xC8)
00586                         sound1Duration = 0; // stop playing sound 1
00587                 }
00588                 // Call any additional sound
00589                 if (byte1 == -1)
00590                     play(3, 30);    // make an 'f' sound
00591                 else if (byte1 == -2)
00592                     play(3, 29);    // make an 's' sound
00593                 else if (byte1 == -3)
00594                     play(3, 33);    // make a 'th' sound
00595                 else if (byte1 == -4)
00596                     play(3, 27);    // make a 'sh' sound
00597 
00598             } while (--duration);
00599 
00600             // Scan ahead to find a '.' or a '?' as this will change the pitch
00601             punctuationPitchDelta = 0;
00602             for (i = 6; i > 0; i--) {
00603                 char next = phonemes[phonemeIn + (i * 2)];
00604                 if (next == 'i')
00605                     // found a full stop
00606                     punctuationPitchDelta = 6 - i;  // Lower the pitch
00607                 else if (next == 'h')
00608                     // found a question mark
00609                     punctuationPitchDelta = i - 6;  // Raise the pitch
00610             }
00611 
00612             if (byte1 == 1)
00613                 delay2(25);
00614         }           // next phoneme
00615     }
00616     soundOff();
00617 }
00618 
00619 /*
00620  * Speak an English command line of text
00621  */
00622 void TTS::sayText(const char *original)
00623 {
00624     unsigned int i;
00625     if (textToPhonemes(original, s_vocab, g_text)) {
00626         sayPhonemes(g_text);
00627     }
00628 }