Important changes to repositories hosted on mbed.com
Mbed hosted mercurial repositories are deprecated and are due to be permanently deleted in July 2026.
To keep a copy of this software download the repository Zip archive or clone locally using Mercurial.
It is also possible to export all your personal repositories from the account settings page.
unicode.c
00001 /* 00002 * This file is part of the Micro Python project, http://micropython.org/ 00003 * 00004 * The MIT License (MIT) 00005 * 00006 * Copyright (c) 2013, 2014 Damien P. George 00007 * 00008 * Permission is hereby granted, free of charge, to any person obtaining a copy 00009 * of this software and associated documentation files (the "Software"), to deal 00010 * in the Software without restriction, including without limitation the rights 00011 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00012 * copies of the Software, and to permit persons to whom the Software is 00013 * furnished to do so, subject to the following conditions: 00014 * 00015 * The above copyright notice and this permission notice shall be included in 00016 * all copies or substantial portions of the Software. 00017 * 00018 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00019 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00020 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00021 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00022 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00023 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00024 * THE SOFTWARE. 00025 */ 00026 00027 #include <stdint.h> 00028 00029 #include "py/unicode.h" 00030 00031 // attribute flags 00032 #define FL_PRINT (0x01) 00033 #define FL_SPACE (0x02) 00034 #define FL_DIGIT (0x04) 00035 #define FL_ALPHA (0x08) 00036 #define FL_UPPER (0x10) 00037 #define FL_LOWER (0x20) 00038 #define FL_XDIGIT (0x40) 00039 00040 // shorthand character attributes 00041 #define AT_PR (FL_PRINT) 00042 #define AT_SP (FL_SPACE | FL_PRINT) 00043 #define AT_DI (FL_DIGIT | FL_PRINT | FL_XDIGIT) 00044 #define AT_AL (FL_ALPHA | FL_PRINT) 00045 #define AT_UP (FL_UPPER | FL_ALPHA | FL_PRINT) 00046 #define AT_LO (FL_LOWER | FL_ALPHA | FL_PRINT) 00047 #define AT_UX (FL_UPPER | FL_ALPHA | FL_PRINT | FL_XDIGIT) 00048 #define AT_LX (FL_LOWER | FL_ALPHA | FL_PRINT | FL_XDIGIT) 00049 00050 // table of attributes for ascii characters 00051 STATIC const uint8_t attr[] = { 00052 0, 0, 0, 0, 0, 0, 0, 0, 00053 0, AT_SP, AT_SP, AT_SP, AT_SP, AT_SP, 0, 0, 00054 0, 0, 0, 0, 0, 0, 0, 0, 00055 0, 0, 0, 0, 0, 0, 0, 0, 00056 AT_SP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, 00057 AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, 00058 AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, 00059 AT_DI, AT_DI, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, 00060 AT_PR, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UP, 00061 AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, 00062 AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, 00063 AT_UP, AT_UP, AT_UP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, 00064 AT_PR, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LO, 00065 AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, 00066 AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, 00067 AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0 00068 }; 00069 00070 // TODO: Rename to str_get_char 00071 unichar utf8_get_char(const byte *s) { 00072 #if MICROPY_PY_BUILTINS_STR_UNICODE 00073 unichar ord = *s++; 00074 if (!UTF8_IS_NONASCII(ord)) return ord; 00075 ord &= 0x7F; 00076 for (unichar mask = 0x40; ord & mask; mask >>= 1) { 00077 ord &= ~mask; 00078 } 00079 while (UTF8_IS_CONT(*s)) { 00080 ord = (ord << 6) | (*s++ & 0x3F); 00081 } 00082 return ord; 00083 #else 00084 return *s; 00085 #endif 00086 } 00087 00088 // TODO: Rename to str_next_char 00089 const byte *utf8_next_char(const byte *s) { 00090 #if MICROPY_PY_BUILTINS_STR_UNICODE 00091 ++s; 00092 while (UTF8_IS_CONT(*s)) { 00093 ++s; 00094 } 00095 return s; 00096 #else 00097 return s + 1; 00098 #endif 00099 } 00100 00101 mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) { 00102 mp_uint_t i = 0; 00103 while (ptr > s) { 00104 if (!UTF8_IS_CONT(*--ptr)) { 00105 i++; 00106 } 00107 } 00108 00109 return i; 00110 } 00111 00112 // TODO: Rename to str_charlen 00113 mp_uint_t unichar_charlen(const char *str, mp_uint_t len) { 00114 #if MICROPY_PY_BUILTINS_STR_UNICODE 00115 mp_uint_t charlen = 0; 00116 for (const char *top = str + len; str < top; ++str) { 00117 if (!UTF8_IS_CONT(*str)) { 00118 ++charlen; 00119 } 00120 } 00121 return charlen; 00122 #else 00123 return len; 00124 #endif 00125 } 00126 00127 // Be aware: These unichar_is* functions are actually ASCII-only! 00128 bool unichar_isspace(unichar c) { 00129 return c < 128 && (attr[c] & FL_SPACE) != 0; 00130 } 00131 00132 bool unichar_isalpha(unichar c) { 00133 return c < 128 && (attr[c] & FL_ALPHA) != 0; 00134 } 00135 00136 bool unichar_isprint(unichar c) { 00137 return c < 128 && (attr[c] & FL_PRINT) != 0; 00138 } 00139 00140 bool unichar_isdigit(unichar c) { 00141 return c < 128 && (attr[c] & FL_DIGIT) != 0; 00142 } 00143 00144 bool unichar_isxdigit(unichar c) { 00145 return c < 128 && (attr[c] & FL_XDIGIT) != 0; 00146 } 00147 00148 bool unichar_isident(unichar c) { 00149 return c < 128 && ((attr[c] & (FL_ALPHA | FL_DIGIT)) != 0 || c == '_'); 00150 } 00151 00152 bool unichar_isupper(unichar c) { 00153 return c < 128 && (attr[c] & FL_UPPER) != 0; 00154 } 00155 00156 bool unichar_islower(unichar c) { 00157 return c < 128 && (attr[c] & FL_LOWER) != 0; 00158 } 00159 00160 unichar unichar_tolower(unichar c) { 00161 if (unichar_isupper(c)) { 00162 return c + 0x20; 00163 } 00164 return c; 00165 } 00166 00167 unichar unichar_toupper(unichar c) { 00168 if (unichar_islower(c)) { 00169 return c - 0x20; 00170 } 00171 return c; 00172 } 00173 00174 mp_uint_t unichar_xdigit_value(unichar c) { 00175 // c is assumed to be hex digit 00176 mp_uint_t n = c - '0'; 00177 if (n > 9) { 00178 n &= ~('a' - 'A'); 00179 n -= ('A' - ('9' + 1)); 00180 } 00181 return n; 00182 }
Generated on Tue Jul 12 2022 11:32:26 by
 1.7.2
 1.7.2