unicode.c 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205
  1. /*
  2. * This file is part of the MicroPython project, http://micropython.org/
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Copyright (c) 2013, 2014 Damien P. George
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in
  16. * all copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. * THE SOFTWARE.
  25. */
  26. #include <stdint.h>
  27. #include "py/unicode.h"
  28. // attribute flags
  29. #define FL_PRINT (0x01)
  30. #define FL_SPACE (0x02)
  31. #define FL_DIGIT (0x04)
  32. #define FL_ALPHA (0x08)
  33. #define FL_UPPER (0x10)
  34. #define FL_LOWER (0x20)
  35. #define FL_XDIGIT (0x40)
  36. // shorthand character attributes
  37. #define AT_PR (FL_PRINT)
  38. #define AT_SP (FL_SPACE | FL_PRINT)
  39. #define AT_DI (FL_DIGIT | FL_PRINT | FL_XDIGIT)
  40. #define AT_AL (FL_ALPHA | FL_PRINT)
  41. #define AT_UP (FL_UPPER | FL_ALPHA | FL_PRINT)
  42. #define AT_LO (FL_LOWER | FL_ALPHA | FL_PRINT)
  43. #define AT_UX (FL_UPPER | FL_ALPHA | FL_PRINT | FL_XDIGIT)
  44. #define AT_LX (FL_LOWER | FL_ALPHA | FL_PRINT | FL_XDIGIT)
  45. // table of attributes for ascii characters
  46. STATIC const uint8_t attr[] = {
  47. 0, 0, 0, 0, 0, 0, 0, 0,
  48. 0, AT_SP, AT_SP, AT_SP, AT_SP, AT_SP, 0, 0,
  49. 0, 0, 0, 0, 0, 0, 0, 0,
  50. 0, 0, 0, 0, 0, 0, 0, 0,
  51. AT_SP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  52. AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  53. AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI, AT_DI,
  54. AT_DI, AT_DI, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  55. AT_PR, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UX, AT_UP,
  56. AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
  57. AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP, AT_UP,
  58. AT_UP, AT_UP, AT_UP, AT_PR, AT_PR, AT_PR, AT_PR, AT_PR,
  59. AT_PR, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LX, AT_LO,
  60. AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
  61. AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO, AT_LO,
  62. AT_LO, AT_LO, AT_LO, AT_PR, AT_PR, AT_PR, AT_PR, 0
  63. };
  64. #if MICROPY_PY_BUILTINS_STR_UNICODE
  65. unichar utf8_get_char(const byte *s) {
  66. unichar ord = *s++;
  67. if (!UTF8_IS_NONASCII(ord)) return ord;
  68. ord &= 0x7F;
  69. for (unichar mask = 0x40; ord & mask; mask >>= 1) {
  70. ord &= ~mask;
  71. }
  72. while (UTF8_IS_CONT(*s)) {
  73. ord = (ord << 6) | (*s++ & 0x3F);
  74. }
  75. return ord;
  76. }
  77. const byte *utf8_next_char(const byte *s) {
  78. ++s;
  79. while (UTF8_IS_CONT(*s)) {
  80. ++s;
  81. }
  82. return s;
  83. }
  84. mp_uint_t utf8_ptr_to_index(const byte *s, const byte *ptr) {
  85. mp_uint_t i = 0;
  86. while (ptr > s) {
  87. if (!UTF8_IS_CONT(*--ptr)) {
  88. i++;
  89. }
  90. }
  91. return i;
  92. }
  93. size_t utf8_charlen(const byte *str, size_t len) {
  94. size_t charlen = 0;
  95. for (const byte *top = str + len; str < top; ++str) {
  96. if (!UTF8_IS_CONT(*str)) {
  97. ++charlen;
  98. }
  99. }
  100. return charlen;
  101. }
  102. #endif
  103. // Be aware: These unichar_is* functions are actually ASCII-only!
  104. bool unichar_isspace(unichar c) {
  105. return c < 128 && (attr[c] & FL_SPACE) != 0;
  106. }
  107. bool unichar_isalpha(unichar c) {
  108. return c < 128 && (attr[c] & FL_ALPHA) != 0;
  109. }
  110. /* unused
  111. bool unichar_isprint(unichar c) {
  112. return c < 128 && (attr[c] & FL_PRINT) != 0;
  113. }
  114. */
  115. bool unichar_isdigit(unichar c) {
  116. return c < 128 && (attr[c] & FL_DIGIT) != 0;
  117. }
  118. bool unichar_isxdigit(unichar c) {
  119. return c < 128 && (attr[c] & FL_XDIGIT) != 0;
  120. }
  121. bool unichar_isident(unichar c) {
  122. return c < 128 && ((attr[c] & (FL_ALPHA | FL_DIGIT)) != 0 || c == '_');
  123. }
  124. bool unichar_isupper(unichar c) {
  125. return c < 128 && (attr[c] & FL_UPPER) != 0;
  126. }
  127. bool unichar_islower(unichar c) {
  128. return c < 128 && (attr[c] & FL_LOWER) != 0;
  129. }
  130. unichar unichar_tolower(unichar c) {
  131. if (unichar_isupper(c)) {
  132. return c + 0x20;
  133. }
  134. return c;
  135. }
  136. unichar unichar_toupper(unichar c) {
  137. if (unichar_islower(c)) {
  138. return c - 0x20;
  139. }
  140. return c;
  141. }
  142. mp_uint_t unichar_xdigit_value(unichar c) {
  143. // c is assumed to be hex digit
  144. mp_uint_t n = c - '0';
  145. if (n > 9) {
  146. n &= ~('a' - 'A');
  147. n -= ('A' - ('9' + 1));
  148. }
  149. return n;
  150. }
  151. #if MICROPY_PY_BUILTINS_STR_UNICODE
  152. bool utf8_check(const byte *p, size_t len) {
  153. uint8_t need = 0;
  154. const byte *end = p + len;
  155. for (; p < end; p++) {
  156. byte c = *p;
  157. if (need) {
  158. if (c >= 0x80) {
  159. need--;
  160. } else {
  161. // mismatch
  162. return 0;
  163. }
  164. } else {
  165. if (c >= 0xc0) {
  166. if (c >= 0xf8) {
  167. // mismatch
  168. return 0;
  169. }
  170. need = (0xe5 >> ((c >> 3) & 0x6)) & 3;
  171. } else if (c >= 0x80) {
  172. // mismatch
  173. return 0;
  174. }
  175. }
  176. }
  177. return need == 0; // no pending fragments allowed
  178. }
  179. #endif