objstrunicode.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312
  1. /*
  2. * This file is part of the MicroPython project, http://micropython.org/
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Copyright (c) 2013, 2014 Damien P. George
  7. * Copyright (c) 2014 Paul Sokolovsky
  8. *
  9. * Permission is hereby granted, free of charge, to any person obtaining a copy
  10. * of this software and associated documentation files (the "Software"), to deal
  11. * in the Software without restriction, including without limitation the rights
  12. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  13. * copies of the Software, and to permit persons to whom the Software is
  14. * furnished to do so, subject to the following conditions:
  15. *
  16. * The above copyright notice and this permission notice shall be included in
  17. * all copies or substantial portions of the Software.
  18. *
  19. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  20. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  21. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  22. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  23. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  24. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  25. * THE SOFTWARE.
  26. */
  27. #include <string.h>
  28. #include <assert.h>
  29. #include "py/objstr.h"
  30. #include "py/objlist.h"
  31. #include "py/runtime.h"
  32. #if MICROPY_PY_BUILTINS_STR_UNICODE
  33. STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf);
  34. /******************************************************************************/
  35. /* str */
  36. STATIC void uni_print_quoted(const mp_print_t *print, const byte *str_data, uint str_len) {
  37. // this escapes characters, but it will be very slow to print (calling print many times)
  38. bool has_single_quote = false;
  39. bool has_double_quote = false;
  40. for (const byte *s = str_data, *top = str_data + str_len; !has_double_quote && s < top; s++) {
  41. if (*s == '\'') {
  42. has_single_quote = true;
  43. } else if (*s == '"') {
  44. has_double_quote = true;
  45. }
  46. }
  47. unichar quote_char = '\'';
  48. if (has_single_quote && !has_double_quote) {
  49. quote_char = '"';
  50. }
  51. mp_printf(print, "%c", quote_char);
  52. const byte *s = str_data, *top = str_data + str_len;
  53. while (s < top) {
  54. unichar ch;
  55. ch = utf8_get_char(s);
  56. s = utf8_next_char(s);
  57. if (ch == quote_char) {
  58. mp_printf(print, "\\%c", quote_char);
  59. } else if (ch == '\\') {
  60. mp_print_str(print, "\\\\");
  61. } else if (32 <= ch && ch <= 126) {
  62. mp_printf(print, "%c", ch);
  63. } else if (ch == '\n') {
  64. mp_print_str(print, "\\n");
  65. } else if (ch == '\r') {
  66. mp_print_str(print, "\\r");
  67. } else if (ch == '\t') {
  68. mp_print_str(print, "\\t");
  69. } else if (ch < 0x100) {
  70. mp_printf(print, "\\x%02x", ch);
  71. } else if (ch < 0x10000) {
  72. mp_printf(print, "\\u%04x", ch);
  73. } else {
  74. mp_printf(print, "\\U%08x", ch);
  75. }
  76. }
  77. mp_printf(print, "%c", quote_char);
  78. }
  79. STATIC void uni_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
  80. GET_STR_DATA_LEN(self_in, str_data, str_len);
  81. #if MICROPY_PY_UJSON
  82. if (kind == PRINT_JSON) {
  83. mp_str_print_json(print, str_data, str_len);
  84. return;
  85. }
  86. #endif
  87. if (kind == PRINT_STR) {
  88. mp_printf(print, "%.*s", str_len, str_data);
  89. } else {
  90. uni_print_quoted(print, str_data, str_len);
  91. }
  92. }
  93. STATIC mp_obj_t uni_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
  94. GET_STR_DATA_LEN(self_in, str_data, str_len);
  95. switch (op) {
  96. case MP_UNARY_OP_BOOL:
  97. return mp_obj_new_bool(str_len != 0);
  98. case MP_UNARY_OP_LEN:
  99. return MP_OBJ_NEW_SMALL_INT(utf8_charlen(str_data, str_len));
  100. default:
  101. return MP_OBJ_NULL; // op not supported
  102. }
  103. }
  104. // Convert an index into a pointer to its lead byte. Out of bounds indexing will raise IndexError or
  105. // be capped to the first/last character of the string, depending on is_slice.
  106. const byte *str_index_to_ptr(const mp_obj_type_t *type, const byte *self_data, size_t self_len,
  107. mp_obj_t index, bool is_slice) {
  108. // All str functions also handle bytes objects, and they call str_index_to_ptr(),
  109. // so it must handle bytes.
  110. if (type == &mp_type_bytes) {
  111. // Taken from objstr.c:str_index_to_ptr()
  112. size_t index_val = mp_get_index(type, self_len, index, is_slice);
  113. return self_data + index_val;
  114. }
  115. mp_int_t i;
  116. // Copied from mp_get_index; I don't want bounds checking, just give me
  117. // the integer as-is. (I can't bounds-check without scanning the whole
  118. // string; an out-of-bounds index will be caught in the loops below.)
  119. if (MP_OBJ_IS_SMALL_INT(index)) {
  120. i = MP_OBJ_SMALL_INT_VALUE(index);
  121. } else if (!mp_obj_get_int_maybe(index, &i)) {
  122. nlr_raise(mp_obj_new_exception_msg_varg(&mp_type_TypeError, "string indices must be integers, not %s", mp_obj_get_type_str(index)));
  123. }
  124. const byte *s, *top = self_data + self_len;
  125. if (i < 0)
  126. {
  127. // Negative indexing is performed by counting from the end of the string.
  128. for (s = top - 1; i; --s) {
  129. if (s < self_data) {
  130. if (is_slice) {
  131. return self_data;
  132. }
  133. mp_raise_msg(&mp_type_IndexError, "string index out of range");
  134. }
  135. if (!UTF8_IS_CONT(*s)) {
  136. ++i;
  137. }
  138. }
  139. ++s;
  140. } else {
  141. // Positive indexing, correspondingly, counts from the start of the string.
  142. // It's assumed that negative indexing will generally be used with small
  143. // absolute values (eg str[-1], not str[-1000000]), which means it'll be
  144. // more efficient this way.
  145. s = self_data;
  146. while (1) {
  147. // First check out-of-bounds
  148. if (s >= top) {
  149. if (is_slice) {
  150. return top;
  151. }
  152. mp_raise_msg(&mp_type_IndexError, "string index out of range");
  153. }
  154. // Then check completion
  155. if (i-- == 0) {
  156. break;
  157. }
  158. // Then skip UTF-8 char
  159. ++s;
  160. while (UTF8_IS_CONT(*s)) {
  161. ++s;
  162. }
  163. }
  164. }
  165. return s;
  166. }
  167. STATIC mp_obj_t str_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
  168. mp_obj_type_t *type = mp_obj_get_type(self_in);
  169. assert(type == &mp_type_str);
  170. GET_STR_DATA_LEN(self_in, self_data, self_len);
  171. if (value == MP_OBJ_SENTINEL) {
  172. // load
  173. #if MICROPY_PY_BUILTINS_SLICE
  174. if (MP_OBJ_IS_TYPE(index, &mp_type_slice)) {
  175. mp_obj_t ostart, ostop, ostep;
  176. mp_obj_slice_get(index, &ostart, &ostop, &ostep);
  177. if (ostep != mp_const_none && ostep != MP_OBJ_NEW_SMALL_INT(1)) {
  178. mp_raise_NotImplementedError("only slices with step=1 (aka None) are supported");
  179. }
  180. const byte *pstart, *pstop;
  181. if (ostart != mp_const_none) {
  182. pstart = str_index_to_ptr(type, self_data, self_len, ostart, true);
  183. } else {
  184. pstart = self_data;
  185. }
  186. if (ostop != mp_const_none) {
  187. // pstop will point just after the stop character. This depends on
  188. // the \0 at the end of the string.
  189. pstop = str_index_to_ptr(type, self_data, self_len, ostop, true);
  190. } else {
  191. pstop = self_data + self_len;
  192. }
  193. if (pstop < pstart) {
  194. return MP_OBJ_NEW_QSTR(MP_QSTR_);
  195. }
  196. return mp_obj_new_str_of_type(type, (const byte *)pstart, pstop - pstart);
  197. }
  198. #endif
  199. const byte *s = str_index_to_ptr(type, self_data, self_len, index, false);
  200. int len = 1;
  201. if (UTF8_IS_NONASCII(*s)) {
  202. // Count the number of 1 bits (after the first)
  203. for (char mask = 0x40; *s & mask; mask >>= 1) {
  204. ++len;
  205. }
  206. }
  207. return mp_obj_new_str_via_qstr((const char*)s, len); // This will create a one-character string
  208. } else {
  209. return MP_OBJ_NULL; // op not supported
  210. }
  211. }
  212. STATIC const mp_rom_map_elem_t struni_locals_dict_table[] = {
  213. #if MICROPY_CPYTHON_COMPAT
  214. { MP_ROM_QSTR(MP_QSTR_encode), MP_ROM_PTR(&str_encode_obj) },
  215. #endif
  216. { MP_ROM_QSTR(MP_QSTR_find), MP_ROM_PTR(&str_find_obj) },
  217. { MP_ROM_QSTR(MP_QSTR_rfind), MP_ROM_PTR(&str_rfind_obj) },
  218. { MP_ROM_QSTR(MP_QSTR_index), MP_ROM_PTR(&str_index_obj) },
  219. { MP_ROM_QSTR(MP_QSTR_rindex), MP_ROM_PTR(&str_rindex_obj) },
  220. { MP_ROM_QSTR(MP_QSTR_join), MP_ROM_PTR(&str_join_obj) },
  221. { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&str_split_obj) },
  222. #if MICROPY_PY_BUILTINS_STR_SPLITLINES
  223. { MP_ROM_QSTR(MP_QSTR_splitlines), MP_ROM_PTR(&str_splitlines_obj) },
  224. #endif
  225. { MP_ROM_QSTR(MP_QSTR_rsplit), MP_ROM_PTR(&str_rsplit_obj) },
  226. { MP_ROM_QSTR(MP_QSTR_startswith), MP_ROM_PTR(&str_startswith_obj) },
  227. { MP_ROM_QSTR(MP_QSTR_endswith), MP_ROM_PTR(&str_endswith_obj) },
  228. { MP_ROM_QSTR(MP_QSTR_strip), MP_ROM_PTR(&str_strip_obj) },
  229. { MP_ROM_QSTR(MP_QSTR_lstrip), MP_ROM_PTR(&str_lstrip_obj) },
  230. { MP_ROM_QSTR(MP_QSTR_rstrip), MP_ROM_PTR(&str_rstrip_obj) },
  231. { MP_ROM_QSTR(MP_QSTR_format), MP_ROM_PTR(&str_format_obj) },
  232. { MP_ROM_QSTR(MP_QSTR_replace), MP_ROM_PTR(&str_replace_obj) },
  233. { MP_ROM_QSTR(MP_QSTR_count), MP_ROM_PTR(&str_count_obj) },
  234. #if MICROPY_PY_BUILTINS_STR_PARTITION
  235. { MP_ROM_QSTR(MP_QSTR_partition), MP_ROM_PTR(&str_partition_obj) },
  236. { MP_ROM_QSTR(MP_QSTR_rpartition), MP_ROM_PTR(&str_rpartition_obj) },
  237. #endif
  238. #if MICROPY_PY_BUILTINS_STR_CENTER
  239. { MP_ROM_QSTR(MP_QSTR_center), MP_ROM_PTR(&str_center_obj) },
  240. #endif
  241. { MP_ROM_QSTR(MP_QSTR_lower), MP_ROM_PTR(&str_lower_obj) },
  242. { MP_ROM_QSTR(MP_QSTR_upper), MP_ROM_PTR(&str_upper_obj) },
  243. { MP_ROM_QSTR(MP_QSTR_isspace), MP_ROM_PTR(&str_isspace_obj) },
  244. { MP_ROM_QSTR(MP_QSTR_isalpha), MP_ROM_PTR(&str_isalpha_obj) },
  245. { MP_ROM_QSTR(MP_QSTR_isdigit), MP_ROM_PTR(&str_isdigit_obj) },
  246. { MP_ROM_QSTR(MP_QSTR_isupper), MP_ROM_PTR(&str_isupper_obj) },
  247. { MP_ROM_QSTR(MP_QSTR_islower), MP_ROM_PTR(&str_islower_obj) },
  248. };
  249. STATIC MP_DEFINE_CONST_DICT(struni_locals_dict, struni_locals_dict_table);
  250. const mp_obj_type_t mp_type_str = {
  251. { &mp_type_type },
  252. .name = MP_QSTR_str,
  253. .print = uni_print,
  254. .make_new = mp_obj_str_make_new,
  255. .unary_op = uni_unary_op,
  256. .binary_op = mp_obj_str_binary_op,
  257. .subscr = str_subscr,
  258. .getiter = mp_obj_new_str_iterator,
  259. .buffer_p = { .get_buffer = mp_obj_str_get_buffer },
  260. .locals_dict = (mp_obj_dict_t*)&struni_locals_dict,
  261. };
  262. /******************************************************************************/
  263. /* str iterator */
  264. typedef struct _mp_obj_str_it_t {
  265. mp_obj_base_t base;
  266. mp_fun_1_t iternext;
  267. mp_obj_t str;
  268. size_t cur;
  269. } mp_obj_str_it_t;
  270. STATIC mp_obj_t str_it_iternext(mp_obj_t self_in) {
  271. mp_obj_str_it_t *self = MP_OBJ_TO_PTR(self_in);
  272. GET_STR_DATA_LEN(self->str, str, len);
  273. if (self->cur < len) {
  274. const byte *cur = str + self->cur;
  275. const byte *end = utf8_next_char(str + self->cur);
  276. mp_obj_t o_out = mp_obj_new_str_via_qstr((const char*)cur, end - cur);
  277. self->cur += end - cur;
  278. return o_out;
  279. } else {
  280. return MP_OBJ_STOP_ITERATION;
  281. }
  282. }
  283. STATIC mp_obj_t mp_obj_new_str_iterator(mp_obj_t str, mp_obj_iter_buf_t *iter_buf) {
  284. assert(sizeof(mp_obj_str_it_t) <= sizeof(mp_obj_iter_buf_t));
  285. mp_obj_str_it_t *o = (mp_obj_str_it_t*)iter_buf;
  286. o->base.type = &mp_type_polymorph_iter;
  287. o->iternext = str_it_iternext;
  288. o->str = str;
  289. o->cur = 0;
  290. return MP_OBJ_FROM_PTR(o);
  291. }
  292. #endif // MICROPY_PY_BUILTINS_STR_UNICODE