modure.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. /*
  2. * This file is part of the MicroPython project, http://micropython.org/
  3. *
  4. * The MIT License (MIT)
  5. *
  6. * Copyright (c) 2014 Paul Sokolovsky
  7. *
  8. * Permission is hereby granted, free of charge, to any person obtaining a copy
  9. * of this software and associated documentation files (the "Software"), to deal
  10. * in the Software without restriction, including without limitation the rights
  11. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12. * copies of the Software, and to permit persons to whom the Software is
  13. * furnished to do so, subject to the following conditions:
  14. *
  15. * The above copyright notice and this permission notice shall be included in
  16. * all copies or substantial portions of the Software.
  17. *
  18. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  24. * THE SOFTWARE.
  25. */
  26. #include <stdio.h>
  27. #include <assert.h>
  28. #include <string.h>
  29. #include "py/runtime.h"
  30. #include "py/binary.h"
  31. #include "py/objstr.h"
  32. #include "py/stackctrl.h"
  33. #if MICROPY_PY_URE
  34. #define re1_5_stack_chk() MP_STACK_CHECK()
  35. #include "re1.5/re1.5.h"
  36. #define FLAG_DEBUG 0x1000
  37. typedef struct _mp_obj_re_t {
  38. mp_obj_base_t base;
  39. ByteProg re;
  40. } mp_obj_re_t;
  41. typedef struct _mp_obj_match_t {
  42. mp_obj_base_t base;
  43. int num_matches;
  44. mp_obj_t str;
  45. const char *caps[0];
  46. } mp_obj_match_t;
  47. STATIC void match_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
  48. (void)kind;
  49. mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
  50. mp_printf(print, "<match num=%d>", self->num_matches);
  51. }
  52. STATIC mp_obj_t match_group(mp_obj_t self_in, mp_obj_t no_in) {
  53. mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
  54. mp_int_t no = mp_obj_get_int(no_in);
  55. if (no < 0 || no >= self->num_matches) {
  56. nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, no_in));
  57. }
  58. const char *start = self->caps[no * 2];
  59. if (start == NULL) {
  60. // no match for this group
  61. return mp_const_none;
  62. }
  63. return mp_obj_new_str_of_type(mp_obj_get_type(self->str),
  64. (const byte*)start, self->caps[no * 2 + 1] - start);
  65. }
  66. MP_DEFINE_CONST_FUN_OBJ_2(match_group_obj, match_group);
  67. #if MICROPY_PY_URE_MATCH_GROUPS
  68. STATIC mp_obj_t match_groups(mp_obj_t self_in) {
  69. mp_obj_match_t *self = MP_OBJ_TO_PTR(self_in);
  70. if (self->num_matches <= 1) {
  71. return mp_const_empty_tuple;
  72. }
  73. mp_obj_tuple_t *groups = MP_OBJ_TO_PTR(mp_obj_new_tuple(self->num_matches - 1, NULL));
  74. for (int i = 1; i < self->num_matches; ++i) {
  75. groups->items[i - 1] = match_group(self_in, MP_OBJ_NEW_SMALL_INT(i));
  76. }
  77. return MP_OBJ_FROM_PTR(groups);
  78. }
  79. MP_DEFINE_CONST_FUN_OBJ_1(match_groups_obj, match_groups);
  80. #endif
  81. #if MICROPY_PY_URE_MATCH_SPAN_START_END
  82. STATIC void match_span_helper(size_t n_args, const mp_obj_t *args, mp_obj_t span[2]) {
  83. mp_obj_match_t *self = MP_OBJ_TO_PTR(args[0]);
  84. mp_int_t no = 0;
  85. if (n_args == 2) {
  86. no = mp_obj_get_int(args[1]);
  87. if (no < 0 || no >= self->num_matches) {
  88. nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, args[1]));
  89. }
  90. }
  91. mp_int_t s = -1;
  92. mp_int_t e = -1;
  93. const char *start = self->caps[no * 2];
  94. if (start != NULL) {
  95. // have a match for this group
  96. const char *begin = mp_obj_str_get_str(self->str);
  97. s = start - begin;
  98. e = self->caps[no * 2 + 1] - begin;
  99. }
  100. span[0] = mp_obj_new_int(s);
  101. span[1] = mp_obj_new_int(e);
  102. }
  103. STATIC mp_obj_t match_span(size_t n_args, const mp_obj_t *args) {
  104. mp_obj_t span[2];
  105. match_span_helper(n_args, args, span);
  106. return mp_obj_new_tuple(2, span);
  107. }
  108. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_span_obj, 1, 2, match_span);
  109. STATIC mp_obj_t match_start(size_t n_args, const mp_obj_t *args) {
  110. mp_obj_t span[2];
  111. match_span_helper(n_args, args, span);
  112. return span[0];
  113. }
  114. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_start_obj, 1, 2, match_start);
  115. STATIC mp_obj_t match_end(size_t n_args, const mp_obj_t *args) {
  116. mp_obj_t span[2];
  117. match_span_helper(n_args, args, span);
  118. return span[1];
  119. }
  120. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(match_end_obj, 1, 2, match_end);
  121. #endif
  122. STATIC const mp_rom_map_elem_t match_locals_dict_table[] = {
  123. { MP_ROM_QSTR(MP_QSTR_group), MP_ROM_PTR(&match_group_obj) },
  124. #if MICROPY_PY_URE_MATCH_GROUPS
  125. { MP_ROM_QSTR(MP_QSTR_groups), MP_ROM_PTR(&match_groups_obj) },
  126. #endif
  127. #if MICROPY_PY_URE_MATCH_SPAN_START_END
  128. { MP_ROM_QSTR(MP_QSTR_span), MP_ROM_PTR(&match_span_obj) },
  129. { MP_ROM_QSTR(MP_QSTR_start), MP_ROM_PTR(&match_start_obj) },
  130. { MP_ROM_QSTR(MP_QSTR_end), MP_ROM_PTR(&match_end_obj) },
  131. #endif
  132. };
  133. STATIC MP_DEFINE_CONST_DICT(match_locals_dict, match_locals_dict_table);
  134. STATIC const mp_obj_type_t match_type = {
  135. { &mp_type_type },
  136. .name = MP_QSTR_match,
  137. .print = match_print,
  138. .locals_dict = (void*)&match_locals_dict,
  139. };
  140. STATIC void re_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
  141. (void)kind;
  142. mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in);
  143. mp_printf(print, "<re %p>", self);
  144. }
  145. STATIC mp_obj_t ure_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
  146. (void)n_args;
  147. mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]);
  148. Subject subj;
  149. size_t len;
  150. subj.begin = mp_obj_str_get_data(args[1], &len);
  151. subj.end = subj.begin + len;
  152. int caps_num = (self->re.sub + 1) * 2;
  153. mp_obj_match_t *match = m_new_obj_var(mp_obj_match_t, char*, caps_num);
  154. // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
  155. memset((char*)match->caps, 0, caps_num * sizeof(char*));
  156. int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, is_anchored);
  157. if (res == 0) {
  158. m_del_var(mp_obj_match_t, char*, caps_num, match);
  159. return mp_const_none;
  160. }
  161. match->base.type = &match_type;
  162. match->num_matches = caps_num / 2; // caps_num counts start and end pointers
  163. match->str = args[1];
  164. return MP_OBJ_FROM_PTR(match);
  165. }
  166. STATIC mp_obj_t re_match(size_t n_args, const mp_obj_t *args) {
  167. return ure_exec(true, n_args, args);
  168. }
  169. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_match_obj, 2, 4, re_match);
  170. STATIC mp_obj_t re_search(size_t n_args, const mp_obj_t *args) {
  171. return ure_exec(false, n_args, args);
  172. }
  173. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_search_obj, 2, 4, re_search);
  174. STATIC mp_obj_t re_split(size_t n_args, const mp_obj_t *args) {
  175. mp_obj_re_t *self = MP_OBJ_TO_PTR(args[0]);
  176. Subject subj;
  177. size_t len;
  178. const mp_obj_type_t *str_type = mp_obj_get_type(args[1]);
  179. subj.begin = mp_obj_str_get_data(args[1], &len);
  180. subj.end = subj.begin + len;
  181. int caps_num = (self->re.sub + 1) * 2;
  182. int maxsplit = 0;
  183. if (n_args > 2) {
  184. maxsplit = mp_obj_get_int(args[2]);
  185. }
  186. mp_obj_t retval = mp_obj_new_list(0, NULL);
  187. const char **caps = mp_local_alloc(caps_num * sizeof(char*));
  188. while (true) {
  189. // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
  190. memset((char**)caps, 0, caps_num * sizeof(char*));
  191. int res = re1_5_recursiveloopprog(&self->re, &subj, caps, caps_num, false);
  192. // if we didn't have a match, or had an empty match, it's time to stop
  193. if (!res || caps[0] == caps[1]) {
  194. break;
  195. }
  196. mp_obj_t s = mp_obj_new_str_of_type(str_type, (const byte*)subj.begin, caps[0] - subj.begin);
  197. mp_obj_list_append(retval, s);
  198. if (self->re.sub > 0) {
  199. mp_raise_NotImplementedError("Splitting with sub-captures");
  200. }
  201. subj.begin = caps[1];
  202. if (maxsplit > 0 && --maxsplit == 0) {
  203. break;
  204. }
  205. }
  206. // cast is a workaround for a bug in msvc (see above)
  207. mp_local_free((char**)caps);
  208. mp_obj_t s = mp_obj_new_str_of_type(str_type, (const byte*)subj.begin, subj.end - subj.begin);
  209. mp_obj_list_append(retval, s);
  210. return retval;
  211. }
  212. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_split_obj, 2, 3, re_split);
  213. #if MICROPY_PY_URE_SUB
  214. STATIC mp_obj_t re_sub_helper(mp_obj_t self_in, size_t n_args, const mp_obj_t *args) {
  215. mp_obj_re_t *self = MP_OBJ_TO_PTR(self_in);
  216. mp_obj_t replace = args[1];
  217. mp_obj_t where = args[2];
  218. mp_int_t count = 0;
  219. if (n_args > 3) {
  220. count = mp_obj_get_int(args[3]);
  221. // Note: flags are currently ignored
  222. }
  223. size_t where_len;
  224. const char *where_str = mp_obj_str_get_data(where, &where_len);
  225. Subject subj;
  226. subj.begin = where_str;
  227. subj.end = subj.begin + where_len;
  228. int caps_num = (self->re.sub + 1) * 2;
  229. vstr_t vstr_return;
  230. vstr_return.buf = NULL; // We'll init the vstr after the first match
  231. mp_obj_match_t *match = mp_local_alloc(sizeof(mp_obj_match_t) + caps_num * sizeof(char*));
  232. match->base.type = &match_type;
  233. match->num_matches = caps_num / 2; // caps_num counts start and end pointers
  234. match->str = where;
  235. for (;;) {
  236. // cast is a workaround for a bug in msvc: it treats const char** as a const pointer instead of a pointer to pointer to const char
  237. memset((char*)match->caps, 0, caps_num * sizeof(char*));
  238. int res = re1_5_recursiveloopprog(&self->re, &subj, match->caps, caps_num, false);
  239. // If we didn't have a match, or had an empty match, it's time to stop
  240. if (!res || match->caps[0] == match->caps[1]) {
  241. break;
  242. }
  243. // Initialise the vstr if it's not already
  244. if (vstr_return.buf == NULL) {
  245. vstr_init(&vstr_return, match->caps[0] - subj.begin);
  246. }
  247. // Add pre-match string
  248. vstr_add_strn(&vstr_return, subj.begin, match->caps[0] - subj.begin);
  249. // Get replacement string
  250. const char* repl = mp_obj_str_get_str((mp_obj_is_callable(replace) ? mp_call_function_1(replace, MP_OBJ_FROM_PTR(match)) : replace));
  251. // Append replacement string to result, substituting any regex groups
  252. while (*repl != '\0') {
  253. if (*repl == '\\') {
  254. ++repl;
  255. bool is_g_format = false;
  256. if (*repl == 'g' && repl[1] == '<') {
  257. // Group specified with syntax "\g<number>"
  258. repl += 2;
  259. is_g_format = true;
  260. }
  261. if ('0' <= *repl && *repl <= '9') {
  262. // Group specified with syntax "\g<number>" or "\number"
  263. unsigned int match_no = 0;
  264. do {
  265. match_no = match_no * 10 + (*repl++ - '0');
  266. } while ('0' <= *repl && *repl <= '9');
  267. if (is_g_format && *repl == '>') {
  268. ++repl;
  269. }
  270. if (match_no >= (unsigned int)match->num_matches) {
  271. nlr_raise(mp_obj_new_exception_arg1(&mp_type_IndexError, MP_OBJ_NEW_SMALL_INT(match_no)));
  272. }
  273. const char *start_match = match->caps[match_no * 2];
  274. if (start_match != NULL) {
  275. // Add the substring matched by group
  276. const char *end_match = match->caps[match_no * 2 + 1];
  277. vstr_add_strn(&vstr_return, start_match, end_match - start_match);
  278. }
  279. }
  280. } else {
  281. // Just add the current byte from the replacement string
  282. vstr_add_byte(&vstr_return, *repl++);
  283. }
  284. }
  285. // Move start pointer to end of last match
  286. subj.begin = match->caps[1];
  287. // Stop substitutions if count was given and gets to 0
  288. if (count > 0 && --count == 0) {
  289. break;
  290. }
  291. }
  292. mp_local_free(match);
  293. if (vstr_return.buf == NULL) {
  294. // Optimisation for case of no substitutions
  295. return where;
  296. }
  297. // Add post-match string
  298. vstr_add_strn(&vstr_return, subj.begin, subj.end - subj.begin);
  299. return mp_obj_new_str_from_vstr(mp_obj_get_type(where), &vstr_return);
  300. }
  301. STATIC mp_obj_t re_sub(size_t n_args, const mp_obj_t *args) {
  302. return re_sub_helper(args[0], n_args, args);
  303. }
  304. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(re_sub_obj, 3, 5, re_sub);
  305. #endif
  306. STATIC const mp_rom_map_elem_t re_locals_dict_table[] = {
  307. { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&re_match_obj) },
  308. { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&re_search_obj) },
  309. { MP_ROM_QSTR(MP_QSTR_split), MP_ROM_PTR(&re_split_obj) },
  310. #if MICROPY_PY_URE_SUB
  311. { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&re_sub_obj) },
  312. #endif
  313. };
  314. STATIC MP_DEFINE_CONST_DICT(re_locals_dict, re_locals_dict_table);
  315. STATIC const mp_obj_type_t re_type = {
  316. { &mp_type_type },
  317. .name = MP_QSTR_ure,
  318. .print = re_print,
  319. .locals_dict = (void*)&re_locals_dict,
  320. };
  321. STATIC mp_obj_t mod_re_compile(size_t n_args, const mp_obj_t *args) {
  322. const char *re_str = mp_obj_str_get_str(args[0]);
  323. int size = re1_5_sizecode(re_str);
  324. if (size == -1) {
  325. goto error;
  326. }
  327. mp_obj_re_t *o = m_new_obj_var(mp_obj_re_t, char, size);
  328. o->base.type = &re_type;
  329. int flags = 0;
  330. if (n_args > 1) {
  331. flags = mp_obj_get_int(args[1]);
  332. }
  333. int error = re1_5_compilecode(&o->re, re_str);
  334. if (error != 0) {
  335. error:
  336. mp_raise_ValueError("Error in regex");
  337. }
  338. if (flags & FLAG_DEBUG) {
  339. re1_5_dumpcode(&o->re);
  340. }
  341. return MP_OBJ_FROM_PTR(o);
  342. }
  343. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_compile_obj, 1, 2, mod_re_compile);
  344. STATIC mp_obj_t mod_re_exec(bool is_anchored, uint n_args, const mp_obj_t *args) {
  345. (void)n_args;
  346. mp_obj_t self = mod_re_compile(1, args);
  347. const mp_obj_t args2[] = {self, args[1]};
  348. mp_obj_t match = ure_exec(is_anchored, 2, args2);
  349. return match;
  350. }
  351. STATIC mp_obj_t mod_re_match(size_t n_args, const mp_obj_t *args) {
  352. return mod_re_exec(true, n_args, args);
  353. }
  354. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_match_obj, 2, 4, mod_re_match);
  355. STATIC mp_obj_t mod_re_search(size_t n_args, const mp_obj_t *args) {
  356. return mod_re_exec(false, n_args, args);
  357. }
  358. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_search_obj, 2, 4, mod_re_search);
  359. #if MICROPY_PY_URE_SUB
  360. STATIC mp_obj_t mod_re_sub(size_t n_args, const mp_obj_t *args) {
  361. mp_obj_t self = mod_re_compile(1, args);
  362. return re_sub_helper(self, n_args, args);
  363. }
  364. MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(mod_re_sub_obj, 3, 5, mod_re_sub);
  365. #endif
  366. STATIC const mp_rom_map_elem_t mp_module_re_globals_table[] = {
  367. { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_ure) },
  368. { MP_ROM_QSTR(MP_QSTR_compile), MP_ROM_PTR(&mod_re_compile_obj) },
  369. { MP_ROM_QSTR(MP_QSTR_match), MP_ROM_PTR(&mod_re_match_obj) },
  370. { MP_ROM_QSTR(MP_QSTR_search), MP_ROM_PTR(&mod_re_search_obj) },
  371. #if MICROPY_PY_URE_SUB
  372. { MP_ROM_QSTR(MP_QSTR_sub), MP_ROM_PTR(&mod_re_sub_obj) },
  373. #endif
  374. { MP_ROM_QSTR(MP_QSTR_DEBUG), MP_ROM_INT(FLAG_DEBUG) },
  375. };
  376. STATIC MP_DEFINE_CONST_DICT(mp_module_re_globals, mp_module_re_globals_table);
  377. const mp_obj_module_t mp_module_ure = {
  378. .base = { &mp_type_module },
  379. .globals = (mp_obj_dict_t*)&mp_module_re_globals,
  380. };
  381. // Source files #include'd here to make sure they're compiled in
  382. // only if module is enabled by config setting.
  383. #define re1_5_fatal(x) assert(!x)
  384. #include "re1.5/compilecode.c"
  385. #include "re1.5/dumpcode.c"
  386. #include "re1.5/recursiveloop.c"
  387. #include "re1.5/charclass.c"
  388. #endif //MICROPY_PY_URE