regex_internal.c 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742
  1. /* Extended regular expression matching and search library.
  2. Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free
  3. Software Foundation, Inc.
  4. This file is part of the GNU C Library.
  5. Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
  6. This program is free software; you can redistribute it and/or modify
  7. it under the terms of the GNU General Public License as published by
  8. the Free Software Foundation; either version 3, or (at your option)
  9. any later version.
  10. This program is distributed in the hope that it will be useful,
  11. but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. GNU General Public License for more details.
  14. You should have received a copy of the GNU General Public License along
  15. with this program; if not, write to the Free Software Foundation,
  16. Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
  17. static void re_string_construct_common (const char *str, Idx len,
  18. re_string_t *pstr,
  19. RE_TRANSLATE_TYPE trans, bool icase,
  20. const re_dfa_t *dfa) internal_function;
  21. static re_dfastate_t *create_ci_newstate (const re_dfa_t *dfa,
  22. const re_node_set *nodes,
  23. re_hashval_t hash) internal_function;
  24. static re_dfastate_t *create_cd_newstate (const re_dfa_t *dfa,
  25. const re_node_set *nodes,
  26. unsigned int context,
  27. re_hashval_t hash) internal_function;
  28. /* Functions for string operation. */
  29. /* This function allocate the buffers. It is necessary to call
  30. re_string_reconstruct before using the object. */
  31. static reg_errcode_t
  32. internal_function __attribute_warn_unused_result__
  33. re_string_allocate (re_string_t *pstr, const char *str, Idx len, Idx init_len,
  34. RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  35. {
  36. reg_errcode_t ret;
  37. Idx init_buf_len;
  38. /* Ensure at least one character fits into the buffers. */
  39. if (init_len < dfa->mb_cur_max)
  40. init_len = dfa->mb_cur_max;
  41. init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
  42. re_string_construct_common (str, len, pstr, trans, icase, dfa);
  43. ret = re_string_realloc_buffers (pstr, init_buf_len);
  44. if (BE (ret != REG_NOERROR, 0))
  45. return ret;
  46. pstr->word_char = dfa->word_char;
  47. pstr->word_ops_used = dfa->word_ops_used;
  48. pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  49. pstr->valid_len = (pstr->mbs_allocated || dfa->mb_cur_max > 1) ? 0 : len;
  50. pstr->valid_raw_len = pstr->valid_len;
  51. return REG_NOERROR;
  52. }
  53. /* This function allocate the buffers, and initialize them. */
  54. static reg_errcode_t
  55. internal_function __attribute_warn_unused_result__
  56. re_string_construct (re_string_t *pstr, const char *str, Idx len,
  57. RE_TRANSLATE_TYPE trans, bool icase, const re_dfa_t *dfa)
  58. {
  59. reg_errcode_t ret;
  60. memset (pstr, '\0', sizeof (re_string_t));
  61. re_string_construct_common (str, len, pstr, trans, icase, dfa);
  62. if (len > 0)
  63. {
  64. ret = re_string_realloc_buffers (pstr, len + 1);
  65. if (BE (ret != REG_NOERROR, 0))
  66. return ret;
  67. }
  68. pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
  69. if (icase)
  70. {
  71. #ifdef RE_ENABLE_I18N
  72. if (dfa->mb_cur_max > 1)
  73. {
  74. while (1)
  75. {
  76. ret = build_wcs_upper_buffer (pstr);
  77. if (BE (ret != REG_NOERROR, 0))
  78. return ret;
  79. if (pstr->valid_raw_len >= len)
  80. break;
  81. if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
  82. break;
  83. ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
  84. if (BE (ret != REG_NOERROR, 0))
  85. return ret;
  86. }
  87. }
  88. else
  89. #endif /* RE_ENABLE_I18N */
  90. build_upper_buffer (pstr);
  91. }
  92. else
  93. {
  94. #ifdef RE_ENABLE_I18N
  95. if (dfa->mb_cur_max > 1)
  96. build_wcs_buffer (pstr);
  97. else
  98. #endif /* RE_ENABLE_I18N */
  99. {
  100. if (trans != NULL)
  101. re_string_translate_buffer (pstr);
  102. else
  103. {
  104. pstr->valid_len = pstr->bufs_len;
  105. pstr->valid_raw_len = pstr->bufs_len;
  106. }
  107. }
  108. }
  109. return REG_NOERROR;
  110. }
  111. /* Helper functions for re_string_allocate, and re_string_construct. */
  112. static reg_errcode_t
  113. internal_function __attribute_warn_unused_result__
  114. re_string_realloc_buffers (re_string_t *pstr, Idx new_buf_len)
  115. {
  116. #ifdef RE_ENABLE_I18N
  117. if (pstr->mb_cur_max > 1)
  118. {
  119. wint_t *new_wcs;
  120. /* Avoid overflow. */
  121. size_t max_object_size = MAX (sizeof (wint_t), sizeof (Idx));
  122. if (BE (SIZE_MAX / max_object_size < new_buf_len, 0))
  123. return REG_ESPACE;
  124. new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
  125. if (BE (new_wcs == NULL, 0))
  126. return REG_ESPACE;
  127. pstr->wcs = new_wcs;
  128. if (pstr->offsets != NULL)
  129. {
  130. Idx *new_offsets = re_realloc (pstr->offsets, Idx, new_buf_len);
  131. if (BE (new_offsets == NULL, 0))
  132. return REG_ESPACE;
  133. pstr->offsets = new_offsets;
  134. }
  135. }
  136. #endif /* RE_ENABLE_I18N */
  137. if (pstr->mbs_allocated)
  138. {
  139. unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
  140. new_buf_len);
  141. if (BE (new_mbs == NULL, 0))
  142. return REG_ESPACE;
  143. pstr->mbs = new_mbs;
  144. }
  145. pstr->bufs_len = new_buf_len;
  146. return REG_NOERROR;
  147. }
  148. static void
  149. internal_function
  150. re_string_construct_common (const char *str, Idx len, re_string_t *pstr,
  151. RE_TRANSLATE_TYPE trans, bool icase,
  152. const re_dfa_t *dfa)
  153. {
  154. pstr->raw_mbs = (const unsigned char *) str;
  155. pstr->len = len;
  156. pstr->raw_len = len;
  157. pstr->trans = trans;
  158. pstr->icase = icase;
  159. pstr->mbs_allocated = (trans != NULL || icase);
  160. pstr->mb_cur_max = dfa->mb_cur_max;
  161. pstr->is_utf8 = dfa->is_utf8;
  162. pstr->map_notascii = dfa->map_notascii;
  163. pstr->stop = pstr->len;
  164. pstr->raw_stop = pstr->stop;
  165. }
  166. #ifdef RE_ENABLE_I18N
  167. /* Build wide character buffer PSTR->WCS.
  168. If the byte sequence of the string are:
  169. <mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
  170. Then wide character buffer will be:
  171. <wc1> , WEOF , <wc2> , WEOF , <wc3>
  172. We use WEOF for padding, they indicate that the position isn't
  173. a first byte of a multibyte character.
  174. Note that this function assumes PSTR->VALID_LEN elements are already
  175. built and starts from PSTR->VALID_LEN. */
  176. static void
  177. internal_function
  178. build_wcs_buffer (re_string_t *pstr)
  179. {
  180. #ifdef _LIBC
  181. unsigned char buf[MB_LEN_MAX];
  182. assert (MB_LEN_MAX >= pstr->mb_cur_max);
  183. #else
  184. unsigned char buf[64];
  185. #endif
  186. mbstate_t prev_st;
  187. Idx byte_idx, end_idx, remain_len;
  188. size_t mbclen;
  189. /* Build the buffers from pstr->valid_len to either pstr->len or
  190. pstr->bufs_len. */
  191. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  192. for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
  193. {
  194. wchar_t wc;
  195. const char *p;
  196. remain_len = end_idx - byte_idx;
  197. prev_st = pstr->cur_state;
  198. /* Apply the translation if we need. */
  199. if (BE (pstr->trans != NULL, 0))
  200. {
  201. int i, ch;
  202. for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
  203. {
  204. ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
  205. buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
  206. }
  207. p = (const char *) buf;
  208. }
  209. else
  210. p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
  211. mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
  212. if (BE (mbclen == (size_t) -2, 0))
  213. {
  214. /* The buffer doesn't have enough space, finish to build. */
  215. pstr->cur_state = prev_st;
  216. break;
  217. }
  218. else if (BE (mbclen == (size_t) -1 || mbclen == 0, 0))
  219. {
  220. /* We treat these cases as a singlebyte character. */
  221. mbclen = 1;
  222. wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
  223. if (BE (pstr->trans != NULL, 0))
  224. wc = pstr->trans[wc];
  225. pstr->cur_state = prev_st;
  226. }
  227. /* Write wide character and padding. */
  228. pstr->wcs[byte_idx++] = wc;
  229. /* Write paddings. */
  230. for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
  231. pstr->wcs[byte_idx++] = WEOF;
  232. }
  233. pstr->valid_len = byte_idx;
  234. pstr->valid_raw_len = byte_idx;
  235. }
  236. /* Build wide character buffer PSTR->WCS like build_wcs_buffer,
  237. but for REG_ICASE. */
  238. static reg_errcode_t
  239. internal_function __attribute_warn_unused_result__
  240. build_wcs_upper_buffer (re_string_t *pstr)
  241. {
  242. mbstate_t prev_st;
  243. Idx src_idx, byte_idx, end_idx, remain_len;
  244. size_t mbclen;
  245. #ifdef _LIBC
  246. char buf[MB_LEN_MAX];
  247. assert (MB_LEN_MAX >= pstr->mb_cur_max);
  248. #else
  249. char buf[64];
  250. #endif
  251. byte_idx = pstr->valid_len;
  252. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  253. /* The following optimization assumes that ASCII characters can be
  254. mapped to wide characters with a simple cast. */
  255. if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
  256. {
  257. while (byte_idx < end_idx)
  258. {
  259. wchar_t wc;
  260. if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
  261. && mbsinit (&pstr->cur_state))
  262. {
  263. /* In case of a singlebyte character. */
  264. pstr->mbs[byte_idx]
  265. = toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
  266. /* The next step uses the assumption that wchar_t is encoded
  267. ASCII-safe: all ASCII values can be converted like this. */
  268. pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
  269. ++byte_idx;
  270. continue;
  271. }
  272. remain_len = end_idx - byte_idx;
  273. prev_st = pstr->cur_state;
  274. mbclen = __mbrtowc (&wc,
  275. ((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
  276. + byte_idx), remain_len, &pstr->cur_state);
  277. if (BE (mbclen < (size_t) -2, 1))
  278. {
  279. wchar_t wcu = wc;
  280. if (iswlower (wc))
  281. {
  282. size_t mbcdlen;
  283. wcu = towupper (wc);
  284. mbcdlen = wcrtomb (buf, wcu, &prev_st);
  285. if (BE (mbclen == mbcdlen, 1))
  286. memcpy (pstr->mbs + byte_idx, buf, mbclen);
  287. else
  288. {
  289. src_idx = byte_idx;
  290. goto offsets_needed;
  291. }
  292. }
  293. else
  294. memcpy (pstr->mbs + byte_idx,
  295. pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
  296. pstr->wcs[byte_idx++] = wcu;
  297. /* Write paddings. */
  298. for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
  299. pstr->wcs[byte_idx++] = WEOF;
  300. }
  301. else if (mbclen == (size_t) -1 || mbclen == 0)
  302. {
  303. /* It is an invalid character or '\0'. Just use the byte. */
  304. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
  305. pstr->mbs[byte_idx] = ch;
  306. /* And also cast it to wide char. */
  307. pstr->wcs[byte_idx++] = (wchar_t) ch;
  308. if (BE (mbclen == (size_t) -1, 0))
  309. pstr->cur_state = prev_st;
  310. }
  311. else
  312. {
  313. /* The buffer doesn't have enough space, finish to build. */
  314. pstr->cur_state = prev_st;
  315. break;
  316. }
  317. }
  318. pstr->valid_len = byte_idx;
  319. pstr->valid_raw_len = byte_idx;
  320. return REG_NOERROR;
  321. }
  322. else
  323. for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
  324. {
  325. wchar_t wc;
  326. const char *p;
  327. offsets_needed:
  328. remain_len = end_idx - byte_idx;
  329. prev_st = pstr->cur_state;
  330. if (BE (pstr->trans != NULL, 0))
  331. {
  332. int i, ch;
  333. for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
  334. {
  335. ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
  336. buf[i] = pstr->trans[ch];
  337. }
  338. p = (const char *) buf;
  339. }
  340. else
  341. p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
  342. mbclen = __mbrtowc (&wc, p, remain_len, &pstr->cur_state);
  343. if (BE (mbclen < (size_t) -2, 1))
  344. {
  345. wchar_t wcu = wc;
  346. if (iswlower (wc))
  347. {
  348. size_t mbcdlen;
  349. wcu = towupper (wc);
  350. mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
  351. if (BE (mbclen == mbcdlen, 1))
  352. memcpy (pstr->mbs + byte_idx, buf, mbclen);
  353. else if (mbcdlen != (size_t) -1)
  354. {
  355. size_t i;
  356. if (byte_idx + mbcdlen > pstr->bufs_len)
  357. {
  358. pstr->cur_state = prev_st;
  359. break;
  360. }
  361. if (pstr->offsets == NULL)
  362. {
  363. pstr->offsets = re_malloc (Idx, pstr->bufs_len);
  364. if (pstr->offsets == NULL)
  365. return REG_ESPACE;
  366. }
  367. if (!pstr->offsets_needed)
  368. {
  369. for (i = 0; i < (size_t) byte_idx; ++i)
  370. pstr->offsets[i] = i;
  371. pstr->offsets_needed = 1;
  372. }
  373. memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
  374. pstr->wcs[byte_idx] = wcu;
  375. pstr->offsets[byte_idx] = src_idx;
  376. for (i = 1; i < mbcdlen; ++i)
  377. {
  378. pstr->offsets[byte_idx + i]
  379. = src_idx + (i < mbclen ? i : mbclen - 1);
  380. pstr->wcs[byte_idx + i] = WEOF;
  381. }
  382. pstr->len += mbcdlen - mbclen;
  383. if (pstr->raw_stop > src_idx)
  384. pstr->stop += mbcdlen - mbclen;
  385. end_idx = (pstr->bufs_len > pstr->len)
  386. ? pstr->len : pstr->bufs_len;
  387. byte_idx += mbcdlen;
  388. src_idx += mbclen;
  389. continue;
  390. }
  391. else
  392. memcpy (pstr->mbs + byte_idx, p, mbclen);
  393. }
  394. else
  395. memcpy (pstr->mbs + byte_idx, p, mbclen);
  396. if (BE (pstr->offsets_needed != 0, 0))
  397. {
  398. size_t i;
  399. for (i = 0; i < mbclen; ++i)
  400. pstr->offsets[byte_idx + i] = src_idx + i;
  401. }
  402. src_idx += mbclen;
  403. pstr->wcs[byte_idx++] = wcu;
  404. /* Write paddings. */
  405. for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
  406. pstr->wcs[byte_idx++] = WEOF;
  407. }
  408. else if (mbclen == (size_t) -1 || mbclen == 0)
  409. {
  410. /* It is an invalid character or '\0'. Just use the byte. */
  411. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
  412. if (BE (pstr->trans != NULL, 0))
  413. ch = pstr->trans [ch];
  414. pstr->mbs[byte_idx] = ch;
  415. if (BE (pstr->offsets_needed != 0, 0))
  416. pstr->offsets[byte_idx] = src_idx;
  417. ++src_idx;
  418. /* And also cast it to wide char. */
  419. pstr->wcs[byte_idx++] = (wchar_t) ch;
  420. if (BE (mbclen == (size_t) -1, 0))
  421. pstr->cur_state = prev_st;
  422. }
  423. else
  424. {
  425. /* The buffer doesn't have enough space, finish to build. */
  426. pstr->cur_state = prev_st;
  427. break;
  428. }
  429. }
  430. pstr->valid_len = byte_idx;
  431. pstr->valid_raw_len = src_idx;
  432. return REG_NOERROR;
  433. }
  434. /* Skip characters until the index becomes greater than NEW_RAW_IDX.
  435. Return the index. */
  436. static Idx
  437. internal_function
  438. re_string_skip_chars (re_string_t *pstr, Idx new_raw_idx, wint_t *last_wc)
  439. {
  440. mbstate_t prev_st;
  441. Idx rawbuf_idx;
  442. size_t mbclen;
  443. wint_t wc = WEOF;
  444. /* Skip the characters which are not necessary to check. */
  445. for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
  446. rawbuf_idx < new_raw_idx;)
  447. {
  448. wchar_t wc2;
  449. Idx remain_len;
  450. remain_len = pstr->len - rawbuf_idx;
  451. prev_st = pstr->cur_state;
  452. mbclen = __mbrtowc (&wc2, (const char *) pstr->raw_mbs + rawbuf_idx,
  453. remain_len, &pstr->cur_state);
  454. if (BE (mbclen == (size_t) -2 || mbclen == (size_t) -1 || mbclen == 0, 0))
  455. {
  456. /* We treat these cases as a single byte character. */
  457. if (mbclen == 0 || remain_len == 0)
  458. wc = L'\0';
  459. else
  460. wc = *(unsigned char *) (pstr->raw_mbs + rawbuf_idx);
  461. mbclen = 1;
  462. pstr->cur_state = prev_st;
  463. }
  464. else
  465. wc = wc2;
  466. /* Then proceed the next character. */
  467. rawbuf_idx += mbclen;
  468. }
  469. *last_wc = wc;
  470. return rawbuf_idx;
  471. }
  472. #endif /* RE_ENABLE_I18N */
  473. /* Build the buffer PSTR->MBS, and apply the translation if we need.
  474. This function is used in case of REG_ICASE. */
  475. static void
  476. internal_function
  477. build_upper_buffer (re_string_t *pstr)
  478. {
  479. Idx char_idx, end_idx;
  480. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  481. for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
  482. {
  483. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
  484. if (BE (pstr->trans != NULL, 0))
  485. ch = pstr->trans[ch];
  486. if (islower (ch))
  487. pstr->mbs[char_idx] = toupper (ch);
  488. else
  489. pstr->mbs[char_idx] = ch;
  490. }
  491. pstr->valid_len = char_idx;
  492. pstr->valid_raw_len = char_idx;
  493. }
  494. /* Apply TRANS to the buffer in PSTR. */
  495. static void
  496. internal_function
  497. re_string_translate_buffer (re_string_t *pstr)
  498. {
  499. Idx buf_idx, end_idx;
  500. end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
  501. for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
  502. {
  503. int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
  504. pstr->mbs[buf_idx] = pstr->trans[ch];
  505. }
  506. pstr->valid_len = buf_idx;
  507. pstr->valid_raw_len = buf_idx;
  508. }
  509. /* This function re-construct the buffers.
  510. Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
  511. convert to upper case in case of REG_ICASE, apply translation. */
  512. static reg_errcode_t
  513. internal_function __attribute_warn_unused_result__
  514. re_string_reconstruct (re_string_t *pstr, Idx idx, int eflags)
  515. {
  516. Idx offset;
  517. if (BE (pstr->raw_mbs_idx <= idx, 0))
  518. offset = idx - pstr->raw_mbs_idx;
  519. else
  520. {
  521. /* Reset buffer. */
  522. #ifdef RE_ENABLE_I18N
  523. if (pstr->mb_cur_max > 1)
  524. memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
  525. #endif /* RE_ENABLE_I18N */
  526. pstr->len = pstr->raw_len;
  527. pstr->stop = pstr->raw_stop;
  528. pstr->valid_len = 0;
  529. pstr->raw_mbs_idx = 0;
  530. pstr->valid_raw_len = 0;
  531. pstr->offsets_needed = 0;
  532. pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
  533. : CONTEXT_NEWLINE | CONTEXT_BEGBUF);
  534. if (!pstr->mbs_allocated)
  535. pstr->mbs = (unsigned char *) pstr->raw_mbs;
  536. offset = idx;
  537. }
  538. if (BE (offset != 0, 1))
  539. {
  540. /* Should the already checked characters be kept? */
  541. if (BE (offset < pstr->valid_raw_len, 1))
  542. {
  543. /* Yes, move them to the front of the buffer. */
  544. #ifdef RE_ENABLE_I18N
  545. if (BE (pstr->offsets_needed, 0))
  546. {
  547. Idx low = 0, high = pstr->valid_len, mid;
  548. do
  549. {
  550. mid = (high + low) / 2;
  551. if (pstr->offsets[mid] > offset)
  552. high = mid;
  553. else if (pstr->offsets[mid] < offset)
  554. low = mid + 1;
  555. else
  556. break;
  557. }
  558. while (low < high);
  559. if (pstr->offsets[mid] < offset)
  560. ++mid;
  561. pstr->tip_context = re_string_context_at (pstr, mid - 1,
  562. eflags);
  563. /* This can be quite complicated, so handle specially
  564. only the common and easy case where the character with
  565. different length representation of lower and upper
  566. case is present at or after offset. */
  567. if (pstr->valid_len > offset
  568. && mid == offset && pstr->offsets[mid] == offset)
  569. {
  570. memmove (pstr->wcs, pstr->wcs + offset,
  571. (pstr->valid_len - offset) * sizeof (wint_t));
  572. memmove (pstr->mbs, pstr->mbs + offset, pstr->valid_len - offset);
  573. pstr->valid_len -= offset;
  574. pstr->valid_raw_len -= offset;
  575. for (low = 0; low < pstr->valid_len; low++)
  576. pstr->offsets[low] = pstr->offsets[low + offset] - offset;
  577. }
  578. else
  579. {
  580. /* Otherwise, just find out how long the partial multibyte
  581. character at offset is and fill it with WEOF/255. */
  582. pstr->len = pstr->raw_len - idx + offset;
  583. pstr->stop = pstr->raw_stop - idx + offset;
  584. pstr->offsets_needed = 0;
  585. while (mid > 0 && pstr->offsets[mid - 1] == offset)
  586. --mid;
  587. while (mid < pstr->valid_len)
  588. if (pstr->wcs[mid] != WEOF)
  589. break;
  590. else
  591. ++mid;
  592. if (mid == pstr->valid_len)
  593. pstr->valid_len = 0;
  594. else
  595. {
  596. pstr->valid_len = pstr->offsets[mid] - offset;
  597. if (pstr->valid_len)
  598. {
  599. for (low = 0; low < pstr->valid_len; ++low)
  600. pstr->wcs[low] = WEOF;
  601. memset (pstr->mbs, 255, pstr->valid_len);
  602. }
  603. }
  604. pstr->valid_raw_len = pstr->valid_len;
  605. }
  606. }
  607. else
  608. #endif
  609. {
  610. pstr->tip_context = re_string_context_at (pstr, offset - 1,
  611. eflags);
  612. #ifdef RE_ENABLE_I18N
  613. if (pstr->mb_cur_max > 1)
  614. memmove (pstr->wcs, pstr->wcs + offset,
  615. (pstr->valid_len - offset) * sizeof (wint_t));
  616. #endif /* RE_ENABLE_I18N */
  617. if (BE (pstr->mbs_allocated, 0))
  618. memmove (pstr->mbs, pstr->mbs + offset,
  619. pstr->valid_len - offset);
  620. pstr->valid_len -= offset;
  621. pstr->valid_raw_len -= offset;
  622. #if DEBUG
  623. assert (pstr->valid_len > 0);
  624. #endif
  625. }
  626. }
  627. else
  628. {
  629. #ifdef RE_ENABLE_I18N
  630. /* No, skip all characters until IDX. */
  631. Idx prev_valid_len = pstr->valid_len;
  632. if (BE (pstr->offsets_needed, 0))
  633. {
  634. pstr->len = pstr->raw_len - idx + offset;
  635. pstr->stop = pstr->raw_stop - idx + offset;
  636. pstr->offsets_needed = 0;
  637. }
  638. #endif
  639. pstr->valid_len = 0;
  640. #ifdef RE_ENABLE_I18N
  641. if (pstr->mb_cur_max > 1)
  642. {
  643. Idx wcs_idx;
  644. wint_t wc = WEOF;
  645. if (pstr->is_utf8)
  646. {
  647. const unsigned char *raw, *p, *end;
  648. /* Special case UTF-8. Multi-byte chars start with any
  649. byte other than 0x80 - 0xbf. */
  650. raw = pstr->raw_mbs + pstr->raw_mbs_idx;
  651. end = raw + (offset - pstr->mb_cur_max);
  652. if (end < pstr->raw_mbs)
  653. end = pstr->raw_mbs;
  654. p = raw + offset - 1;
  655. #ifdef _LIBC
  656. /* We know the wchar_t encoding is UCS4, so for the simple
  657. case, ASCII characters, skip the conversion step. */
  658. if (isascii (*p) && BE (pstr->trans == NULL, 1))
  659. {
  660. memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
  661. /* pstr->valid_len = 0; */
  662. wc = (wchar_t) *p;
  663. }
  664. else
  665. #endif
  666. for (; p >= end; --p)
  667. if ((*p & 0xc0) != 0x80)
  668. {
  669. mbstate_t cur_state;
  670. wchar_t wc2;
  671. Idx mlen = raw + pstr->len - p;
  672. size_t mbclen;
  673. #if 0 /* dead code: buf is set but never used */
  674. unsigned char buf[6];
  675. if (BE (pstr->trans != NULL, 0))
  676. {
  677. int i = mlen < 6 ? mlen : 6;
  678. while (--i >= 0)
  679. buf[i] = pstr->trans[p[i]];
  680. }
  681. #endif
  682. /* XXX Don't use mbrtowc, we know which conversion
  683. to use (UTF-8 -> UCS4). */
  684. memset (&cur_state, 0, sizeof (cur_state));
  685. mbclen = __mbrtowc (&wc2, (const char *) p, mlen,
  686. &cur_state);
  687. if (raw + offset - p <= mbclen
  688. && mbclen < (size_t) -2)
  689. {
  690. memset (&pstr->cur_state, '\0',
  691. sizeof (mbstate_t));
  692. pstr->valid_len = mbclen - (raw + offset - p);
  693. wc = wc2;
  694. }
  695. break;
  696. }
  697. }
  698. if (wc == WEOF)
  699. pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
  700. if (wc == WEOF)
  701. pstr->tip_context
  702. = re_string_context_at (pstr, prev_valid_len - 1, eflags);
  703. else
  704. pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
  705. && IS_WIDE_WORD_CHAR (wc))
  706. ? CONTEXT_WORD
  707. : ((IS_WIDE_NEWLINE (wc)
  708. && pstr->newline_anchor)
  709. ? CONTEXT_NEWLINE : 0));
  710. if (BE (pstr->valid_len, 0))
  711. {
  712. for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
  713. pstr->wcs[wcs_idx] = WEOF;
  714. if (pstr->mbs_allocated)
  715. memset (pstr->mbs, 255, pstr->valid_len);
  716. }
  717. pstr->valid_raw_len = pstr->valid_len;
  718. }
  719. else
  720. #endif /* RE_ENABLE_I18N */
  721. {
  722. int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
  723. pstr->valid_raw_len = 0;
  724. if (pstr->trans)
  725. c = pstr->trans[c];
  726. pstr->tip_context = (bitset_contain (pstr->word_char, c)
  727. ? CONTEXT_WORD
  728. : ((IS_NEWLINE (c) && pstr->newline_anchor)
  729. ? CONTEXT_NEWLINE : 0));
  730. }
  731. }
  732. if (!BE (pstr->mbs_allocated, 0))
  733. pstr->mbs += offset;
  734. }
  735. pstr->raw_mbs_idx = idx;
  736. pstr->len -= offset;
  737. pstr->stop -= offset;
  738. /* Then build the buffers. */
  739. #ifdef RE_ENABLE_I18N
  740. if (pstr->mb_cur_max > 1)
  741. {
  742. if (pstr->icase)
  743. {
  744. reg_errcode_t ret = build_wcs_upper_buffer (pstr);
  745. if (BE (ret != REG_NOERROR, 0))
  746. return ret;
  747. }
  748. else
  749. build_wcs_buffer (pstr);
  750. }
  751. else
  752. #endif /* RE_ENABLE_I18N */
  753. if (BE (pstr->mbs_allocated, 0))
  754. {
  755. if (pstr->icase)
  756. build_upper_buffer (pstr);
  757. else if (pstr->trans != NULL)
  758. re_string_translate_buffer (pstr);
  759. }
  760. else
  761. pstr->valid_len = pstr->len;
  762. pstr->cur_idx = 0;
  763. return REG_NOERROR;
  764. }
  765. static unsigned char
  766. internal_function __attribute ((pure))
  767. re_string_peek_byte_case (const re_string_t *pstr, Idx idx)
  768. {
  769. int ch;
  770. Idx off;
  771. /* Handle the common (easiest) cases first. */
  772. if (BE (!pstr->mbs_allocated, 1))
  773. return re_string_peek_byte (pstr, idx);
  774. #ifdef RE_ENABLE_I18N
  775. if (pstr->mb_cur_max > 1
  776. && ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
  777. return re_string_peek_byte (pstr, idx);
  778. #endif
  779. off = pstr->cur_idx + idx;
  780. #ifdef RE_ENABLE_I18N
  781. if (pstr->offsets_needed)
  782. off = pstr->offsets[off];
  783. #endif
  784. ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
  785. #ifdef RE_ENABLE_I18N
  786. /* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
  787. this function returns CAPITAL LETTER I instead of first byte of
  788. DOTLESS SMALL LETTER I. The latter would confuse the parser,
  789. since peek_byte_case doesn't advance cur_idx in any way. */
  790. if (pstr->offsets_needed && !isascii (ch))
  791. return re_string_peek_byte (pstr, idx);
  792. #endif
  793. return ch;
  794. }
  795. static unsigned char
  796. internal_function __attribute ((pure))
  797. re_string_fetch_byte_case (re_string_t *pstr)
  798. {
  799. if (BE (!pstr->mbs_allocated, 1))
  800. return re_string_fetch_byte (pstr);
  801. #ifdef RE_ENABLE_I18N
  802. if (pstr->offsets_needed)
  803. {
  804. Idx off;
  805. int ch;
  806. /* For tr_TR.UTF-8 [[:islower:]] there is
  807. [[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
  808. in that case the whole multi-byte character and return
  809. the original letter. On the other side, with
  810. [[: DOTLESS SMALL LETTER I return [[:I, as doing
  811. anything else would complicate things too much. */
  812. if (!re_string_first_byte (pstr, pstr->cur_idx))
  813. return re_string_fetch_byte (pstr);
  814. off = pstr->offsets[pstr->cur_idx];
  815. ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
  816. if (! isascii (ch))
  817. return re_string_fetch_byte (pstr);
  818. re_string_skip_bytes (pstr,
  819. re_string_char_size_at (pstr, pstr->cur_idx));
  820. return ch;
  821. }
  822. #endif
  823. return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
  824. }
  825. static void
  826. internal_function
  827. re_string_destruct (re_string_t *pstr)
  828. {
  829. #ifdef RE_ENABLE_I18N
  830. re_free (pstr->wcs);
  831. re_free (pstr->offsets);
  832. #endif /* RE_ENABLE_I18N */
  833. if (pstr->mbs_allocated)
  834. re_free (pstr->mbs);
  835. }
  836. /* Return the context at IDX in INPUT. */
  837. static unsigned int
  838. internal_function
  839. re_string_context_at (const re_string_t *input, Idx idx, int eflags)
  840. {
  841. int c;
  842. if (BE (! REG_VALID_INDEX (idx), 0))
  843. /* In this case, we use the value stored in input->tip_context,
  844. since we can't know the character in input->mbs[-1] here. */
  845. return input->tip_context;
  846. if (BE (idx == input->len, 0))
  847. return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
  848. : CONTEXT_NEWLINE | CONTEXT_ENDBUF);
  849. #ifdef RE_ENABLE_I18N
  850. if (input->mb_cur_max > 1)
  851. {
  852. wint_t wc;
  853. Idx wc_idx = idx;
  854. while(input->wcs[wc_idx] == WEOF)
  855. {
  856. #ifdef DEBUG
  857. /* It must not happen. */
  858. assert (REG_VALID_INDEX (wc_idx));
  859. #endif
  860. --wc_idx;
  861. if (! REG_VALID_INDEX (wc_idx))
  862. return input->tip_context;
  863. }
  864. wc = input->wcs[wc_idx];
  865. if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
  866. return CONTEXT_WORD;
  867. return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
  868. ? CONTEXT_NEWLINE : 0);
  869. }
  870. else
  871. #endif
  872. {
  873. c = re_string_byte_at (input, idx);
  874. if (bitset_contain (input->word_char, c))
  875. return CONTEXT_WORD;
  876. return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
  877. }
  878. }
  879. /* Functions for set operation. */
  880. static reg_errcode_t
  881. internal_function __attribute_warn_unused_result__
  882. re_node_set_alloc (re_node_set *set, Idx size)
  883. {
  884. set->alloc = size;
  885. set->nelem = 0;
  886. set->elems = re_malloc (Idx, size);
  887. if (BE (set->elems == NULL, 0))
  888. return REG_ESPACE;
  889. return REG_NOERROR;
  890. }
  891. static reg_errcode_t
  892. internal_function __attribute_warn_unused_result__
  893. re_node_set_init_1 (re_node_set *set, Idx elem)
  894. {
  895. set->alloc = 1;
  896. set->nelem = 1;
  897. set->elems = re_malloc (Idx, 1);
  898. if (BE (set->elems == NULL, 0))
  899. {
  900. set->alloc = set->nelem = 0;
  901. return REG_ESPACE;
  902. }
  903. set->elems[0] = elem;
  904. return REG_NOERROR;
  905. }
  906. static reg_errcode_t
  907. internal_function __attribute_warn_unused_result__
  908. re_node_set_init_2 (re_node_set *set, Idx elem1, Idx elem2)
  909. {
  910. set->alloc = 2;
  911. set->elems = re_malloc (Idx, 2);
  912. if (BE (set->elems == NULL, 0))
  913. return REG_ESPACE;
  914. if (elem1 == elem2)
  915. {
  916. set->nelem = 1;
  917. set->elems[0] = elem1;
  918. }
  919. else
  920. {
  921. set->nelem = 2;
  922. if (elem1 < elem2)
  923. {
  924. set->elems[0] = elem1;
  925. set->elems[1] = elem2;
  926. }
  927. else
  928. {
  929. set->elems[0] = elem2;
  930. set->elems[1] = elem1;
  931. }
  932. }
  933. return REG_NOERROR;
  934. }
  935. static reg_errcode_t
  936. internal_function __attribute_warn_unused_result__
  937. re_node_set_init_copy (re_node_set *dest, const re_node_set *src)
  938. {
  939. dest->nelem = src->nelem;
  940. if (src->nelem > 0)
  941. {
  942. dest->alloc = dest->nelem;
  943. dest->elems = re_malloc (Idx, dest->alloc);
  944. if (BE (dest->elems == NULL, 0))
  945. {
  946. dest->alloc = dest->nelem = 0;
  947. return REG_ESPACE;
  948. }
  949. memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
  950. }
  951. else
  952. re_node_set_init_empty (dest);
  953. return REG_NOERROR;
  954. }
  955. /* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
  956. DEST. Return value indicate the error code or REG_NOERROR if succeeded.
  957. Note: We assume dest->elems is NULL, when dest->alloc is 0. */
  958. static reg_errcode_t
  959. internal_function __attribute_warn_unused_result__
  960. re_node_set_add_intersect (re_node_set *dest, const re_node_set *src1,
  961. const re_node_set *src2)
  962. {
  963. Idx i1, i2, is, id, delta, sbase;
  964. if (src1->nelem == 0 || src2->nelem == 0)
  965. return REG_NOERROR;
  966. /* We need dest->nelem + 2 * elems_in_intersection; this is a
  967. conservative estimate. */
  968. if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
  969. {
  970. Idx new_alloc = src1->nelem + src2->nelem + dest->alloc;
  971. Idx *new_elems = re_realloc (dest->elems, Idx, new_alloc);
  972. if (BE (new_elems == NULL, 0))
  973. return REG_ESPACE;
  974. dest->elems = new_elems;
  975. dest->alloc = new_alloc;
  976. }
  977. /* Find the items in the intersection of SRC1 and SRC2, and copy
  978. into the top of DEST those that are not already in DEST itself. */
  979. sbase = dest->nelem + src1->nelem + src2->nelem;
  980. i1 = src1->nelem - 1;
  981. i2 = src2->nelem - 1;
  982. id = dest->nelem - 1;
  983. for (;;)
  984. {
  985. if (src1->elems[i1] == src2->elems[i2])
  986. {
  987. /* Try to find the item in DEST. Maybe we could binary search? */
  988. while (REG_VALID_INDEX (id) && dest->elems[id] > src1->elems[i1])
  989. --id;
  990. if (! REG_VALID_INDEX (id) || dest->elems[id] != src1->elems[i1])
  991. dest->elems[--sbase] = src1->elems[i1];
  992. if (! REG_VALID_INDEX (--i1) || ! REG_VALID_INDEX (--i2))
  993. break;
  994. }
  995. /* Lower the highest of the two items. */
  996. else if (src1->elems[i1] < src2->elems[i2])
  997. {
  998. if (! REG_VALID_INDEX (--i2))
  999. break;
  1000. }
  1001. else
  1002. {
  1003. if (! REG_VALID_INDEX (--i1))
  1004. break;
  1005. }
  1006. }
  1007. id = dest->nelem - 1;
  1008. is = dest->nelem + src1->nelem + src2->nelem - 1;
  1009. delta = is - sbase + 1;
  1010. /* Now copy. When DELTA becomes zero, the remaining
  1011. DEST elements are already in place; this is more or
  1012. less the same loop that is in re_node_set_merge. */
  1013. dest->nelem += delta;
  1014. if (delta > 0 && REG_VALID_INDEX (id))
  1015. for (;;)
  1016. {
  1017. if (dest->elems[is] > dest->elems[id])
  1018. {
  1019. /* Copy from the top. */
  1020. dest->elems[id + delta--] = dest->elems[is--];
  1021. if (delta == 0)
  1022. break;
  1023. }
  1024. else
  1025. {
  1026. /* Slide from the bottom. */
  1027. dest->elems[id + delta] = dest->elems[id];
  1028. if (! REG_VALID_INDEX (--id))
  1029. break;
  1030. }
  1031. }
  1032. /* Copy remaining SRC elements. */
  1033. memcpy (dest->elems, dest->elems + sbase, delta * sizeof (Idx));
  1034. return REG_NOERROR;
  1035. }
  1036. /* Calculate the union set of the sets SRC1 and SRC2. And store it to
  1037. DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
  1038. static reg_errcode_t
  1039. internal_function __attribute_warn_unused_result__
  1040. re_node_set_init_union (re_node_set *dest, const re_node_set *src1,
  1041. const re_node_set *src2)
  1042. {
  1043. Idx i1, i2, id;
  1044. if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
  1045. {
  1046. dest->alloc = src1->nelem + src2->nelem;
  1047. dest->elems = re_malloc (Idx, dest->alloc);
  1048. if (BE (dest->elems == NULL, 0))
  1049. return REG_ESPACE;
  1050. }
  1051. else
  1052. {
  1053. if (src1 != NULL && src1->nelem > 0)
  1054. return re_node_set_init_copy (dest, src1);
  1055. else if (src2 != NULL && src2->nelem > 0)
  1056. return re_node_set_init_copy (dest, src2);
  1057. else
  1058. re_node_set_init_empty (dest);
  1059. return REG_NOERROR;
  1060. }
  1061. for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
  1062. {
  1063. if (src1->elems[i1] > src2->elems[i2])
  1064. {
  1065. dest->elems[id++] = src2->elems[i2++];
  1066. continue;
  1067. }
  1068. if (src1->elems[i1] == src2->elems[i2])
  1069. ++i2;
  1070. dest->elems[id++] = src1->elems[i1++];
  1071. }
  1072. if (i1 < src1->nelem)
  1073. {
  1074. memcpy (dest->elems + id, src1->elems + i1,
  1075. (src1->nelem - i1) * sizeof (Idx));
  1076. id += src1->nelem - i1;
  1077. }
  1078. else if (i2 < src2->nelem)
  1079. {
  1080. memcpy (dest->elems + id, src2->elems + i2,
  1081. (src2->nelem - i2) * sizeof (Idx));
  1082. id += src2->nelem - i2;
  1083. }
  1084. dest->nelem = id;
  1085. return REG_NOERROR;
  1086. }
  1087. /* Calculate the union set of the sets DEST and SRC. And store it to
  1088. DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
  1089. static reg_errcode_t
  1090. internal_function __attribute_warn_unused_result__
  1091. re_node_set_merge (re_node_set *dest, const re_node_set *src)
  1092. {
  1093. Idx is, id, sbase, delta;
  1094. if (src == NULL || src->nelem == 0)
  1095. return REG_NOERROR;
  1096. if (dest->alloc < 2 * src->nelem + dest->nelem)
  1097. {
  1098. Idx new_alloc = 2 * (src->nelem + dest->alloc);
  1099. Idx *new_buffer = re_realloc (dest->elems, Idx, new_alloc);
  1100. if (BE (new_buffer == NULL, 0))
  1101. return REG_ESPACE;
  1102. dest->elems = new_buffer;
  1103. dest->alloc = new_alloc;
  1104. }
  1105. if (BE (dest->nelem == 0, 0))
  1106. {
  1107. dest->nelem = src->nelem;
  1108. memcpy (dest->elems, src->elems, src->nelem * sizeof (Idx));
  1109. return REG_NOERROR;
  1110. }
  1111. /* Copy into the top of DEST the items of SRC that are not
  1112. found in DEST. Maybe we could binary search in DEST? */
  1113. for (sbase = dest->nelem + 2 * src->nelem,
  1114. is = src->nelem - 1, id = dest->nelem - 1;
  1115. REG_VALID_INDEX (is) && REG_VALID_INDEX (id); )
  1116. {
  1117. if (dest->elems[id] == src->elems[is])
  1118. is--, id--;
  1119. else if (dest->elems[id] < src->elems[is])
  1120. dest->elems[--sbase] = src->elems[is--];
  1121. else /* if (dest->elems[id] > src->elems[is]) */
  1122. --id;
  1123. }
  1124. if (REG_VALID_INDEX (is))
  1125. {
  1126. /* If DEST is exhausted, the remaining items of SRC must be unique. */
  1127. sbase -= is + 1;
  1128. memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (Idx));
  1129. }
  1130. id = dest->nelem - 1;
  1131. is = dest->nelem + 2 * src->nelem - 1;
  1132. delta = is - sbase + 1;
  1133. if (delta == 0)
  1134. return REG_NOERROR;
  1135. /* Now copy. When DELTA becomes zero, the remaining
  1136. DEST elements are already in place. */
  1137. dest->nelem += delta;
  1138. for (;;)
  1139. {
  1140. if (dest->elems[is] > dest->elems[id])
  1141. {
  1142. /* Copy from the top. */
  1143. dest->elems[id + delta--] = dest->elems[is--];
  1144. if (delta == 0)
  1145. break;
  1146. }
  1147. else
  1148. {
  1149. /* Slide from the bottom. */
  1150. dest->elems[id + delta] = dest->elems[id];
  1151. if (! REG_VALID_INDEX (--id))
  1152. {
  1153. /* Copy remaining SRC elements. */
  1154. memcpy (dest->elems, dest->elems + sbase,
  1155. delta * sizeof (Idx));
  1156. break;
  1157. }
  1158. }
  1159. }
  1160. return REG_NOERROR;
  1161. }
  1162. /* Insert the new element ELEM to the re_node_set* SET.
  1163. SET should not already have ELEM.
  1164. Return true if successful. */
  1165. static bool
  1166. internal_function __attribute_warn_unused_result__
  1167. re_node_set_insert (re_node_set *set, Idx elem)
  1168. {
  1169. Idx idx;
  1170. /* In case the set is empty. */
  1171. if (set->alloc == 0)
  1172. return BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1);
  1173. if (BE (set->nelem, 0) == 0)
  1174. {
  1175. /* We already guaranteed above that set->alloc != 0. */
  1176. set->elems[0] = elem;
  1177. ++set->nelem;
  1178. return true;
  1179. }
  1180. /* Realloc if we need. */
  1181. if (set->alloc == set->nelem)
  1182. {
  1183. Idx *new_elems;
  1184. set->alloc = set->alloc * 2;
  1185. new_elems = re_realloc (set->elems, Idx, set->alloc);
  1186. if (BE (new_elems == NULL, 0))
  1187. return false;
  1188. set->elems = new_elems;
  1189. }
  1190. /* Move the elements which follows the new element. Test the
  1191. first element separately to skip a check in the inner loop. */
  1192. if (elem < set->elems[0])
  1193. {
  1194. idx = 0;
  1195. for (idx = set->nelem; idx > 0; idx--)
  1196. set->elems[idx] = set->elems[idx - 1];
  1197. }
  1198. else
  1199. {
  1200. for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
  1201. set->elems[idx] = set->elems[idx - 1];
  1202. }
  1203. /* Insert the new element. */
  1204. set->elems[idx] = elem;
  1205. ++set->nelem;
  1206. return true;
  1207. }
  1208. /* Insert the new element ELEM to the re_node_set* SET.
  1209. SET should not already have any element greater than or equal to ELEM.
  1210. Return true if successful. */
  1211. static bool
  1212. internal_function __attribute_warn_unused_result__
  1213. re_node_set_insert_last (re_node_set *set, Idx elem)
  1214. {
  1215. /* Realloc if we need. */
  1216. if (set->alloc == set->nelem)
  1217. {
  1218. Idx *new_elems;
  1219. set->alloc = (set->alloc + 1) * 2;
  1220. new_elems = re_realloc (set->elems, Idx, set->alloc);
  1221. if (BE (new_elems == NULL, 0))
  1222. return false;
  1223. set->elems = new_elems;
  1224. }
  1225. /* Insert the new element. */
  1226. set->elems[set->nelem++] = elem;
  1227. return true;
  1228. }
  1229. /* Compare two node sets SET1 and SET2.
  1230. Return true if SET1 and SET2 are equivalent. */
  1231. static bool
  1232. internal_function __attribute ((pure))
  1233. re_node_set_compare (const re_node_set *set1, const re_node_set *set2)
  1234. {
  1235. Idx i;
  1236. if (set1 == NULL || set2 == NULL || set1->nelem != set2->nelem)
  1237. return false;
  1238. for (i = set1->nelem ; REG_VALID_INDEX (--i) ; )
  1239. if (set1->elems[i] != set2->elems[i])
  1240. return false;
  1241. return true;
  1242. }
  1243. /* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
  1244. static Idx
  1245. internal_function __attribute ((pure))
  1246. re_node_set_contains (const re_node_set *set, Idx elem)
  1247. {
  1248. __re_size_t idx, right, mid;
  1249. if (! REG_VALID_NONZERO_INDEX (set->nelem))
  1250. return 0;
  1251. /* Binary search the element. */
  1252. idx = 0;
  1253. right = set->nelem - 1;
  1254. while (idx < right)
  1255. {
  1256. mid = (idx + right) / 2;
  1257. if (set->elems[mid] < elem)
  1258. idx = mid + 1;
  1259. else
  1260. right = mid;
  1261. }
  1262. return set->elems[idx] == elem ? idx + 1 : 0;
  1263. }
  1264. static void
  1265. internal_function
  1266. re_node_set_remove_at (re_node_set *set, Idx idx)
  1267. {
  1268. if (idx < 0 || idx >= set->nelem)
  1269. return;
  1270. --set->nelem;
  1271. for (; idx < set->nelem; idx++)
  1272. set->elems[idx] = set->elems[idx + 1];
  1273. }
  1274. /* Add the token TOKEN to dfa->nodes, and return the index of the token.
  1275. Or return REG_MISSING if an error occurred. */
  1276. static Idx
  1277. internal_function
  1278. re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
  1279. {
  1280. if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
  1281. {
  1282. size_t new_nodes_alloc = dfa->nodes_alloc * 2;
  1283. Idx *new_nexts, *new_indices;
  1284. re_node_set *new_edests, *new_eclosures;
  1285. re_token_t *new_nodes;
  1286. size_t max_object_size =
  1287. MAX (sizeof (re_token_t),
  1288. MAX (sizeof (re_node_set),
  1289. sizeof (Idx)));
  1290. /* Avoid overflows. */
  1291. if (BE (SIZE_MAX / 2 / max_object_size < dfa->nodes_alloc, 0))
  1292. return REG_MISSING;
  1293. new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
  1294. if (BE (new_nodes == NULL, 0))
  1295. return REG_MISSING;
  1296. dfa->nodes = new_nodes;
  1297. new_nexts = re_realloc (dfa->nexts, Idx, new_nodes_alloc);
  1298. new_indices = re_realloc (dfa->org_indices, Idx, new_nodes_alloc);
  1299. new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
  1300. new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
  1301. if (BE (new_nexts == NULL || new_indices == NULL
  1302. || new_edests == NULL || new_eclosures == NULL, 0))
  1303. return REG_MISSING;
  1304. dfa->nexts = new_nexts;
  1305. dfa->org_indices = new_indices;
  1306. dfa->edests = new_edests;
  1307. dfa->eclosures = new_eclosures;
  1308. dfa->nodes_alloc = new_nodes_alloc;
  1309. }
  1310. dfa->nodes[dfa->nodes_len] = token;
  1311. dfa->nodes[dfa->nodes_len].constraint = 0;
  1312. #ifdef RE_ENABLE_I18N
  1313. {
  1314. int type = token.type;
  1315. dfa->nodes[dfa->nodes_len].accept_mb =
  1316. (type == OP_PERIOD && dfa->mb_cur_max > 1) || type == COMPLEX_BRACKET;
  1317. }
  1318. #endif
  1319. dfa->nexts[dfa->nodes_len] = REG_MISSING;
  1320. re_node_set_init_empty (dfa->edests + dfa->nodes_len);
  1321. re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
  1322. return dfa->nodes_len++;
  1323. }
  1324. static inline re_hashval_t
  1325. internal_function
  1326. calc_state_hash (const re_node_set *nodes, unsigned int context)
  1327. {
  1328. re_hashval_t hash = nodes->nelem + context;
  1329. Idx i;
  1330. for (i = 0 ; i < nodes->nelem ; i++)
  1331. hash += nodes->elems[i];
  1332. return hash;
  1333. }
  1334. /* Search for the state whose node_set is equivalent to NODES.
  1335. Return the pointer to the state, if we found it in the DFA.
  1336. Otherwise create the new one and return it. In case of an error
  1337. return NULL and set the error code in ERR.
  1338. Note: - We assume NULL as the invalid state, then it is possible that
  1339. return value is NULL and ERR is REG_NOERROR.
  1340. - We never return non-NULL value in case of any errors, it is for
  1341. optimization. */
  1342. static re_dfastate_t *
  1343. internal_function __attribute_warn_unused_result__
  1344. re_acquire_state (reg_errcode_t *err, const re_dfa_t *dfa,
  1345. const re_node_set *nodes)
  1346. {
  1347. re_hashval_t hash;
  1348. re_dfastate_t *new_state;
  1349. struct re_state_table_entry *spot;
  1350. Idx i;
  1351. #ifdef lint
  1352. /* Suppress bogus uninitialized-variable warnings. */
  1353. *err = REG_NOERROR;
  1354. #endif
  1355. if (BE (nodes->nelem == 0, 0))
  1356. {
  1357. *err = REG_NOERROR;
  1358. return NULL;
  1359. }
  1360. hash = calc_state_hash (nodes, 0);
  1361. spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1362. for (i = 0 ; i < spot->num ; i++)
  1363. {
  1364. re_dfastate_t *state = spot->array[i];
  1365. if (hash != state->hash)
  1366. continue;
  1367. if (re_node_set_compare (&state->nodes, nodes))
  1368. return state;
  1369. }
  1370. /* There are no appropriate state in the dfa, create the new one. */
  1371. new_state = create_ci_newstate (dfa, nodes, hash);
  1372. if (BE (new_state == NULL, 0))
  1373. *err = REG_ESPACE;
  1374. return new_state;
  1375. }
  1376. /* Search for the state whose node_set is equivalent to NODES and
  1377. whose context is equivalent to CONTEXT.
  1378. Return the pointer to the state, if we found it in the DFA.
  1379. Otherwise create the new one and return it. In case of an error
  1380. return NULL and set the error code in ERR.
  1381. Note: - We assume NULL as the invalid state, then it is possible that
  1382. return value is NULL and ERR is REG_NOERROR.
  1383. - We never return non-NULL value in case of any errors, it is for
  1384. optimization. */
  1385. static re_dfastate_t *
  1386. internal_function __attribute_warn_unused_result__
  1387. re_acquire_state_context (reg_errcode_t *err, const re_dfa_t *dfa,
  1388. const re_node_set *nodes, unsigned int context)
  1389. {
  1390. re_hashval_t hash;
  1391. re_dfastate_t *new_state;
  1392. struct re_state_table_entry *spot;
  1393. Idx i;
  1394. #ifdef lint
  1395. /* Suppress bogus uninitialized-variable warnings. */
  1396. *err = REG_NOERROR;
  1397. #endif
  1398. if (nodes->nelem == 0)
  1399. {
  1400. *err = REG_NOERROR;
  1401. return NULL;
  1402. }
  1403. hash = calc_state_hash (nodes, context);
  1404. spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1405. for (i = 0 ; i < spot->num ; i++)
  1406. {
  1407. re_dfastate_t *state = spot->array[i];
  1408. if (state->hash == hash
  1409. && state->context == context
  1410. && re_node_set_compare (state->entrance_nodes, nodes))
  1411. return state;
  1412. }
  1413. /* There are no appropriate state in `dfa', create the new one. */
  1414. new_state = create_cd_newstate (dfa, nodes, context, hash);
  1415. if (BE (new_state == NULL, 0))
  1416. *err = REG_ESPACE;
  1417. return new_state;
  1418. }
  1419. /* Finish initialization of the new state NEWSTATE, and using its hash value
  1420. HASH put in the appropriate bucket of DFA's state table. Return value
  1421. indicates the error code if failed. */
  1422. static reg_errcode_t
  1423. __attribute_warn_unused_result__
  1424. register_state (const re_dfa_t *dfa, re_dfastate_t *newstate,
  1425. re_hashval_t hash)
  1426. {
  1427. struct re_state_table_entry *spot;
  1428. reg_errcode_t err;
  1429. Idx i;
  1430. newstate->hash = hash;
  1431. err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
  1432. if (BE (err != REG_NOERROR, 0))
  1433. return REG_ESPACE;
  1434. for (i = 0; i < newstate->nodes.nelem; i++)
  1435. {
  1436. Idx elem = newstate->nodes.elems[i];
  1437. if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
  1438. if (BE (! re_node_set_insert_last (&newstate->non_eps_nodes, elem), 0))
  1439. return REG_ESPACE;
  1440. }
  1441. spot = dfa->state_table + (hash & dfa->state_hash_mask);
  1442. if (BE (spot->alloc <= spot->num, 0))
  1443. {
  1444. Idx new_alloc = 2 * spot->num + 2;
  1445. re_dfastate_t **new_array = re_realloc (spot->array, re_dfastate_t *,
  1446. new_alloc);
  1447. if (BE (new_array == NULL, 0))
  1448. return REG_ESPACE;
  1449. spot->array = new_array;
  1450. spot->alloc = new_alloc;
  1451. }
  1452. spot->array[spot->num++] = newstate;
  1453. return REG_NOERROR;
  1454. }
  1455. static void
  1456. free_state (re_dfastate_t *state)
  1457. {
  1458. re_node_set_free (&state->non_eps_nodes);
  1459. re_node_set_free (&state->inveclosure);
  1460. if (state->entrance_nodes != &state->nodes)
  1461. {
  1462. re_node_set_free (state->entrance_nodes);
  1463. re_free (state->entrance_nodes);
  1464. }
  1465. re_node_set_free (&state->nodes);
  1466. re_free (state->word_trtable);
  1467. re_free (state->trtable);
  1468. re_free (state);
  1469. }
  1470. /* Create the new state which is independ of contexts.
  1471. Return the new state if succeeded, otherwise return NULL. */
  1472. static re_dfastate_t *
  1473. internal_function __attribute_warn_unused_result__
  1474. create_ci_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
  1475. re_hashval_t hash)
  1476. {
  1477. Idx i;
  1478. reg_errcode_t err;
  1479. re_dfastate_t *newstate;
  1480. newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
  1481. if (BE (newstate == NULL, 0))
  1482. return NULL;
  1483. err = re_node_set_init_copy (&newstate->nodes, nodes);
  1484. if (BE (err != REG_NOERROR, 0))
  1485. {
  1486. re_free (newstate);
  1487. return NULL;
  1488. }
  1489. newstate->entrance_nodes = &newstate->nodes;
  1490. for (i = 0 ; i < nodes->nelem ; i++)
  1491. {
  1492. re_token_t *node = dfa->nodes + nodes->elems[i];
  1493. re_token_type_t type = node->type;
  1494. if (type == CHARACTER && !node->constraint)
  1495. continue;
  1496. #ifdef RE_ENABLE_I18N
  1497. newstate->accept_mb |= node->accept_mb;
  1498. #endif /* RE_ENABLE_I18N */
  1499. /* If the state has the halt node, the state is a halt state. */
  1500. if (type == END_OF_RE)
  1501. newstate->halt = 1;
  1502. else if (type == OP_BACK_REF)
  1503. newstate->has_backref = 1;
  1504. else if (type == ANCHOR || node->constraint)
  1505. newstate->has_constraint = 1;
  1506. }
  1507. err = register_state (dfa, newstate, hash);
  1508. if (BE (err != REG_NOERROR, 0))
  1509. {
  1510. free_state (newstate);
  1511. newstate = NULL;
  1512. }
  1513. return newstate;
  1514. }
  1515. /* Create the new state which is depend on the context CONTEXT.
  1516. Return the new state if succeeded, otherwise return NULL. */
  1517. static re_dfastate_t *
  1518. internal_function __attribute_warn_unused_result__
  1519. create_cd_newstate (const re_dfa_t *dfa, const re_node_set *nodes,
  1520. unsigned int context, re_hashval_t hash)
  1521. {
  1522. Idx i, nctx_nodes = 0;
  1523. reg_errcode_t err;
  1524. re_dfastate_t *newstate;
  1525. newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
  1526. if (BE (newstate == NULL, 0))
  1527. return NULL;
  1528. err = re_node_set_init_copy (&newstate->nodes, nodes);
  1529. if (BE (err != REG_NOERROR, 0))
  1530. {
  1531. re_free (newstate);
  1532. return NULL;
  1533. }
  1534. newstate->context = context;
  1535. newstate->entrance_nodes = &newstate->nodes;
  1536. for (i = 0 ; i < nodes->nelem ; i++)
  1537. {
  1538. re_token_t *node = dfa->nodes + nodes->elems[i];
  1539. re_token_type_t type = node->type;
  1540. unsigned int constraint = node->constraint;
  1541. if (type == CHARACTER && !constraint)
  1542. continue;
  1543. #ifdef RE_ENABLE_I18N
  1544. newstate->accept_mb |= node->accept_mb;
  1545. #endif /* RE_ENABLE_I18N */
  1546. /* If the state has the halt node, the state is a halt state. */
  1547. if (type == END_OF_RE)
  1548. newstate->halt = 1;
  1549. else if (type == OP_BACK_REF)
  1550. newstate->has_backref = 1;
  1551. if (constraint)
  1552. {
  1553. if (newstate->entrance_nodes == &newstate->nodes)
  1554. {
  1555. newstate->entrance_nodes = re_malloc (re_node_set, 1);
  1556. if (BE (newstate->entrance_nodes == NULL, 0))
  1557. {
  1558. free_state (newstate);
  1559. return NULL;
  1560. }
  1561. if (re_node_set_init_copy (newstate->entrance_nodes, nodes)
  1562. != REG_NOERROR)
  1563. return NULL;
  1564. nctx_nodes = 0;
  1565. newstate->has_constraint = 1;
  1566. }
  1567. if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
  1568. {
  1569. re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
  1570. ++nctx_nodes;
  1571. }
  1572. }
  1573. }
  1574. err = register_state (dfa, newstate, hash);
  1575. if (BE (err != REG_NOERROR, 0))
  1576. {
  1577. free_state (newstate);
  1578. newstate = NULL;
  1579. }
  1580. return newstate;
  1581. }